From 747bda7bb5b1644a06734900326847a5d353c448 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 19 Apr 2024 10:29:00 +0200 Subject: fbdev/deferred-io: Provide get_page hook in struct fb_deferred_io Add a callback for drivers to provide framebuffer pages to fbdev's deferred-I/O helpers. Implementations need to acquire a reference on the page before returning it. Returning NULL generates a SIGBUS signal. This will be useful for DRM's fbdev emulation with GEM-shmem buffer objects. v2: - fix typo in commit message (Javier) Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Link: https://patchwork.freedesktop.org/patch/msgid/20240419083331.7761-8-tzimmermann@suse.de --- include/linux/fb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 811e47f9d1c3..5358edbb9c0b 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -225,6 +225,7 @@ struct fb_deferred_io { struct mutex lock; /* mutex that protects the pageref list */ struct list_head pagereflist; /* list of pagerefs for touched pages */ /* callback */ + struct page *(*get_page)(struct fb_info *info, unsigned long offset); void (*deferred_io)(struct fb_info *info, struct list_head *pagelist); }; #endif -- cgit v1.2.3 From c72ceafbd12cf95e088681ae5e535ef1a78bf0ed Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Fri, 29 Mar 2024 16:24:42 -0500 Subject: mm: Introduce AS_INACCESSIBLE for encrypted/confidential memory filemap users like guest_memfd may use page cache pages to allocate/manage memory that is only intended to be accessed by guests via hardware protections like encryption. Writes to memory of this sort in common paths like truncation may cause unexpected behavior such as writing garbage instead of zeros when attempting to zero pages, or worse, triggering hardware protections that are considered fatal as far as the kernel is concerned. Introduce a new address_space flag, AS_INACCESSIBLE, and use this initially to prevent zero'ing of pages during truncation, with the understanding that it is up to the owner of the mapping to handle this specially if needed. This is admittedly a rather blunt solution, but it seems like there are no other places that should take into account the flag to keep its promise. Link: https://lore.kernel.org/lkml/ZR9LYhpxTaTk6PJX@google.com/ Cc: Matthew Wilcox Suggested-by: Sean Christopherson Signed-off-by: Michael Roth Message-ID: <20240329212444.395559-5-michael.roth@amd.com> Acked-by: Vlastimil Babka Signed-off-by: Paolo Bonzini --- include/linux/pagemap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2df35e65557d..f879c1d54da7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -207,6 +207,7 @@ enum mapping_flags { AS_STABLE_WRITES, /* must wait for writeback before modifying folio contents */ AS_UNMOVABLE, /* The mapping cannot be moved, ever */ + AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping */ }; /** -- cgit v1.2.3 From 3bb2531e20bff99f9cd8719ee7aea8bd82687173 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 May 2024 12:54:03 -0400 Subject: KVM: guest_memfd: Add hook for initializing memory guest_memfd pages are generally expected to be in some arch-defined initial state prior to using them for guest memory. For SEV-SNP this initial state is 'private', or 'guest-owned', and requires additional operations to move these pages into a 'private' state by updating the corresponding entries the RMP table. Allow for an arch-defined hook to handle updates of this sort, and go ahead and implement one for x86 so KVM implementations like AMD SVM can register a kvm_x86_ops callback to handle these updates for SEV-SNP guests. The preparation callback is always called when allocating/grabbing folios via gmem, and it is up to the architecture to keep track of whether or not the pages are already in the expected state (e.g. the RMP table in the case of SEV-SNP). In some cases, it is necessary to defer the preparation of the pages to handle things like in-place encryption of initial guest memory payloads before marking these pages as 'private'/'guest-owned'. Add an argument (always true for now) to kvm_gmem_get_folio() that allows for the preparation callback to be bypassed. To detect possible issues in the way userspace initializes memory, it is only possible to add an unprepared page if it is not already included in the filemap. Link: https://lore.kernel.org/lkml/ZLqVdvsF11Ddo7Dq@google.com/ Co-developed-by: Michael Roth Signed-off-by: Michael Roth Message-Id: <20231230172351.574091-5-michael.roth@amd.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index afbc99264ffa..1af069ab657c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2443,4 +2443,9 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, } #endif /* CONFIG_KVM_PRIVATE_MEM */ +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); +bool kvm_arch_gmem_prepare_needed(struct kvm *kvm); +#endif + #endif -- cgit v1.2.3 From 1f6c06b177513e8a47c43e95d1985dbd9cff3ddd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Feb 2024 12:09:06 -0500 Subject: KVM: guest_memfd: Add interface for populating gmem pages with user data During guest run-time, kvm_arch_gmem_prepare() is issued as needed to prepare newly-allocated gmem pages prior to mapping them into the guest. In the case of SEV-SNP, this mainly involves setting the pages to private in the RMP table. However, for the GPA ranges comprising the initial guest payload, which are encrypted/measured prior to starting the guest, the gmem pages need to be accessed prior to setting them to private in the RMP table so they can be initialized with the userspace-provided data. Additionally, an SNP firmware call is needed afterward to encrypt them in-place and measure the contents into the guest's launch digest. While it is possible to bypass the kvm_arch_gmem_prepare() hooks so that this handling can be done in an open-coded/vendor-specific manner, this may expose more gmem-internal state/dependencies to external callers than necessary. Try to avoid this by implementing an interface that tries to handle as much of the common functionality inside gmem as possible, while also making it generic enough to potentially be usable/extensible for TDX as well. Suggested-by: Sean Christopherson Signed-off-by: Michael Roth Co-developed-by: Michael Roth Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1af069ab657c..1ae65774d9fa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2448,4 +2448,31 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord bool kvm_arch_gmem_prepare_needed(struct kvm *kvm); #endif +/** + * kvm_gmem_populate() - Populate/prepare a GPA range with guest data + * + * @kvm: KVM instance + * @gfn: starting GFN to be populated + * @src: userspace-provided buffer containing data to copy into GFN range + * (passed to @post_populate, and incremented on each iteration + * if not NULL) + * @npages: number of pages to copy from userspace-buffer + * @post_populate: callback to issue for each gmem page that backs the GPA + * range + * @opaque: opaque data to pass to @post_populate callback + * + * This is primarily intended for cases where a gmem-backed GPA range needs + * to be initialized with userspace-provided data prior to being mapped into + * the guest as a private page. This should be called with the slots->lock + * held so that caller-enforced invariants regarding the expected memory + * attributes of the GPA range do not race with KVM_SET_MEMORY_ATTRIBUTES. + * + * Returns the number of pages that were populated. + */ +typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, int order, void *opaque); + +long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, + kvm_gmem_populate_cb post_populate, void *opaque); + #endif -- cgit v1.2.3 From a90764f0e4ed5350cc9caeb384e0fb356c032587 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Sat, 30 Dec 2023 11:23:21 -0600 Subject: KVM: guest_memfd: Add hook for invalidating memory In some cases, like with SEV-SNP, guest memory needs to be updated in a platform-specific manner before it can be safely freed back to the host. Wire up arch-defined hooks to the .free_folio kvm_gmem_aops callback to allow for special handling of this sort when freeing memory in response to FALLOC_FL_PUNCH_HOLE operations and when releasing the inode, and go ahead and define an arch-specific hook for x86 since it will be needed for handling memory used for SEV-SNP guests. Signed-off-by: Michael Roth Message-Id: <20231230172351.574091-6-michael.roth@amd.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1ae65774d9fa..b43b96f876fe 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2475,4 +2475,8 @@ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); +#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); +#endif + #endif -- cgit v1.2.3 From ad27ce155566f2b4400fa865859834592bd18777 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 1 May 2024 03:51:57 -0500 Subject: KVM: SEV: Add KVM_SEV_SNP_LAUNCH_FINISH command Add a KVM_SEV_SNP_LAUNCH_FINISH command to finalize the cryptographic launch digest which stores the measurement of the guest at launch time. Also extend the existing SNP firmware data structures to support disabling the use of Versioned Chip Endorsement Keys (VCEK) by guests as part of this command. While finalizing the launch flow, the code also issues the LAUNCH_UPDATE SNP firmware commands to encrypt/measure the initial VMSA pages for each configured vCPU, which requires setting the RMP entries for those pages to private, so also add handling to clean up the RMP entries for these pages whening freeing vCPUs during shutdown. Signed-off-by: Brijesh Singh Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Harald Hoyer Signed-off-by: Ashish Kalra Message-ID: <20240501085210.2213060-8-michael.roth@amd.com> Signed-off-by: Paolo Bonzini --- include/linux/psp-sev.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 3705c2044fc0..903ddfea8585 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -658,6 +658,7 @@ struct sev_data_snp_launch_update { * @id_auth_paddr: system physical address of ID block authentication structure * @id_block_en: indicates whether ID block is present * @auth_key_en: indicates whether author key is present in authentication structure + * @vcek_disabled: indicates whether use of VCEK is allowed for attestation reports * @rsvd: reserved * @host_data: host-supplied data for guest, not interpreted by firmware */ @@ -667,7 +668,8 @@ struct sev_data_snp_launch_finish { u64 id_auth_paddr; u8 id_block_en:1; u8 auth_key_en:1; - u64 rsvd:62; + u8 vcek_disabled:1; + u64 rsvd:61; u8 host_data[32]; } __packed; -- cgit v1.2.3 From 3bb8dce41c023e39ec4d79a83742403b5064a276 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 May 2024 09:38:36 +0200 Subject: wifi: ieee80211: add missing doc short descriptions Some structures erroneously don't have a short description, add the missing descriptions. Link: https://msgid.link/20240515093852.16f4355e918e.I940276a4fb006ada68ab1a3e6077e3229fff0f14@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index de2dce743ee2..713266ce48a7 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1101,7 +1101,7 @@ enum ieee80211_vht_opmode_bits { }; /** - * enum ieee80211_s1g_chanwidth + * enum ieee80211_s1g_chanwidth - S1G channel widths * These are defined in IEEE802.11-2016ah Table 10-20 * as BSS Channel Width * @@ -4145,7 +4145,7 @@ enum ieee80211_idle_options { }; /** - * struct ieee80211_bss_max_idle_period_ie + * struct ieee80211_bss_max_idle_period_ie - BSS max idle period element struct * * This structure refers to "BSS Max idle period element" * @@ -4180,7 +4180,7 @@ enum ieee80211_sa_query_action { }; /** - * struct ieee80211_bssid_index + * struct ieee80211_bssid_index - multiple BSSID index element structure * * This structure refers to "Multiple BSSID-index element" * @@ -4195,7 +4195,8 @@ struct ieee80211_bssid_index { }; /** - * struct ieee80211_multiple_bssid_configuration + * struct ieee80211_multiple_bssid_configuration - multiple BSSID configuration + * element structure * * This structure refers to "Multiple BSSID Configuration element" * -- cgit v1.2.3 From 54856871298c9a8717450351699a8e6fe706101c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 May 2024 09:38:38 +0200 Subject: wifi: ieee80211: remove ieee80211_next_tbtt_present() This is actually completely equivalent to the other function ieee80211_is_s1g_short_beacon(), but open-codes the logic. Implement the necessary logic in ieee80211_is_s1g_short_beacon() and remove ieee80211_next_tbtt_present(). Link: https://msgid.link/20240515093852.774ced74dea8.I152525b4cff6e6a25be6c48fe6a4b89f17bab8a9@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 713266ce48a7..72b351abb4f6 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -604,25 +604,15 @@ static inline bool ieee80211_is_s1g_beacon(__le16 fc) } /** - * ieee80211_next_tbtt_present - check if IEEE80211_FTYPE_EXT && - * IEEE80211_STYPE_S1G_BEACON && IEEE80211_S1G_BCN_NEXT_TBTT - * @fc: frame control bytes in little-endian byteorder - */ -static inline bool ieee80211_next_tbtt_present(__le16 fc) -{ - return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) == - cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON) && - fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT); -} - -/** - * ieee80211_is_s1g_short_beacon - check if next tbtt present bit is set. Only - * true for S1G beacons when they're short. + * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is an S1G short beacon, + * i.e. it is an S1G beacon with 'next TBTT' flag set */ static inline bool ieee80211_is_s1g_short_beacon(__le16 fc) { - return ieee80211_is_s1g_beacon(fc) && ieee80211_next_tbtt_present(fc); + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT)); } /** -- cgit v1.2.3 From 3c75e99c7036a87f159dbd526060554afb3482f5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 May 2024 09:38:39 +0200 Subject: wifi: ieee80211: document function return values These are all missing, as pointed out when running kernel-doc. Add return value documentation and fix some small things while at it. Link: https://msgid.link/20240515093852.1cd5ad8f354d.Idc16e9767fa42de80b659c32efc58aea38c26996@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 106 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 84 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 72b351abb4f6..595f83783f0e 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -373,6 +373,7 @@ struct ieee80211_trigger { /** * ieee80211_has_tods - check if IEEE80211_FCTL_TODS is set * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame has to-DS set */ static inline bool ieee80211_has_tods(__le16 fc) { @@ -382,6 +383,7 @@ static inline bool ieee80211_has_tods(__le16 fc) /** * ieee80211_has_fromds - check if IEEE80211_FCTL_FROMDS is set * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame has from-DS set */ static inline bool ieee80211_has_fromds(__le16 fc) { @@ -391,6 +393,7 @@ static inline bool ieee80211_has_fromds(__le16 fc) /** * ieee80211_has_a4 - check if IEEE80211_FCTL_TODS and IEEE80211_FCTL_FROMDS are set * @fc: frame control bytes in little-endian byteorder + * Return: whether or not it's a 4-address frame (from-DS and to-DS set) */ static inline bool ieee80211_has_a4(__le16 fc) { @@ -401,6 +404,7 @@ static inline bool ieee80211_has_a4(__le16 fc) /** * ieee80211_has_morefrags - check if IEEE80211_FCTL_MOREFRAGS is set * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame has more fragments (more frags bit set) */ static inline bool ieee80211_has_morefrags(__le16 fc) { @@ -410,6 +414,7 @@ static inline bool ieee80211_has_morefrags(__le16 fc) /** * ieee80211_has_retry - check if IEEE80211_FCTL_RETRY is set * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the retry flag is set */ static inline bool ieee80211_has_retry(__le16 fc) { @@ -419,6 +424,7 @@ static inline bool ieee80211_has_retry(__le16 fc) /** * ieee80211_has_pm - check if IEEE80211_FCTL_PM is set * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the power management flag is set */ static inline bool ieee80211_has_pm(__le16 fc) { @@ -428,6 +434,7 @@ static inline bool ieee80211_has_pm(__le16 fc) /** * ieee80211_has_moredata - check if IEEE80211_FCTL_MOREDATA is set * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the more data flag is set */ static inline bool ieee80211_has_moredata(__le16 fc) { @@ -437,6 +444,7 @@ static inline bool ieee80211_has_moredata(__le16 fc) /** * ieee80211_has_protected - check if IEEE80211_FCTL_PROTECTED is set * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the protected flag is set */ static inline bool ieee80211_has_protected(__le16 fc) { @@ -446,6 +454,7 @@ static inline bool ieee80211_has_protected(__le16 fc) /** * ieee80211_has_order - check if IEEE80211_FCTL_ORDER is set * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the order flag is set */ static inline bool ieee80211_has_order(__le16 fc) { @@ -455,6 +464,7 @@ static inline bool ieee80211_has_order(__le16 fc) /** * ieee80211_is_mgmt - check if type is IEEE80211_FTYPE_MGMT * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame type is management */ static inline bool ieee80211_is_mgmt(__le16 fc) { @@ -465,6 +475,7 @@ static inline bool ieee80211_is_mgmt(__le16 fc) /** * ieee80211_is_ctl - check if type is IEEE80211_FTYPE_CTL * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame type is control */ static inline bool ieee80211_is_ctl(__le16 fc) { @@ -475,6 +486,7 @@ static inline bool ieee80211_is_ctl(__le16 fc) /** * ieee80211_is_data - check if type is IEEE80211_FTYPE_DATA * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a data frame */ static inline bool ieee80211_is_data(__le16 fc) { @@ -485,6 +497,7 @@ static inline bool ieee80211_is_data(__le16 fc) /** * ieee80211_is_ext - check if type is IEEE80211_FTYPE_EXT * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame type is extended */ static inline bool ieee80211_is_ext(__le16 fc) { @@ -496,6 +509,7 @@ static inline bool ieee80211_is_ext(__le16 fc) /** * ieee80211_is_data_qos - check if type is IEEE80211_FTYPE_DATA and IEEE80211_STYPE_QOS_DATA is set * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a QoS data frame */ static inline bool ieee80211_is_data_qos(__le16 fc) { @@ -510,6 +524,8 @@ static inline bool ieee80211_is_data_qos(__le16 fc) /** * ieee80211_is_data_present - check if type is IEEE80211_FTYPE_DATA and has data * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a QoS data frame that has data + * (i.e. is not null data) */ static inline bool ieee80211_is_data_present(__le16 fc) { @@ -524,6 +540,7 @@ static inline bool ieee80211_is_data_present(__le16 fc) /** * ieee80211_is_assoc_req - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ASSOC_REQ * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is an association request */ static inline bool ieee80211_is_assoc_req(__le16 fc) { @@ -534,6 +551,7 @@ static inline bool ieee80211_is_assoc_req(__le16 fc) /** * ieee80211_is_assoc_resp - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ASSOC_RESP * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is an association response */ static inline bool ieee80211_is_assoc_resp(__le16 fc) { @@ -544,6 +562,7 @@ static inline bool ieee80211_is_assoc_resp(__le16 fc) /** * ieee80211_is_reassoc_req - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_REASSOC_REQ * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a reassociation request */ static inline bool ieee80211_is_reassoc_req(__le16 fc) { @@ -554,6 +573,7 @@ static inline bool ieee80211_is_reassoc_req(__le16 fc) /** * ieee80211_is_reassoc_resp - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_REASSOC_RESP * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a reassociation response */ static inline bool ieee80211_is_reassoc_resp(__le16 fc) { @@ -564,6 +584,7 @@ static inline bool ieee80211_is_reassoc_resp(__le16 fc) /** * ieee80211_is_probe_req - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_PROBE_REQ * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a probe request */ static inline bool ieee80211_is_probe_req(__le16 fc) { @@ -574,6 +595,7 @@ static inline bool ieee80211_is_probe_req(__le16 fc) /** * ieee80211_is_probe_resp - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_PROBE_RESP * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a probe response */ static inline bool ieee80211_is_probe_resp(__le16 fc) { @@ -584,6 +606,7 @@ static inline bool ieee80211_is_probe_resp(__le16 fc) /** * ieee80211_is_beacon - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_BEACON * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a (regular, not S1G) beacon */ static inline bool ieee80211_is_beacon(__le16 fc) { @@ -595,6 +618,7 @@ static inline bool ieee80211_is_beacon(__le16 fc) * ieee80211_is_s1g_beacon - check if IEEE80211_FTYPE_EXT && * IEEE80211_STYPE_S1G_BEACON * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is an S1G beacon */ static inline bool ieee80211_is_s1g_beacon(__le16 fc) { @@ -618,6 +642,7 @@ static inline bool ieee80211_is_s1g_short_beacon(__le16 fc) /** * ieee80211_is_atim - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ATIM * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is an ATIM frame */ static inline bool ieee80211_is_atim(__le16 fc) { @@ -628,6 +653,7 @@ static inline bool ieee80211_is_atim(__le16 fc) /** * ieee80211_is_disassoc - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_DISASSOC * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a disassociation frame */ static inline bool ieee80211_is_disassoc(__le16 fc) { @@ -638,6 +664,7 @@ static inline bool ieee80211_is_disassoc(__le16 fc) /** * ieee80211_is_auth - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_AUTH * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is an authentication frame */ static inline bool ieee80211_is_auth(__le16 fc) { @@ -648,6 +675,7 @@ static inline bool ieee80211_is_auth(__le16 fc) /** * ieee80211_is_deauth - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_DEAUTH * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a deauthentication frame */ static inline bool ieee80211_is_deauth(__le16 fc) { @@ -658,6 +686,7 @@ static inline bool ieee80211_is_deauth(__le16 fc) /** * ieee80211_is_action - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ACTION * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is an action frame */ static inline bool ieee80211_is_action(__le16 fc) { @@ -668,6 +697,7 @@ static inline bool ieee80211_is_action(__le16 fc) /** * ieee80211_is_back_req - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_BACK_REQ * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a block-ACK request frame */ static inline bool ieee80211_is_back_req(__le16 fc) { @@ -678,6 +708,7 @@ static inline bool ieee80211_is_back_req(__le16 fc) /** * ieee80211_is_back - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_BACK * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a block-ACK frame */ static inline bool ieee80211_is_back(__le16 fc) { @@ -688,6 +719,7 @@ static inline bool ieee80211_is_back(__le16 fc) /** * ieee80211_is_pspoll - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_PSPOLL * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a PS-poll frame */ static inline bool ieee80211_is_pspoll(__le16 fc) { @@ -698,6 +730,7 @@ static inline bool ieee80211_is_pspoll(__le16 fc) /** * ieee80211_is_rts - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_RTS * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is an RTS frame */ static inline bool ieee80211_is_rts(__le16 fc) { @@ -708,6 +741,7 @@ static inline bool ieee80211_is_rts(__le16 fc) /** * ieee80211_is_cts - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_CTS * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a CTS frame */ static inline bool ieee80211_is_cts(__le16 fc) { @@ -718,6 +752,7 @@ static inline bool ieee80211_is_cts(__le16 fc) /** * ieee80211_is_ack - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_ACK * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is an ACK frame */ static inline bool ieee80211_is_ack(__le16 fc) { @@ -728,6 +763,7 @@ static inline bool ieee80211_is_ack(__le16 fc) /** * ieee80211_is_cfend - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_CFEND * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a CF-end frame */ static inline bool ieee80211_is_cfend(__le16 fc) { @@ -738,6 +774,7 @@ static inline bool ieee80211_is_cfend(__le16 fc) /** * ieee80211_is_cfendack - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_CFENDACK * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a CF-end-ack frame */ static inline bool ieee80211_is_cfendack(__le16 fc) { @@ -748,6 +785,7 @@ static inline bool ieee80211_is_cfendack(__le16 fc) /** * ieee80211_is_nullfunc - check if frame is a regular (non-QoS) nullfunc frame * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a nullfunc frame */ static inline bool ieee80211_is_nullfunc(__le16 fc) { @@ -758,6 +796,7 @@ static inline bool ieee80211_is_nullfunc(__le16 fc) /** * ieee80211_is_qos_nullfunc - check if frame is a QoS nullfunc frame * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a QoS nullfunc frame */ static inline bool ieee80211_is_qos_nullfunc(__le16 fc) { @@ -768,6 +807,7 @@ static inline bool ieee80211_is_qos_nullfunc(__le16 fc) /** * ieee80211_is_trigger - check if frame is trigger frame * @fc: frame control field in little-endian byteorder + * Return: whether or not the frame is a trigger frame */ static inline bool ieee80211_is_trigger(__le16 fc) { @@ -778,6 +818,7 @@ static inline bool ieee80211_is_trigger(__le16 fc) /** * ieee80211_is_any_nullfunc - check if frame is regular or QoS nullfunc frame * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is a nullfunc or QoS nullfunc frame */ static inline bool ieee80211_is_any_nullfunc(__le16 fc) { @@ -787,6 +828,8 @@ static inline bool ieee80211_is_any_nullfunc(__le16 fc) /** * ieee80211_is_first_frag - check if IEEE80211_SCTL_FRAG is not set * @seq_ctrl: frame sequence control bytes in little-endian byteorder + * Return: whether or not the frame is the first fragment (also true if + * it's not fragmented at all) */ static inline bool ieee80211_is_first_frag(__le16 seq_ctrl) { @@ -796,6 +839,7 @@ static inline bool ieee80211_is_first_frag(__le16 seq_ctrl) /** * ieee80211_is_frag - check if a frame is a fragment * @hdr: 802.11 header of the frame + * Return: whether or not the frame is a fragment */ static inline bool ieee80211_is_frag(struct ieee80211_hdr *hdr) { @@ -2349,6 +2393,8 @@ struct ieee80211_eht_operation_info { * @max_vht_nss: current maximum NSS as advertised by the STA in * operating mode notification, can be 0 in which case the * capability data will be used to derive this (from MCS support) + * Return: The maximum NSS that can be used for the given bandwidth/MCS + * combination * * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can * vary for a given BW/MCS. This function parses the data. @@ -4317,6 +4363,7 @@ struct ieee80211_he_6ghz_capa { /** * ieee80211_get_qos_ctl - get pointer to qos control bytes * @hdr: the frame + * Return: a pointer to the QoS control field in the frame header * * The qos ctrl bytes come after the frame_control, duration, seq_num * and 3 or 4 addresses of length ETH_ALEN. Checks frame_control to choose @@ -4339,6 +4386,7 @@ static inline u8 *ieee80211_get_qos_ctl(struct ieee80211_hdr *hdr) /** * ieee80211_get_tid - get qos TID * @hdr: the frame + * Return: the TID from the QoS control field */ static inline u8 ieee80211_get_tid(struct ieee80211_hdr *hdr) { @@ -4350,6 +4398,7 @@ static inline u8 ieee80211_get_tid(struct ieee80211_hdr *hdr) /** * ieee80211_get_SA - get pointer to SA * @hdr: the frame + * Return: a pointer to the source address (SA) * * Given an 802.11 frame, this function returns the offset * to the source address (SA). It does not verify that the @@ -4369,6 +4418,7 @@ static inline u8 *ieee80211_get_SA(struct ieee80211_hdr *hdr) /** * ieee80211_get_DA - get pointer to DA * @hdr: the frame + * Return: a pointer to the destination address (DA) * * Given an 802.11 frame, this function returns the offset * to the destination address (DA). It does not verify that @@ -4387,6 +4437,7 @@ static inline u8 *ieee80211_get_DA(struct ieee80211_hdr *hdr) /** * ieee80211_is_bufferable_mmpdu - check if frame is bufferable MMPDU * @skb: the skb to check, starting with the 802.11 header + * Return: whether or not the MMPDU is bufferable */ static inline bool ieee80211_is_bufferable_mmpdu(struct sk_buff *skb) { @@ -4425,6 +4476,7 @@ static inline bool ieee80211_is_bufferable_mmpdu(struct sk_buff *skb) /** * _ieee80211_is_robust_mgmt_frame - check if frame is a robust management frame * @hdr: the frame (buffer must include at least the first octet of payload) + * Return: whether or not the frame is a robust management frame */ static inline bool _ieee80211_is_robust_mgmt_frame(struct ieee80211_hdr *hdr) { @@ -4461,6 +4513,7 @@ static inline bool _ieee80211_is_robust_mgmt_frame(struct ieee80211_hdr *hdr) /** * ieee80211_is_robust_mgmt_frame - check if skb contains a robust mgmt frame * @skb: the skb containing the frame, length will be checked + * Return: whether or not the frame is a robust management frame */ static inline bool ieee80211_is_robust_mgmt_frame(struct sk_buff *skb) { @@ -4473,6 +4526,7 @@ static inline bool ieee80211_is_robust_mgmt_frame(struct sk_buff *skb) * ieee80211_is_public_action - check if frame is a public action frame * @hdr: the frame * @len: length of the frame + * Return: whether or not the frame is a public action frame */ static inline bool ieee80211_is_public_action(struct ieee80211_hdr *hdr, size_t len) @@ -4518,8 +4572,9 @@ ieee80211_is_protected_dual_of_public_action(struct sk_buff *skb) /** * _ieee80211_is_group_privacy_action - check if frame is a group addressed - * privacy action frame + * privacy action frame * @hdr: the frame + * Return: whether or not the frame is a group addressed privacy action frame */ static inline bool _ieee80211_is_group_privacy_action(struct ieee80211_hdr *hdr) { @@ -4535,8 +4590,9 @@ static inline bool _ieee80211_is_group_privacy_action(struct ieee80211_hdr *hdr) /** * ieee80211_is_group_privacy_action - check if frame is a group addressed - * privacy action frame + * privacy action frame * @skb: the skb containing the frame, length will be checked + * Return: whether or not the frame is a group addressed privacy action frame */ static inline bool ieee80211_is_group_privacy_action(struct sk_buff *skb) { @@ -4548,6 +4604,7 @@ static inline bool ieee80211_is_group_privacy_action(struct sk_buff *skb) /** * ieee80211_tu_to_usec - convert time units (TU) to microseconds * @tu: the TUs + * Return: the time value converted to microseconds */ static inline unsigned long ieee80211_tu_to_usec(unsigned long tu) { @@ -4559,6 +4616,7 @@ static inline unsigned long ieee80211_tu_to_usec(unsigned long tu) * @tim: the TIM IE * @tim_len: length of the TIM IE * @aid: the AID to look for + * Return: whether or not traffic is indicated in the TIM for the given AID */ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, u8 tim_len, u16 aid) @@ -4585,8 +4643,10 @@ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, } /** - * ieee80211_get_tdls_action - get tdls packet action (or -1, if not tdls packet) + * ieee80211_get_tdls_action - get TDLS action code * @skb: the skb containing the frame, length will not be checked + * Return: the TDLS action code, or -1 if it's not an encapsulated TDLS action + * frame * * This function assumes the frame is a data frame, and that the network header * is in the correct place. @@ -4626,6 +4686,7 @@ static inline int ieee80211_get_tdls_action(struct sk_buff *skb) /** * ieee80211_action_contains_tpc - checks if the frame contains TPC element * @skb: the skb containing the frame, length will be checked + * Return: %true if the frame contains a TPC element, %false otherwise * * This function checks if it's either TPC report action frame or Link * Measurement report action frame as defined in IEEE Std. 802.11-2012 8.5.2.5 @@ -4743,6 +4804,7 @@ struct element { * @element: element pointer after for_each_element() or friends * @data: same data pointer as passed to for_each_element() or friends * @datalen: same data length as passed to for_each_element() or friends + * Return: %true if all elements were iterated, %false otherwise; see notes * * This function returns %true if all the data was parsed or considered * while walking the elements. Only use this if your for_each_element() @@ -4946,6 +5008,7 @@ struct ieee80211_mle_tdls_common_info { * ieee80211_mle_common_size - check multi-link element common size * @data: multi-link element, must already be checked for size using * ieee80211_mle_size_ok() + * Return: the size of the multi-link element's "common" subfield */ static inline u8 ieee80211_mle_common_size(const u8 *data) { @@ -4978,11 +5041,10 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) /** * ieee80211_mle_get_link_id - returns the link ID * @data: the basic multi link element + * Return: the link ID, or -1 if not present * * The element is assumed to be of the correct type (BASIC) and big enough, * this must be checked using ieee80211_mle_type_ok(). - * - * If the BSS link ID can't be found, -1 will be returned */ static inline int ieee80211_mle_get_link_id(const u8 *data) { @@ -5002,12 +5064,10 @@ static inline int ieee80211_mle_get_link_id(const u8 *data) /** * ieee80211_mle_get_bss_param_ch_cnt - returns the BSS parameter change count * @data: pointer to the basic multi link element + * Return: the BSS Parameter Change Count field value, or -1 if not present * * The element is assumed to be of the correct type (BASIC) and big enough, * this must be checked using ieee80211_mle_type_ok(). - * - * If the BSS parameter change count value can't be found (the presence bit - * for it is clear), -1 will be returned. */ static inline int ieee80211_mle_get_bss_param_ch_cnt(const u8 *data) @@ -5030,13 +5090,13 @@ ieee80211_mle_get_bss_param_ch_cnt(const u8 *data) /** * ieee80211_mle_get_eml_med_sync_delay - returns the medium sync delay - * @data: pointer to the multi link EHT IE + * @data: pointer to the multi-link element + * Return: the medium synchronization delay field value from the multi-link + * element, or the default value (%IEEE80211_MED_SYNC_DELAY_DEFAULT) + * if not present * * The element is assumed to be of the correct type (BASIC) and big enough, * this must be checked using ieee80211_mle_type_ok(). - * - * If the medium synchronization is not present, then the default value is - * returned. */ static inline u16 ieee80211_mle_get_eml_med_sync_delay(const u8 *data) { @@ -5060,12 +5120,12 @@ static inline u16 ieee80211_mle_get_eml_med_sync_delay(const u8 *data) /** * ieee80211_mle_get_eml_cap - returns the EML capability - * @data: pointer to the multi link EHT IE + * @data: pointer to the multi-link element + * Return: the EML capability field value from the multi-link element, + * or 0 if not present * * The element is assumed to be of the correct type (BASIC) and big enough, * this must be checked using ieee80211_mle_type_ok(). - * - * If the EML capability is not present, 0 will be returned. */ static inline u16 ieee80211_mle_get_eml_cap(const u8 *data) { @@ -5091,13 +5151,12 @@ static inline u16 ieee80211_mle_get_eml_cap(const u8 *data) /** * ieee80211_mle_get_mld_capa_op - returns the MLD capabilities and operations. - * @data: pointer to the multi link EHT IE + * @data: pointer to the multi-link element + * Return: the MLD capabilities and operations field value from the multi-link + * element, or 0 if not present * * The element is assumed to be of the correct type (BASIC) and big enough, * this must be checked using ieee80211_mle_type_ok(). - * - * If the MLD capabilities and operations field is not present, 0 will be - * returned. */ static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data) { @@ -5128,12 +5187,11 @@ static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data) /** * ieee80211_mle_get_mld_id - returns the MLD ID - * @data: pointer to the multi link element + * @data: pointer to the multi-link element + * Return: The MLD ID in the given multi-link element, or 0 if not present * * The element is assumed to be of the correct type (BASIC) and big enough, * this must be checked using ieee80211_mle_type_ok(). - * - * If the MLD ID is not present, 0 will be returned. */ static inline u8 ieee80211_mle_get_mld_id(const u8 *data) { @@ -5168,6 +5226,7 @@ static inline u8 ieee80211_mle_get_mld_id(const u8 *data) * ieee80211_mle_size_ok - validate multi-link element size * @data: pointer to the element data * @len: length of the containing element + * Return: whether or not the multi-link element size is OK */ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) { @@ -5237,6 +5296,7 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) * @data: pointer to the element data * @type: expected type of the element * @len: length of the containing element + * Return: whether or not the multi-link element type matches and size is OK */ static inline bool ieee80211_mle_type_ok(const u8 *data, u8 type, size_t len) { @@ -5280,6 +5340,7 @@ struct ieee80211_mle_per_sta_profile { * profile size * @data: pointer to the sub element data * @len: length of the containing sub element + * Return: %true if the STA profile is large enough, %false otherwise */ static inline bool ieee80211_mle_basic_sta_prof_size_ok(const u8 *data, size_t len) @@ -5364,6 +5425,7 @@ ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(const struct ieee80211_mle_per_sta * element sta profile size. * @data: pointer to the sub element data * @len: length of the containing sub element + * Return: %true if the STA profile is large enough, %false otherwise */ static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data, size_t len) -- cgit v1.2.3 From 9d222c12834d80849afbad7b42822b8ffbbc025f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 May 2024 09:38:40 +0200 Subject: wifi: ieee80211: document two FTM related functions Add some documentation to ieee80211_is_timing_measurement() and ieee80211_is_ftm(). Link: https://msgid.link/20240515093852.229aa69e972c.Ifae6762a698e79cd5a49a055fe4c32330e826200@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 595f83783f0e..fb50a99daa93 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4731,6 +4731,11 @@ static inline bool ieee80211_action_contains_tpc(struct sk_buff *skb) return true; } +/** + * ieee80211_is_timing_measurement - check if frame is timing measurement response + * @skb: the SKB to check + * Return: whether or not the frame is a valid timing measurement response + */ static inline bool ieee80211_is_timing_measurement(struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *)skb->data; @@ -4750,6 +4755,11 @@ static inline bool ieee80211_is_timing_measurement(struct sk_buff *skb) return false; } +/** + * ieee80211_is_ftm - check if frame is FTM response + * @skb: the SKB to check + * Return: whether or not the frame is a valid FTM response action frame + */ static inline bool ieee80211_is_ftm(struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *)skb->data; -- cgit v1.2.3 From 8592fd7ccc95a7cf222985e6297041463e8b4e9d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 6 May 2024 21:37:54 +0200 Subject: wifi: ieee80211/ath11k: remove IEEE80211_MAX_NUM_PWR_LEVEL The define IEEE80211_MAX_NUM_PWR_LEVEL doesn't make much sense. Yes, that table has a maximum value of 8, and the table will actually remain that way, but EHT introduced a way to encode more levels for 320 MHz channels. Remove IEEE80211_MAX_NUM_PWR_LEVEL and, for ath11k being the only user, add ATH11K_NUM_PWR_LEVELS, where it makes sense since it cannot support 320 MHz channels. Acked-by: Kalle Valo Link: https://msgid.link/20240506214536.9818e5471055.Icece7e47e963d6b68e0d97ba13c102b37fbaa689@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index fb50a99daa93..1c3a683a3ee2 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2833,11 +2833,6 @@ struct ieee80211_he_6ghz_oper { * So it it totally max 8 Transmit Power Envelope element. */ #define IEEE80211_TPE_MAX_IE_COUNT 8 -/* - * In "Table 9-277—Meaning of Maximum Transmit Power Count subfield" - * of "IEEE Std 802.11ax™‐2021", the max power level is 8. - */ -#define IEEE80211_MAX_NUM_PWR_LEVEL 8 #define IEEE80211_TPE_MAX_POWER_COUNT 8 -- cgit v1.2.3 From 39dc8b8ea387ce7f4fe2d2d6d550ed52aa9aa040 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 6 May 2024 21:37:56 +0200 Subject: wifi: mac80211: pass parsed TPE data to drivers Instead of passing the full TPE elements, in all their glory and mixed up data formats for HE backward compatibility, parse them fully into the right values, and pass that to the drivers. Also introduce proper validation already in mac80211, so that drivers don't need to do it, and parse the EHT portions. The code now passes the values in the right order according to the channel used by an interface, which could also be a subset of the data advertised by the AP, if we couldn't connect with the full bandwidth (for whatever reason.) Also add kunit tests for the more complicated bits of it. Reviewed-by: Miriam Rachel Korenblit Acked-by: Kalle Valo Link: https://msgid.link/20240506214536.2aa839969b60.I265b28209e0b29772b2f125f7f83de44a4da877b@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 102 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 1c3a683a3ee2..769008a51809 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2825,17 +2825,6 @@ struct ieee80211_he_6ghz_oper { u8 minrate; } __packed; -/* - * In "9.4.2.161 Transmit Power Envelope element" of "IEEE Std 802.11ax-2021", - * it show four types in "Table 9-275a-Maximum Transmit Power Interpretation - * subfield encoding", and two category for each type in "Table E-12-Regulatory - * Info subfield encoding in the United States". - * So it it totally max 8 Transmit Power Envelope element. - */ -#define IEEE80211_TPE_MAX_IE_COUNT 8 - -#define IEEE80211_TPE_MAX_POWER_COUNT 8 - /* transmit power interpretation type of transmit power envelope element */ enum ieee80211_tx_power_intrpt_type { IEEE80211_TPE_LOCAL_EIRP, @@ -2844,24 +2833,107 @@ enum ieee80211_tx_power_intrpt_type { IEEE80211_TPE_REG_CLIENT_EIRP_PSD, }; +/* category type of transmit power envelope element */ +enum ieee80211_tx_power_category_6ghz { + IEEE80211_TPE_CAT_6GHZ_DEFAULT = 0, + IEEE80211_TPE_CAT_6GHZ_SUBORDINATE = 1, +}; + +/* + * For IEEE80211_TPE_LOCAL_EIRP / IEEE80211_TPE_REG_CLIENT_EIRP, + * setting to 63.5 dBm means no constraint. + */ +#define IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT 127 + +/* + * For IEEE80211_TPE_LOCAL_EIRP_PSD / IEEE80211_TPE_REG_CLIENT_EIRP_PSD, + * setting to 127 indicates no PSD limit for the 20 MHz channel. + */ +#define IEEE80211_TPE_PSD_NO_LIMIT 127 + /** * struct ieee80211_tx_pwr_env - Transmit Power Envelope - * @tx_power_info: Transmit Power Information field - * @tx_power: Maximum Transmit Power field + * @info: Transmit Power Information field + * @variable: Maximum Transmit Power field * * This structure represents the payload of the "Transmit Power * Envelope element" as described in IEEE Std 802.11ax-2021 section * 9.4.2.161 */ struct ieee80211_tx_pwr_env { - u8 tx_power_info; - s8 tx_power[IEEE80211_TPE_MAX_POWER_COUNT]; + u8 info; + u8 variable[]; } __packed; #define IEEE80211_TX_PWR_ENV_INFO_COUNT 0x7 #define IEEE80211_TX_PWR_ENV_INFO_INTERPRET 0x38 #define IEEE80211_TX_PWR_ENV_INFO_CATEGORY 0xC0 +#define IEEE80211_TX_PWR_ENV_EXT_COUNT 0xF + +static inline bool ieee80211_valid_tpe_element(const u8 *data, u8 len) +{ + const struct ieee80211_tx_pwr_env *env = (const void *)data; + u8 count, interpret, category; + u8 needed = sizeof(*env); + u8 N; /* also called N in the spec */ + + if (len < needed) + return false; + + count = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_COUNT); + interpret = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_INTERPRET); + category = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_CATEGORY); + + switch (category) { + case IEEE80211_TPE_CAT_6GHZ_DEFAULT: + case IEEE80211_TPE_CAT_6GHZ_SUBORDINATE: + break; + default: + return false; + } + + switch (interpret) { + case IEEE80211_TPE_LOCAL_EIRP: + case IEEE80211_TPE_REG_CLIENT_EIRP: + if (count > 3) + return false; + + /* count == 0 encodes 1 value for 20 MHz, etc. */ + needed += count + 1; + + if (len < needed) + return false; + + /* there can be extension fields not accounted for in 'count' */ + + return true; + case IEEE80211_TPE_LOCAL_EIRP_PSD: + case IEEE80211_TPE_REG_CLIENT_EIRP_PSD: + if (count > 4) + return false; + + N = count ? 1 << (count - 1) : 1; + needed += N; + + if (len < needed) + return false; + + if (len > needed) { + u8 K = u8_get_bits(env->variable[N], + IEEE80211_TX_PWR_ENV_EXT_COUNT); + + needed += 1 + K; + if (len < needed) + return false; + } + + return true; + } + + return false; +} + /* * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size * @he_oper_ie: byte data of the He Operations IE, stating from the byte -- cgit v1.2.3 From 4d25ca2d6801cfcf26f7f39c561611ba5be99bf8 Mon Sep 17 00:00:00 2001 From: Abhishek Chauhan Date: Thu, 9 May 2024 14:18:32 -0700 Subject: net: Rename mono_delivery_time to tstamp_type for scalabilty mono_delivery_time was added to check if skb->tstamp has delivery time in mono clock base (i.e. EDT) otherwise skb->tstamp has timestamp in ingress and delivery_time at egress. Renaming the bitfield from mono_delivery_time to tstamp_type is for extensibilty for other timestamps such as userspace timestamp (i.e. SO_TXTIME) set via sock opts. As we are renaming the mono_delivery_time to tstamp_type, it makes sense to start assigning tstamp_type based on enum defined in this commit. Earlier we used bool arg flag to check if the tstamp is mono in function skb_set_delivery_time, Now the signature of the functions accepts tstamp_type to distinguish between mono and real time. Also skb_set_delivery_type_by_clockid is a new function which accepts clockid to determine the tstamp_type. In future tstamp_type:1 can be extended to support userspace timestamp by increasing the bitfield. Signed-off-by: Abhishek Chauhan Reviewed-by: Willem de Bruijn Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509211834.3235191-2-quic_abchauha@quicinc.com Signed-off-by: Martin KaFai Lau --- include/linux/skbuff.h | 52 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c0b97c93a6de..3a721cc3b644 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -706,6 +706,11 @@ typedef unsigned int sk_buff_data_t; typedef unsigned char *sk_buff_data_t; #endif +enum skb_tstamp_type { + SKB_CLOCK_REALTIME, + SKB_CLOCK_MONOTONIC, +}; + /** * DOC: Basic sk_buff geometry * @@ -823,10 +828,8 @@ typedef unsigned char *sk_buff_data_t; * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required - * @mono_delivery_time: When set, skb->tstamp has the - * delivery_time in mono clock base (i.e. EDT). Otherwise, the - * skb->tstamp has the (rcv) timestamp at ingress and - * delivery_time at egress. + * @tstamp_type: When set, skb->tstamp has the + * delivery_time clock base of skb->tstamp. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @alloc_cpu: CPU which did the skb allocation. @@ -954,7 +957,7 @@ struct sk_buff { /* private: */ __u8 __mono_tc_offset[0]; /* public: */ - __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ + __u8 tstamp_type:1; /* See skb_tstamp_type */ #ifdef CONFIG_NET_XGRESS __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ __u8 tc_skip_classify:1; @@ -4183,7 +4186,7 @@ static inline void skb_get_new_timestampns(const struct sk_buff *skb, static inline void __net_timestamp(struct sk_buff *skb) { skb->tstamp = ktime_get_real(); - skb->mono_delivery_time = 0; + skb->tstamp_type = SKB_CLOCK_REALTIME; } static inline ktime_t net_timedelta(ktime_t t) @@ -4192,10 +4195,33 @@ static inline ktime_t net_timedelta(ktime_t t) } static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, - bool mono) + u8 tstamp_type) { skb->tstamp = kt; - skb->mono_delivery_time = kt && mono; + + if (kt) + skb->tstamp_type = tstamp_type; + else + skb->tstamp_type = SKB_CLOCK_REALTIME; +} + +static inline void skb_set_delivery_type_by_clockid(struct sk_buff *skb, + ktime_t kt, clockid_t clockid) +{ + u8 tstamp_type = SKB_CLOCK_REALTIME; + + switch (clockid) { + case CLOCK_REALTIME: + break; + case CLOCK_MONOTONIC: + tstamp_type = SKB_CLOCK_MONOTONIC; + break; + default: + WARN_ON_ONCE(1); + kt = 0; + } + + skb_set_delivery_time(skb, kt, tstamp_type); } DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); @@ -4205,8 +4231,8 @@ DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); */ static inline void skb_clear_delivery_time(struct sk_buff *skb) { - if (skb->mono_delivery_time) { - skb->mono_delivery_time = 0; + if (skb->tstamp_type) { + skb->tstamp_type = SKB_CLOCK_REALTIME; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); else @@ -4216,7 +4242,7 @@ static inline void skb_clear_delivery_time(struct sk_buff *skb) static inline void skb_clear_tstamp(struct sk_buff *skb) { - if (skb->mono_delivery_time) + if (skb->tstamp_type) return; skb->tstamp = 0; @@ -4224,7 +4250,7 @@ static inline void skb_clear_tstamp(struct sk_buff *skb) static inline ktime_t skb_tstamp(const struct sk_buff *skb) { - if (skb->mono_delivery_time) + if (skb->tstamp_type) return 0; return skb->tstamp; @@ -4232,7 +4258,7 @@ static inline ktime_t skb_tstamp(const struct sk_buff *skb) static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond) { - if (!skb->mono_delivery_time && skb->tstamp) + if (skb->tstamp_type != SKB_CLOCK_MONOTONIC && skb->tstamp) return skb->tstamp; if (static_branch_unlikely(&netstamp_needed_key) || cond) -- cgit v1.2.3 From 1693c5db6ab8262e6f5263f9d211855959aa5acd Mon Sep 17 00:00:00 2001 From: Abhishek Chauhan Date: Thu, 9 May 2024 14:18:33 -0700 Subject: net: Add additional bit to support clockid_t timestamp type tstamp_type is now set based on actual clockid_t compressed into 2 bits. To make the design scalable for future needs this commit bring in the change to extend the tstamp_type:1 to tstamp_type:2 to support other clockid_t timestamp. We now support CLOCK_TAI as part of tstamp_type as part of this commit with existing support CLOCK_MONOTONIC and CLOCK_REALTIME. Signed-off-by: Abhishek Chauhan Reviewed-by: Willem de Bruijn Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509211834.3235191-3-quic_abchauha@quicinc.com Signed-off-by: Martin KaFai Lau --- include/linux/skbuff.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3a721cc3b644..1e5c97daaa37 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -709,6 +709,8 @@ typedef unsigned char *sk_buff_data_t; enum skb_tstamp_type { SKB_CLOCK_REALTIME, SKB_CLOCK_MONOTONIC, + SKB_CLOCK_TAI, + __SKB_CLOCK_MAX = SKB_CLOCK_TAI, }; /** @@ -957,7 +959,7 @@ struct sk_buff { /* private: */ __u8 __mono_tc_offset[0]; /* public: */ - __u8 tstamp_type:1; /* See skb_tstamp_type */ + __u8 tstamp_type:2; /* See skb_tstamp_type */ #ifdef CONFIG_NET_XGRESS __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ __u8 tc_skip_classify:1; @@ -1087,15 +1089,16 @@ struct sk_buff { #endif #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) -/* if you move tc_at_ingress or mono_delivery_time +/* if you move tc_at_ingress or tstamp_type * around, you also must adapt these constants. */ #ifdef __BIG_ENDIAN_BITFIELD -#define SKB_MONO_DELIVERY_TIME_MASK (1 << 7) -#define TC_AT_INGRESS_MASK (1 << 6) +#define SKB_TSTAMP_TYPE_MASK (3 << 6) +#define SKB_TSTAMP_TYPE_RSHIFT (6) +#define TC_AT_INGRESS_MASK (1 << 5) #else -#define SKB_MONO_DELIVERY_TIME_MASK (1 << 0) -#define TC_AT_INGRESS_MASK (1 << 1) +#define SKB_TSTAMP_TYPE_MASK (3) +#define TC_AT_INGRESS_MASK (1 << 2) #endif #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) @@ -4216,6 +4219,9 @@ static inline void skb_set_delivery_type_by_clockid(struct sk_buff *skb, case CLOCK_MONOTONIC: tstamp_type = SKB_CLOCK_MONOTONIC; break; + case CLOCK_TAI: + tstamp_type = SKB_CLOCK_TAI; + break; default: WARN_ON_ONCE(1); kt = 0; -- cgit v1.2.3 From 2c1713a8f1c94033a6e00aae4693ab03e8a3b9f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 18 May 2024 16:58:47 +0200 Subject: bpf: constify member bpf_sysctl_kern:: Table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysctl core is preparing to only expose instances of struct ctl_table as "const". This will also affect the ctl_table argument of sysctl handlers, for which bpf_sysctl_kern::table is also used. As the function prototype of all sysctl handlers throughout the tree needs to stay consistent that change will be done in one commit. To reduce the size of that final commit, switch this utility type which is not bound by "typedef proc_handler" to "const struct ctl_table". No functional change. Signed-off-by: Thomas Weißschuh Signed-off-by: Daniel Borkmann Reviewed-by: Joel Granados Link: https://lore.kernel.org/bpf/20240518-sysctl-const-handler-bpf-v1-1-f0d7186743c1@weissschuh.net --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 0f12cf01070e..b02aea291b7e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1406,7 +1406,7 @@ struct bpf_sock_ops_kern { struct bpf_sysctl_kern { struct ctl_table_header *head; - struct ctl_table *table; + const struct ctl_table *table; void *cur_val; size_t cur_len; void *new_val; -- cgit v1.2.3 From 73e75e6fc352bdca08f7e0893d5b6bb37171bdd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Tue, 21 May 2024 11:21:26 +0200 Subject: cgroup/pids: Separate semantics of pids.events related to pids.max MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, when pids.max limit is breached in the hierarchy, the event is counted and reported in the cgroup where the forking task resides. This decouples the limit and the notification caused by the limit making it hard to detect when the actual limit was effected. Redefine the pids.events:max as: the number of times the limit of the cgroup was hit. (Implementation differentiates also "forkfail" event but this is currently not exposed as it would better fit into pids.stat. It also differs from pids.events:max only when pids.max is configured on non-leaf cgroups.) Since it changes semantics of the original "max" event, introduce this change only in the v2 API of the controller and add a cgroup2 mount option to revert to the legacy behavior. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ea48c861cd36..b36690ca0d3f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -119,7 +119,12 @@ enum { /* * Enable hugetlb accounting for the memory controller. */ - CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), + CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), + + /* + * Enable legacy local pids.events. + */ + CGRP_ROOT_PIDS_LOCAL_EVENTS = (1 << 20), }; /* cftype->flags */ -- cgit v1.2.3 From 7a147670035ddec35f3fb2ace538ad56ad82c861 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 12 May 2024 13:11:21 +0200 Subject: regulator: consumer: Reorder fields in 'struct regulator_bulk_data' Based on pahole, 2 holes can be combined in 'struct regulator_bulk_data'. On x86_64 and allmodconfig, this shrinks the size of the structure from 32 to 24 bytes. This is usually a win, because this structure is often used for static global variables. As an example: Before: text data bss dec hex filename 3557 162 0 3719 e87 drivers/gpu/drm/msm/dsi/dsi_cfg.o After: text data bss dec hex filename 3477 162 0 3639 e37 drivers/gpu/drm/msm/dsi/dsi_cfg.o Signed-off-by: Christophe JAILLET Link: https://msgid.link/r/35c4edf2dbc6d4f24fb771341ded2989ae32f779.1715512259.git.christophe.jaillet@wanadoo.fr Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 59d0b9a79e6e..e6f81fc1fb17 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -128,11 +128,11 @@ struct regulator; * * @supply: The name of the supply. Initialised by the user before * using the bulk regulator APIs. + * @consumer: The regulator consumer for the supply. This will be managed + * by the bulk API. * @init_load_uA: After getting the regulator, regulator_set_load() will be * called with this load. Initialised by the user before * using the bulk regulator APIs. - * @consumer: The regulator consumer for the supply. This will be managed - * by the bulk API. * * The regulator APIs provide a series of regulator_bulk_() API calls as * a convenience to consumers which require multiple supplies. This @@ -140,8 +140,8 @@ struct regulator; */ struct regulator_bulk_data { const char *supply; - int init_load_uA; struct regulator *consumer; + int init_load_uA; /* private: Internal use */ int ret; -- cgit v1.2.3 From f261172d39f358dcecce13c310690d3937e0cca6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 17 May 2024 22:40:20 +0300 Subject: spi: bitbang: Use typedef for txrx_*() callbacks With a typedef for the txrx_*() callbacks the code looks neater. Note that typedef for a function is okay to have. Signed-off-by: Andy Shevchenko Link: https://msgid.link/r/20240517194104.747328-2-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi_bitbang.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h index b930eca2ef7b..7ca08b430ed5 100644 --- a/include/linux/spi/spi_bitbang.h +++ b/include/linux/spi/spi_bitbang.h @@ -4,6 +4,8 @@ #include +typedef u32 (*spi_bb_txrx_word_fn)(struct spi_device *, unsigned int, u32, u8, unsigned int); + struct spi_bitbang { struct mutex lock; u8 busy; @@ -28,9 +30,8 @@ struct spi_bitbang { int (*txrx_bufs)(struct spi_device *spi, struct spi_transfer *t); /* txrx_word[SPI_MODE_*]() just looks like a shift register */ - u32 (*txrx_word[4])(struct spi_device *spi, - unsigned nsecs, - u32 word, u8 bits, unsigned flags); + spi_bb_txrx_word_fn txrx_word[4]; + int (*set_line_direction)(struct spi_device *spi, bool output); }; -- cgit v1.2.3 From b90cc232e2ce8c959b19dc4b183e23e7aec137ab Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 17 May 2024 22:40:22 +0300 Subject: spi: bitbang: Replace hard coded number of SPI modes Instead of using hard coded number of modes, replace it with SPI_MODE_X_MASK + 1 to show relation to the SPI modes. Signed-off-by: Andy Shevchenko Link: https://msgid.link/r/20240517194104.747328-4-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi_bitbang.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h index 7ca08b430ed5..d4cb83195f7a 100644 --- a/include/linux/spi/spi_bitbang.h +++ b/include/linux/spi/spi_bitbang.h @@ -30,7 +30,7 @@ struct spi_bitbang { int (*txrx_bufs)(struct spi_device *spi, struct spi_transfer *t); /* txrx_word[SPI_MODE_*]() just looks like a shift register */ - spi_bb_txrx_word_fn txrx_word[4]; + spi_bb_txrx_word_fn txrx_word[SPI_MODE_X_MASK + 1]; int (*set_line_direction)(struct spi_device *spi, bool output); }; -- cgit v1.2.3 From 983095eaf6c161ef73d96152bfc1a99ca051cd57 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 25 May 2024 18:00:31 +0200 Subject: dma-buf/fence-array: Add flex array to struct dma_fence_array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is an effort to get rid of all multiplications from allocation functions in order to prevent integer overflows [1][2]. The "struct dma_fence_array" can be refactored to add a flex array in order to have the "callback structures allocated behind the array" be more explicit. Do so: - makes the code more readable and safer. - allows using __counted_by() for additional checks - avoids some pointer arithmetic in dma_fence_array_enable_signaling() Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [1] Link: https://github.com/KSPP/linux/issues/160 [2] Signed-off-by: Christophe JAILLET Reviewed-by: Kees Cook Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/8b4e556e07b5dd78bb8a39b67ea0a43b199083c8.1716652811.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christian König --- include/linux/dma-fence-array.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h index ec7f25def392..29c5650c1038 100644 --- a/include/linux/dma-fence-array.h +++ b/include/linux/dma-fence-array.h @@ -33,6 +33,7 @@ struct dma_fence_array_cb { * @num_pending: fences in the array still pending * @fences: array of the fences * @work: internal irq_work function + * @callbacks: array of callback helpers */ struct dma_fence_array { struct dma_fence base; @@ -43,6 +44,8 @@ struct dma_fence_array { struct dma_fence **fences; struct irq_work work; + + struct dma_fence_array_cb callbacks[] __counted_by(num_fences); }; /** -- cgit v1.2.3 From 0edb555a65d1ef047a9805051c36922b52a38a9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 9 Oct 2023 12:37:26 +0200 Subject: platform: Make platform_driver::remove() return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit struct platform_driver::remove returning an integer made driver authors expect that returning an error code was proper error handling. However the driver core ignores the error and continues to remove the device because there is nothing the core could do anyhow and reentering the remove callback again is only calling for trouble. To prevent such wrong assumptions, change the return type of the remove callback to void. This was prepared by introducing an alternative remove callback returning void and converting all drivers to that. So .remove() can be changed without further changes in drivers. This corresponds to step b) of the plan outlined in commit 5c5a7680e67b ("platform: Provide a remove callback that returns no value"). Signed-off-by: Uwe Kleine-König --- include/linux/platform_device.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 7a41c72c1959..d422db6eec63 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -237,15 +237,14 @@ struct platform_driver { int (*probe)(struct platform_device *); /* - * Traditionally the remove callback returned an int which however is - * ignored by the driver core. This led to wrong expectations by driver - * authors who thought returning an error code was a valid error - * handling strategy. To convert to a callback returning void, new - * drivers should implement .remove_new() until the conversion it done - * that eventually makes .remove() return void. + * .remove_new() is a relic from a prototype conversion of .remove(). + * New drivers are supposed to implement .remove(). Once all drivers are + * converted to not use .remove_new any more, it will be dropped. */ - int (*remove)(struct platform_device *); - void (*remove_new)(struct platform_device *); + union { + void (*remove)(struct platform_device *); + void (*remove_new)(struct platform_device *); + }; void (*shutdown)(struct platform_device *); int (*suspend)(struct platform_device *, pm_message_t state); -- cgit v1.2.3 From c80c4490c280a1678e47d34d2a335a58f1318615 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 May 2024 12:45:31 +0000 Subject: cleanup: Standardize the header guard define's name At some point during early development, the header must have been named , as evidenced by the header guard name: #ifndef __LINUX_GUARDS_H #define __LINUX_GUARDS_H It ended up being , but the old guard name for a file name that was never upstream never changed. Do that now - and while at it, also use the canonical _LINUX prefix, instead of the less common __LINUX prefix. Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/171664113181.10875.8784434350512348496.tip-bot2@tip-bot2 --- include/linux/cleanup.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index c2d09bc4f976..cef68e8e09b6 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_GUARDS_H -#define __LINUX_GUARDS_H +#ifndef _LINUX_CLEANUP_H +#define _LINUX_CLEANUP_H #include @@ -247,4 +247,4 @@ __DEFINE_LOCK_GUARD_0(_name, _lock) { return class_##_name##_lock_ptr(_T); } -#endif /* __LINUX_GUARDS_H */ +#endif /* _LINUX_CLEANUP_H */ -- cgit v1.2.3 From 5350f6ec55df381cd99db2a6bde283328fb3f75a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 3 May 2024 21:42:30 +0300 Subject: mtd: cfi: Get rid of redundant 'else' In the snippets like the following if (...) return / goto / break / continue ...; else ... the 'else' is redundant. Get rid of it. Signed-off-by: Andy Shevchenko Reviewed-by: Jeff Johnson Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20240503184230.2927283-1-andriy.shevchenko@linux.intel.com --- include/linux/mtd/cfi.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index 947410faf9e2..35ca19ae21ae 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -308,32 +308,32 @@ static inline uint8_t cfi_read_query(struct map_info *map, uint32_t addr) { map_word val = map_read(map, addr); - if (map_bankwidth_is_1(map)) { + if (map_bankwidth_is_1(map)) return val.x[0]; - } else if (map_bankwidth_is_2(map)) { + if (map_bankwidth_is_2(map)) return cfi16_to_cpu(map, val.x[0]); - } else { - /* No point in a 64-bit byteswap since that would just be - swapping the responses from different chips, and we are - only interested in one chip (a representative sample) */ - return cfi32_to_cpu(map, val.x[0]); - } + /* + * No point in a 64-bit byteswap since that would just be + * swapping the responses from different chips, and we are + * only interested in one chip (a representative sample) + */ + return cfi32_to_cpu(map, val.x[0]); } static inline uint16_t cfi_read_query16(struct map_info *map, uint32_t addr) { map_word val = map_read(map, addr); - if (map_bankwidth_is_1(map)) { + if (map_bankwidth_is_1(map)) return val.x[0] & 0xff; - } else if (map_bankwidth_is_2(map)) { + if (map_bankwidth_is_2(map)) return cfi16_to_cpu(map, val.x[0]); - } else { - /* No point in a 64-bit byteswap since that would just be - swapping the responses from different chips, and we are - only interested in one chip (a representative sample) */ - return cfi32_to_cpu(map, val.x[0]); - } + /* + * No point in a 64-bit byteswap since that would just be + * swapping the responses from different chips, and we are + * only interested in one chip (a representative sample) + */ + return cfi32_to_cpu(map, val.x[0]); } void cfi_udelay(int us); -- cgit v1.2.3 From 56813b244e5f2ace887f04c4a69a2a0041992297 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 27 May 2024 11:27:47 +0200 Subject: w1: Add missing newline and fix typos in w1_bus_master comment - Add missing newline before @return - s/bytes/byte/ - s/handles/handle/ - s/exists/exist/ in dev_info() message Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20240527092746.263038-2-thorsten.blum@toblux.com [krzysztof: squash "w1: Fix typo in dev_info() message"] Signed-off-by: Krzysztof Kozlowski --- include/linux/w1.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/w1.h b/include/linux/w1.h index 9a2a0ef39018..064805bfae3f 100644 --- a/include/linux/w1.h +++ b/include/linux/w1.h @@ -85,7 +85,8 @@ typedef void (*w1_slave_found_callback)(struct w1_master *, u64); * * @data: the first parameter in all the functions below * - * @read_bit: Sample the line level @return the level read (0 or 1) + * @read_bit: Sample the line level + * @return the level read (0 or 1) * * @write_bit: Sets the line level * @@ -95,7 +96,7 @@ typedef void (*w1_slave_found_callback)(struct w1_master *, u64); * touch_bit(1) = write-1 / read cycle * @return the bit read (0 or 1) * - * @read_byte: Reads a bytes. Same as 8 touch_bit(1) calls. + * @read_byte: Reads a byte. Same as 8 touch_bit(1) calls. * @return the byte read * * @write_byte: Writes a byte. Same as 8 touch_bit(x) calls. @@ -114,7 +115,7 @@ typedef void (*w1_slave_found_callback)(struct w1_master *, u64); * @set_pullup: Put out a strong pull-up pulse of the specified duration. * @return -1=Error, 0=completed * - * @search: Really nice hardware can handles the different types of ROM search + * @search: Really nice hardware can handle the different types of ROM search * w1_master* is passed to the slave found callback. * u8 is search_type, W1_SEARCH or W1_ALARM_SEARCH * -- cgit v1.2.3 From 3aa63a569c64e708df547a8913c84e64a06e7853 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 17 May 2024 20:08:40 -0400 Subject: fs: switch timespec64 fields in inode to discrete integers Adjacent struct timespec64's pack poorly. Switch them to discrete integer fields for the seconds and nanoseconds. This shaves 8 bytes off struct inode with a garden-variety Fedora Kconfig on x86_64, but that also moves the i_lock into the previous cacheline, away from the fields it protects. To remedy that, move i_generation above the i_lock, which moves the new 4-byte hole to just after the i_fsnotify_mask in my setup. Amir has plans to use that to expand the i_fsnotify_mask, so add a comment to that effect as well. Cc: Amir Goldstein Suggested-by: Linus Torvalds Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240517-amtime-v1-1-7b804ca4be8f@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 48 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0283cf366c2a..639885621608 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -660,9 +660,13 @@ struct inode { }; dev_t i_rdev; loff_t i_size; - struct timespec64 __i_atime; - struct timespec64 __i_mtime; - struct timespec64 __i_ctime; /* use inode_*_ctime accessors! */ + time64_t i_atime_sec; + time64_t i_mtime_sec; + time64_t i_ctime_sec; + u32 i_atime_nsec; + u32 i_mtime_nsec; + u32 i_ctime_nsec; + u32 i_generation; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; @@ -719,10 +723,10 @@ struct inode { unsigned i_dir_seq; }; - __u32 i_generation; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ + /* 32-bit hole reserved for expanding i_fsnotify_mask */ struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif @@ -1538,23 +1542,27 @@ struct timespec64 inode_set_ctime_current(struct inode *inode); static inline time64_t inode_get_atime_sec(const struct inode *inode) { - return inode->__i_atime.tv_sec; + return inode->i_atime_sec; } static inline long inode_get_atime_nsec(const struct inode *inode) { - return inode->__i_atime.tv_nsec; + return inode->i_atime_nsec; } static inline struct timespec64 inode_get_atime(const struct inode *inode) { - return inode->__i_atime; + struct timespec64 ts = { .tv_sec = inode_get_atime_sec(inode), + .tv_nsec = inode_get_atime_nsec(inode) }; + + return ts; } static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode, struct timespec64 ts) { - inode->__i_atime = ts; + inode->i_atime_sec = ts.tv_sec; + inode->i_atime_nsec = ts.tv_nsec; return ts; } @@ -1563,28 +1571,32 @@ static inline struct timespec64 inode_set_atime(struct inode *inode, { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; + return inode_set_atime_to_ts(inode, ts); } static inline time64_t inode_get_mtime_sec(const struct inode *inode) { - return inode->__i_mtime.tv_sec; + return inode->i_mtime_sec; } static inline long inode_get_mtime_nsec(const struct inode *inode) { - return inode->__i_mtime.tv_nsec; + return inode->i_mtime_nsec; } static inline struct timespec64 inode_get_mtime(const struct inode *inode) { - return inode->__i_mtime; + struct timespec64 ts = { .tv_sec = inode_get_mtime_sec(inode), + .tv_nsec = inode_get_mtime_nsec(inode) }; + return ts; } static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode, struct timespec64 ts) { - inode->__i_mtime = ts; + inode->i_mtime_sec = ts.tv_sec; + inode->i_mtime_nsec = ts.tv_nsec; return ts; } @@ -1598,23 +1610,27 @@ static inline struct timespec64 inode_set_mtime(struct inode *inode, static inline time64_t inode_get_ctime_sec(const struct inode *inode) { - return inode->__i_ctime.tv_sec; + return inode->i_ctime_sec; } static inline long inode_get_ctime_nsec(const struct inode *inode) { - return inode->__i_ctime.tv_nsec; + return inode->i_ctime_nsec; } static inline struct timespec64 inode_get_ctime(const struct inode *inode) { - return inode->__i_ctime; + struct timespec64 ts = { .tv_sec = inode_get_ctime_sec(inode), + .tv_nsec = inode_get_ctime_nsec(inode) }; + + return ts; } static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) { - inode->__i_ctime = ts; + inode->i_ctime_sec = ts.tv_sec; + inode->i_ctime_nsec = ts.tv_nsec; return ts; } -- cgit v1.2.3 From 447e140e66fd226350b3ce86cffc965eaae4c856 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 8 May 2024 13:17:01 +0300 Subject: gpio: Remove legacy API documentation In order to discourage people to use old and legacy GPIO APIs remove the respective documentation completely. It also helps further cleanups of the legacy GPIO API leftovers, which is ongoing task. Signed-off-by: Andy Shevchenko Reviewed-by: Hu Haowen <2023002089@link.tyut.edu.cn> Link: https://lore.kernel.org/r/20240508101703.830066-1-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio.h b/include/linux/gpio.h index 56ac7e7a2889..063f71b18a7c 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * + * NOTE: This header *must not* be included. * * This is the LEGACY GPIO bulk include file, including legacy APIs. It is * used for GPIO drivers still referencing the global GPIO numberspace, @@ -16,8 +16,6 @@ struct device; -/* see Documentation/driver-api/gpio/legacy.rst */ - /* make these flag values available regardless of GPIO kconfig options */ #define GPIOF_DIR_OUT (0 << 0) #define GPIOF_DIR_IN (1 << 0) @@ -121,8 +119,6 @@ static inline int gpio_to_irq(unsigned gpio) int gpio_request_one(unsigned gpio, unsigned long flags, const char *label); -/* CONFIG_GPIOLIB: bindings for managed devices that want to request gpios */ - int devm_gpio_request(struct device *dev, unsigned gpio, const char *label); int devm_gpio_request_one(struct device *dev, unsigned gpio, unsigned long flags, const char *label); -- cgit v1.2.3 From 3fefd9b594aa6be9f120733f5bb57b07d47871a9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 17 Apr 2024 18:56:57 +0100 Subject: fs: Remove i_blocks_per_page The last caller has been converted to i_blocks_per_folio() so we can remove this wrapper. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Dave Kleikamp --- include/linux/pagemap.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3d69589c00a4..63f2f3602a7f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1536,10 +1536,4 @@ unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio) { return folio_size(folio) >> inode->i_blkbits; } - -static inline -unsigned int i_blocks_per_page(struct inode *inode, struct page *page) -{ - return i_blocks_per_folio(inode, page_folio(page)); -} #endif /* _LINUX_PAGEMAP_H */ -- cgit v1.2.3 From a0a44d9175b349df2462089140fb7f292100bd7c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 27 May 2024 11:01:28 +0200 Subject: mm, slab: don't wrap internal functions with alloc_hooks() The functions __kmalloc_noprof(), kmalloc_large_noprof(), kmalloc_trace_noprof() and their _node variants are all internal to the implementations of kmalloc_noprof() and kmalloc_node_noprof() and are only declared in the "public" slab.h and exported so that those implementations can be static inline and distinguish the build-time constant size variants. The only other users for some of the internal functions are slub_kunit and fortify_kunit tests which make very short-lived allocations. Therefore we can stop wrapping them with the alloc_hooks() macro. Instead add a __ prefix to all of them and a comment documenting these as internal. Also rename __kmalloc_trace() to __kmalloc_cache() which is more descriptive - it is a variant of __kmalloc() where the exact kmalloc cache has been already determined. The usage in fortify_kunit can be removed completely, as the internal functions should be tested already through kmalloc() tests in the test variant that passes non-constant allocation size. Reported-by: Kent Overstreet Cc: Suren Baghdasaryan Cc: Kees Cook Reviewed-by: Kent Overstreet Acked-by: David Rientjes Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 7247e217e21b..ed6bee5ec2b6 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -528,9 +528,6 @@ static_assert(PAGE_SHIFT <= 20); #include -void *__kmalloc_noprof(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); -#define __kmalloc(...) alloc_hooks(__kmalloc_noprof(__VA_ARGS__)) - /** * kmem_cache_alloc - Allocate an object * @cachep: The cache to allocate from. @@ -568,31 +565,34 @@ static __always_inline void kfree_bulk(size_t size, void **p) kmem_cache_free_bulk(NULL, size, p); } -void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment - __alloc_size(1); -#define __kmalloc_node(...) alloc_hooks(__kmalloc_node_noprof(__VA_ARGS__)) - void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; #define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__)) -void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t flags, size_t size) - __assume_kmalloc_alignment __alloc_size(3); +/* + * The following functions are not to be used directly and are intended only + * for internal use from kmalloc() and kmalloc_node() + * with the exception of kunit tests + */ + +void *__kmalloc_noprof(size_t size, gfp_t flags) + __assume_kmalloc_alignment __alloc_size(1); + +void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) + __assume_kmalloc_alignment __alloc_size(1); -void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) __assume_kmalloc_alignment - __alloc_size(4); -#define kmalloc_trace(...) alloc_hooks(kmalloc_trace_noprof(__VA_ARGS__)) +void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t flags, size_t size) + __assume_kmalloc_alignment __alloc_size(3); -#define kmalloc_node_trace(...) alloc_hooks(kmalloc_node_trace_noprof(__VA_ARGS__)) +void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) + __assume_kmalloc_alignment __alloc_size(4); -void *kmalloc_large_noprof(size_t size, gfp_t flags) __assume_page_alignment - __alloc_size(1); -#define kmalloc_large(...) alloc_hooks(kmalloc_large_noprof(__VA_ARGS__)) +void *__kmalloc_large_noprof(size_t size, gfp_t flags) + __assume_page_alignment __alloc_size(1); -void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) __assume_page_alignment - __alloc_size(1); -#define kmalloc_large_node(...) alloc_hooks(kmalloc_large_node_noprof(__VA_ARGS__)) +void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) + __assume_page_alignment __alloc_size(1); /** * kmalloc - allocate kernel memory @@ -654,10 +654,10 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f unsigned int index; if (size > KMALLOC_MAX_CACHE_SIZE) - return kmalloc_large_noprof(size, flags); + return __kmalloc_large_noprof(size, flags); index = kmalloc_index(size); - return kmalloc_trace_noprof( + return __kmalloc_cache_noprof( kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index], flags, size); } @@ -671,10 +671,10 @@ static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gf unsigned int index; if (size > KMALLOC_MAX_CACHE_SIZE) - return kmalloc_large_node_noprof(size, flags, node); + return __kmalloc_large_node_noprof(size, flags, node); index = kmalloc_index(size); - return kmalloc_node_trace_noprof( + return __kmalloc_cache_node_noprof( kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index], flags, node, size); } -- cgit v1.2.3 From db003a28e03f95f2bcb63f037a2078b8870b1ecd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 May 2024 14:33:20 +0200 Subject: netfs: fix kernel doc for nets_wait_for_outstanding_io() The @inode parameter wasn't documented leading to new doc build warnings. Fixes: f89ea63f1c65 ("netfs, 9p: Fix race between umount and async request completion") Link: https://lore.kernel.org/r/20240528133050.7e09d78e@canb.auug.org.au Signed-off-by: Christian Brauner --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 3ca3906bb8da..5d0288938cc2 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -521,7 +521,7 @@ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) /** * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete - * @ctx: The netfs inode to wait on + * @inode: The netfs inode to wait on * * Wait for outstanding I/O requests of any type to complete. This is intended * to be called from inode eviction routines. This makes sure that any -- cgit v1.2.3 From 620c266f394932e5decc4b34683a75dfc59dc2f4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 24 May 2024 12:19:39 +0200 Subject: fhandle: relax open_by_handle_at() permission checks A current limitation of open_by_handle_at() is that it's currently not possible to use it from within containers at all because we require CAP_DAC_READ_SEARCH in the initial namespace. That's unfortunate because there are scenarios where using open_by_handle_at() from within containers. Two examples: (1) cgroupfs allows to encode cgroups to file handles and reopen them with open_by_handle_at(). (2) Fanotify allows placing filesystem watches they currently aren't usable in containers because the returned file handles cannot be used. Here's a proposal for relaxing the permission check for open_by_handle_at(). (1) Opening file handles when the caller has privileges over the filesystem (1.1) The caller has an unobstructed view of the filesystem. (1.2) The caller has permissions to follow a path to the file handle. This doesn't address the problem of opening a file handle when only a portion of a filesystem is exposed as is common in containers by e.g., bind-mounting a subtree. The proposal to solve this use-case is: (2) Opening file handles when the caller has privileges over a subtree (2.1) The caller is able to reach the file from the provided mount fd. (2.2) The caller has permissions to construct an unobstructed path to the file handle. (2.3) The caller has permissions to follow a path to the file handle. The relaxed permission checks are currently restricted to directory file handles which are what both cgroupfs and fanotify need. Handling disconnected non-directory file handles would lead to a potentially non-deterministic api. If a disconnected non-directory file handle is provided we may fail to decode a valid path that we could use for permission checking. That in itself isn't a problem as we would just return EACCES in that case. However, confusion may arise if a non-disconnected dentry ends up in the cache later and those opening the file handle would suddenly succeed. * It's potentially possible to use timing information (side-channel) to infer whether a given inode exists. I don't think that's particularly problematic. Thanks to Jann for bringing this to my attention. * An unrelated note (IOW, these are thoughts that apply to open_by_handle_at() generically and are unrelated to the changes here): Jann pointed out that we should verify whether deleted files could potentially be reopened through open_by_handle_at(). I don't think that's possible though. Another potential thing to check is whether open_by_handle_at() could be abused to open internal stuff like memfds or gpu stuff. I don't think so but I haven't had the time to completely verify this. This dates back to discussions Amir and I had quite some time ago and thanks to him for providing a lot of details around the export code and related patches! Link: https://lore.kernel.org/r/20240524-vfs-open_by_handle_at-v1-1-3d4b7d22736b@kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index bb37ad5cc954..893a1d21dc1c 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -158,6 +158,7 @@ struct fid { #define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */ #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ +#define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ /** * struct export_operations - for nfsd to communicate with file systems @@ -305,6 +306,7 @@ static inline int exportfs_encode_fid(struct inode *inode, struct fid *fid, extern struct dentry *exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, + unsigned int flags, int (*acceptable)(void *, struct dentry *), void *context); extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, -- cgit v1.2.3 From 4edd7dc82bd6be6989c034ccbbbccdc61034e33b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 30 Apr 2024 11:43:43 +0530 Subject: PCI: endpoint: Rename core_init() callback in 'struct pci_epc_event_ops' to epc_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit core_init() callback is used to notify the EPC initialization event to the EPF drivers. The 'core' prefix was used indicate that the controller IP core has completed initialization. But it serves no purpose as the EPF driver will only care about the EPC initialization as a whole and there is no real benefit to distinguish the IP core part. Rename the core_init() callback in 'struct pci_epc_event_ops' to epc_init() to make it more clear. Link: https://lore.kernel.org/linux-pci/20240430-pci-epf-rework-v4-2-22832d0d456f@linaro.org Tested-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas Reviewed-by: Niklas Cassel --- include/linux/pci-epf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index adee6a1b35db..afe3bfd88742 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -70,13 +70,13 @@ struct pci_epf_ops { /** * struct pci_epc_event_ops - Callbacks for capturing the EPC events - * @core_init: Callback for the EPC initialization complete event + * @epc_init: Callback for the EPC initialization complete event * @link_up: Callback for the EPC link up event * @link_down: Callback for the EPC link down event * @bme: Callback for the EPC BME (Bus Master Enable) event */ struct pci_epc_event_ops { - int (*core_init)(struct pci_epf *epf); + int (*epc_init)(struct pci_epf *epf); int (*link_up)(struct pci_epf *epf); int (*link_down)(struct pci_epf *epf); int (*bme)(struct pci_epf *epf); -- cgit v1.2.3 From f58838d7feb04809257f54a8b256786d2f9dfe01 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 30 Apr 2024 11:43:44 +0530 Subject: PCI: endpoint: Rename BME to Bus Master Enable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BME which stands for 'Bus Master Enable' is not defined in the PCIe base spec even though it is commonly referred in many places (vendor docs). To align with the spec, rename it to its expansion 'Bus Master Enable'. Suggested-by: Damien Le Moal Link: https://lore.kernel.org/linux-pci/20240430-pci-epf-rework-v4-3-22832d0d456f@linaro.org Link: https://lore.kernel.org/linux-pci/20240430-pci-epf-rework-v4-4-22832d0d456f@linaro.org Tested-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Signed-off-by: Krzysztof Wilczyński [bhelgaas: squash removal of irrelevant 'Link is enabled'] Signed-off-by: Bjorn Helgaas Reviewed-by: Niklas Cassel --- include/linux/pci-epc.h | 2 +- include/linux/pci-epf.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index acc5f96161fe..11115cd0fe5b 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -226,7 +226,7 @@ void pci_epc_linkup(struct pci_epc *epc); void pci_epc_linkdown(struct pci_epc *epc); void pci_epc_init_notify(struct pci_epc *epc); void pci_epc_notify_pending_init(struct pci_epc *epc, struct pci_epf *epf); -void pci_epc_bme_notify(struct pci_epc *epc); +void pci_epc_bus_master_enable_notify(struct pci_epc *epc); void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf, enum pci_epc_interface_type type); int pci_epc_write_header(struct pci_epc *epc, u8 func_no, u8 vfunc_no, diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index afe3bfd88742..dc759eb7157c 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -73,13 +73,13 @@ struct pci_epf_ops { * @epc_init: Callback for the EPC initialization complete event * @link_up: Callback for the EPC link up event * @link_down: Callback for the EPC link down event - * @bme: Callback for the EPC BME (Bus Master Enable) event + * @bus_master_enable: Callback for the EPC Bus Master Enable event */ struct pci_epc_event_ops { int (*epc_init)(struct pci_epf *epf); int (*link_up)(struct pci_epf *epf); int (*link_down)(struct pci_epf *epf); - int (*bme)(struct pci_epf *epf); + int (*bus_master_enable)(struct pci_epf *epf); }; /** -- cgit v1.2.3 From 4eed3dd7116814c82630b0f1b0f73fa96707134f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 7 May 2024 13:25:19 +0300 Subject: resource: Use typedef for alignf callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To make it simpler to declare resource constraint alignf callbacks, add typedef for it and document it. Suggested-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240507102523.57320-5-ilpo.jarvinen@linux.intel.com Tested-by: Lidong Wang Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Reviewed-by: Mika Westerberg --- include/linux/ioport.h | 22 ++++++++++++++++++---- include/linux/pci.h | 5 +---- 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index db7fe25f3370..28266426e5bf 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -188,6 +188,23 @@ enum { #define DEFINE_RES_DMA(_dma) \ DEFINE_RES_DMA_NAMED((_dma), NULL) +/** + * typedef resource_alignf - Resource alignment callback + * @data: Private data used by the callback + * @res: Resource candidate range (an empty resource space) + * @size: The minimum size of the empty space + * @align: Alignment from the constraints + * + * Callback allows calculating resource placement and alignment beyond min, + * max, and align fields in the struct resource_constraint. + * + * Return: Start address for the resource. + */ +typedef resource_size_t (*resource_alignf)(void *data, + const struct resource *res, + resource_size_t size, + resource_size_t align); + /* PC/ISA/whatever - the normal PC address spaces: IO and memory */ extern struct resource ioport_resource; extern struct resource iomem_resource; @@ -207,10 +224,7 @@ extern void arch_remove_reservations(struct resource *avail); extern int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, - resource_size_t (*alignf)(void *, - const struct resource *, - resource_size_t, - resource_size_t), + resource_alignf alignf, void *alignf_data); struct resource *lookup_resource(struct resource *root, resource_size_t start); int adjust_resource(struct resource *res, resource_size_t start, diff --git a/include/linux/pci.h b/include/linux/pci.h index fb004fd4e889..760fdc4b37cc 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1551,10 +1551,7 @@ int __must_check pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res, resource_size_t size, resource_size_t align, resource_size_t min, unsigned long type_mask, - resource_size_t (*alignf)(void *, - const struct resource *, - resource_size_t, - resource_size_t), + resource_alignf alignf, void *alignf_data); -- cgit v1.2.3 From 2700225304e8e4f0f29f2bbb8ad0c112c9666d90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 7 May 2024 13:25:21 +0300 Subject: resource: Export find_resource_space() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI bridge window logic needs to find out in advance to the actual allocation if there is an empty space big enough to fit the window. Export find_resource_space() for the purpose. Also move the struct resource_constraint into generic header to be able to use the new interface. Link: https://lore.kernel.org/r/20240507102523.57320-7-ilpo.jarvinen@linux.intel.com Tested-by: Lidong Wang Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Reviewed-by: Mika Westerberg --- include/linux/ioport.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 28266426e5bf..6e9fb667a1c5 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -205,6 +205,25 @@ typedef resource_size_t (*resource_alignf)(void *data, resource_size_t size, resource_size_t align); +/** + * struct resource_constraint - constraints to be met while searching empty + * resource space + * @min: The minimum address for the memory range + * @max: The maximum address for the memory range + * @align: Alignment for the start address of the empty space + * @alignf: Additional alignment constraints callback + * @alignf_data: Data provided for @alignf callback + * + * Contains the range and alignment constraints that have to be met during + * find_resource_space(). @alignf can be NULL indicating no alignment beyond + * @align is necessary. + */ +struct resource_constraint { + resource_size_t min, max, align; + resource_alignf alignf; + void *alignf_data; +}; + /* PC/ISA/whatever - the normal PC address spaces: IO and memory */ extern struct resource ioport_resource; extern struct resource iomem_resource; @@ -278,6 +297,9 @@ static inline bool resource_union(const struct resource *r1, const struct resour return true; } +int find_resource_space(struct resource *root, struct resource *new, + resource_size_t size, struct resource_constraint *constraint); + /* Convenience shorthand with allocation */ #define request_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name), 0) #define request_muxed_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name), IORESOURCE_MUXED) -- cgit v1.2.3 From 7c327d56597d8de1680cf24e956b704270d3d84a Mon Sep 17 00:00:00 2001 From: Richard Maina Date: Wed, 29 May 2024 11:09:55 -0700 Subject: hwspinlock: Introduce hwspin_lock_bust() When a remoteproc crashes or goes down unexpectedly this can result in a state where locks held by the remoteproc will remain locked possibly resulting in deadlock. This new API hwspin_lock_bust() allows hwspinlock implementers to define a bust operation for freeing previously acquired hwspinlocks after verifying ownership of the acquired lock. Signed-off-by: Richard Maina Reviewed-by: Bjorn Andersson Signed-off-by: Chris Lew Link: https://lore.kernel.org/r/20240529-hwspinlock-bust-v3-1-c8b924ffa5a2@quicinc.com Signed-off-by: Bjorn Andersson --- include/linux/hwspinlock.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h index bfe7c1f1ac6d..f0231dbc4777 100644 --- a/include/linux/hwspinlock.h +++ b/include/linux/hwspinlock.h @@ -68,6 +68,7 @@ int __hwspin_lock_timeout(struct hwspinlock *, unsigned int, int, int __hwspin_trylock(struct hwspinlock *, int, unsigned long *); void __hwspin_unlock(struct hwspinlock *, int, unsigned long *); int of_hwspin_lock_get_id_byname(struct device_node *np, const char *name); +int hwspin_lock_bust(struct hwspinlock *hwlock, unsigned int id); int devm_hwspin_lock_free(struct device *dev, struct hwspinlock *hwlock); struct hwspinlock *devm_hwspin_lock_request(struct device *dev); struct hwspinlock *devm_hwspin_lock_request_specific(struct device *dev, @@ -127,6 +128,11 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags) { } +static inline int hwspin_lock_bust(struct hwspinlock *hwlock, unsigned int id) +{ + return 0; +} + static inline int of_hwspin_lock_get_id(struct device_node *np, int index) { return 0; -- cgit v1.2.3 From 2e3f0d693875db698891ffe89a18121bda5b95b8 Mon Sep 17 00:00:00 2001 From: Chris Lew Date: Wed, 29 May 2024 11:09:57 -0700 Subject: soc: qcom: smem: Add qcom_smem_bust_hwspin_lock_by_host() Add qcom_smem_bust_hwspin_lock_by_host to enable remoteproc to bust the hwspin_lock owned by smem. In the event the remoteproc crashes unexpectedly, the remoteproc driver can invoke this API to try and bust the hwspin_lock and release the lock if still held by the remoteproc device. Signed-off-by: Chris Lew Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20240529-hwspinlock-bust-v3-3-c8b924ffa5a2@quicinc.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/smem.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/smem.h b/include/linux/soc/qcom/smem.h index a36a3b9d4929..03187bc95851 100644 --- a/include/linux/soc/qcom/smem.h +++ b/include/linux/soc/qcom/smem.h @@ -14,4 +14,6 @@ phys_addr_t qcom_smem_virt_to_phys(void *p); int qcom_smem_get_soc_id(u32 *id); +int qcom_smem_bust_hwspin_lock_by_host(unsigned int host); + #endif -- cgit v1.2.3 From 73fc975318e0ab3385c5b3372c7b296ae58c8d6b Mon Sep 17 00:00:00 2001 From: Durai Manickam KR Date: Wed, 24 Apr 2024 11:03:45 +0530 Subject: drm: atmel-hlcdc: Define XLCDC specific registers The register address of the XLCDC IP used in SAM9X7 SoC family are different from the previous HLCDC. Defining those address space with valid macros. Signed-off-by: Durai Manickam KR [manikandan.m@microchip.com: Remove unused macro definitions] Signed-off-by: Manikandan Muralidharan Acked-by: Lee Jones Acked-by: Sam Ravnborg Signed-off-by: Sam Ravnborg Link: https://patchwork.freedesktop.org/patch/msgid/20240424053351.589830-3-manikandan.m@microchip.com --- include/linux/mfd/atmel-hlcdc.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/atmel-hlcdc.h b/include/linux/mfd/atmel-hlcdc.h index a186119a49b5..80d675a03b39 100644 --- a/include/linux/mfd/atmel-hlcdc.h +++ b/include/linux/mfd/atmel-hlcdc.h @@ -22,6 +22,8 @@ #define ATMEL_HLCDC_DITHER BIT(6) #define ATMEL_HLCDC_DISPDLY BIT(7) #define ATMEL_HLCDC_MODE_MASK GENMASK(9, 8) +#define ATMEL_XLCDC_MODE_MASK GENMASK(10, 8) +#define ATMEL_XLCDC_DPI BIT(11) #define ATMEL_HLCDC_PP BIT(10) #define ATMEL_HLCDC_VSPSU BIT(12) #define ATMEL_HLCDC_VSPHO BIT(13) @@ -34,6 +36,12 @@ #define ATMEL_HLCDC_IDR 0x30 #define ATMEL_HLCDC_IMR 0x34 #define ATMEL_HLCDC_ISR 0x38 +#define ATMEL_XLCDC_ATTRE 0x3c + +#define ATMEL_XLCDC_BASE_UPDATE BIT(0) +#define ATMEL_XLCDC_OVR1_UPDATE BIT(1) +#define ATMEL_XLCDC_OVR3_UPDATE BIT(2) +#define ATMEL_XLCDC_HEO_UPDATE BIT(3) #define ATMEL_HLCDC_CLKPOL BIT(0) #define ATMEL_HLCDC_CLKSEL BIT(2) @@ -48,6 +56,8 @@ #define ATMEL_HLCDC_DISP BIT(2) #define ATMEL_HLCDC_PWM BIT(3) #define ATMEL_HLCDC_SIP BIT(4) +#define ATMEL_XLCDC_SD BIT(5) +#define ATMEL_XLCDC_CM BIT(6) #define ATMEL_HLCDC_SOF BIT(0) #define ATMEL_HLCDC_SYNCDIS BIT(1) -- cgit v1.2.3 From 020e6c22bd6e67592f38b47d0f1926a831482560 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 May 2024 15:36:57 -0700 Subject: kcsan: Add example to data_race() kerneldoc header Although the data_race() kerneldoc header accurately states what it does, some of the implications and usage patterns are non-obvious. Therefore, add a brief locking example and also state how to have KCSAN ignore accesses while also preventing the compiler from folding, spindling, or otherwise mutilating the access. [ paulmck: Apply Bart Van Assche feedback. ] [ paulmck: Apply feedback from Marco Elver. ] Reported-by: Bart Van Assche Signed-off-by: Paul E. McKenney Cc: Marco Elver Cc: Breno Leitao Cc: Jens Axboe --- include/linux/compiler.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 8c252e073bd8..68a24a3a6979 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -194,9 +194,17 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, * This data_race() macro is useful for situations in which data races * should be forgiven. One example is diagnostic code that accesses * shared variables but is not a part of the core synchronization design. + * For example, if accesses to a given variable are protected by a lock, + * except for diagnostic code, then the accesses under the lock should + * be plain C-language accesses and those in the diagnostic code should + * use data_race(). This way, KCSAN will complain if buggy lockless + * accesses to that variable are introduced, even if the buggy accesses + * are protected by READ_ONCE() or WRITE_ONCE(). * * This macro *does not* affect normal code generation, but is a hint - * to tooling that data races here are to be ignored. + * to tooling that data races here are to be ignored. If the access must + * be atomic *and* KCSAN should ignore the access, use both data_race() + * and READ_ONCE(), for example, data_race(READ_ONCE(x)). */ #define data_race(expr) \ ({ \ -- cgit v1.2.3 From 73287fe228721b05690e671adbcccc6cf5435be6 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 29 May 2024 23:59:39 -0700 Subject: bpf: pass bpf_struct_ops_link to callbacks in bpf_struct_ops. Pass an additional pointer of bpf_struct_ops_link to callback function reg, unreg, and update provided by subsystems defined in bpf_struct_ops. A bpf_struct_ops_map can be registered for multiple links. Passing a pointer of bpf_struct_ops_link helps subsystems to distinguish them. This pointer will be used in the later patches to let the subsystem initiate a detachment on a link that was registered to it previously. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240530065946.979330-2-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5e694a308081..19f8836382fc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1730,9 +1730,9 @@ struct bpf_struct_ops { int (*init_member)(const struct btf_type *t, const struct btf_member *member, void *kdata, const void *udata); - int (*reg)(void *kdata); - void (*unreg)(void *kdata); - int (*update)(void *kdata, void *old_kdata); + int (*reg)(void *kdata, struct bpf_link *link); + void (*unreg)(void *kdata, struct bpf_link *link); + int (*update)(void *kdata, void *old_kdata, struct bpf_link *link); int (*validate)(void *kdata); void *cfi_stubs; struct module *owner; -- cgit v1.2.3 From 1adddc97aa44c8783f9f0276ea70854d56f9f6df Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 29 May 2024 23:59:41 -0700 Subject: bpf: support epoll from bpf struct_ops links. Add epoll support to bpf struct_ops links to trigger EPOLLHUP event upon detachment. This patch implements the "poll" of the "struct file_operations" for BPF links and introduces a new "poll" operator in the "struct bpf_link_ops". By implementing "poll" of "struct bpf_link_ops" for the links of struct_ops, the file descriptor of a struct_ops link can be added to an epoll file descriptor to receive EPOLLHUP events. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240530065946.979330-4-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 19f8836382fc..5eb61120e4f5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1612,6 +1612,7 @@ struct bpf_link_ops { struct bpf_link_info *info); int (*update_map)(struct bpf_link *link, struct bpf_map *new_map, struct bpf_map *old_map); + __poll_t (*poll)(struct file *file, struct poll_table_struct *pts); }; struct bpf_tramp_link { -- cgit v1.2.3 From 67c3e8353f45c27800eecc46e00e8272f063f7d1 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 29 May 2024 23:59:42 -0700 Subject: bpf: export bpf_link_inc_not_zero. bpf_link_inc_not_zero() will be used by kernel modules. We will use it in bpf_testmod.c later. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240530065946.979330-5-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5eb61120e4f5..a834f4b761bc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2334,6 +2334,7 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); void bpf_link_cleanup(struct bpf_link_primer *primer); void bpf_link_inc(struct bpf_link *link); +struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link); void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); struct bpf_link *bpf_link_get_from_fd(u32 ufd); @@ -2705,6 +2706,11 @@ static inline void bpf_link_inc(struct bpf_link *link) { } +static inline struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) +{ + return NULL; +} + static inline void bpf_link_put(struct bpf_link *link) { } -- cgit v1.2.3 From 02d00dc73d8d0613f82063ed53d67912a422c21e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 29 May 2024 14:29:29 +0100 Subject: net: phylink: rename ovr_an_inband to default_an_inband Since ovr_an_inband no longer overrides every MLO_AN_xxx mode, rename it to reflect what it now does - it changes the default mode from MLO_AN_PHY to MLO_AN_INBAND. Fix up the two users of this. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Halaney Link: https://lore.kernel.org/r/E1sCJMv-00Ecr1-Sk@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 5ea6b2ad2396..a30a692acc32 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -141,7 +141,8 @@ enum phylink_op_type { * @mac_requires_rxc: if true, the MAC always requires a receive clock from PHY. * The PHY driver should start the clock signal as soon as * possible and avoid stopping it during suspend events. - * @ovr_an_inband: if true, override PCS to MLO_AN_INBAND + * @default_an_inband: if true, defaults to MLO_AN_INBAND rather than + * MLO_AN_PHY. A fixed-link specification will override. * @get_fixed_state: callback to execute to determine the fixed link state, * if MAC link is at %MLO_AN_FIXED mode. * @supported_interfaces: bitmap describing which PHY_INTERFACE_MODE_xxx @@ -154,7 +155,7 @@ struct phylink_config { bool poll_fixed_state; bool mac_managed_pm; bool mac_requires_rxc; - bool ovr_an_inband; + bool default_an_inband; void (*get_fixed_state)(struct phylink_config *config, struct phylink_link_state *state); DECLARE_PHY_INTERFACE_MASK(supported_interfaces); -- cgit v1.2.3 From 83f55b01dd903040d6ab64f4f9d78a27391a5916 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 29 May 2024 14:29:40 +0100 Subject: net: stmmac: rename xpcs_an_inband to default_an_inband Rename xpcs_an_inband to default_an_inband to reflect the change in phylink and its changed functionality. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Halaney Link: https://lore.kernel.org/r/E1sCJN6-00EcrD-43@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index f92c195c76ed..8f0f156d50d3 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -83,7 +83,7 @@ struct stmmac_priv; struct stmmac_mdio_bus_data { unsigned int phy_mask; unsigned int has_xpcs; - unsigned int xpcs_an_inband; + unsigned int default_an_inband; int *irqs; int probed_phy_irq; bool needs_reset; -- cgit v1.2.3 From 5607ca92e6274dfb85d0ff7c4e91e6c4ddb6d25c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 31 May 2024 13:41:22 +0200 Subject: leds: core: Add led_mc_set_brightness() function Add a new led_mc_set_brightness() function for in kernel color/brightness changing of multi-color LEDs. led-class-multicolor can be build as a module and led_mc_set_brightness() will have the builtin callers, so put led_mc_set_brightness() inside led-core instead, just like how led_set_brightness() is part of the core and not of the led-class object. This also adds a new LED_MULTI_COLOR led_classdev flag to allow led_mc_set_brightness() to verify that it is operating on a multi-color LED classdev, avoiding casting the passed in LED classdev to a multi-color LED classdev, when it actually is not a multi-color LED. Signed-off-by: Hans de Goede Reviewed-by: Jacek Anaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240531114124.45346-5-hdegoede@redhat.com Signed-off-by: Lee Jones --- include/linux/leds.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 6300313c46b7..ae07e44f1dcb 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -107,6 +107,7 @@ struct led_classdev { #define LED_BRIGHT_HW_CHANGED BIT(21) #define LED_RETAIN_AT_SHUTDOWN BIT(22) #define LED_INIT_DEFAULT_TRIGGER BIT(23) +#define LED_MULTI_COLOR BIT(24) /* set_brightness_work / blink_timer flags, atomic, private. */ unsigned long work_flags; @@ -373,6 +374,25 @@ void led_set_brightness(struct led_classdev *led_cdev, unsigned int brightness); */ int led_set_brightness_sync(struct led_classdev *led_cdev, unsigned int value); +/** + * led_mc_set_brightness - set mc LED color intensity values and brightness + * @led_cdev: the LED to set + * @intensity_value: array of per color intensity values to set + * @num_colors: amount of entries in intensity_value array + * @brightness: the brightness to set the LED to + * + * Set a multi-color LED's per color intensity values and brightness. + * If necessary, this cancels the software blink timer. This function is + * guaranteed not to sleep. + * + * Calling this function on a non multi-color led_classdev or with the wrong + * num_colors value is an error. In this case an error will be logged once + * and the call will do nothing. + */ +void led_mc_set_brightness(struct led_classdev *led_cdev, + unsigned int *intensity_value, unsigned int num_colors, + unsigned int brightness); + /** * led_update_brightness - update LED brightness * @led_cdev: the LED to query -- cgit v1.2.3 From 0921a57c91648b08857b47a2f26fa7942f06120f Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 31 May 2024 13:41:23 +0200 Subject: leds: trigger: Add led_mc_trigger_event() function Add a new led_mc_trigger_event() function for triggers which want to change the color of a multi-color LED based on their trigger conditions. Signed-off-by: Hans de Goede Reviewed-by: Jacek Anaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240531114124.45346-6-hdegoede@redhat.com Signed-off-by: Lee Jones --- include/linux/leds.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index ae07e44f1dcb..517b6198df07 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -510,6 +510,9 @@ void led_trigger_register_simple(const char *name, struct led_trigger **trigger); void led_trigger_unregister_simple(struct led_trigger *trigger); void led_trigger_event(struct led_trigger *trigger, enum led_brightness event); +void led_mc_trigger_event(struct led_trigger *trig, + unsigned int *intensity_value, unsigned int num_colors, + enum led_brightness brightness); void led_trigger_blink(struct led_trigger *trigger, unsigned long delay_on, unsigned long delay_off); void led_trigger_blink_oneshot(struct led_trigger *trigger, @@ -552,6 +555,9 @@ static inline void led_trigger_register_simple(const char *name, static inline void led_trigger_unregister_simple(struct led_trigger *trigger) {} static inline void led_trigger_event(struct led_trigger *trigger, enum led_brightness event) {} +static inline void led_mc_trigger_event(struct led_trigger *trig, + unsigned int *intensity_value, unsigned int num_colors, + enum led_brightness brightness) {} static inline void led_trigger_blink(struct led_trigger *trigger, unsigned long delay_on, unsigned long delay_off) {} -- cgit v1.2.3 From 9af12f57f1f9785f231d31a7365ad244c656b7ff Mon Sep 17 00:00:00 2001 From: Kate Hsuan Date: Fri, 31 May 2024 13:41:24 +0200 Subject: power: supply: power-supply-leds: Add charging_orange_full_green trigger for RGB LED Add a charging_orange_full_green LED trigger and the trigger is based on led_mc_trigger_event() which can set an RGB LED when the trigger is triggered. The LED will show orange when the battery status is charging. The LED will show green when the battery status is full. Link: https://lore.kernel.org/linux-leds/f40a0b1a-ceac-e269-c2dd-0158c5b4a1ad@gmail.com/ Signed-off-by: Kate Hsuan Acked-by: Sebastian Reichel Reviewed-by: Andy Shevchenko [hdegoede@redhat.com change color order to RGB] Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240531114124.45346-7-hdegoede@redhat.com Signed-off-by: Lee Jones --- include/linux/power_supply.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 8e5705a56b85..c852cc882501 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -319,6 +319,8 @@ struct power_supply { char *online_trig_name; struct led_trigger *charging_blink_full_solid_trig; char *charging_blink_full_solid_trig_name; + struct led_trigger *charging_orange_full_green_trig; + char *charging_orange_full_green_trig_name; #endif }; -- cgit v1.2.3 From b44d79d6bad16c30978c2cee3421133d3f181494 Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Mon, 27 May 2024 16:29:33 +0300 Subject: platform/x86/intel/tpmi: Add support for performance limit reasons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add TPMI ID 0x0C (Perf Limit Reasons) to the list of supported TPMI IDs. Reviewed-by: Andy Shevchenko Reviewed-by: Ilpo Järvinen Signed-off-by: Tero Kristo Link: https://lore.kernel.org/r/20240527133400.483634-2-tero.kristo@linux.intel.com Signed-off-by: Ilpo Järvinen --- include/linux/intel_tpmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index 1e880cb0f454..a88ac937d2c2 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -21,6 +21,7 @@ enum intel_tpmi_id { TPMI_ID_PEM = 1, /* Power and Perf excursion Monitor */ TPMI_ID_UNCORE = 2, /* Uncore Frequency Scaling */ TPMI_ID_SST = 5, /* Speed Select Technology */ + TPMI_ID_PLR = 0xc, /* Performance Limit Reasons */ TPMI_CONTROL_ID = 0x80, /* Special ID for getting feature status */ TPMI_INFO_ID = 0x81, /* Special ID for PCI BDF and Package ID information */ }; -- cgit v1.2.3 From d36842bacf8e3491f555059f27de57b3436cc3ff Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Mon, 27 May 2024 16:29:34 +0300 Subject: platform/x86/intel/tpmi: Add API to get debugfs root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new API to get the debugfs root directory for TPMI. This allows any TPMI devices to add their own debugfs items under the same directory structure. Reviewed-by: Andy Shevchenko Reviewed-by: Ilpo Järvinen Signed-off-by: Tero Kristo Link: https://lore.kernel.org/r/20240527133400.483634-3-tero.kristo@linux.intel.com Signed-off-by: Ilpo Järvinen --- include/linux/intel_tpmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index a88ac937d2c2..ff480b47ae64 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -54,4 +54,5 @@ struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int int tpmi_get_resource_count(struct auxiliary_device *auxdev); int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, bool *read_blocked, bool *write_blocked); +struct dentry *tpmi_get_debugfs_dir(struct auxiliary_device *auxdev); #endif -- cgit v1.2.3 From 4d2bcefa965b06a1f2be6912456bcfa86a34f184 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 31 May 2024 13:29:02 +0100 Subject: mm: Reduce the number of slab->folio casts Mark a few more folio functions as taking a const folio pointer, which allows us to remove a few places in slab which cast away the const. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Vlastimil Babka --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9849dfda44d4..a983d371fafa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1105,7 +1105,7 @@ static inline unsigned int compound_order(struct page *page) * * Return: The order of the folio. */ -static inline unsigned int folio_order(struct folio *folio) +static inline unsigned int folio_order(const struct folio *folio) { if (!folio_test_large(folio)) return 0; @@ -2145,7 +2145,7 @@ static inline struct folio *folio_next(struct folio *folio) * it from being split. It is not necessary for the folio to be locked. * Return: The base-2 logarithm of the size of this folio. */ -static inline unsigned int folio_shift(struct folio *folio) +static inline unsigned int folio_shift(const struct folio *folio) { return PAGE_SHIFT + folio_order(folio); } @@ -2158,7 +2158,7 @@ static inline unsigned int folio_shift(struct folio *folio) * it from being split. It is not necessary for the folio to be locked. * Return: The number of bytes in this folio. */ -static inline size_t folio_size(struct folio *folio) +static inline size_t folio_size(const struct folio *folio) { return PAGE_SIZE << folio_order(folio); } -- cgit v1.2.3 From b7c5e64fecfa88764791679cca4786ac65de739e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 29 May 2024 22:52:30 -0600 Subject: vfio: Create vfio_fs_type with inode per device By linking all the device fds we provide to userspace to an address space through a new pseudo fs, we can use tools like unmap_mapping_range() to zap all vmas associated with a device. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240530045236.1005864-2-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 8b1a29820409..000a6cab2d31 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -64,6 +64,7 @@ struct vfio_device { struct completion comp; struct iommufd_access *iommufd_access; void (*put_kvm)(struct kvm *kvm); + struct inode *inode; #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; u8 iommufd_attached:1; -- cgit v1.2.3 From aac6db75a9fc2c7a6f73e152df8f15101dda38e6 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 29 May 2024 22:52:31 -0600 Subject: vfio/pci: Use unmap_mapping_range() With the vfio device fd tied to the address space of the pseudo fs inode, we can use the mm to track all vmas that might be mmap'ing device BARs, which removes our vma_list and all the complicated lock ordering necessary to manually zap each related vma. Note that we can no longer store the pfn in vm_pgoff if we want to use unmap_mapping_range() to zap a selective portion of the device fd corresponding to BAR mappings. This also converts our mmap fault handler to use vmf_insert_pfn() because we no longer have a vma_list to avoid the concurrency problem with io_remap_pfn_range(). The goal is to eventually use the vm_ops huge_fault handler to avoid the additional faulting overhead, but vmf_insert_pfn_{pmd,pud}() need to learn about pfnmaps first. Also, Jason notes that a race exists between unmap_mapping_range() and the fops mmap callback if we were to call io_remap_pfn_range() to populate the vma on mmap. Specifically, mmap_region() does call_mmap() before it does vma_link_file() which gives a window where the vma is populated but invisible to unmap_mapping_range(). Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240530045236.1005864-3-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index a2c8b8bba711..f87067438ed4 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -93,8 +93,6 @@ struct vfio_pci_core_device { struct list_head sriov_pfs_item; struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; - struct mutex vma_lock; - struct list_head vma_list; struct rw_semaphore memory_lock; }; -- cgit v1.2.3 From 055afc34fd219c8e2290d736ad186edd58870a9e Mon Sep 17 00:00:00 2001 From: Unnathi Chalicheemala Date: Fri, 31 May 2024 09:45:25 -0700 Subject: soc: qcom: llcc: Add regmap for Broadcast_AND region Until SM8450, there was only one broadcast region (Broadcast_OR) used to broadcast write and check for status bit 0. >From SM8450 onwards another broadcast region (Broadcast_AND) has been added which checks for status bit 1. This hasn't been updated and Broadcast_OR region was wrongly being used to check for status bit 1 all along. Hence define new regmap structure for Broadcast_AND region and initialize this regmap when HW block version is greater than 4.1, otherwise initialize as a NULL pointer for backwards compatibility. Switch from broadcast_OR to broadcast_AND region (when defined in DT) for checking status bit 1 as Broadcast_OR region checks only for bit 0. Signed-off-by: Unnathi Chalicheemala Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/9cf19928a67eaa577ae0f02de5bf86276be34ea2.1717014052.git.quic_uchalich@quicinc.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/llcc-qcom.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 1a886666bbb6..9e9f528b1370 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -115,7 +115,8 @@ struct llcc_edac_reg_offset { /** * struct llcc_drv_data - Data associated with the llcc driver * @regmaps: regmaps associated with the llcc device - * @bcast_regmap: regmap associated with llcc broadcast offset + * @bcast_regmap: regmap associated with llcc broadcast OR offset + * @bcast_and_regmap: regmap associated with llcc broadcast AND offset * @cfg: pointer to the data structure for slice configuration * @edac_reg_offset: Offset of the LLCC EDAC registers * @lock: mutex associated with each slice @@ -129,6 +130,7 @@ struct llcc_edac_reg_offset { struct llcc_drv_data { struct regmap **regmaps; struct regmap *bcast_regmap; + struct regmap *bcast_and_regmap; const struct llcc_slice_config *cfg; const struct llcc_edac_reg_offset *edac_reg_offset; struct mutex lock; -- cgit v1.2.3 From 964a504b1261b641797d93cdbf79b68ff7d85720 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 31 May 2024 15:47:00 +0200 Subject: power: supply: leds: Add power_supply_[un]register_led_trigger() Add power_supply_[un]register_led_trigger() helper functions. The primary goal of this is as a preparation patch for adding an activate callback to the power-supply LED triggers to ensure that power-supply LEDs get the correct initial value when the LED gets registered after the power_supply has been registered (this will use the psy back pointer). There also is quite a lot of code duplication in the existing LED trigger registration in the form of the kasprintf() for the name-template for each trigger + related error handling. This duplication is removed by these new helpers. Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240531134702.166145-2-hdegoede@redhat.com Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index c852cc882501..e218a8ab72ee 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -310,17 +310,11 @@ struct power_supply { #ifdef CONFIG_LEDS_TRIGGERS struct led_trigger *charging_full_trig; - char *charging_full_trig_name; struct led_trigger *charging_trig; - char *charging_trig_name; struct led_trigger *full_trig; - char *full_trig_name; struct led_trigger *online_trig; - char *online_trig_name; struct led_trigger *charging_blink_full_solid_trig; - char *charging_blink_full_solid_trig_name; struct led_trigger *charging_orange_full_green_trig; - char *charging_orange_full_green_trig_name; #endif }; -- cgit v1.2.3 From 6c951a84dab9c0e051aec54d7497f25e910a4953 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 31 May 2024 15:47:01 +0200 Subject: power: supply: leds: Share trig pointer for online and charging_full Either 5 different LED triggers are registered for battery power-supply devices or a single online LED trigger is used for non battery power-supply devices. These 5 / 1 LED trigger(s) are never used at the same time. So there is no need for a separate LED trigger pointer for the online trigger. Rename the first battery trigger from charging_full_trig to just trig and use this for the online trigger too. Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240531134702.166145-3-hdegoede@redhat.com Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index e218a8ab72ee..65082ef75692 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -309,10 +309,9 @@ struct power_supply { #endif #ifdef CONFIG_LEDS_TRIGGERS - struct led_trigger *charging_full_trig; + struct led_trigger *trig; struct led_trigger *charging_trig; struct led_trigger *full_trig; - struct led_trigger *online_trig; struct led_trigger *charging_blink_full_solid_trig; struct led_trigger *charging_orange_full_green_trig; #endif -- cgit v1.2.3 From a14a569a9918a0c7e340257a17dbc088bb27db72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 29 May 2024 08:27:11 +0200 Subject: platform/chrome: cros_ec_proto: Introduce cros_ec_cmd_readmem() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To read from the EC memory different mechanism are possible. ECs connected via LPC expose their memory via a ->cmd_readmem operation. Other protocols require the usage of EC_CMD_READ_MEMMAP, which on the other hand is not implemented by LPC ECs. Provide a helper that automatically selects the correct mechanism. Signed-off-by: Thomas Weißschuh Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20240529-cros_ec-hwmon-v4-1-5cdf0c5db50a@weissschuh.net Signed-off-by: Tzung-Bi Shih --- include/linux/platform_data/cros_ec_proto.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 8865e350c12a..1ddc52603f9a 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -261,6 +261,8 @@ int cros_ec_get_sensor_count(struct cros_ec_dev *ec); int cros_ec_cmd(struct cros_ec_device *ec_dev, unsigned int version, int command, const void *outdata, size_t outsize, void *indata, size_t insize); +int cros_ec_cmd_readmem(struct cros_ec_device *ec_dev, u8 offset, u8 size, void *dest); + /** * cros_ec_get_time_ns() - Return time in ns. * -- cgit v1.2.3 From 6b2e29977518ec13ef3022f234ff8f3014c243da Mon Sep 17 00:00:00 2001 From: Lakshmi Sowjanya D Date: Mon, 13 May 2024 16:08:02 +0530 Subject: timekeeping: Provide infrastructure for converting to/from a base clock Hardware time stamps like provided by PTP clock implementations are based on a clock which feeds both the PCIe device and the system clock. For further processing the underlying hardwarre clock timestamp must be converted to the system clock. Right now this requires drivers to invoke an architecture specific conversion function, e.g. to convert the ART (Always Running Timer) timestamp to a TSC timestamp. As the system clock is aware of the underlying base clock, this can be moved to the core code by providing a base clock property for the system clock which contains the conversion factors and assigning a clocksource ID to the base clock. Add the required data structures and the conversion infrastructure in the core code to prepare for converting X86 and the related PTP drivers over. [ tglx: Added a missing READ_ONCE(). Massaged change log ] Co-developed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner Co-developed-by: Christopher S. Hall Signed-off-by: Christopher S. Hall Signed-off-by: Lakshmi Sowjanya D Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240513103813.5666-2-lakshmi.sowjanya.d@intel.com --- include/linux/clocksource.h | 27 +++++++++++++++++++++++++++ include/linux/timekeeping.h | 2 ++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 0ad8b550bb4b..d35b677b08fe 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -21,6 +21,7 @@ #include #include +struct clocksource_base; struct clocksource; struct module; @@ -50,6 +51,7 @@ struct module; * multiplication * @name: Pointer to clocksource name * @list: List head for registration (internal) + * @freq_khz: Clocksource frequency in khz. * @rating: Rating value for selection (higher is better) * To avoid rating inflation the following * list should give you a guide as to how @@ -70,6 +72,8 @@ struct module; * validate the clocksource from which the snapshot was * taken. * @flags: Flags describing special properties + * @base: Hardware abstraction for clock on which a clocksource + * is based * @enable: Optional function to enable the clocksource * @disable: Optional function to disable the clocksource * @suspend: Optional suspend function for the clocksource @@ -107,10 +111,12 @@ struct clocksource { u64 max_cycles; const char *name; struct list_head list; + u32 freq_khz; int rating; enum clocksource_ids id; enum vdso_clock_mode vdso_clock_mode; unsigned long flags; + struct clocksource_base *base; int (*enable)(struct clocksource *cs); void (*disable)(struct clocksource *cs); @@ -306,4 +312,25 @@ static inline unsigned int clocksource_get_max_watchdog_retry(void) void clocksource_verify_percpu(struct clocksource *cs); +/** + * struct clocksource_base - hardware abstraction for clock on which a clocksource + * is based + * @id: Defaults to CSID_GENERIC. The id value is used for conversion + * functions which require that the current clocksource is based + * on a clocksource_base with a particular ID in certain snapshot + * functions to allow callers to validate the clocksource from + * which the snapshot was taken. + * @freq_khz: Nominal frequency of the base clock in kHz + * @offset: Offset between the base clock and the clocksource + * @numerator: Numerator of the clock ratio between base clock and the clocksource + * @denominator: Denominator of the clock ratio between base clock and the clocksource + */ +struct clocksource_base { + enum clocksource_ids id; + u32 freq_khz; + u64 offset; + u32 numerator; + u32 denominator; +}; + #endif /* _LINUX_CLOCKSOURCE_H */ diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 0ea7823b7f31..b2ee182d891e 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -310,10 +310,12 @@ struct system_device_crosststamp { * timekeeping code to verify comparability of two cycle values. * The default ID, CSID_GENERIC, does not identify a specific * clocksource. + * @use_nsecs: @cycles is in nanoseconds. */ struct system_counterval_t { u64 cycles; enum clocksource_ids cs_id; + bool use_nsecs; }; /* -- cgit v1.2.3 From 3a52886c8f972c3a5b70bfec330c71817cd7fc63 Mon Sep 17 00:00:00 2001 From: Lakshmi Sowjanya D Date: Mon, 13 May 2024 16:08:03 +0530 Subject: x86/tsc: Provide ART base clock information for TSC The core code provides a new mechanism to allow conversion between ART and TSC. This allows to replace the x86 specific ART/TSC conversion functions. Prepare for removal by filling in the base clock conversion information for ART and associating the base clock to the TSC clocksource. The existing conversion functions will be removed once the usage sites are converted over to the new model. [ tglx: Massaged change log ] Co-developed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner Co-developed-by: Christopher S. Hall Signed-off-by: Christopher S. Hall Signed-off-by: Lakshmi Sowjanya D Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240513103813.5666-3-lakshmi.sowjanya.d@intel.com --- include/linux/clocksource_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h index a4fa3436940c..2bb4d8c2f1b0 100644 --- a/include/linux/clocksource_ids.h +++ b/include/linux/clocksource_ids.h @@ -9,6 +9,7 @@ enum clocksource_ids { CSID_X86_TSC_EARLY, CSID_X86_TSC, CSID_X86_KVM_CLK, + CSID_X86_ART, CSID_MAX, }; -- cgit v1.2.3 From 02ecee07ca30f76f2a0f1381661a688b8e501ab0 Mon Sep 17 00:00:00 2001 From: Lakshmi Sowjanya D Date: Mon, 13 May 2024 16:08:10 +0530 Subject: timekeeping: Add function to convert realtime to base clock PPS (Pulse Per Second) generates a hardware pulse every second based on CLOCK_REALTIME. This works fine when the pulse is generated in software from a hrtimer callback function. For hardware which generates the pulse by programming a timer it is required to convert CLOCK_REALTIME to the underlying hardware clock. The X86 Timed IO device is based on the Always Running Timer (ART), which is the base clock of the TSC, which is usually the system clocksource on X86. The core code already has functionality to convert base clock timestamps to system clocksource timestamps, but there is no support for converting the other way around. Provide the required functionality to support such devices in a generic way to avoid code duplication in drivers: 1) ktime_real_to_base_clock() to convert a CLOCK_REALTIME timestamp to a base clock timestamp 2) timekeeping_clocksource_has_base() to allow drivers to validate that the system clocksource is based on a particular clocksource ID. [ tglx: Simplify timekeeping_clocksource_has_base() and add missing READ_ONCE() ] Co-developed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner Co-developed-by: Christopher S. Hall Signed-off-by: Christopher S. Hall Signed-off-by: Lakshmi Sowjanya D Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240513103813.5666-10-lakshmi.sowjanya.d@intel.com --- include/linux/timekeeping.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index b2ee182d891e..fc12a9ba2c88 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -318,6 +318,10 @@ struct system_counterval_t { bool use_nsecs; }; +extern bool ktime_real_to_base_clock(ktime_t treal, + enum clocksource_ids base_id, u64 *cycles); +extern bool timekeeping_clocksource_has_base(enum clocksource_ids id); + /* * Get cross timestamp between system clock and device clock */ -- cgit v1.2.3 From fcf544ac64397776a18246eba5a8ca72a47c4405 Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Wed, 24 Apr 2024 05:49:00 -0700 Subject: soc: xilinx: Add cb event for subsystem restart Add support to register subsystem restart events from firmware for Versal and Versal NET platforms. This event is received when firmware requests for subsystem restart. After receiving this event, the kernel needs to be restarted. Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20240424124900.29287-1-jay.buddhabhatti@amd.com Signed-off-by: Michal Simek --- include/linux/firmware/xlnx-event-manager.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-event-manager.h b/include/linux/firmware/xlnx-event-manager.h index 82e8254b0f80..645dd34155e6 100644 --- a/include/linux/firmware/xlnx-event-manager.h +++ b/include/linux/firmware/xlnx-event-manager.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Xilinx Event Management Driver + * + * Copyright (C) 2024, Advanced Micro Devices, Inc. + */ #ifndef _FIRMWARE_XLNX_EVENT_MANAGER_H_ #define _FIRMWARE_XLNX_EVENT_MANAGER_H_ @@ -7,6 +12,11 @@ #define CB_MAX_PAYLOAD_SIZE (4U) /*In payload maximum 32bytes */ +#define EVENT_SUBSYSTEM_RESTART (4U) + +#define PM_DEV_ACPU_0_0 (0x1810c0afU) +#define PM_DEV_ACPU_0 (0x1810c003U) + /************************** Exported Function *****************************/ typedef void (*event_cb_func_t)(const u32 *payload, void *data); -- cgit v1.2.3 From 494c55a1ec0ab40198cf43f5a41c7c5e0b70e7fc Mon Sep 17 00:00:00 2001 From: Ronak Jain Date: Thu, 25 Apr 2024 02:59:13 -0700 Subject: firmware: xilinx: Move FIRMWARE_VERSION_MASK to xlnx-zynqmp.h Move FIRMWARE_VERSION_MASK macro to xlnx-zynqmp.h so that other drivers can use it for verifying the supported firmware version. Signed-off-by: Ronak Jain Signed-off-by: Anand Ashok Dumbre Signed-off-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/20240425095913.919390-1-ronak.jain@amd.com Signed-off-by: Michal Simek --- include/linux/firmware/xlnx-zynqmp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 1a069a56c961..d7d07afc0532 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -52,6 +52,9 @@ #define API_ID_MASK GENMASK(7, 0) #define MODULE_ID_MASK GENMASK(11, 8) +/* Firmware feature check version mask */ +#define FIRMWARE_VERSION_MASK 0xFFFFU + /* ATF only commands */ #define TF_A_PM_REGISTER_SGI 0xa04 #define PM_GET_TRUSTZONE_VERSION 0xa03 -- cgit v1.2.3 From a5b7365f28c191df6b93f60942d2b9a9fe71746c Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 3 Jun 2024 14:58:41 +0800 Subject: soundwire: bus: add stream refcount The notion of stream is by construction based on a multi-bus capability, to allow for aggregation of Peripheral devices or functions located on different segments. We currently count how many master_rt contexts are used by a stream, but we don't have the dual refcount of how many streams are allocated on a given bus. This refcount will be useful to check if BTP/BRA streams can be allocated. Note that the stream_refcount is modified in sdw_master_rt_alloc() and sdw_master_rt_free() which are both called with the bus_lock mutex held, so there's no need for refcount_ primitives for additional protection. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20240603065841.4860-2-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 13e96d8b7423..94fc1b57c57b 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -903,6 +903,7 @@ struct sdw_master_ops { * meaningful if multi_link is set. If set to 1, hardware-based * synchronization will be used even if a stream only uses a single * SoundWire segment. + * @stream_refcount: number of streams currently using this bus */ struct sdw_bus { struct device *dev; @@ -933,6 +934,7 @@ struct sdw_bus { u32 bank_switch_timeout; bool multi_link; int hw_sync_min_links; + int stream_refcount; }; int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, -- cgit v1.2.3 From 9b5fd115e7d5a98b82054cff5c96f6768ee06845 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 3 Jun 2024 15:02:40 +0800 Subject: soundwire: intel_ace2.x: add AC timing extensions for PantherLake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ACE3 IP used in PantherLake exposes new bitfields in the ACTMCTL register to better control clocks/delays. These bitfields were reserved/zero in the ACE2.x IP, to simplify the integration the new bifields are added unconditionally. The behavior will only be impacted when the firmware exposes DSD properties to set non-zero values. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Reviewed-by: Ranjani Sridharan Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20240603070240.5165-1-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 8e78417156e3..d537587b4499 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -182,6 +182,11 @@ #define SDW_SHIM2_INTEL_VS_ACTMCTL_DODSE BIT(2) #define SDW_SHIM2_INTEL_VS_ACTMCTL_DOAIS GENMASK(4, 3) #define SDW_SHIM2_INTEL_VS_ACTMCTL_DOAISE BIT(5) +#define SDW_SHIM3_INTEL_VS_ACTMCTL_CLSS BIT(6) +#define SDW_SHIM3_INTEL_VS_ACTMCTL_CLDS GENMASK(11, 7) +#define SDW_SHIM3_INTEL_VS_ACTMCTL_DODSE2 GENMASK(13, 12) +#define SDW_SHIM3_INTEL_VS_ACTMCTL_DOAISE2 BIT(14) +#define SDW_SHIM3_INTEL_VS_ACTMCTL_CLDE BIT(15) /** * struct sdw_intel_stream_params_data: configuration passed during -- cgit v1.2.3 From 32fe91524e1651b9a0e11ddfbc63de9aa485cf48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 13 May 2024 11:25:18 +0200 Subject: sysctl: constify ctl_table arguments of utility function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In a future commit the proc_handlers themselves will change to "const struct ctl_table". As a preparation for that adapt the internal helper. Signed-off-by: Thomas Weißschuh Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- include/linux/sysctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 09db2f2e6488..54fbec062772 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -237,7 +237,7 @@ extern struct ctl_table_header *register_sysctl_mount_point(const char *path); void do_sysctl_args(void); bool sysctl_is_alias(char *param); -int do_proc_douintvec(struct ctl_table *table, int write, +int do_proc_douintvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, -- cgit v1.2.3 From 3d9a0a25336433d48ddca1e1f4fea25fd4a3b1d8 Mon Sep 17 00:00:00 2001 From: Sreenath Vijayan Date: Thu, 30 May 2024 13:15:47 +0530 Subject: printk: Rename console_replay_all() and update context Rename console_replay_all() to console_try_replay_all() to make clear that the implementation is best effort. Also, the function should not be called in NMI context as it takes locks, so update the comment in code. Fixes: 693f75b91a91 ("printk: Add function to replay kernel log on consoles") Fixes: 1b743485e27f ("tty/sysrq: Replay kernel log messages on consoles via sysrq") Suggested-by: Petr Mladek Signed-off-by: Shimoyashiki Taichi Signed-off-by: Sreenath Vijayan Link: https://lore.kernel.org/r/Zlguq/wU21Z8MqI4@sreenath.vijayan@sony.com Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek --- include/linux/printk.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 40afab23881a..ca4c9271daf6 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -195,7 +195,7 @@ void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; void printk_trigger_flush(void); -void console_replay_all(void); +void console_try_replay_all(void); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -275,7 +275,7 @@ static inline void dump_stack(void) static inline void printk_trigger_flush(void) { } -static inline void console_replay_all(void) +static inline void console_try_replay_all(void) { } #endif -- cgit v1.2.3 From 96a02b9fa95108269cd0cd012f091f86bd6f41a4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 15 May 2024 17:08:04 +0200 Subject: KVM: Unexport kvm_debugfs_dir After faf01aef0570 ("KVM: PPC: Merge powerpc's debugfs entry content into generic entry") kvm_debugfs_dir is not used anywhere else outside of kvm_main.c Unexport it and make it static. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240515150804.9354-1-bp@kernel.org Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 692c01e41a18..c80fe03a1fa4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1955,8 +1955,6 @@ struct _kvm_stats_desc { HALT_POLL_HIST_COUNT), \ STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking) -extern struct dentry *kvm_debugfs_dir; - ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, const struct _kvm_stats_desc *desc, void *stats, size_t size_stats, -- cgit v1.2.3 From 61df7b82820494368bd46071ca97e43a3dfc3b11 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 2 May 2024 17:57:51 -0400 Subject: lsm: fixup the inode xattr capability handling The current security_inode_setxattr() and security_inode_removexattr() hooks rely on individual LSMs to either call into the associated capability hooks (cap_inode_setxattr() or cap_inode_removexattr()), or return a magic value of 1 to indicate that the LSM layer itself should perform the capability checks. Unfortunately, with the default return value for these LSM hooks being 0, an individual LSM hook returning a 1 will cause the LSM hook processing to exit early, potentially skipping a LSM. Thankfully, with the exception of the BPF LSM, none of the LSMs which currently register inode xattr hooks should end up returning a value of 1, and in the BPF LSM case, with the BPF LSM hooks executing last there should be no real harm in stopping processing of the LSM hooks. However, the reliance on the individual LSMs to either call the capability hooks themselves, or signal the LSM with a return value of 1, is fragile and relies on a specific set of LSMs being enabled. This patch is an effort to resolve, or minimize, these issues. Before we discuss the solution, there are a few observations and considerations that we need to take into account: * BPF LSM registers an implementation for every LSM hook, and that implementation simply returns the hook's default return value, a 0 in this case. We want to ensure that the default BPF LSM behavior results in the capability checks being called. * SELinux and Smack do not expect the traditional capability checks to be applied to the xattrs that they "own". * SELinux and Smack are currently written in such a way that the xattr capability checks happen before any additional LSM specific access control checks. SELinux does apply SELinux specific access controls to all xattrs, even those not "owned" by SELinux. * IMA and EVM also register xattr hooks but assume that the LSM layer and specific LSMs have already authorized the basic xattr operation. In order to ensure we perform the capability based access controls before the individual LSM access controls, perform only one capability access control check for each operation, and clarify the logic around applying the capability controls, we need a mechanism to determine if any of the enabled LSMs "own" a particular xattr and want to take responsibility for controlling access to that xattr. The solution in this patch is to create a new LSM hook, 'inode_xattr_skipcap', that is not exported to the rest of the kernel via a security_XXX() function, but is used by the LSM layer to determine if a LSM wants to control access to a given xattr and avoid the traditional capability controls. Registering an inode_xattr_skipcap hook is optional, if a LSM declines to register an implementation, or uses an implementation that simply returns the default value (0), there is no effect as the LSM continues to enforce the capability based controls (unless another LSM takes ownership of the xattr). If none of the LSMs signal that the capability checks should be skipped, the capability check is performed and if access is granted the individual LSM xattr access control hooks are executed, keeping with the DAC-before-LSM convention. Cc: stable@vger.kernel.org Acked-by: Casey Schaufler Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index f804b76cde44..8fd87f823d3a 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -144,6 +144,7 @@ LSM_HOOK(int, 0, inode_setattr, struct mnt_idmap *idmap, struct dentry *dentry, LSM_HOOK(void, LSM_RET_VOID, inode_post_setattr, struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) LSM_HOOK(int, 0, inode_getattr, const struct path *path) +LSM_HOOK(int, 0, inode_xattr_skipcap, const char *name) LSM_HOOK(int, 0, inode_setxattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) -- cgit v1.2.3 From 32d99593bdc91442fe6183d97b060210cc9c8193 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 16 Mar 2024 00:10:15 -0700 Subject: rcu: Add lockdep_assert_in_rcu_read_lock() and friends There is no direct RCU counterpart to lockdep_assert_irqs_disabled() and friends. Although it is possible to construct them, it would be more convenient to have the following lockdep assertions: lockdep_assert_in_rcu_read_lock() lockdep_assert_in_rcu_read_lock_bh() lockdep_assert_in_rcu_read_lock_sched() lockdep_assert_in_rcu_reader() This commit therefore creates them. Reported-by: Jens Axboe Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index dfd2399f2cde..8470a85f6563 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -421,11 +421,71 @@ static inline void rcu_preempt_sleep_check(void) { } "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) +// See RCU_LOCKDEP_WARN() for an explanation of the double call to +// debug_lockdep_rcu_enabled(). +static inline bool lockdep_assert_rcu_helper(bool c) +{ + return debug_lockdep_rcu_enabled() && + (c || !rcu_is_watching() || !rcu_lockdep_current_cpu_online()) && + debug_lockdep_rcu_enabled(); +} + +/** + * lockdep_assert_in_rcu_read_lock - WARN if not protected by rcu_read_lock() + * + * Splats if lockdep is enabled and there is no rcu_read_lock() in effect. + */ +#define lockdep_assert_in_rcu_read_lock() \ + WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map))) + +/** + * lockdep_assert_in_rcu_read_lock_bh - WARN if not protected by rcu_read_lock_bh() + * + * Splats if lockdep is enabled and there is no rcu_read_lock_bh() in effect. + * Note that local_bh_disable() and friends do not suffice here, instead an + * actual rcu_read_lock_bh() is required. + */ +#define lockdep_assert_in_rcu_read_lock_bh() \ + WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_bh_lock_map))) + +/** + * lockdep_assert_in_rcu_read_lock_sched - WARN if not protected by rcu_read_lock_sched() + * + * Splats if lockdep is enabled and there is no rcu_read_lock_sched() + * in effect. Note that preempt_disable() and friends do not suffice here, + * instead an actual rcu_read_lock_sched() is required. + */ +#define lockdep_assert_in_rcu_read_lock_sched() \ + WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_sched_lock_map))) + +/** + * lockdep_assert_in_rcu_reader - WARN if not within some type of RCU reader + * + * Splats if lockdep is enabled and there is no RCU reader of any + * type in effect. Note that regions of code protected by things like + * preempt_disable, local_bh_disable(), and local_irq_disable() all qualify + * as RCU readers. + * + * Note that this will never trigger in PREEMPT_NONE or PREEMPT_VOLUNTARY + * kernels that are not also built with PREEMPT_COUNT. But if you have + * lockdep enabled, you might as well also enable PREEMPT_COUNT. + */ +#define lockdep_assert_in_rcu_reader() \ + WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map) && \ + !lock_is_held(&rcu_bh_lock_map) && \ + !lock_is_held(&rcu_sched_lock_map) && \ + preemptible())) + #else /* #ifdef CONFIG_PROVE_RCU */ #define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c)) #define rcu_sleep_check() do { } while (0) +#define lockdep_assert_in_rcu_read_lock() do { } while (0) +#define lockdep_assert_in_rcu_read_lock_bh() do { } while (0) +#define lockdep_assert_in_rcu_read_lock_sched() do { } while (0) +#define lockdep_assert_in_rcu_reader() do { } while (0) + #endif /* #else #ifdef CONFIG_PROVE_RCU */ /* -- cgit v1.2.3 From ce418966a83320837f59f04a646fa4716e132dc8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 25 Apr 2024 16:18:32 +0200 Subject: rcu/nocb: Fix segcblist state machine comments about bypass The parts explaining the bypass lifecycle in (de-)offloading are out of date and/or wrong. Bypass is simply enabled whenever SEGCBLIST_RCU_CORE flag is off. Fix the comments accordingly. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 659d13a7ddaa..1e5b4ef167ca 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -141,7 +141,7 @@ struct rcu_cblist { * | SEGCBLIST_KTHREAD_GP | * | | * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | - * | handles callbacks concurrently. Bypass enqueue is enabled. | + * | handles callbacks concurrently. Bypass enqueue is disabled. | * | Invoke RCU core so we make sure not to preempt it in the middle with | * | leaving some urgent work unattended within a jiffy. | * ---------------------------------------------------------------------------- @@ -154,8 +154,7 @@ struct rcu_cblist { * | SEGCBLIST_KTHREAD_GP | * | | * | CB/GP kthreads and local rcu_core() handle callbacks concurrently | - * | holding nocb_lock. Wake up CB and GP kthreads if necessary. Disable | - * | bypass enqueue. | + * | holding nocb_lock. Wake up CB and GP kthreads if necessary. | * ---------------------------------------------------------------------------- * | * v @@ -185,7 +184,7 @@ struct rcu_cblist { * | | * | Callbacks processed by rcu_core() from softirqs or local | * | rcuc kthread, while holding nocb_lock. Forbid nocb_timer to be armed. | - * | Flush pending nocb_timer. Flush nocb bypass callbacks. | + * | Flush pending nocb_timer. | * ---------------------------------------------------------------------------- * | * v -- cgit v1.2.3 From aa97b9a56906f5965a7c5752790d174cadc8b820 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 25 Apr 2024 16:18:33 +0200 Subject: rcu/nocb: Fix segcblist state machine stale comments about timers The (de-)offloading process used to take care about the NOCB timer when it depended on the per-rdp NOCB locking. However this isn't the case anymore for a long while. It can now safely be armed and run during the (de-)offloading process, which doesn't care about it anymore. Update the comments accordingly. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 1e5b4ef167ca..8018045989af 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -80,8 +80,7 @@ struct rcu_cblist { * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED | * | | * | Callbacks processed by rcu_core() from softirqs or local | - * | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads, | - * | allowing nocb_timer to be armed. | + * | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads. | * ---------------------------------------------------------------------------- * | * v @@ -183,8 +182,7 @@ struct rcu_cblist { * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | * | | * | Callbacks processed by rcu_core() from softirqs or local | - * | rcuc kthread, while holding nocb_lock. Forbid nocb_timer to be armed. | - * | Flush pending nocb_timer. | + * | rcuc kthread, while holding nocb_lock. | * ---------------------------------------------------------------------------- * | * v -- cgit v1.2.3 From 483d5bf23125a9127ffcb3f7a3b3539b34df67d4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 25 Apr 2024 16:18:34 +0200 Subject: rcu/nocb: Use kthread parking instead of ad-hoc implementation Upon NOCB deoffloading, the rcuo kthread must be forced to sleep until the corresponding rdp is ever offloaded again. The deoffloader clears the SEGCBLIST_OFFLOADED flag, wakes up the rcuo kthread which then notices that change and clears in turn its SEGCBLIST_KTHREAD_CB flag before going to sleep, until it ever sees the SEGCBLIST_OFFLOADED flag again, should a re-offloading happen. Upon NOCB offloading, the rcuo kthread must be forced to wake up and handle callbacks until the corresponding rdp is ever deoffloaded again. The offloader sets the SEGCBLIST_OFFLOADED flag, wakes up the rcuo kthread which then notices that change and sets in turn its SEGCBLIST_KTHREAD_CB flag before going to check callbacks, until it ever sees the SEGCBLIST_OFFLOADED flag cleared again, should a de-offloading happen again. This is all a crude ad-hoc and error-prone kthread (un-)parking re-implementation. Consolidate the behaviour with the appropriate API instead. [ paulmck: Apply Qiang Zhang feedback provided in Link: below. ] Link: https://lore.kernel.org/all/20240509074046.15629-1-qiang.zhang1211@gmail.com/ Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 81 +++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 8018045989af..ba95c06675e1 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -84,31 +84,31 @@ struct rcu_cblist { * ---------------------------------------------------------------------------- * | * v - * ----------------------------------- - * | | - * v v - * --------------------------------------- ----------------------------------| - * | SEGCBLIST_RCU_CORE | | | SEGCBLIST_RCU_CORE | | - * | SEGCBLIST_LOCKING | | | SEGCBLIST_LOCKING | | - * | SEGCBLIST_OFFLOADED | | | SEGCBLIST_OFFLOADED | | - * | SEGCBLIST_KTHREAD_CB | | SEGCBLIST_KTHREAD_GP | - * | | | | - * | | | | - * | CB kthread woke up and | | GP kthread woke up and | - * | acknowledged SEGCBLIST_OFFLOADED. | | acknowledged SEGCBLIST_OFFLOADED| - * | Processes callbacks concurrently | | | - * | with rcu_core(), holding | | | - * | nocb_lock. | | | - * --------------------------------------- ----------------------------------- - * | | - * ----------------------------------- + * ---------------------------------------------------------------------------- + * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED | + * | + unparked CB kthread | + * | | + * | CB kthread got unparked and processes callbacks concurrently with | + * | rcu_core(), holding nocb_lock. | + * --------------------------------------------------------------------------- + * | + * v + * ---------------------------------------------------------------------------| + * | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_GP | + * | + unparked CB kthread | + * | | + * | GP kthread woke up and acknowledged nocb_lock. | + * ---------------------------------------- ----------------------------------- * | * v * |--------------------------------------------------------------------------| - * | SEGCBLIST_LOCKING | | - * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | * | SEGCBLIST_KTHREAD_GP | | - * | SEGCBLIST_KTHREAD_CB | + * | + unparked CB kthread | * | | * | Kthreads handle callbacks holding nocb_lock, local rcu_core() stops | * | handling callbacks. Enable bypass queueing. | @@ -124,8 +124,8 @@ struct rcu_cblist { * |--------------------------------------------------------------------------| * | SEGCBLIST_LOCKING | | * | SEGCBLIST_OFFLOADED | | - * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | + * | + unparked CB kthread | * | | * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | * | ignores callbacks. Bypass enqueue is enabled. | @@ -136,8 +136,8 @@ struct rcu_cblist { * | SEGCBLIST_RCU_CORE | | * | SEGCBLIST_LOCKING | | * | SEGCBLIST_OFFLOADED | | - * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | + * | + unparked CB kthread | * | | * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | * | handles callbacks concurrently. Bypass enqueue is disabled. | @@ -149,40 +149,31 @@ struct rcu_cblist { * |--------------------------------------------------------------------------| * | SEGCBLIST_RCU_CORE | | * | SEGCBLIST_LOCKING | | - * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | + * | + unparked CB kthread | * | | * | CB/GP kthreads and local rcu_core() handle callbacks concurrently | - * | holding nocb_lock. Wake up CB and GP kthreads if necessary. | + * | holding nocb_lock. Wake up GP kthread if necessary. | * ---------------------------------------------------------------------------- * | * v - * ----------------------------------- - * | | - * v v - * ---------------------------------------------------------------------------| - * | | | - * | SEGCBLIST_RCU_CORE | | SEGCBLIST_RCU_CORE | | - * | SEGCBLIST_LOCKING | | SEGCBLIST_LOCKING | | - * | SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | - * | | | - * | GP kthread woke up and | CB kthread woke up and | - * | acknowledged the fact that | acknowledged the fact that | - * | SEGCBLIST_OFFLOADED got cleared. | SEGCBLIST_OFFLOADED got cleared. | - * | | The CB kthread goes to sleep | - * | The callbacks from the target CPU | until it ever gets re-offloaded. | - * | will be ignored from the GP kthread | | - * | loop. | | + * |--------------------------------------------------------------------------| + * | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | + * | + unparked CB kthread | + * | | + * | GP kthread woke up and acknowledged the fact that SEGCBLIST_OFFLOADED | + * | got cleared. The callbacks from the target CPU will be ignored from the| + * | GP kthread loop. | * ---------------------------------------------------------------------------- - * | | - * ----------------------------------- * | * v * ---------------------------------------------------------------------------- * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | + * | + parked CB kthread | * | | - * | Callbacks processed by rcu_core() from softirqs or local | - * | rcuc kthread, while holding nocb_lock. | + * | CB kthread is parked. Callbacks processed by rcu_core() from softirqs or | + * | local rcuc kthread, while holding nocb_lock. | * ---------------------------------------------------------------------------- * | * v -- cgit v1.2.3 From 9855c37edf0009cc276cecfee09f7e76e2380212 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 25 Apr 2024 16:24:04 +0200 Subject: Revert "rcu-tasks: Fix synchronize_rcu_tasks() VS zap_pid_ns_processes()" This reverts commit 28319d6dc5e2ffefa452c2377dd0f71621b5bff0. The race it fixed was subject to conditions that don't exist anymore since: 1612160b9127 ("rcu-tasks: Eliminate deadlocks involving do_exit() and RCU tasks") This latter commit removes the use of SRCU that used to cover the RCU-tasks blind spot on exit between the tasklist's removal and the final preemption disabling. The task is now placed instead into a temporary list inside which voluntary sleeps are accounted as RCU-tasks quiescent states. This would disarm the deadlock initially reported against PID namespace exit. Signed-off-by: Frederic Weisbecker Reviewed-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index dfd2399f2cde..61cb3de236af 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -209,7 +209,6 @@ void synchronize_rcu_tasks_rude(void); #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); -void exit_tasks_rcu_stop(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_classic_qs(t, preempt) do { } while (0) @@ -218,7 +217,6 @@ void exit_tasks_rcu_finish(void); #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } -static inline void exit_tasks_rcu_stop(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ -- cgit v1.2.3 From 4ce6e8a859f0503d97aac6869bc3b1a24b15601d Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 28 May 2024 13:33:29 -0700 Subject: hwmon: Add PEC attribute support to hardware monitoring core Several hardware monitoring chips optionally support Packet Error Checking (PEC). For some chips, PEC support can be enabled simply by setting I2C_CLIENT_PEC in the i2c client data structure. Others require chip specific code to enable or disable PEC support. Introduce hwmon_chip_pec and HWMON_C_PEC to simplify adding configurable PEC support for hardware monitoring drivers. A driver can set HWMON_C_PEC in its chip information data to indicate PEC support. If a chip requires chip specific code to enable or disable PEC support, the driver only needs to implement support for the hwmon_chip_pec attribute to its write function. Packet Error Checking is only supported for SMBus devices. HWMON_C_PEC must therefore only be set by a driver if the parent device is an I2C device. Attempts to set HWMON_C_PEC on any other device type is not supported and rejected. The code calls i2c_check_functionality() to check if PEC is supported by the I2C/SMBus controller. This function is only available if CONFIG_I2C is enabled and reachable. For this reason, the added code needs to depend on reachability of CONFIG_I2C. Cc: Radu Sabau Acked-by: Nuno Sa Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index edf96f249eb5..e94314760aab 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -45,6 +45,7 @@ enum hwmon_chip_attributes { hwmon_chip_power_samples, hwmon_chip_temp_samples, hwmon_chip_beep_enable, + hwmon_chip_pec, }; #define HWMON_C_TEMP_RESET_HISTORY BIT(hwmon_chip_temp_reset_history) @@ -60,6 +61,7 @@ enum hwmon_chip_attributes { #define HWMON_C_POWER_SAMPLES BIT(hwmon_chip_power_samples) #define HWMON_C_TEMP_SAMPLES BIT(hwmon_chip_temp_samples) #define HWMON_C_BEEP_ENABLE BIT(hwmon_chip_beep_enable) +#define HWMON_C_PEC BIT(hwmon_chip_pec) enum hwmon_temp_attributes { hwmon_temp_enable, -- cgit v1.2.3 From f85d39dd7ed89ffdd622bc1de247ffba8d961504 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 27 May 2024 19:35:38 +0200 Subject: kcov, usb: disable interrupts in kcov_remote_start_usb_softirq After commit 8fea0c8fda30 ("usb: core: hcd: Convert from tasklet to BH workqueue"), usb_giveback_urb_bh() runs in the BH workqueue with interrupts enabled. Thus, the remote coverage collection section in usb_giveback_urb_bh()-> __usb_hcd_giveback_urb() might be interrupted, and the interrupt handler might invoke __usb_hcd_giveback_urb() again. This breaks KCOV, as it does not support nested remote coverage collection sections within the same context (neither in task nor in softirq). Update kcov_remote_start/stop_usb_softirq() to disable interrupts for the duration of the coverage collection section to avoid nested sections in the softirq context (in addition to such in the task context, which are already handled). Reported-by: Tetsuo Handa Closes: https://lore.kernel.org/linux-usb/0f4d1964-7397-485b-bc48-11c01e2fcbca@I-love.SAKURA.ne.jp/ Closes: https://syzkaller.appspot.com/bug?extid=0438378d6f157baae1a2 Suggested-by: Alan Stern Fixes: 8fea0c8fda30 ("usb: core: hcd: Convert from tasklet to BH workqueue") Cc: stable@vger.kernel.org Acked-by: Dmitry Vyukov Signed-off-by: Andrey Konovalov Link: https://lore.kernel.org/r/20240527173538.4989-1-andrey.konovalov@linux.dev Signed-off-by: Greg Kroah-Hartman --- include/linux/kcov.h | 47 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kcov.h b/include/linux/kcov.h index b851ba415e03..1068a7318d89 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -55,21 +55,47 @@ static inline void kcov_remote_start_usb(u64 id) /* * The softirq flavor of kcov_remote_*() functions is introduced as a temporary - * work around for kcov's lack of nested remote coverage sections support in - * task context. Adding support for nested sections is tracked in: - * https://bugzilla.kernel.org/show_bug.cgi?id=210337 + * workaround for KCOV's lack of nested remote coverage sections support. + * + * Adding support is tracked in https://bugzilla.kernel.org/show_bug.cgi?id=210337. + * + * kcov_remote_start_usb_softirq(): + * + * 1. Only collects coverage when called in the softirq context. This allows + * avoiding nested remote coverage collection sections in the task context. + * For example, USB/IP calls usb_hcd_giveback_urb() in the task context + * within an existing remote coverage collection section. Thus, KCOV should + * not attempt to start collecting coverage within the coverage collection + * section in __usb_hcd_giveback_urb() in this case. + * + * 2. Disables interrupts for the duration of the coverage collection section. + * This allows avoiding nested remote coverage collection sections in the + * softirq context (a softirq might occur during the execution of a work in + * the BH workqueue, which runs with in_serving_softirq() > 0). + * For example, usb_giveback_urb_bh() runs in the BH workqueue with + * interrupts enabled, so __usb_hcd_giveback_urb() might be interrupted in + * the middle of its remote coverage collection section, and the interrupt + * handler might invoke __usb_hcd_giveback_urb() again. */ -static inline void kcov_remote_start_usb_softirq(u64 id) +static inline unsigned long kcov_remote_start_usb_softirq(u64 id) { - if (in_serving_softirq()) + unsigned long flags = 0; + + if (in_serving_softirq()) { + local_irq_save(flags); kcov_remote_start_usb(id); + } + + return flags; } -static inline void kcov_remote_stop_softirq(void) +static inline void kcov_remote_stop_softirq(unsigned long flags) { - if (in_serving_softirq()) + if (in_serving_softirq()) { kcov_remote_stop(); + local_irq_restore(flags); + } } #ifdef CONFIG_64BIT @@ -103,8 +129,11 @@ static inline u64 kcov_common_handle(void) } static inline void kcov_remote_start_common(u64 id) {} static inline void kcov_remote_start_usb(u64 id) {} -static inline void kcov_remote_start_usb_softirq(u64 id) {} -static inline void kcov_remote_stop_softirq(void) {} +static inline unsigned long kcov_remote_start_usb_softirq(u64 id) +{ + return 0; +} +static inline void kcov_remote_stop_softirq(unsigned long flags) {} #endif /* CONFIG_KCOV */ #endif /* _LINUX_KCOV_H */ -- cgit v1.2.3 From fe8db0bbe04d31ab17dc60e205c4a3365c92e67c Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Fri, 10 May 2024 20:12:42 +0000 Subject: usb: typec: Update sysfs when setting ops When adding altmode ops, update the sysfs group so that visibility is also recalculated. Reviewed-by: Heikki Krogerus Reviewed-by: Benson Leung Signed-off-by: Abhishek Pandit-Subedi Signed-off-by: Jameson Thies Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240510201244.2968152-3-jthies@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index b35b427561ab..549275f8ac1b 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -167,6 +167,9 @@ struct typec_port *typec_altmode2port(struct typec_altmode *alt); void typec_altmode_update_active(struct typec_altmode *alt, bool active); +void typec_altmode_set_ops(struct typec_altmode *alt, + const struct typec_altmode_ops *ops); + enum typec_plug_index { TYPEC_PLUG_SOP_P, TYPEC_PLUG_SOP_PP, -- cgit v1.2.3 From 971187350602d03c4a27c0783ff412502b95720a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 4 Jul 2023 14:17:19 +0100 Subject: driver core: remove devm_device_add_groups() There is no more in-kernel users of this function, and no driver should ever be using it, so remove it from the kernel. Acked-by: Dmitry Torokhov Acked-by: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20230704131715.44454-8-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index fc3bd7116ab9..ace039151cb8 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1220,8 +1220,6 @@ static inline void device_remove_group(struct device *dev, return device_remove_groups(dev, groups); } -int __must_check devm_device_add_groups(struct device *dev, - const struct attribute_group **groups); int __must_check devm_device_add_group(struct device *dev, const struct attribute_group *grp); -- cgit v1.2.3 From 44a45be57f85165761fdabf072f9a97aa026ff61 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 23 May 2024 13:00:00 +0200 Subject: sysfs: Unbreak the build around sysfs_bin_attr_simple_read() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Günter reports build breakage for m68k "m5208evb_defconfig" plus CONFIG_BLK_DEV_INITRD=y caused by commit 66bc1a173328 ("treewide: Use sysfs_bin_attr_simple_read() helper"). The defconfig disables CONFIG_SYSFS, so sysfs_bin_attr_simple_read() is not compiled into the kernel. But init/initramfs.c references that function in the initializer of a struct bin_attribute. Add an empty static inline to avoid the build breakage. Fixes: 66bc1a173328 ("treewide: Use sysfs_bin_attr_simple_read() helper") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/r/e12b0027-b199-4de7-b83d-668171447ccc@roeck-us.net Signed-off-by: Lukas Wunner Tested-by: Guenter Roeck Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/05f4290439a58730738a15b0c99cd8576c4aa0d9.1716461752.git.lukas@wunner.de Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index a7d725fbf739..c4e64dc11206 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -750,6 +750,15 @@ static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } + +static inline ssize_t sysfs_bin_attr_simple_read(struct file *file, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return 0; +} #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, -- cgit v1.2.3 From 42675b723b4842bca7bfb0f209aa9a493a10324a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Jun 2024 15:07:05 -0400 Subject: function_graph: Convert ret_stack to a series of longs In order to make it possible to have multiple callbacks registered with the function_graph tracer, the retstack needs to be converted from an array of ftrace_ret_stack structures to an array of longs. This will allow to store the list of callbacks on the stack for the return side of the functions. Link: https://lore.kernel.org/linux-trace-kernel/171509092742.162236.4427737821399314856.stgit@devnote2 Link: https://lore.kernel.org/linux-trace-kernel/20240603190821.073111754@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 61591ac6eab6..352939dab3a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1402,7 +1402,7 @@ struct task_struct { int curr_ret_depth; /* Stack of return addresses for return function tracing: */ - struct ftrace_ret_stack *ret_stack; + unsigned long *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; -- cgit v1.2.3 From 7aa1eaef9f4282c9acd39588b1fdc9dda7e73f34 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Jun 2024 15:07:08 -0400 Subject: function_graph: Allow multiple users to attach to function graph Allow for multiple users to attach to function graph tracer at the same time. Only 16 simultaneous users can attach to the tracer. This is because there's an array that stores the pointers to the attached fgraph_ops. When a function being traced is entered, each of the ftrace_ops entryfunc is called and if it returns non zero, its index into the array will be added to the shadow stack. On exit of the function being traced, the shadow stack will contain the indexes of the ftrace_ops on the array that want their retfunc to be called. Because a function may sleep for a long time (if a task sleeps itself), the return of the function may be literally days later. If the ftrace_ops is removed, its place on the array is replaced with a ftrace_ops that contains the stub functions and that will be called when the function finally returns. If another ftrace_ops is added that happens to get the same index into the array, its return function may be called. But that's actually the way things current work with the old function graph tracer. If one tracer is removed and another is added, the new one will get the return calls of the function traced by the previous one, thus this is not a regression. This can be fixed by adding a counter to each time the array item is updated and save that on the shadow stack as well, such that it won't be called if the index saved does not match the index on the array. Note, being able to filter functions when both are called is not completely handled yet, but that shouldn't be too hard to manage. Co-developed with Masami Hiramatsu: Link: https://lore.kernel.org/linux-trace-kernel/171509096221.162236.8806372072523195752.stgit@devnote2 Link: https://lore.kernel.org/linux-trace-kernel/20240603190821.555493396@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 800995c425e0..8f00a7415bd5 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1038,6 +1038,7 @@ extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace); struct fgraph_ops { trace_func_graph_ent_t entryfunc; trace_func_graph_ret_t retfunc; + int idx; }; /* @@ -1072,7 +1073,7 @@ function_graph_enter(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp); struct ftrace_ret_stack * -ftrace_graph_get_ret_stack(struct task_struct *task, int idx); +ftrace_graph_get_ret_stack(struct task_struct *task, int skip); unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, unsigned long *retp); -- cgit v1.2.3 From 37238abe3cb47b8daaa8706c9949f67b2a705cf1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Jun 2024 15:07:11 -0400 Subject: ftrace/function_graph: Pass fgraph_ops to function graph callbacks Pass the fgraph_ops structure to the function graph callbacks. This will allow callbacks to add a descriptor to a fgraph_ops private field that wil be added in the future and use it for the callbacks. This will be useful when more than one callback can be registered to the function graph tracer. Co-developed with Masami Hiramatsu: Link: https://lore.kernel.org/linux-trace-kernel/171509098588.162236.4787930115997357578.stgit@devnote2 Link: https://lore.kernel.org/linux-trace-kernel/20240603190822.035147698@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8f00a7415bd5..4d817b85ac0d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1027,11 +1027,15 @@ struct ftrace_graph_ret { unsigned long long rettime; } __packed; +struct fgraph_ops; + /* Type of the callback handlers for tracing function graph*/ -typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */ -typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ +typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *, + struct fgraph_ops *); /* return */ +typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *, + struct fgraph_ops *); /* entry */ -extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace); +extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct fgraph_ops *gops); #ifdef CONFIG_FUNCTION_GRAPH_TRACER -- cgit v1.2.3 From 26dda5631d1bb2f254f4c94aa87ee6c92a89cfdb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Jun 2024 15:07:12 -0400 Subject: ftrace: Allow function_graph tracer to be enabled in instances Now that function graph tracing can handle more than one user, allow it to be enabled in the ftrace instances. Note, the filtering of the functions is still joined by the top level set_ftrace_filter and friends, as well as the graph and nograph files. Co-developed with Masami Hiramatsu: Link: https://lore.kernel.org/linux-trace-kernel/171509099743.162236.1699959255446248163.stgit@devnote2 Link: https://lore.kernel.org/linux-trace-kernel/20240603190822.190630762@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4d817b85ac0d..fd656e6d6b7c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1042,6 +1042,7 @@ extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct fgraph struct fgraph_ops { trace_func_graph_ent_t entryfunc; trace_func_graph_ret_t retfunc; + void *private; int idx; }; -- cgit v1.2.3 From ab6b84630382914ffcbab59f4913c9a60971d034 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Jun 2024 15:07:13 -0400 Subject: ftrace: Allow ftrace startup flags to exist without dynamic ftrace Some of the flags for ftrace_startup() may be exposed even when CONFIG_DYNAMIC_FTRACE is not configured in. This is fine as the difference between dynamic ftrace and static ftrace is done within the internals of ftrace itself. No need to have use cases fail to compile because dynamic ftrace is disabled. This change is needed to move some of the logic of what is passed to ftrace_startup() out of the parameters of ftrace_startup(). Link: https://lore.kernel.org/linux-trace-kernel/171509100890.162236.4362350342549122222.stgit@devnote2 Link: https://lore.kernel.org/linux-trace-kernel/20240603190822.350654104@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fd656e6d6b7c..586018744785 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -509,6 +509,15 @@ static inline void stack_tracer_disable(void) { } static inline void stack_tracer_enable(void) { } #endif +enum { + FTRACE_UPDATE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_START_FUNC_RET = (1 << 3), + FTRACE_STOP_FUNC_RET = (1 << 4), + FTRACE_MAY_SLEEP = (1 << 5), +}; + #ifdef CONFIG_DYNAMIC_FTRACE void ftrace_arch_code_modify_prepare(void); @@ -603,15 +612,6 @@ void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); void ftrace_free_filter(struct ftrace_ops *ops); void ftrace_ops_set_global_filter(struct ftrace_ops *ops); -enum { - FTRACE_UPDATE_CALLS = (1 << 0), - FTRACE_DISABLE_CALLS = (1 << 1), - FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_START_FUNC_RET = (1 << 3), - FTRACE_STOP_FUNC_RET = (1 << 4), - FTRACE_MAY_SLEEP = (1 << 5), -}; - /* * The FTRACE_UPDATE_* enum is used to pass information back * from the ftrace_update_record() and ftrace_test_record() -- cgit v1.2.3 From 5fccc7552ccbc521bad61653ee739b1196b1bc53 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 3 Jun 2024 15:07:14 -0400 Subject: ftrace: Add subops logic to allow one ops to manage many There are cases where a single system will use a single function callback to handle multiple users. For example, to allow function_graph tracer to have multiple users where each can trace their own set of functions, it is useful to only have one ftrace_ops registered to ftrace that will call a function by the function_graph tracer to handle the multiplexing with the different registered function_graph tracers. Add a "subop_list" to the ftrace_ops that will hold a list of other ftrace_ops that the top ftrace_ops will manage. The function ftrace_startup_subops() that takes the manager ftrace_ops and a subop ftrace_ops it will manage. If there are no subops with the ftrace_ops yet, it will copy the ftrace_ops subop filters to the manager ftrace_ops and register that with ftrace_startup(), and adds the subop to its subop_list. If the manager ops already has something registered, it will then merge the new subop filters with what it has and enable the new functions that covers all the subops it has. To remove a subop, ftrace_shutdown_subops() is called which will use the subop_list of the manager ops to rebuild all the functions it needs to trace, and update the ftrace records to only call the functions it now has registered. If there are no more functions registered, it will then call ftrace_shutdown() to disable itself completely. Note, it is up to the manager ops callback to always make sure that the subops callbacks are called if its filter matches, as there are times in the update where the callback could be calling more functions than those that are currently registered. This could be updated to handle other systems other than function_graph, for example, fprobes could use this (but will need an interface to call ftrace_startup_subops()). Link: https://lore.kernel.org/linux-trace-kernel/20240603190822.508431129@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 586018744785..978a1d3b270a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -334,6 +334,7 @@ struct ftrace_ops { unsigned long trampoline; unsigned long trampoline_size; struct list_head list; + struct list_head subop_list; ftrace_ops_func_t ops_func; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS unsigned long direct_call; -- cgit v1.2.3 From d9bbfbd14f58d2955cc7a3efa8ae6d4e09ee5995 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 3 Jun 2024 15:07:15 -0400 Subject: ftrace: Allow subops filtering to be modified The subops filters use a "manager" ops to enable and disable its filters. The manager ops can handle more than one subops, and its filter is what controls what functions get set. Add a ftrace_hash_move_and_update_subops() function that will update the manager ops when the subops filters change. Link: https://lore.kernel.org/linux-trace-kernel/20240603190822.673932251@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 978a1d3b270a..63238a9a9270 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -227,6 +227,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * ftrace_enabled. * DIRECT - Used by the direct ftrace_ops helper for direct functions * (internal ftrace only, should not be used by others) + * SUBOP - Is controlled by another op in field managed. */ enum { FTRACE_OPS_FL_ENABLED = BIT(0), @@ -247,6 +248,7 @@ enum { FTRACE_OPS_FL_TRACE_ARRAY = BIT(15), FTRACE_OPS_FL_PERMANENT = BIT(16), FTRACE_OPS_FL_DIRECT = BIT(17), + FTRACE_OPS_FL_SUBOP = BIT(18), }; #ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS @@ -336,6 +338,7 @@ struct ftrace_ops { struct list_head list; struct list_head subop_list; ftrace_ops_func_t ops_func; + struct ftrace_ops *managed; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS unsigned long direct_call; #endif -- cgit v1.2.3 From c132be2c4fcc1150ad0791c2a85dd4c9ad0bd0c8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Jun 2024 15:07:16 -0400 Subject: function_graph: Have the instances use their own ftrace_ops for filtering Allow for instances to have their own ftrace_ops part of the fgraph_ops that makes the funtion_graph tracer filter on the set_ftrace_filter file of the instance and not the top instance. This uses the new ftrace_startup_subops(), by using graph_ops as the "manager ops" that defines the callback function and adds the functions defined by the filters of the ops for each trace instance. The callback defined by the manager ops will call the registered fgraph ops that were added to the fgraph_array. Co-developed with Masami Hiramatsu: Link: https://lore.kernel.org/linux-trace-kernel/171509102088.162236.15758883237657317789.stgit@devnote2 Link: https://lore.kernel.org/linux-trace-kernel/20240603190822.832946261@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 63238a9a9270..8f865689e868 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1046,6 +1046,7 @@ extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct fgraph struct fgraph_ops { trace_func_graph_ent_t entryfunc; trace_func_graph_ret_t retfunc; + struct ftrace_ops ops; /* for the hash lists */ void *private; int idx; }; -- cgit v1.2.3 From df3ec5da6a1e7f6e142680d7c5266d3af187170b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 3 Jun 2024 15:07:17 -0400 Subject: function_graph: Add pid tracing back to function graph tracer Now that the function_graph has a main callback that handles the function graph subops tracing, it no longer honors the pid filtering of ftrace. Add back this logic in the function_graph code to update the gops callback for the entry function to test if it should trace the current task or not. Link: https://lore.kernel.org/linux-trace-kernel/20240603190822.991720703@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8f865689e868..e31ec8516de1 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1040,6 +1040,7 @@ typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *, struct fgraph_ops *); /* entry */ extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct fgraph_ops *gops); +bool ftrace_pids_enabled(struct ftrace_ops *ops); #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -1048,6 +1049,7 @@ struct fgraph_ops { trace_func_graph_ret_t retfunc; struct ftrace_ops ops; /* for the hash lists */ void *private; + trace_func_graph_ent_t saved_func; int idx; }; -- cgit v1.2.3 From 4497412a1f7b5d9e0849f125652f2cc58cdba562 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Jun 2024 15:07:19 -0400 Subject: function_graph: Add "task variables" per task for fgraph_ops Add a "task variables" array on the tasks shadow ret_stack that is the size of longs for each possible registered fgraph_ops. That's a total of 16, taking up 8 * 16 = 128 bytes (out of a page size 4k). This will allow for fgraph_ops to do specific features on a per task basis having a way to maintain state for each task. Co-developed with Masami Hiramatsu: Link: https://lore.kernel.org/linux-trace-kernel/171509104383.162236.12239656156685718550.stgit@devnote2 Link: https://lore.kernel.org/linux-trace-kernel/20240603190823.308806126@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e31ec8516de1..82cfc8e09cfd 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1089,6 +1089,7 @@ ftrace_graph_get_ret_stack(struct task_struct *task, int skip); unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, unsigned long *retp); +unsigned long *fgraph_get_task_var(struct fgraph_ops *gops); /* * Sometimes we don't want to trace a function with the function -- cgit v1.2.3 From 12117f3307b63f287756d7ec8cc4f11b94e1206a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Jun 2024 15:07:20 -0400 Subject: function_graph: Move set_graph_function tests to shadow stack global var The use of the task->trace_recursion for the logic used for the set_graph_function was a bit of an abuse of that variable. Now that there exists global vars that are per stack for registered graph traces, use that instead. Link: https://lore.kernel.org/linux-trace-kernel/171509105520.162236.10339831553995971290.stgit@devnote2 Link: https://lore.kernel.org/linux-trace-kernel/20240603190823.472955399@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_recursion.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index 24ea8ac049b4..02e6afc6d7fe 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -44,9 +44,6 @@ enum { */ TRACE_IRQ_BIT, - /* Set if the function is in the set_graph_function file */ - TRACE_GRAPH_BIT, - /* * In the very unlikely case that an interrupt came in * at a start of graph tracing, and we want to trace @@ -60,7 +57,7 @@ enum { * that preempted a softirq start of a function that * preempted normal context!!!! Luckily, it can't be * greater than 3, so the next two bits are a mask - * of what the depth is when we set TRACE_GRAPH_BIT + * of what the depth is when we set TRACE_GRAPH_FL */ TRACE_GRAPH_DEPTH_START_BIT, -- cgit v1.2.3 From 068da098eb504469dc195137ae35eeacfe0c8de9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Jun 2024 15:07:21 -0400 Subject: function_graph: Move graph depth stored data to shadow stack global var The use of the task->trace_recursion for the logic used for the function graph depth was a bit of an abuse of that variable. Now that there exists global vars that are per stack for registered graph traces, use that instead. Link: https://lore.kernel.org/linux-trace-kernel/171509106728.162236.2398372644430125344.stgit@devnote2 Link: https://lore.kernel.org/linux-trace-kernel/20240603190823.634870264@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_recursion.h | 29 ----------------------------- 1 file changed, 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index 02e6afc6d7fe..fdfb6f66718a 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -44,25 +44,6 @@ enum { */ TRACE_IRQ_BIT, - /* - * In the very unlikely case that an interrupt came in - * at a start of graph tracing, and we want to trace - * the function in that interrupt, the depth can be greater - * than zero, because of the preempted start of a previous - * trace. In an even more unlikely case, depth could be 2 - * if a softirq interrupted the start of graph tracing, - * followed by an interrupt preempting a start of graph - * tracing in the softirq, and depth can even be 3 - * if an NMI came in at the start of an interrupt function - * that preempted a softirq start of a function that - * preempted normal context!!!! Luckily, it can't be - * greater than 3, so the next two bits are a mask - * of what the depth is when we set TRACE_GRAPH_FL - */ - - TRACE_GRAPH_DEPTH_START_BIT, - TRACE_GRAPH_DEPTH_END_BIT, - /* * To implement set_graph_notrace, if this bit is set, we ignore * function graph tracing of called functions, until the return @@ -78,16 +59,6 @@ enum { #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) #define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) -#define trace_recursion_depth() \ - (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3) -#define trace_recursion_set_depth(depth) \ - do { \ - current->trace_recursion &= \ - ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \ - current->trace_recursion |= \ - ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \ - } while (0) - #define TRACE_CONTEXT_BITS 4 #define TRACE_FTRACE_START TRACE_FTRACE_BIT -- cgit v1.2.3 From b84214890a9bc56f0fe4ec4fc72f2307ed05096d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Jun 2024 15:07:22 -0400 Subject: function_graph: Move graph notrace bit to shadow stack global var The use of the task->trace_recursion for the logic used for the function graph no-trace was a bit of an abuse of that variable. Now that there exists global vars that are per stack for registered graph traces, use that instead. Link: https://lore.kernel.org/linux-trace-kernel/171509107907.162236.6564679266777519065.stgit@devnote2 Link: https://lore.kernel.org/linux-trace-kernel/20240603190823.796709456@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_recursion.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index fdfb6f66718a..ae04054a1be3 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -44,13 +44,6 @@ enum { */ TRACE_IRQ_BIT, - /* - * To implement set_graph_notrace, if this bit is set, we ignore - * function graph tracing of called functions, until the return - * function is called to clear it. - */ - TRACE_GRAPH_NOTRACE_BIT, - /* Used to prevent recursion recording from recursing. */ TRACE_RECORD_RECURSION_BIT, }; -- cgit v1.2.3 From 91c46b0aa917546432b5b219494859cda0edc39e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Jun 2024 15:07:23 -0400 Subject: function_graph: Implement fgraph_reserve_data() and fgraph_retrieve_data() Added functions that can be called by a fgraph_ops entryfunc and retfunc to store state between the entry of the function being traced to the exit of the same function. The fgraph_ops entryfunc() may call fgraph_reserve_data() to store up to 32 words onto the task's shadow ret_stack and this then can be retrieved by fgraph_retrieve_data() called by the corresponding retfunc(). Co-developed with Masami Hiramatsu: Link: https://lore.kernel.org/linux-trace-kernel/171509109089.162236.11372474169781184034.stgit@devnote2 Link: https://lore.kernel.org/linux-trace-kernel/20240603190823.959703050@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Sven Schnelle Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alan Maguire Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Guo Ren Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 82cfc8e09cfd..9f61556a9491 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1053,6 +1053,9 @@ struct fgraph_ops { int idx; }; +void *fgraph_reserve_data(int idx, int size_bytes); +void *fgraph_retrieve_data(int idx, int *size_bytes); + /* * Stack of return addresses for functions * of a thread. -- cgit v1.2.3 From 3aee48a8e01f98d3285a0064285b0152cdbb8a9e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 8 May 2024 21:46:57 +0300 Subject: misc: eeprom_93xx46: Hide legacy platform data in the driver First of all, there is no user for the platform data in the kernel. Second, it needs a lot of updates to follow the modern standards of the kernel, including proper Device Tree bindings and device property handling. For now, just hide the legacy platform data in the driver's code. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20240508184905.2102633-5-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/eeprom_93xx46.h | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 include/linux/eeprom_93xx46.h (limited to 'include/linux') diff --git a/include/linux/eeprom_93xx46.h b/include/linux/eeprom_93xx46.h deleted file mode 100644 index 34c2175e6a1e..000000000000 --- a/include/linux/eeprom_93xx46.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Module: eeprom_93xx46 - * platform description for 93xx46 EEPROMs. - */ -#include - -struct eeprom_93xx46_platform_data { - unsigned char flags; -#define EE_ADDR8 0x01 /* 8 bit addr. cfg */ -#define EE_ADDR16 0x02 /* 16 bit addr. cfg */ -#define EE_READONLY 0x08 /* forbid writing */ -#define EE_SIZE1K 0x10 /* 1 kb of data, that is a 93xx46 */ -#define EE_SIZE2K 0x20 /* 2 kb of data, that is a 93xx56 */ -#define EE_SIZE4K 0x40 /* 4 kb of data, that is a 93xx66 */ - - unsigned int quirks; -/* Single word read transfers only; no sequential read. */ -#define EEPROM_93XX46_QUIRK_SINGLE_WORD_READ (1 << 0) -/* Instructions such as EWEN are (addrlen + 2) in length. */ -#define EEPROM_93XX46_QUIRK_INSTRUCTION_LENGTH (1 << 1) -/* Add extra cycle after address during a read */ -#define EEPROM_93XX46_QUIRK_EXTRA_READ_CYCLE BIT(2) - - /* - * optional hooks to control additional logic - * before and after spi transfer. - */ - void (*prepare)(void *); - void (*finish)(void *); - struct gpio_desc *select; -}; -- cgit v1.2.3 From 1968845d358e108cfbfba45538d64b3cbdf04ac2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 31 May 2024 17:51:29 +0300 Subject: driver core: device.h: Group of_node handling declarations and definitions There are a few of_node related APIs defined in the driver core. Group the respective declarations and definitions in the header. There is no functional change. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240531145129.1506733-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index fc3bd7116ab9..56f266429229 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1031,13 +1031,6 @@ static inline void device_lock_assert(struct device *dev) lockdep_assert_held(&dev->mutex); } -static inline struct device_node *dev_of_node(struct device *dev) -{ - if (!IS_ENABLED(CONFIG_OF) || !dev) - return NULL; - return dev->of_node; -} - static inline bool dev_has_sync_state(struct device *dev) { if (!dev) @@ -1144,10 +1137,18 @@ void unlock_device_hotplug(void); int lock_device_hotplug_sysfs(void); int device_offline(struct device *dev); int device_online(struct device *dev); + void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode); -void device_set_of_node_from_dev(struct device *dev, const struct device *dev2); void device_set_node(struct device *dev, struct fwnode_handle *fwnode); +void device_set_of_node_from_dev(struct device *dev, const struct device *dev2); + +static inline struct device_node *dev_of_node(struct device *dev) +{ + if (!IS_ENABLED(CONFIG_OF) || !dev) + return NULL; + return dev->of_node; +} static inline int dev_num_vf(struct device *dev) { -- cgit v1.2.3 From fd7179ece035417f44f7ecff086d6df674d8a5bd Mon Sep 17 00:00:00 2001 From: David Lechner Date: Thu, 30 May 2024 10:14:08 -0500 Subject: iio: introduce struct iio_scan_type This gives the channel scan_type a named type so that it can be used to simplify code in later commits. Signed-off-by: David Lechner Reviewed-by: Nuno Sa Link: https://lore.kernel.org/r/20240530-iio-add-support-for-multiple-scan-types-v3-1-cbc4acea2cfa@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 55e2b22086a1..19de573a944a 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -173,6 +173,27 @@ struct iio_event_spec { unsigned long mask_shared_by_all; }; +/** + * struct iio_scan_type - specification for channel data format in buffer + * @sign: 's' or 'u' to specify signed or unsigned + * @realbits: Number of valid bits of data + * @storagebits: Realbits + padding + * @shift: Shift right by this before masking out realbits. + * @repeat: Number of times real/storage bits repeats. When the + * repeat element is more than 1, then the type element in + * sysfs will show a repeat value. Otherwise, the number + * of repetitions is omitted. + * @endianness: little or big endian + */ +struct iio_scan_type { + char sign; + u8 realbits; + u8 storagebits; + u8 shift; + u8 repeat; + enum iio_endian endianness; +}; + /** * struct iio_chan_spec - specification of a single channel * @type: What type of measurement is the channel making. @@ -184,17 +205,6 @@ struct iio_event_spec { * @scan_index: Monotonic index to give ordering in scans when read * from a buffer. * @scan_type: struct describing the scan type - * @scan_type.sign: 's' or 'u' to specify signed or unsigned - * @scan_type.realbits: Number of valid bits of data - * @scan_type.storagebits: Realbits + padding - * @scan_type.shift: Shift right by this before masking out - * realbits. - * @scan_type.repeat: Number of times real/storage bits repeats. - * When the repeat element is more than 1, then - * the type element in sysfs will show a repeat - * value. Otherwise, the number of repetitions - * is omitted. - * @scan_type.endianness: little or big endian * @info_mask_separate: What information is to be exported that is specific to * this channel. * @info_mask_separate_available: What availability information is to be @@ -245,14 +255,7 @@ struct iio_chan_spec { int channel2; unsigned long address; int scan_index; - struct { - char sign; - u8 realbits; - u8 storagebits; - u8 shift; - u8 repeat; - enum iio_endian endianness; - } scan_type; + struct iio_scan_type scan_type; long info_mask_separate; long info_mask_separate_available; long info_mask_shared_by_type; -- cgit v1.2.3 From d8f2bb50845f2797f594ffe3cac9417abff4d7b0 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Thu, 30 May 2024 10:14:10 -0500 Subject: iio: add support for multiple scan types per channel This adds new fields to the iio_channel structure to support multiple scan types per channel. This is useful for devices that support multiple resolution modes or other modes that require different data formats of the raw data. To make use of this, drivers need to implement the new callback get_current_scan_type() to resolve the scan type for a given channel based on the current state of the driver. There is a new scan_type_ext field in the iio_channel structure that should be used to store the scan types for any channel that has more than one. There is also a new flag has_ext_scan_type that acts as a type discriminator for the scan_type/ext_scan_type union. A union is used so that we don't grow the size of the iio_channel structure and also makes it clear that scan_type and ext_scan_type are mutually exclusive. The buffer code is the only code in the IIO core code that is using the scan_type field. This patch updates the buffer code to use the new iio_channel_validate_scan_type() function to ensure it is returning the correct scan type for the current state of the device when reading the sysfs attributes. The buffer validation code is also update to validate any additional scan types that are set in the scan_type_ext field. Part of that code is refactored to a new function to avoid duplication. Some userspace tools may need to be updated to re-read the scan type after writing any other attribute. During testing, we noticed that we had to restart iiod to get it to re-read the scan type after enabling oversampling on the ad7380 driver. Signed-off-by: David Lechner Reviewed-by: Nuno Sa Link: https://lore.kernel.org/r/20240530-iio-add-support-for-multiple-scan-types-v3-3-cbc4acea2cfa@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 19de573a944a..894309294182 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -204,7 +204,13 @@ struct iio_scan_type { * @address: Driver specific identifier. * @scan_index: Monotonic index to give ordering in scans when read * from a buffer. - * @scan_type: struct describing the scan type + * @scan_type: struct describing the scan type - mutually exclusive + * with ext_scan_type. + * @ext_scan_type: Used in rare cases where there is more than one scan + * format for a channel. When this is used, the flag + * has_ext_scan_type must be set and the driver must + * implement get_current_scan_type in struct iio_info. + * @num_ext_scan_type: Number of elements in ext_scan_type. * @info_mask_separate: What information is to be exported that is specific to * this channel. * @info_mask_separate_available: What availability information is to be @@ -248,6 +254,7 @@ struct iio_scan_type { * attributes but not for event codes. * @output: Channel is output. * @differential: Channel is differential. + * @has_ext_scan_type: True if ext_scan_type is used instead of scan_type. */ struct iio_chan_spec { enum iio_chan_type type; @@ -255,7 +262,13 @@ struct iio_chan_spec { int channel2; unsigned long address; int scan_index; - struct iio_scan_type scan_type; + union { + struct iio_scan_type scan_type; + struct { + const struct iio_scan_type *ext_scan_type; + unsigned int num_ext_scan_type; + }; + }; long info_mask_separate; long info_mask_separate_available; long info_mask_shared_by_type; @@ -273,6 +286,7 @@ struct iio_chan_spec { unsigned indexed:1; unsigned output:1; unsigned differential:1; + unsigned has_ext_scan_type:1; }; @@ -435,6 +449,9 @@ struct iio_trigger; /* forward declaration */ * for better event identification. * @validate_trigger: function to validate the trigger when the * current trigger gets changed. + * @get_current_scan_type: must be implemented by drivers that use ext_scan_type + * in the channel spec to return the index of the currently + * active ext_scan type for a channel. * @update_scan_mode: function to configure device and scan buffer when * channels have changed * @debugfs_reg_access: function to read or write register value of device @@ -519,6 +536,8 @@ struct iio_info { int (*validate_trigger)(struct iio_dev *indio_dev, struct iio_trigger *trig); + int (*get_current_scan_type)(const struct iio_dev *indio_dev, + const struct iio_chan_spec *chan); int (*update_scan_mode)(struct iio_dev *indio_dev, const unsigned long *scan_mask); int (*debugfs_reg_access)(struct iio_dev *indio_dev, @@ -804,6 +823,38 @@ static inline bool iio_read_acpi_mount_matrix(struct device *dev, } #endif +/** + * iio_get_current_scan_type - Get the current scan type for a channel + * @indio_dev: the IIO device to get the scan type for + * @chan: the channel to get the scan type for + * + * Most devices only have one scan type per channel and can just access it + * directly without calling this function. Core IIO code and drivers that + * implement ext_scan_type in the channel spec should use this function to + * get the current scan type for a channel. + * + * Returns: the current scan type for the channel or error. + */ +static inline const struct iio_scan_type +*iio_get_current_scan_type(const struct iio_dev *indio_dev, + const struct iio_chan_spec *chan) +{ + int ret; + + if (chan->has_ext_scan_type) { + ret = indio_dev->info->get_current_scan_type(indio_dev, chan); + if (ret < 0) + return ERR_PTR(ret); + + if (ret >= chan->num_ext_scan_type) + return ERR_PTR(-EINVAL); + + return &chan->ext_scan_type[ret]; + } + + return &chan->scan_type; +} + ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals); int iio_str_to_fixpoint(const char *str, int fract_mult, int *integer, -- cgit v1.2.3 From b6e6aca6c2b1b53fb4db4b672eaa2722a75aa6a2 Mon Sep 17 00:00:00 2001 From: Ramona Gradinariu Date: Mon, 27 May 2024 17:26:12 +0300 Subject: iio: imu: adis_buffer: Add buffer setup API with buffer attributes Add new API called devm_adis_setup_buffer_and_trigger_with_attrs() which also takes buffer attributes as a parameter. Rewrite devm_adis_setup_buffer_and_trigger() implementation such that it calls devm_adis_setup_buffer_and_trigger_with_attrs() with buffer attributes parameter NULL Reviewed-by: Nuno Sa Signed-off-by: Ramona Gradinariu Link: https://lore.kernel.org/r/20240527142618.275897-4-ramona.bolboaca13@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 8898966bc0f0..0fe3a2f63033 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -21,6 +21,7 @@ #define ADIS_REG_PAGE_ID 0x00 struct adis; +struct iio_dev_attr; /** * struct adis_timeouts - ADIS chip variant timeouts @@ -515,11 +516,19 @@ int adis_single_conversion(struct iio_dev *indio_dev, #define ADIS_ROT_CHAN(mod, addr, si, info_sep, info_all, bits) \ ADIS_MOD_CHAN(IIO_ROT, mod, addr, si, info_sep, info_all, bits) +#define devm_adis_setup_buffer_and_trigger(adis, indio_dev, trigger_handler) \ + devm_adis_setup_buffer_and_trigger_with_attrs((adis), (indio_dev), \ + (trigger_handler), NULL, \ + NULL) + #ifdef CONFIG_IIO_ADIS_LIB_BUFFER int -devm_adis_setup_buffer_and_trigger(struct adis *adis, struct iio_dev *indio_dev, - irq_handler_t trigger_handler); +devm_adis_setup_buffer_and_trigger_with_attrs(struct adis *adis, + struct iio_dev *indio_dev, + irq_handler_t trigger_handler, + const struct iio_buffer_setup_ops *ops, + const struct iio_dev_attr **buffer_attrs); int devm_adis_probe_trigger(struct adis *adis, struct iio_dev *indio_dev); @@ -529,8 +538,11 @@ int adis_update_scan_mode(struct iio_dev *indio_dev, #else /* CONFIG_IIO_BUFFER */ static inline int -devm_adis_setup_buffer_and_trigger(struct adis *adis, struct iio_dev *indio_dev, - irq_handler_t trigger_handler) +devm_adis_setup_buffer_and_trigger_with_attrs(struct adis *adis, + struct iio_dev *indio_dev, + irq_handler_t trigger_handler, + const struct iio_buffer_setup_ops *ops, + const struct iio_dev_attr **buffer_attrs) { return 0; } -- cgit v1.2.3 From 01724ce2d9405b2246bbb69701c74880eb56a34b Mon Sep 17 00:00:00 2001 From: Ramona Gradinariu Date: Mon, 27 May 2024 17:26:15 +0300 Subject: iio: imu: adis_trigger: Allow level interrupts for FIFO readings Currently, adis library allows configuration only for edge interrupts, needed for data ready sampling. This patch removes the restriction for level interrupts for devices which have FIFO support. Furthermore, in case of devices which have FIFO support, devm_request_threaded_irq is used for interrupt allocation, to avoid flooding the processor with the FIFO watermark level interrupt, which is active until enough data has been read from the FIFO. Reviewed-by: Nuno Sa Signed-off-by: Ramona Gradinariu Link: https://lore.kernel.org/r/20240527142618.275897-7-ramona.bolboaca13@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 0fe3a2f63033..4bb0a53cf7ea 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -85,6 +85,7 @@ struct adis_data { bool unmasked_drdy; bool has_paging; + bool has_fifo; unsigned int burst_reg_cmd; unsigned int burst_len; -- cgit v1.2.3 From ba098ed9829c5de4d65e5708d08d3508f2e70a7d Mon Sep 17 00:00:00 2001 From: Daisuke Nojiri Date: Tue, 4 Jun 2024 10:01:48 -0700 Subject: platform/chrome: Add struct ec_response_get_next_event_v3 Add struct ec_response_get_next_event_v3 to upgrade EC_CMD_GET_NEXT_EVENT to version 3. Signed-off-by: Daisuke Nojiri Link: https://lore.kernel.org/r/20240604170552.2517189-1-dnojiri@chromium.org Signed-off-by: Tzung-Bi Shih --- include/linux/platform_data/cros_ec_commands.h | 34 ++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index ecc47d5fe239..ec598057d1da 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -3463,6 +3463,34 @@ union __ec_align_offset1 ec_response_get_next_data_v1 { }; BUILD_ASSERT(sizeof(union ec_response_get_next_data_v1) == 16); +union __ec_align_offset1 ec_response_get_next_data_v3 { + uint8_t key_matrix[18]; + + /* Unaligned */ + uint32_t host_event; + uint64_t host_event64; + + struct __ec_todo_unpacked { + /* For aligning the fifo_info */ + uint8_t reserved[3]; + struct ec_response_motion_sense_fifo_info info; + } sensor_fifo; + + uint32_t buttons; + + uint32_t switches; + + uint32_t fp_events; + + uint32_t sysrq; + + /* CEC events from enum mkbp_cec_event */ + uint32_t cec_events; + + uint8_t cec_message[16]; +}; +BUILD_ASSERT(sizeof(union ec_response_get_next_data_v3) == 18); + struct ec_response_get_next_event { uint8_t event_type; /* Followed by event data if any */ @@ -3475,6 +3503,12 @@ struct ec_response_get_next_event_v1 { union ec_response_get_next_data_v1 data; } __ec_align1; +struct ec_response_get_next_event_v3 { + uint8_t event_type; + /* Followed by event data if any */ + union ec_response_get_next_data_v3 data; +} __ec_align1; + /* Bit indices for buttons and switches.*/ /* Buttons */ #define EC_MKBP_POWER_BUTTON 0 -- cgit v1.2.3 From 106d6739823369c734a8fc3b13634274eee4f60e Mon Sep 17 00:00:00 2001 From: Daisuke Nojiri Date: Tue, 4 Jun 2024 16:08:25 -0700 Subject: platform/chrome: cros_ec_proto: Upgrade get_next_event to v3 Upgrade EC_CMD_GET_NEXT_EVENT to version 3. The max supported version will be v3. So, we speak v3 even if the EC says it supports v4+. Signed-off-by: Daisuke Nojiri Link: https://lore.kernel.org/r/20240604230837.2878737-1-dnojiri@chromium.org [tzungbi: uint32_t -> u32 per suggested by checkpatch.pl] Signed-off-by: Tzung-Bi Shih --- include/linux/platform_data/cros_ec_proto.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 1ddc52603f9a..6e9225bdf903 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -185,7 +185,7 @@ struct cros_ec_device { bool host_sleep_v1; struct blocking_notifier_head event_notifier; - struct ec_response_get_next_event_v1 event_data; + struct ec_response_get_next_event_v3 event_data; int event_size; u32 host_event_wake_mask; u32 last_resume_result; -- cgit v1.2.3 From 93bbbcb1e762a49fc18ee1272545b77371595f1e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 25 May 2024 02:30:39 +0000 Subject: mm/memblock: fix a typo in description of for_each_mem_region() No functional change. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240525023040.13509-2-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- include/linux/memblock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e2082240586d..6cf18dc2b4d0 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -565,7 +565,7 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo } /** - * for_each_mem_region - itereate over memory regions + * for_each_mem_region - iterate over memory regions * @region: loop variable */ #define for_each_mem_region(region) \ -- cgit v1.2.3 From 172e422ffea20a89bfdc672741c1aad6fbb5044e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 12 May 2024 13:30:07 +0200 Subject: fsnotify: clear PARENT_WATCHED flags lazily In some setups directories can have many (usually negative) dentries. Hence __fsnotify_update_child_dentry_flags() function can take a significant amount of time. Since the bulk of this function happens under inode->i_lock this causes a significant contention on the lock when we remove the watch from the directory as the __fsnotify_update_child_dentry_flags() call from fsnotify_recalc_mask() races with __fsnotify_update_child_dentry_flags() calls from __fsnotify_parent() happening on children. This can lead upto softlockup reports reported by users. Fix the problem by calling fsnotify_update_children_dentry_flags() to set PARENT_WATCHED flags only when parent starts watching children. When parent stops watching children, clear false positive PARENT_WATCHED flags lazily in __fsnotify_parent() for each accessed child. Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Stephen Brennan Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 4dd6143db271..8be029bc50b1 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -594,12 +594,14 @@ static inline __u32 fsnotify_parent_needed_mask(__u32 mask) static inline int fsnotify_inode_watches_children(struct inode *inode) { + __u32 parent_mask = READ_ONCE(inode->i_fsnotify_mask); + /* FS_EVENT_ON_CHILD is set if the inode may care */ - if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD)) + if (!(parent_mask & FS_EVENT_ON_CHILD)) return 0; /* this inode might care about child events, does it care about the * specific set of events that can happen on a child? */ - return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD; + return parent_mask & FS_EVENTS_POSS_ON_CHILD; } /* @@ -613,7 +615,7 @@ static inline void fsnotify_update_flags(struct dentry *dentry) /* * Serialisation of setting PARENT_WATCHED on the dentries is provided * by d_lock. If inotify_inode_watched changes after we have taken - * d_lock, the following __fsnotify_update_child_dentry_flags call will + * d_lock, the following fsnotify_set_children_dentry_flags call will * find our entry, so it will spin until we complete here, and update * us with the new state. */ -- cgit v1.2.3 From f0dc887f21d18791037c0166f652c67da761f16f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 27 May 2024 17:34:47 -0700 Subject: sched/core: Move preempt_model_*() helpers from sched.h to preempt.h Move the declarations and inlined implementations of the preempt_model_*() helpers to preempt.h so that they can be referenced in spinlock.h without creating a potential circular dependency between spinlock.h and sched.h. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ankur Arora Link: https://lkml.kernel.org/r/20240528003521.979836-2-ankur.a.arora@oracle.com --- include/linux/preempt.h | 41 +++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 41 ----------------------------------------- 2 files changed, 41 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 7233e9cf1bab..ce76f1a45722 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -481,4 +481,45 @@ DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable()) DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace()) DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) +#ifdef CONFIG_PREEMPT_DYNAMIC + +extern bool preempt_model_none(void); +extern bool preempt_model_voluntary(void); +extern bool preempt_model_full(void); + +#else + +static inline bool preempt_model_none(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_NONE); +} +static inline bool preempt_model_voluntary(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); +} +static inline bool preempt_model_full(void) +{ + return IS_ENABLED(CONFIG_PREEMPT); +} + +#endif + +static inline bool preempt_model_rt(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_RT); +} + +/* + * Does the preemption model allow non-cooperative preemption? + * + * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with + * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the + * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the + * PREEMPT_NONE model. + */ +static inline bool preempt_model_preemptible(void) +{ + return preempt_model_full() || preempt_model_rt(); +} + #endif /* __LINUX_PREEMPT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 61591ac6eab6..90691d99027e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2064,47 +2064,6 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock); __cond_resched_rwlock_write(lock); \ }) -#ifdef CONFIG_PREEMPT_DYNAMIC - -extern bool preempt_model_none(void); -extern bool preempt_model_voluntary(void); -extern bool preempt_model_full(void); - -#else - -static inline bool preempt_model_none(void) -{ - return IS_ENABLED(CONFIG_PREEMPT_NONE); -} -static inline bool preempt_model_voluntary(void) -{ - return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); -} -static inline bool preempt_model_full(void) -{ - return IS_ENABLED(CONFIG_PREEMPT); -} - -#endif - -static inline bool preempt_model_rt(void) -{ - return IS_ENABLED(CONFIG_PREEMPT_RT); -} - -/* - * Does the preemption model allow non-cooperative preemption? - * - * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with - * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the - * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the - * PREEMPT_NONE model. - */ -static inline bool preempt_model_preemptible(void) -{ - return preempt_model_full() || preempt_model_rt(); -} - static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); -- cgit v1.2.3 From c793a62823d1ce8f70d9cfc7803e3ea436277cda Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 27 May 2024 17:34:48 -0700 Subject: sched/core: Drop spinlocks on contention iff kernel is preemptible Use preempt_model_preemptible() to detect a preemptible kernel when deciding whether or not to reschedule in order to drop a contended spinlock or rwlock. Because PREEMPT_DYNAMIC selects PREEMPTION, kernels built with PREEMPT_DYNAMIC=y will yield contended locks even if the live preemption model is "none" or "voluntary". In short, make kernels with dynamically selected models behave the same as kernels with statically selected models. Somewhat counter-intuitively, NOT yielding a lock can provide better latency for the relevant tasks/processes. E.g. KVM x86's mmu_lock, a rwlock, is often contended between an invalidation event (takes mmu_lock for write) and a vCPU servicing a guest page fault (takes mmu_lock for read). For _some_ setups, letting the invalidation task complete even if there is mmu_lock contention provides lower latency for *all* tasks, i.e. the invalidation completes sooner *and* the vCPU services the guest page fault sooner. But even KVM's mmu_lock behavior isn't uniform, e.g. the "best" behavior can vary depending on the host VMM, the guest workload, the number of vCPUs, the number of pCPUs in the host, why there is lock contention, etc. In other words, simply deleting the CONFIG_PREEMPTION guard (or doing the opposite and removing contention yielding entirely) needs to come with a big pile of data proving that changing the status quo is a net positive. Opportunistically document this side effect of preempt=full, as yielding contended spinlocks can have significant, user-visible impact. Fixes: c597bfddc9e9 ("sched: Provide Kconfig support for default dynamic preempt mode") Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ankur Arora Reviewed-by: Chen Yu Link: https://lore.kernel.org/kvm/ef81ff36-64bb-4cfe-ae9b-e3acf47bff24@proxmox.com --- include/linux/spinlock.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 3fcd20de6ca8..63dd8cf3c3c2 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -462,11 +462,10 @@ static __always_inline int spin_is_contended(spinlock_t *lock) */ static inline int spin_needbreak(spinlock_t *lock) { -#ifdef CONFIG_PREEMPTION + if (!preempt_model_preemptible()) + return 0; + return spin_is_contended(lock); -#else - return 0; -#endif } /* @@ -479,11 +478,10 @@ static inline int spin_needbreak(spinlock_t *lock) */ static inline int rwlock_needbreak(rwlock_t *lock) { -#ifdef CONFIG_PREEMPTION + if (!preempt_model_preemptible()) + return 0; + return rwlock_is_contended(lock); -#else - return 0; -#endif } /* -- cgit v1.2.3 From dff60734fc7606fabde668ab6a26feacec8787cc Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Jun 2024 17:52:56 +0200 Subject: vfs: retire user_path_at_empty and drop empty arg from getname_flags No users after do_readlinkat started doing the job on its own. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240604155257.109500-3-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- include/linux/namei.h | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 639885621608..bfc1e6407bf6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2701,7 +2701,7 @@ static inline struct file *file_clone_open(struct file *file) } extern int filp_close(struct file *, fl_owner_t id); -extern struct filename *getname_flags(const char __user *, int, int *); +extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); extern struct filename *getname(const char __user *); extern struct filename *getname_kernel(const char *); diff --git a/include/linux/namei.h b/include/linux/namei.h index 967aa9ea9f96..8ec8fed3bce8 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -50,13 +50,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; extern int path_pts(struct path *path); -extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); - -static inline int user_path_at(int dfd, const char __user *name, unsigned flags, - struct path *path) -{ - return user_path_at_empty(dfd, name, flags, path, NULL); -} +extern int user_path_at(int, const char __user *, unsigned, struct path *); struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, -- cgit v1.2.3 From 758191c9ea7bcc45dd99398a538ae4ab27c4029e Mon Sep 17 00:00:00 2001 From: Yoray Zack Date: Tue, 4 Jun 2024 00:22:17 +0300 Subject: net/mlx5e: SHAMPO, Use KSMs instead of KLMs KSM Mkey is KLM Mkey with a fixed buffer size. Due to this fact, it is a faster mechanism than KLM. SHAMPO feature used KLMs Mkeys for memory mappings of its headers buffer. As it used KLMs with the same buffer size for each entry, we can use KSMs instead. This commit changes the Mkeys that map the SHAMPO headers buffer from KLMs to KSMs. Signed-off-by: Yoray Zack Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240603212219.1037656-13-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index d7bb31d9a446..da09bfaa7b81 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -294,6 +294,7 @@ enum { #define MLX5_UMR_FLEX_ALIGNMENT 0x40 #define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_klm)) +#define MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_ksm)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit v1.2.3 From 99be56171fa9ffea494dfe3d4a7f6e7e51630c2e Mon Sep 17 00:00:00 2001 From: Yoray Zack Date: Tue, 4 Jun 2024 00:22:18 +0300 Subject: net/mlx5e: SHAMPO, Re-enable HW-GRO Add back HW-GRO to the reported features. As the current implementation of HW-GRO uses KSMs with a specific fixed buffer size (256B) to map its headers buffer, we reported the feature only if the NIC is supporting KSM and the minimum value for buffer size is below the requested one. iperf3 bandwidth comparison: +---------+--------+--------+-----------+ | streams | SW GRO | HW GRO | Unit | |---------+--------+--------+-----------| | 1 | 36 | 42 | Gbits/sec | | 4 | 34 | 39 | Gbits/sec | | 8 | 31 | 35 | Gbits/sec | +---------+--------+--------+-----------+ A downstream patch will add skb fragment coalescing which will improve performance considerably. Benchmark details: VM based setup CPU: Intel(R) Xeon(R) Platinum 8380 CPU, 24 cores NIC: ConnectX-7 100GbE iperf3 and irq running on same CPU over a single receive queue Signed-off-by: Yoray Zack Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240603212219.1037656-14-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5df52e15f7d6..17acd0f3ca8e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1526,8 +1526,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 ts_cqe_to_dest_cqn[0x1]; u8 reserved_at_b3[0x6]; u8 go_back_n[0x1]; - u8 shampo[0x1]; - u8 reserved_at_bb[0x5]; + u8 reserved_at_ba[0x6]; u8 max_sgl_for_optimized_performance[0x8]; u8 log_max_cq_sz[0x8]; @@ -1744,7 +1743,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_280[0x10]; u8 max_wqe_sz_sq[0x10]; - u8 reserved_at_2a0[0x10]; + u8 reserved_at_2a0[0xb]; + u8 shampo[0x1]; + u8 reserved_at_2ac[0x4]; u8 max_wqe_sz_rq[0x10]; u8 max_flow_counter_31_16[0x10]; @@ -2017,7 +2018,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_250[0x10]; u8 reserved_at_260[0x120]; - u8 reserved_at_380[0x10]; + u8 reserved_at_380[0xb]; + u8 min_mkey_log_entity_size_fixed_buffer[0x5]; u8 ec_vf_vport_base[0x10]; u8 reserved_at_3a0[0x10]; @@ -2029,7 +2031,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 pcc_ifa2[0x1]; u8 reserved_at_3f1[0xf]; - u8 reserved_at_400[0x400]; + u8 reserved_at_400[0x1]; + u8 min_mkey_log_entity_size_fixed_buffer_valid[0x1]; + u8 reserved_at_402[0x1e]; + + u8 reserved_at_420[0x3e0]; }; enum mlx5_ifc_flow_destination_type { -- cgit v1.2.3 From f1180fd2a7c039691b64ebf404c746a74e40b7b0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 5 Jun 2024 07:13:39 +0000 Subject: mm/mm_init.c: not always search next deferred_init_pfn from very beginning In function deferred_init_memmap(), we call deferred_init_mem_pfn_range_in_zone() to get the next deferred_init_pfn. But we always search it from the very beginning. Since we save the index in i, we can leverage this to search from i next time. [rppt refine the comment] Signed-off-by: Wei Yang Link: https://lore.kernel.org/all/20240605071339.15330-1-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- include/linux/memblock.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6cf18dc2b4d0..45cac33334c8 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -299,25 +299,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, unsigned long *out_spfn, unsigned long *out_epfn); -/** - * for_each_free_mem_pfn_range_in_zone - iterate through zone specific free - * memblock areas - * @i: u64 used as loop variable - * @zone: zone in which all of the memory blocks reside - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * - * Walks over free (memory && !reserved) areas of memblock in a specific - * zone. Available once memblock and an empty zone is initialized. The main - * assumption is that the zone start, end, and pgdat have been associated. - * This way we can use the zone to determine NUMA node, and if a given part - * of the memblock is valid for the zone. - */ -#define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end) \ - for (i = 0, \ - __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \ - i != U64_MAX; \ - __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) /** * for_each_free_mem_pfn_range_in_zone_from - iterate through zone specific -- cgit v1.2.3 From ababa16fd9bd0e2727a1c31c4fb68d6be053bddc Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Thu, 6 Jun 2024 11:42:38 +0200 Subject: irqchip/gic-v3: Enable non-coherent redistributors/ITSes ACPI probing The GIC architecture specification defines a set of registers for redistributors and ITSes that control the sharebility and cacheability attributes of redistributors/ITSes initiator ports on the interconnect (GICR_[V]PROPBASER, GICR_[V]PENDBASER, GITS_BASER). Architecturally the GIC provides a means to drive shareability and cacheability attributes signals but it is not mandatory for designs to wire up the corresponding interconnect signals that control the cacheability/shareability of transactions. Redistributors and ITSes interconnect ports can be connected to non-coherent interconnects that are not able to manage the shareability/cacheability attributes; this implicitly makes the redistributors and ITSes non-coherent observers. To enable non-coherent GIC designs on ACPI based systems, parse the MADT GICC/GICR/ITS subtables non-coherent flags to determine whether the respective components are non-coherent observers and force the shareability attributes to be programmed into the redistributors and ITSes registers. An ACPI global function (acpi_get_madt_revision()) is added to retrieve the MADT revision, in that it is essential to check the MADT revision before checking for flags that were added with MADT revision 7 so that if the kernel is booted with an ACPI MADT table with revision < 7 it skips parsing the newly added flags (that should be zeroed reserved values for MADT versions < 7 but they could turn out to be buggy and should be ignored). Signed-off-by: Lorenzo Pieralisi Signed-off-by: Thomas Gleixner Reviewed-by: Robin Murphy Acked-by: Marc Zyngier Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20240606094238.757649-2-lpieralisi@kernel.org --- include/linux/acpi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 28c3fb2bef0d..000d339e1596 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -279,6 +279,9 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id) return phys_id == PHYS_CPUID_INVALID; } + +int __init acpi_get_madt_revision(void); + /* Validate the processor object's proc_id */ bool acpi_duplicate_processor_id(int proc_id); /* Processor _CTS control */ -- cgit v1.2.3 From 0ee14725471cea66e03e3cd4f4c582d759de502c Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 6 Jun 2024 15:46:09 +0100 Subject: mm/util: Swap kmemdup_array() arguments GCC 14.1 complains about the argument usage of kmemdup_array(): drivers/soc/tegra/fuse/fuse-tegra.c:130:65: error: 'kmemdup_array' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Werror=calloc-transposed-args] 130 | fuse->lookups = kmemdup_array(fuse->soc->lookups, sizeof(*fuse->lookups), | ^ drivers/soc/tegra/fuse/fuse-tegra.c:130:65: note: earlier argument should specify number of elements, later size of each element The annotation introduced by commit 7d78a7773355 ("string: Add additional __realloc_size() annotations for "dup" helpers") lets the compiler think that kmemdup_array() follows the same format as calloc(), with the number of elements preceding the size of one element. So we could simply swap the arguments to __realloc_size() to get rid of that warning, but it seems cleaner to instead have kmemdup_array() follow the same format as krealloc_array(), memdup_array_user(), calloc() etc. Fixes: 7d78a7773355 ("string: Add additional __realloc_size() annotations for "dup" helpers") Signed-off-by: Jean-Philippe Brucker Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240606144608.97817-2-jean-philippe@linaro.org Signed-off-by: Kees Cook --- include/linux/string.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 60168aa2af07..9edace076ddb 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -289,7 +289,7 @@ extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_si extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); -extern void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp) +extern void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) __realloc_size(2, 3); /* lib/argv_split.c */ -- cgit v1.2.3 From 1d5f0222944fe723b9bfaaeb27e368363644ccab Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 5 Jun 2024 16:26:45 -0400 Subject: ftrace: Declare function_trace_op in header to quiet sparse warning Sparse complains that function_trace_op is not static but is not declared in a header file. It is used only in assembly code. But add it to a header so that sparse no longer complains: kernel/trace/ftrace.c:99:19: warning: symbol 'function_trace_op' was not declared. Should it be static? Link: https://lore.kernel.org/linux-trace-kernel/20240605202708.289105647@goodmis.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9f61556a9491..4135dc171447 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1131,6 +1131,9 @@ extern void ftrace_graph_init_task(struct task_struct *t); extern void ftrace_graph_exit_task(struct task_struct *t); extern void ftrace_graph_init_idle_task(struct task_struct *t, int cpu); +/* Used by assembly, but to quiet sparse warnings */ +extern struct ftrace_ops *function_trace_op; + static inline void pause_graph_tracing(void) { atomic_inc(¤t->tracing_graph_pause); -- cgit v1.2.3 From c76494768761aef7630e7e0db820ba7b375964da Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 30 May 2024 11:07:19 -0700 Subject: linux/interrupt.h: allow "guard" notation to disable and reenable IRQ Drivers often need to first disable an interrupt, carry out some action, and then reenable the interrupt. Introduce support for the "guard" notation for this so that the following is possible: ... scoped_cond_guard(mutex_intr, return -EINTR, &data->sysfs_mutex) { guard(disable_irq)(&client->irq); error = elan_acquire_baseline(data); if (error) return error; } ... Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/ZljAV6HjkPSEhWSw@google.com Signed-off-by: Dmitry Torokhov --- include/linux/interrupt.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5c9bdd3ffccc..3a36e64119c8 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -235,6 +236,9 @@ extern void enable_percpu_irq(unsigned int irq, unsigned int type); extern bool irq_percpu_is_enabled(unsigned int irq); extern void irq_wake_thread(unsigned int irq, void *dev_id); +DEFINE_LOCK_GUARD_1(disable_irq, int, + disable_irq(*_T->lock), enable_irq(*_T->lock)) + extern void disable_nmi_nosync(unsigned int irq); extern void disable_percpu_nmi(unsigned int irq); extern void enable_nmi(unsigned int irq); -- cgit v1.2.3 From b4100947a8014e1876872546398275968f77e1ad Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 28 May 2024 16:07:10 -0500 Subject: crypto: ccp - align psp_platform_access_msg Align the whitespace so that future messages will also be better aligned. Acked-by: Tom Lendacky Signed-off-by: Mario Limonciello Signed-off-by: Herbert Xu --- include/linux/psp-platform-access.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index c1dc87fc536b..23893b33e48c 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -6,8 +6,8 @@ #include enum psp_platform_access_msg { - PSP_CMD_NONE = 0x0, - PSP_I2C_REQ_BUS_CMD = 0x64, + PSP_CMD_NONE = 0x0, + PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, PSP_DYNAMIC_BOOST_SET_UID, PSP_DYNAMIC_BOOST_GET_PARAMETER, -- cgit v1.2.3 From 82f9327f774c6e040ba63a887a85fa2e290a233a Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 28 May 2024 16:07:11 -0500 Subject: crypto: ccp - Add support for getting security attributes on some older systems Older systems will not populate the security attributes in the capabilities register. The PSP on these systems, however, does have a command to get the security attributes. Use this command during ccp startup to populate the attributes if they're missing. Closes: https://github.com/fwupd/fwupd/issues/5284 Closes: https://github.com/fwupd/fwupd/issues/5675 Closes: https://github.com/fwupd/fwupd/issues/6253 Closes: https://github.com/fwupd/fwupd/issues/7280 Closes: https://github.com/fwupd/fwupd/issues/6323 Closes: https://github.com/fwupd/fwupd/discussions/5433 Signed-off-by: Mario Limonciello Acked-by: Tom Lendacky Signed-off-by: Herbert Xu --- include/linux/psp-platform-access.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index 23893b33e48c..1504fb012c05 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -7,6 +7,7 @@ enum psp_platform_access_msg { PSP_CMD_NONE = 0x0, + PSP_CMD_HSTI_QUERY = 0x14, PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, PSP_DYNAMIC_BOOST_SET_UID, -- cgit v1.2.3 From f05eda16037f9363297561bd28f318a6d7833d35 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Fri, 8 Mar 2024 17:09:27 -0800 Subject: srcu: Add an API for a memory barrier after SRCU read lock To avoid redundant memory barriers, add smp_mb__after_srcu_read_lock() to pair with smp_mb__after_srcu_read_unlock() for use in paths that need to emit a memory barrier, but already do srcu_read_lock(), which includes a full memory barrier. Provide an API, e.g. as opposed to having callers document the behavior via a comment, as the full memory barrier provided by srcu_read_lock() is an implementation detail that shouldn't bleed into random subsystems. KVM will use smp_mb__after_srcu_read_lock() in it's VM-Exit path to ensure a memory barrier is emitted, which is necessary to ensure correctness of mixed memory types on CPUs that support self-snoop. Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Kevin Tian Signed-off-by: Yan Zhao [sean: massage changelog] Tested-by: Xiangfei Ma Tested-by: Yongwei Ma Reviewed-by: Paul E. McKenney --- include/linux/srcu.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 236610e4a8fa..1cb4527076de 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -343,6 +343,20 @@ static inline void smp_mb__after_srcu_read_unlock(void) /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */ } +/** + * smp_mb__after_srcu_read_lock - ensure full ordering after srcu_read_lock + * + * Converts the preceding srcu_read_lock into a two-way memory barrier. + * + * Call this after srcu_read_lock, to guarantee that all memory operations + * that occur after smp_mb__after_srcu_read_lock will appear to happen after + * the preceding srcu_read_lock. + */ +static inline void smp_mb__after_srcu_read_lock(void) +{ + /* __srcu_read_lock has smp_mb() internally so nothing to do here. */ +} + DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct, _T->idx = srcu_read_lock(_T->lock), srcu_read_unlock(_T->lock, _T->idx), -- cgit v1.2.3 From 6a79a4e187bdd17959ff3e811d5de65c066f89e6 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 6 Jun 2024 10:33:49 +0300 Subject: libfs: Introduce case-insensitive string comparison helper generic_ci_match can be used by case-insensitive filesystems to compare strings under lookup with dirents in a case-insensitive way. This function is currently reimplemented by each filesystem supporting casefolding, so this reduces code duplication in filesystem-specific code. [eugen.hristev@collabora.com: rework to first test the exact match, cleanup and add error message] Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Eugen Hristev Link: https://lore.kernel.org/r/20240606073353.47130-4-eugen.hristev@collabora.com Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0283cf366c2a..2b1eeca426a0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3351,6 +3351,10 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); extern void generic_set_sb_d_ops(struct super_block *sb); +extern int generic_ci_match(const struct inode *parent, + const struct qstr *name, + const struct qstr *folded_name, + const u8 *de_name, u32 de_name_len); static inline bool sb_has_encoding(const struct super_block *sb) { -- cgit v1.2.3 From 231035f18d6b80e5c28732a20872398116a54ecd Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Thu, 6 Jun 2024 16:52:15 +0800 Subject: workqueue: Increase worker desc's length to 32 Commit 31c89007285d ("workqueue.c: Increase workqueue name length") increased WQ_NAME_LEN from 24 to 32, but forget to increase WORKER_DESC_LEN, which would cause truncation when setting kworker's desc from workqueue_struct's name, process_one_work() for example. Fixes: 31c89007285d ("workqueue.c: Increase workqueue name length") Signed-off-by: Wenchao Hao CC: Audra Mitchell Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fb3993894536..d9968bfc8eac 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -95,7 +95,7 @@ enum wq_misc_consts { WORK_BUSY_RUNNING = 1 << 1, /* maximum string length for set_worker_desc() */ - WORKER_DESC_LEN = 24, + WORKER_DESC_LEN = 32, }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ -- cgit v1.2.3 From 11c63e57404e538c5df91f732e5d505860edb660 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Thu, 16 May 2024 12:25:31 +0200 Subject: firmware: add nowarn variant of request_firmware_nowait() Device drivers with optional firmware may still want to use the asynchronous firmware loading interface. To avoid printing a warning into the kernel log when the optional firmware is absent, add a nowarn variant of this interface. Signed-off-by: Lucas Stach Reviewed-by: Greg Kroah-Hartman Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240516102532.213874-1-l.stach@pengutronix.de Signed-off-by: Vinod Koul --- include/linux/firmware.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware.h b/include/linux/firmware.h index f026f8926d79..aae1b85ffc10 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -98,6 +98,10 @@ static inline bool firmware_request_builtin(struct firmware *fw, #if IS_REACHABLE(CONFIG_FW_LOADER) int request_firmware(const struct firmware **fw, const char *name, struct device *device); +int firmware_request_nowait_nowarn( + struct module *module, const char *name, + struct device *device, gfp_t gfp, void *context, + void (*cont)(const struct firmware *fw, void *context)); int firmware_request_nowarn(const struct firmware **fw, const char *name, struct device *device); int firmware_request_platform(const struct firmware **fw, const char *name, @@ -123,6 +127,14 @@ static inline int request_firmware(const struct firmware **fw, return -EINVAL; } +static inline int firmware_request_nowait_nowarn( + struct module *module, const char *name, + struct device *device, gfp_t gfp, void *context, + void (*cont)(const struct firmware *fw, void *context)) +{ + return -EINVAL; +} + static inline int firmware_request_nowarn(const struct firmware **fw, const char *name, struct device *device) -- cgit v1.2.3 From a31a0a3e90b442c1a414be24b062e27624a40a8e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 28 May 2024 11:47:16 -0700 Subject: thermal: intel: intel_soc_dts_thermal: Switch to new Intel CPU model defines New CPU #defines encode vendor and family as well as model. Signed-off-by: Tony Luck Acked-by: Rafael J. Wysocki Signed-off-by: Rafael J. Wysocki --- include/linux/platform_data/x86/soc.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/soc.h b/include/linux/platform_data/x86/soc.h index a5705189e2ac..f981907a5cb0 100644 --- a/include/linux/platform_data/x86/soc.h +++ b/include/linux/platform_data/x86/soc.h @@ -20,7 +20,7 @@ static inline bool soc_intel_is_##soc(void) \ { \ static const struct x86_cpu_id soc##_cpu_ids[] = { \ - X86_MATCH_INTEL_FAM6_MODEL(type, NULL), \ + X86_MATCH_VFM(type, NULL), \ {} \ }; \ const struct x86_cpu_id *id; \ @@ -31,11 +31,11 @@ static inline bool soc_intel_is_##soc(void) \ return false; \ } -SOC_INTEL_IS_CPU(byt, ATOM_SILVERMONT); -SOC_INTEL_IS_CPU(cht, ATOM_AIRMONT); -SOC_INTEL_IS_CPU(apl, ATOM_GOLDMONT); -SOC_INTEL_IS_CPU(glk, ATOM_GOLDMONT_PLUS); -SOC_INTEL_IS_CPU(cml, KABYLAKE_L); +SOC_INTEL_IS_CPU(byt, INTEL_ATOM_SILVERMONT); +SOC_INTEL_IS_CPU(cht, INTEL_ATOM_AIRMONT); +SOC_INTEL_IS_CPU(apl, INTEL_ATOM_GOLDMONT); +SOC_INTEL_IS_CPU(glk, INTEL_ATOM_GOLDMONT_PLUS); +SOC_INTEL_IS_CPU(cml, INTEL_KABYLAKE_L); #undef SOC_INTEL_IS_CPU -- cgit v1.2.3 From 195fb517ee25bfefde9c74ecd86348eccbd6d2e4 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Sun, 9 Jun 2024 17:39:24 -0700 Subject: cpu: Move CPU hotplug function declarations into their own header Avoid upcoming #include hell when wants to use lockdep_assert_cpus_held() and creates a #include loop that would break the build for arch/riscv. [ bp: s/cpu/CPU/g ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240610003927.341707-2-tony.luck@intel.com --- include/linux/cpu.h | 33 +-------------------------------- include/linux/cpuhplock.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 32 deletions(-) create mode 100644 include/linux/cpuhplock.h (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 861c3bfc5f17..a8926d0a28cd 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -18,6 +18,7 @@ #include #include #include +#include #include struct device; @@ -132,38 +133,6 @@ static inline int add_cpu(unsigned int cpu) { return 0;} #endif /* CONFIG_SMP */ extern const struct bus_type cpu_subsys; -extern int lockdep_is_cpus_held(void); - -#ifdef CONFIG_HOTPLUG_CPU -extern void cpus_write_lock(void); -extern void cpus_write_unlock(void); -extern void cpus_read_lock(void); -extern void cpus_read_unlock(void); -extern int cpus_read_trylock(void); -extern void lockdep_assert_cpus_held(void); -extern void cpu_hotplug_disable(void); -extern void cpu_hotplug_enable(void); -void clear_tasks_mm_cpumask(int cpu); -int remove_cpu(unsigned int cpu); -int cpu_device_down(struct device *dev); -extern void smp_shutdown_nonboot_cpus(unsigned int primary_cpu); - -#else /* CONFIG_HOTPLUG_CPU */ - -static inline void cpus_write_lock(void) { } -static inline void cpus_write_unlock(void) { } -static inline void cpus_read_lock(void) { } -static inline void cpus_read_unlock(void) { } -static inline int cpus_read_trylock(void) { return true; } -static inline void lockdep_assert_cpus_held(void) { } -static inline void cpu_hotplug_disable(void) { } -static inline void cpu_hotplug_enable(void) { } -static inline int remove_cpu(unsigned int cpu) { return -EPERM; } -static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { } -#endif /* !CONFIG_HOTPLUG_CPU */ - -DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock()) - #ifdef CONFIG_PM_SLEEP_SMP extern int freeze_secondary_cpus(int primary); extern void thaw_secondary_cpus(void); diff --git a/include/linux/cpuhplock.h b/include/linux/cpuhplock.h new file mode 100644 index 000000000000..386abc482264 --- /dev/null +++ b/include/linux/cpuhplock.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/cpuhplock.h - CPU hotplug locking + * + * Locking functions for CPU hotplug. + */ +#ifndef _LINUX_CPUHPLOCK_H_ +#define _LINUX_CPUHPLOCK_H_ + +#include +#include + +struct device; + +extern int lockdep_is_cpus_held(void); + +#ifdef CONFIG_HOTPLUG_CPU +extern void cpus_write_lock(void); +extern void cpus_write_unlock(void); +extern void cpus_read_lock(void); +extern void cpus_read_unlock(void); +extern int cpus_read_trylock(void); +extern void lockdep_assert_cpus_held(void); +extern void cpu_hotplug_disable(void); +extern void cpu_hotplug_enable(void); +void clear_tasks_mm_cpumask(int cpu); +int remove_cpu(unsigned int cpu); +int cpu_device_down(struct device *dev); +extern void smp_shutdown_nonboot_cpus(unsigned int primary_cpu); + +#else /* CONFIG_HOTPLUG_CPU */ + +static inline void cpus_write_lock(void) { } +static inline void cpus_write_unlock(void) { } +static inline void cpus_read_lock(void) { } +static inline void cpus_read_unlock(void) { } +static inline int cpus_read_trylock(void) { return true; } +static inline void lockdep_assert_cpus_held(void) { } +static inline void cpu_hotplug_disable(void) { } +static inline void cpu_hotplug_enable(void) { } +static inline int remove_cpu(unsigned int cpu) { return -EPERM; } +static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { } +#endif /* !CONFIG_HOTPLUG_CPU */ + +DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock()) + +#endif /* _LINUX_CPUHPLOCK_H_ */ -- cgit v1.2.3 From ddefcfdeb5a2238cbcb07b80dda9ac3136735b1e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Sun, 9 Jun 2024 17:39:25 -0700 Subject: cpu: Drop "extern" from function declarations in cpuhplock.h This file was created with a direct cut and paste from cpu.h so kept the legacy declaration style. But the Linux coding standard for function declarations in header files is to avoid use of "extern". Drop "extern" from all function declarations. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240610003927.341707-3-tony.luck@intel.com --- include/linux/cpuhplock.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhplock.h b/include/linux/cpuhplock.h index 386abc482264..431560bbd045 100644 --- a/include/linux/cpuhplock.h +++ b/include/linux/cpuhplock.h @@ -15,18 +15,18 @@ struct device; extern int lockdep_is_cpus_held(void); #ifdef CONFIG_HOTPLUG_CPU -extern void cpus_write_lock(void); -extern void cpus_write_unlock(void); -extern void cpus_read_lock(void); -extern void cpus_read_unlock(void); -extern int cpus_read_trylock(void); -extern void lockdep_assert_cpus_held(void); -extern void cpu_hotplug_disable(void); -extern void cpu_hotplug_enable(void); +void cpus_write_lock(void); +void cpus_write_unlock(void); +void cpus_read_lock(void); +void cpus_read_unlock(void); +int cpus_read_trylock(void); +void lockdep_assert_cpus_held(void); +void cpu_hotplug_disable(void); +void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); int remove_cpu(unsigned int cpu); int cpu_device_down(struct device *dev); -extern void smp_shutdown_nonboot_cpus(unsigned int primary_cpu); +void smp_shutdown_nonboot_cpus(unsigned int primary_cpu); #else /* CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From 685cb1674060c2cb1b9da051a12933c082b8e874 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Sun, 9 Jun 2024 17:39:26 -0700 Subject: cacheinfo: Add function to get cacheinfo for a given CPU and cache level Resctrl open codes a search for information about a given cache level in a couple of places (and more are on the way). Provide a new inline function get_cpu_cacheinfo_level() in to do the search and return a pointer to the cacheinfo structure. Add lockdep_assert_cpus_held() to enforce the comment that cpuhp lock must be held. Simplify the existing get_cpu_cacheinfo_id() by using this new function to do the search. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/r/20240610003927.341707-4-tony.luck@intel.com --- include/linux/cacheinfo.h | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 2cb15fe4fe12..3dde175f4108 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -3,6 +3,7 @@ #define _LINUX_CACHEINFO_H #include +#include #include #include @@ -113,23 +114,37 @@ int acpi_get_cache_info(unsigned int cpu, const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf); /* - * Get the id of the cache associated with @cpu at level @level. + * Get the cacheinfo structure for the cache associated with @cpu at + * level @level. * cpuhp lock must be held. */ -static inline int get_cpu_cacheinfo_id(int cpu, int level) +static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level) { struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); int i; + lockdep_assert_cpus_held(); + for (i = 0; i < ci->num_leaves; i++) { if (ci->info_list[i].level == level) { if (ci->info_list[i].attributes & CACHE_ID) - return ci->info_list[i].id; - return -1; + return &ci->info_list[i]; + return NULL; } } - return -1; + return NULL; +} + +/* + * Get the id of the cache associated with @cpu at level @level. + * cpuhp lock must be held. + */ +static inline int get_cpu_cacheinfo_id(int cpu, int level) +{ + struct cacheinfo *ci = get_cpu_cacheinfo_level(cpu, level); + + return ci ? ci->id : -1; } #ifdef CONFIG_ARM64 -- cgit v1.2.3 From 97d833ceb27dc19f8777d63f90be4a27b5daeedf Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 6 Jun 2024 16:49:42 +0200 Subject: mlxsw: spectrum_acl_erp: Fix object nesting warning ACLs in Spectrum-2 and newer ASICs can reside in the algorithmic TCAM (A-TCAM) or in the ordinary circuit TCAM (C-TCAM). The former can contain more ACLs (i.e., tc filters), but the number of masks in each region (i.e., tc chain) is limited. In order to mitigate the effects of the above limitation, the device allows filters to share a single mask if their masks only differ in up to 8 consecutive bits. For example, dst_ip/25 can be represented using dst_ip/24 with a delta of 1 bit. The C-TCAM does not have a limit on the number of masks being used (and therefore does not support mask aggregation), but can contain a limited number of filters. The driver uses the "objagg" library to perform the mask aggregation by passing it objects that consist of the filter's mask and whether the filter is to be inserted into the A-TCAM or the C-TCAM since filters in different TCAMs cannot share a mask. The set of created objects is dependent on the insertion order of the filters and is not necessarily optimal. Therefore, the driver will periodically ask the library to compute a more optimal set ("hints") by looking at all the existing objects. When the library asks the driver whether two objects can be aggregated the driver only compares the provided masks and ignores the A-TCAM / C-TCAM indication. This is the right thing to do since the goal is to move as many filters as possible to the A-TCAM. The driver also forbids two identical masks from being aggregated since this can only happen if one was intentionally put in the C-TCAM to avoid a conflict in the A-TCAM. The above can result in the following set of hints: H1: {mask X, A-TCAM} -> H2: {mask Y, A-TCAM} // X is Y + delta H3: {mask Y, C-TCAM} -> H4: {mask Z, A-TCAM} // Y is Z + delta After getting the hints from the library the driver will start migrating filters from one region to another while consulting the computed hints and instructing the device to perform a lookup in both regions during the transition. Assuming a filter with mask X is being migrated into the A-TCAM in the new region, the hints lookup will return H1. Since H2 is the parent of H1, the library will try to find the object associated with it and create it if necessary in which case another hints lookup (recursive) will be performed. This hints lookup for {mask Y, A-TCAM} will either return H2 or H3 since the driver passes the library an object comparison function that ignores the A-TCAM / C-TCAM indication. This can eventually lead to nested objects which are not supported by the library [1]. Fix by removing the object comparison function from both the driver and the library as the driver was the only user. That way the lookup will only return exact matches. I do not have a reliable reproducer that can reproduce the issue in a timely manner, but before the fix the issue would reproduce in several minutes and with the fix it does not reproduce in over an hour. Note that the current usefulness of the hints is limited because they include the C-TCAM indication and represent aggregation that cannot actually happen. This will be addressed in net-next. [1] WARNING: CPU: 0 PID: 153 at lib/objagg.c:170 objagg_obj_parent_assign+0xb5/0xd0 Modules linked in: CPU: 0 PID: 153 Comm: kworker/0:18 Not tainted 6.9.0-rc6-custom-g70fbc2c1c38b #42 Hardware name: Mellanox Technologies Ltd. MSN3700C/VMOD0008, BIOS 5.11 10/10/2018 Workqueue: mlxsw_core mlxsw_sp_acl_tcam_vregion_rehash_work RIP: 0010:objagg_obj_parent_assign+0xb5/0xd0 [...] Call Trace: __objagg_obj_get+0x2bb/0x580 objagg_obj_get+0xe/0x80 mlxsw_sp_acl_erp_mask_get+0xb5/0xf0 mlxsw_sp_acl_atcam_entry_add+0xe8/0x3c0 mlxsw_sp_acl_tcam_entry_create+0x5e/0xa0 mlxsw_sp_acl_tcam_vchunk_migrate_one+0x16b/0x270 mlxsw_sp_acl_tcam_vregion_rehash_work+0xbe/0x510 process_one_work+0x151/0x370 Fixes: 9069a3817d82 ("lib: objagg: implement optimization hints assembly and use hints for object creation") Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Tested-by: Alexander Zubkov Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/objagg.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/objagg.h b/include/linux/objagg.h index 78021777df46..6df5b887dc54 100644 --- a/include/linux/objagg.h +++ b/include/linux/objagg.h @@ -8,7 +8,6 @@ struct objagg_ops { size_t obj_size; bool (*delta_check)(void *priv, const void *parent_obj, const void *obj); - int (*hints_obj_cmp)(const void *obj1, const void *obj2); void * (*delta_create)(void *priv, void *parent_obj, void *obj); void (*delta_destroy)(void *priv, void *delta_priv); void * (*root_create)(void *priv, void *obj, unsigned int root_id); -- cgit v1.2.3 From 0b7e448119428e1dcb854abb5855f66966fb82dc Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 28 May 2024 14:29:33 -0500 Subject: ACPI: utils: introduce acpi_get_local_u64_address() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ACPI _ADR is a 64-bit value. We changed the definitions in commit ca6f998cf9a2 ("ACPI: bus: change _ADR representation to 64 bits") but some helpers still assume the value is a 32-bit value. This patch adds a new helper to extract the full 64-bits. The existing 32-bit helper is kept for backwards-compatibility and cases where the _ADR is known to fit in a 32-bit value. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Reviewed-by: Bard Liao Acked-by: Rafael J. Wysocki Reviewed-by: Takashi Iwai Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240528192936.16180-2-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- include/linux/acpi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 28c3fb2bef0d..65e7177bcb02 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -761,6 +761,7 @@ static inline u64 acpi_arch_get_root_pointer(void) } #endif +int acpi_get_local_u64_address(acpi_handle handle, u64 *addr); int acpi_get_local_address(acpi_handle handle, u32 *addr); const char *acpi_get_subsystem_id(acpi_handle handle); -- cgit v1.2.3 From e289df82344fecc104ad8326b9ab6da612b9c899 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 31 May 2024 22:42:40 +0300 Subject: spi: Rework per message DMA mapped flag to be per transfer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The granularity of DMA mappings is transfer and moreover, the direction is also important as it can be unidirect. The current cur_msg_mapped flag doesn't fit well the DMA mapping and syncing calls and we have tons of checks around on top of it. So, instead of doing that rework the code to use per transfer per direction flag to show if it's DMA mapped or not. Signed-off-by: Andy Shevchenko Tested-by: Neil Armstrong # on SM8650-QRD Link: https://lore.kernel.org/r/20240531194723.1761567-9-andriy.shevchenko@linux.intel.com Reviewed-by: Serge Semin Tested-by: Nícolas F. R. A. Prado Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e8e1e798924f..b4a89db4c855 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -447,7 +447,6 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @cur_msg_need_completion: Flag used internally to opportunistically skip * the @cur_msg_completion. This flag is used to signal the context that * is running spi_finalize_current_message() that it needs to complete() - * @cur_msg_mapped: message has been mapped for DMA * @fallback: fallback to PIO if DMA transfer return failure with * SPI_TRANS_FAIL_NO_START. * @last_cs_mode_high: was (mode & SPI_CS_HIGH) true on the last call to set_cs. @@ -708,7 +707,6 @@ struct spi_controller { bool running; bool rt; bool auto_runtime_pm; - bool cur_msg_mapped; bool fallback; bool last_cs_mode_high; s8 last_cs[SPI_CS_CNT_MAX]; @@ -981,6 +979,8 @@ struct spi_res { * transfer this transfer. Set to 0 if the SPI bus driver does * not support it. * @transfer_list: transfers are sequenced through @spi_message.transfers + * @tx_sg_mapped: If true, the @tx_sg is mapped for DMA + * @rx_sg_mapped: If true, the @rx_sg is mapped for DMA * @tx_sg: Scatterlist for transmit, currently not for client use * @rx_sg: Scatterlist for receive, currently not for client use * @ptp_sts_word_pre: The word (subject to bits_per_word semantics) offset @@ -1077,10 +1077,13 @@ struct spi_transfer { #define SPI_TRANS_FAIL_IO BIT(1) u16 error; - dma_addr_t tx_dma; - dma_addr_t rx_dma; + bool tx_sg_mapped; + bool rx_sg_mapped; + struct sg_table tx_sg; struct sg_table rx_sg; + dma_addr_t tx_dma; + dma_addr_t rx_dma; unsigned dummy_data:1; unsigned cs_off:1; -- cgit v1.2.3 From 5fbf57a937f418fe204f9dbb7735e91984f4ee6a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 6 Jun 2024 12:29:06 -0700 Subject: net: netlink: remove the cb_mutex "injection" from netlink core Back in 2007, in commit af65bdfce98d ("[NETLINK]: Switch cb_lock spinlock to mutex and allow to override it") netlink core was extended to allow subsystems to replace the dump mutex lock with its own lock. The mechanism was used by rtnetlink to take rtnl_lock but it isn't sufficiently flexible for other users. Over the 17 years since it was added no other user appeared. Since rtnetlink needs conditional locking now, and doesn't use it either, axe this feature complete. Signed-off-by: Jakub Kicinski Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/netlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 5df7340d4dab..b332c2048c75 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -47,7 +47,6 @@ struct netlink_kernel_cfg { unsigned int groups; unsigned int flags; void (*input)(struct sk_buff *skb); - struct mutex *cb_mutex; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release) (struct sock *sk, unsigned long *groups); -- cgit v1.2.3 From 8a74e4eaa72c14f997df6d820effb6aac400d470 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 10 Jun 2024 10:20:53 +0200 Subject: PCI: switchtec: Make switchtec_class constant Now that the driver core allows for struct class to be in read-only memory, we should make all 'class' structures declared at build time placing them into read-only memory, instead of having to be dynamically allocated at runtime. Link: https://lore.kernel.org/r/2024061053-online-unwound-b173@gregkh Signed-off-by: Greg Kroah-Hartman Signed-off-by: Bjorn Helgaas Reviewed-by: Dave Jiang Reviewed-By: Logan Gunthorpe Cc: Kurt Schwemmer Cc: Jon Mason Cc: Allen Hubbe --- include/linux/switchtec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h index 8d8fac1626bd..cdb58d61c152 100644 --- a/include/linux/switchtec.h +++ b/include/linux/switchtec.h @@ -521,6 +521,6 @@ static inline struct switchtec_dev *to_stdev(struct device *dev) return container_of(dev, struct switchtec_dev, dev); } -extern struct class *switchtec_class; +extern const struct class switchtec_class; #endif -- cgit v1.2.3 From 1d4ce389da2b84e0e24aad5e83fe9c742adc3f99 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 7 Jun 2024 14:22:33 -0700 Subject: ice: add and use roundup_u64 instead of open coding equivalent In ice_ptp_cfg_clkout(), the ice driver needs to calculate the nearest next second of a current time value specified in nanoseconds. It implements this using div64_u64, because the time value is a u64. It could use div_u64 since NSEC_PER_SEC is smaller than 32-bits. Ideally this would be implemented directly with roundup(), but that can't work on all platforms due to a division which requires using the specific macros and functions due to platform restrictions, and to ensure that the most appropriate and fast instructions are used. The kernel doesn't currently provide any 64-bit equivalents for doing roundup. Attempting to use roundup() on a 32-bit platform will result in a link failure due to not having a direct 64-bit division. The closest equivalent for this is DIV64_U64_ROUND_UP, which does a division always rounding up. However, this only computes the division, and forces use of the div64_u64 in cases where the divisor is a 32bit value and could make use of div_u64. Introduce DIV_U64_ROUND_UP based on div_u64, and then use it to implement roundup_u64 which takes a u64 input value and a u32 rounding value. The name roundup_u64 matches the naming scheme of div_u64, and future patches could implement roundup64_u64 if they need to round by a multiple that is greater than 32-bits. Replace the logic in ice_ptp.c which does this equivalent with the newly added roundup_u64. Tested-by: Pucha Himasekhar Reddy Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20240607-next-2024-06-03-intel-next-batch-v3-2-d1470cee3347@intel.com Signed-off-by: Jakub Kicinski --- include/linux/math64.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/math64.h b/include/linux/math64.h index d34def7f9a8c..6aaccc1626ab 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -297,6 +297,19 @@ u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); #define DIV64_U64_ROUND_UP(ll, d) \ ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) +/** + * DIV_U64_ROUND_UP - unsigned 64bit divide with 32bit divisor rounded up + * @ll: unsigned 64bit dividend + * @d: unsigned 32bit divisor + * + * Divide unsigned 64bit dividend by unsigned 32bit divisor + * and round up. + * + * Return: dividend / divisor rounded up + */ +#define DIV_U64_ROUND_UP(ll, d) \ + ({ u32 _tmp = (d); div_u64((ll) + _tmp - 1, _tmp); }) + /** * DIV64_U64_ROUND_CLOSEST - unsigned 64bit divide with 64bit divisor rounded to nearest integer * @dividend: unsigned 64bit dividend @@ -342,4 +355,19 @@ u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); div_s64((__x - (__d / 2)), __d); \ } \ ) + +/** + * roundup_u64 - Round up a 64bit value to the next specified 32bit multiple + * @x: the value to up + * @y: 32bit multiple to round up to + * + * Rounds @x to the next multiple of @y. For 32bit @x values, see roundup and + * the faster round_up() for powers of 2. + * + * Return: rounded up value. + */ +static inline u64 roundup_u64(u64 x, u32 y) +{ + return DIV_U64_ROUND_UP(x, y) * y; +} #endif /* _LINUX_MATH64_H */ -- cgit v1.2.3 From 5f7fb89a115d53b4a10bf7ba2733e78df281e98d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 10 Jun 2024 23:09:36 -0400 Subject: function_graph: Everyone uses HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, remove it All architectures that implement function graph also implements HAVE_FUNCTION_GRAPH_RET_ADDR_PTR. Remove it, as it is no longer a differentiator. Link: https://lore.kernel.org/linux-trace-kernel/20240611031737.982047614@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Will Deacon Cc: Guo Ren Cc: Huacai Chen Cc: WANG Xuerui Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: "Naveen N. Rao" Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4135dc171447..845c2ab0bc1c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1071,9 +1071,7 @@ struct ftrace_ret_stack { #ifdef HAVE_FUNCTION_GRAPH_FP_TEST unsigned long fp; #endif -#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR unsigned long *retp; -#endif }; /* -- cgit v1.2.3 From f328a26eeb5380bc74e58cb9c3280a4908452df7 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 3 Jun 2024 17:55:55 -0400 Subject: dlm: introduce DLM_LSFL_SOFTIRQ_SAFE Introduce a new external lockspace flag DLM_LSFL_SOFTIRQ_SAFE. A lockspace user will set this flag if it can handle dlm running the callback functions from softirq context. When not set, dlm will continue to run callback functions from the dlm_callback workqueue. The new lockspace flag cannot be used for user space lockspaces, so a uapi placeholder definition is used for the new flag value. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- include/linux/dlm.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dlm.h b/include/linux/dlm.h index c58c4f790c04..bacda9898f2b 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -35,6 +35,9 @@ struct dlm_lockspace_ops { int num_slots, int our_slot, uint32_t generation); }; +/* only relevant for kernel lockspaces, will be removed in future */ +#define DLM_LSFL_SOFTIRQ __DLM_LSFL_RESERVED0 + /* * dlm_new_lockspace * @@ -55,6 +58,11 @@ struct dlm_lockspace_ops { * used to select the directory node. Must be the same on all nodes. * DLM_LSFL_NEWEXCL * dlm_new_lockspace() should return -EEXIST if the lockspace exists. + * DLM_LSFL_SOFTIRQ + * dlm request callbacks (ast, bast) are softirq safe. Flag should be + * preferred by users. Will be default in some future. If set the + * strongest context for ast, bast callback is softirq as it avoids + * an additional context switch. * * lvblen: length of lvb in bytes. Must be multiple of 8. * dlm_new_lockspace() returns an error if this does not match @@ -121,7 +129,14 @@ int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force); * call. * * AST routines should not block (at least not for long), but may make - * any locking calls they please. + * any locking calls they please. If DLM_LSFL_SOFTIRQ for kernel + * users of dlm_new_lockspace() is passed the ast and bast callbacks + * can be processed in softirq context. Also some of the callback + * contexts are in the same context as the DLM lock request API, users + * must not hold locks while calling dlm lock request API and trying + * to acquire this lock in the callback again, this will end in a + * lock recursion. For newer implementation the DLM_LSFL_SOFTIRQ + * should be used. */ int dlm_lock(dlm_lockspace_t *lockspace, -- cgit v1.2.3 From 10b8e0fd3f7234a38db2c8d2c8dec0bd6eeede44 Mon Sep 17 00:00:00 2001 From: Amelie Delaunay Date: Fri, 31 May 2024 17:07:10 +0200 Subject: dmaengine: add channel device name to channel registration Channel device name is used for sysfs, but also by dmatest filter function. With dynamic channel registration, channels can be registered after dma controller registration. Users may want to have specific channel names. If name is NULL, the channel name relies on previous implementation, dmachan. Signed-off-by: Amelie Delaunay Link: https://lore.kernel.org/r/20240531150712.2503554-11-amelie.delaunay@foss.st.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 752dbde4cec1..73537fddbb52 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -1575,7 +1575,8 @@ int dma_async_device_register(struct dma_device *device); int dmaenginem_async_device_register(struct dma_device *device); void dma_async_device_unregister(struct dma_device *device); int dma_async_device_channel_register(struct dma_device *device, - struct dma_chan *chan); + struct dma_chan *chan, + const char *name); void dma_async_device_channel_unregister(struct dma_device *device, struct dma_chan *chan); void dma_run_dependencies(struct dma_async_tx_descriptor *tx); -- cgit v1.2.3 From 3ff1180a39fbc43ae69d4238e6922c57e3278910 Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Mon, 10 Jun 2024 08:53:13 -0500 Subject: gpiolib: Remove data-less gpiochip_add() function GPIO chips should be added with driver-private data associated with the chip. If none is needed, NULL can be used. All users already do this except one, fix that here. With no more users of the base gpiochip_add() we can drop this function so no more users show up later. Signed-off-by: Andrew Davis Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240610135313.142571-1-afd@ti.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 0032bb6e7d8f..6d31388dde0a 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -632,10 +632,6 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, devm_gpiochip_add_data_with_key(dev, gc, data, NULL, NULL) #endif /* CONFIG_LOCKDEP */ -static inline int gpiochip_add(struct gpio_chip *gc) -{ - return gpiochip_add_data(gc, NULL); -} void gpiochip_remove(struct gpio_chip *gc); int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc, void *data, struct lock_class_key *lock_key, -- cgit v1.2.3 From fbe4a7e881d4408bfabbb4fd538f10fd686cd8ab Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 6 May 2024 18:17:49 +0800 Subject: KVM: Setup empty IRQ routing when creating a VM Setup empty IRQ routing during VM creation so that x86 and s390 don't need to set empty/dummy IRQ routing during KVM_CREATE_IRQCHIP (in future patches). Initializing IRQ routing before there are any potential readers allows KVM to avoid the synchronize_srcu() in kvm_set_irq_routing(), which can introduces 20+ milliseconds of latency in the VM creation path. Ensuring that all VMs have non-NULL IRQ routing also hardens KVM against misbehaving userspace VMMs, e.g. RISC-V dynamically instantiates its interrupt controller, but doesn't override kvm_arch_intc_initialized() or kvm_arch_irqfd_allowed(), and so can likely reach kvm_irq_map_gsi() without fully initialized IRQ routing. Signed-off-by: Yi Wang Acked-by: Christian Borntraeger Link: https://lore.kernel.org/r/20240506101751.3145407-2-foxywang@tencent.com [sean: init refcount after IRQ routing, fix stub, massage changelog] Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c80fe03a1fa4..eaa70e9b1218 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2094,6 +2094,7 @@ int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, unsigned flags); +int kvm_init_irq_routing(struct kvm *kvm); int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); @@ -2103,6 +2104,11 @@ void kvm_free_irq_routing(struct kvm *kvm); static inline void kvm_free_irq_routing(struct kvm *kvm) {} +static inline int kvm_init_irq_routing(struct kvm *kvm) +{ + return 0; +} + #endif int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); -- cgit v1.2.3 From d1ae567fb8b559401a9f65290bbb0cef5e987bfe Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 21 May 2024 18:40:08 -0700 Subject: KVM: Add a flag to track if a loaded vCPU is scheduled out Add a kvm_vcpu.scheduled_out flag to track if a vCPU is in the process of being scheduled out (vCPU put path), or if the vCPU is being reloaded after being scheduled out (vCPU load path). In the short term, this will allow dropping kvm_arch_sched_in(), as arch code can query scheduled_out during kvm_arch_vcpu_load(). Longer term, scheduled_out opens up other potential optimizations, without creating subtle/brittle dependencies. E.g. it allows KVM to keep guest state (that is managed via kvm_arch_vcpu_{load,put}()) loaded across kvm_sched_{out,in}(), if KVM knows the state isn't accessed by the host kernel. Forcing arch code to coordinate between kvm_arch_sched_{in,out}() and kvm_arch_vcpu_{load,put}() is awkward, not reusable, and relies on the exact ordering of calls into arch code. Adding scheduled_out also obviates the need for a kvm_arch_sched_out() hook, e.g. if arch code needs to do something novel when putting vCPU state. And even if KVM never uses scheduled_out for anything beyond dropping kvm_arch_sched_in(), just being able to remove all of the arch stubs makes it worth adding the flag. Link: https://lore.kernel.org/all/20240430224431.490139-1-seanjc@google.com Cc: Oliver Upton Reviewed-by: Oliver Upton Acked-by: Kai Huang Link: https://lore.kernel.org/r/20240522014013.1672962-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index eaa70e9b1218..5e04c6cae34a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -380,6 +380,7 @@ struct kvm_vcpu { #endif bool preempted; bool ready; + bool scheduled_out; struct kvm_vcpu_arch arch; struct kvm_vcpu_stat stat; char stats_id[KVM_STATS_NAME_SIZE]; -- cgit v1.2.3 From 2a27c431400797e0044872283d1971aa372fcd3a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 21 May 2024 18:40:11 -0700 Subject: KVM: Delete the now unused kvm_arch_sched_in() Delete kvm_arch_sched_in() now that all implementations are nops. Reviewed-by: Bibo Mao Acked-by: Kai Huang Link: https://lore.kernel.org/r/20240522014013.1672962-5-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5e04c6cae34a..7b9d2633a931 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1495,8 +1495,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu); -void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu); - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id); -- cgit v1.2.3 From 190fec72df4a5d4d98b1e783c333f471e5e5f344 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 12 Jun 2024 08:44:27 +0900 Subject: uprobe: Wire up uretprobe system call Wiring up uretprobe system call, which comes in following changes. We need to do the wiring before, because the uretprobe implementation needs the syscall number. Note at the moment uretprobe syscall is supported only for native 64-bit process. Link: https://lore.kernel.org/all/20240611112158.40795-3-jolsa@kernel.org/ Reviewed-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Masami Hiramatsu (Google) --- include/linux/syscalls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 9104952d323d..494f5e0f61f7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -973,6 +973,8 @@ asmlinkage long sys_lsm_list_modules(u64 *ids, u32 *size, u32 flags); /* x86 */ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); +asmlinkage long sys_uretprobe(void); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, -- cgit v1.2.3 From ff474a78cef5cb5f32be52fe25b78441327a2e7c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 12 Jun 2024 08:44:28 +0900 Subject: uprobe: Add uretprobe syscall to speed up return probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding uretprobe syscall instead of trap to speed up return probe. At the moment the uretprobe setup/path is: - install entry uprobe - when the uprobe is hit, it overwrites probed function's return address on stack with address of the trampoline that contains breakpoint instruction - the breakpoint trap code handles the uretprobe consumers execution and jumps back to original return address This patch replaces the above trampoline's breakpoint instruction with new ureprobe syscall call. This syscall does exactly the same job as the trap with some more extra work: - syscall trampoline must save original value for rax/r11/rcx registers on stack - rax is set to syscall number and r11/rcx are changed and used by syscall instruction - the syscall code reads the original values of those registers and restore those values in task's pt_regs area - only caller from trampoline exposed in '[uprobes]' is allowed, the process will receive SIGILL signal otherwise Even with some extra work, using the uretprobes syscall shows speed improvement (compared to using standard breakpoint): On Intel (11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz) current: uretprobe-nop : 1.498 ± 0.000M/s uretprobe-push : 1.448 ± 0.001M/s uretprobe-ret : 0.816 ± 0.001M/s with the fix: uretprobe-nop : 1.969 ± 0.002M/s < 31% speed up uretprobe-push : 1.910 ± 0.000M/s < 31% speed up uretprobe-ret : 0.934 ± 0.000M/s < 14% speed up On Amd (AMD Ryzen 7 5700U) current: uretprobe-nop : 0.778 ± 0.001M/s uretprobe-push : 0.744 ± 0.001M/s uretprobe-ret : 0.540 ± 0.001M/s with the fix: uretprobe-nop : 0.860 ± 0.001M/s < 10% speed up uretprobe-push : 0.818 ± 0.001M/s < 10% speed up uretprobe-ret : 0.578 ± 0.000M/s < 7% speed up The performance test spawns a thread that runs loop which triggers uprobe with attached bpf program that increments the counter that gets printed in results above. The uprobe (and uretprobe) kind is determined by which instruction is being patched with breakpoint instruction. That's also important for uretprobes, because uprobe is installed for each uretprobe. The performance test is part of bpf selftests: tools/testing/selftests/bpf/run_bench_uprobes.sh Note at the moment uretprobe syscall is supported only for native 64-bit process, compat process still uses standard breakpoint. Note that when shadow stack is enabled the uretprobe syscall returns via iret, which is slower than return via sysret, but won't cause the shadow stack violation. Link: https://lore.kernel.org/all/20240611112158.40795-4-jolsa@kernel.org/ Suggested-by: Andrii Nakryiko Reviewed-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Signed-off-by: Oleg Nesterov Signed-off-by: Jiri Olsa Signed-off-by: Masami Hiramatsu (Google) --- include/linux/uprobes.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index f46e0ca0169c..b503fafb7fb3 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -138,6 +138,9 @@ extern bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); +extern void uprobe_handle_trampoline(struct pt_regs *regs); +extern void *arch_uprobe_trampoline(unsigned long *psize); +extern unsigned long uprobe_get_trampoline_vaddr(void); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; -- cgit v1.2.3 From fa59dc2f6fc616fde3941f62ccd4adcaa21ccd47 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Fri, 7 Jun 2024 18:25:24 +0800 Subject: net: core,vrf: Change pcpu_dstat fields to u64_stats_t The pcpu_sw_netstats and pcpu_lstats structs both contain a set of u64_stats_t fields for individual stats, but pcpu_dstats uses u64s instead. Make this consistent by using u64_stats_t across all stats types. The per-cpu dstats are only used by the vrf driver at present, so update that driver as part of this change. Signed-off-by: Jeremy Kerr Reviewed-by: Simon Horman Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240607-dstats-v3-1-cc781fe116f7@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d20c6c99eb88..f148a01dd1d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2731,12 +2731,12 @@ struct pcpu_sw_netstats { } __aligned(4 * sizeof(u64)); struct pcpu_dstats { - u64 rx_packets; - u64 rx_bytes; - u64 rx_drops; - u64 tx_packets; - u64 tx_bytes; - u64 tx_drops; + u64_stats_t rx_packets; + u64_stats_t rx_bytes; + u64_stats_t rx_drops; + u64_stats_t tx_packets; + u64_stats_t tx_bytes; + u64_stats_t tx_drops; struct u64_stats_sync syncp; } __aligned(8 * sizeof(u64)); -- cgit v1.2.3 From 144ba8580bcb82b2686c3d1a043299d844b9a682 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Mon, 10 Jun 2024 10:34:26 +0200 Subject: net: pse-pd: Use EOPNOTSUPP error code instead of ENOTSUPP ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP as reported by checkpatch script. Fixes: 18ff0bcda6d1 ("ethtool: add interface to interact with Ethernet Power Equipment") Reviewed-by: Andrew Lunn Acked-by: Oleksij Rempel Signed-off-by: Kory Maincent Link: https://lore.kernel.org/r/20240610083426.740660-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/pse-pd/pse.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 6d07c95dabb9..6eec24ffa866 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -167,14 +167,14 @@ static inline int pse_ethtool_get_status(struct pse_control *psec, struct netlink_ext_ack *extack, struct pse_control_status *status) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int pse_ethtool_set_config(struct pse_control *psec, struct netlink_ext_ack *extack, const struct pse_control_config *config) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline bool pse_has_podl(struct pse_control *psec) -- cgit v1.2.3 From 249ebf3f65f8530beb2cbfb91bff1d83ba88d23c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 5 Jun 2024 14:38:49 +0200 Subject: power: sequencing: implement the pwrseq core Implement the power sequencing subsystem allowing devices to share complex powering-up and down procedures. It's split into the consumer and provider parts but does not implement any new DT bindings so that the actual power sequencing is never revealed in the DT representation. Tested-by: Amit Pundir Tested-by: Neil Armstrong # on SM8550-QRD, SM8650-QRD & SM8650-HDK Tested-by: Caleb Connolly # OnePlus 8T Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240605123850.24857-2-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/pwrseq/consumer.h | 56 ++++++++++++++++++++++++++++++ include/linux/pwrseq/provider.h | 75 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 include/linux/pwrseq/consumer.h create mode 100644 include/linux/pwrseq/provider.h (limited to 'include/linux') diff --git a/include/linux/pwrseq/consumer.h b/include/linux/pwrseq/consumer.h new file mode 100644 index 000000000000..7d583b4f266e --- /dev/null +++ b/include/linux/pwrseq/consumer.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024 Linaro Ltd. + */ + +#ifndef __POWER_SEQUENCING_CONSUMER_H__ +#define __POWER_SEQUENCING_CONSUMER_H__ + +#include + +struct device; +struct pwrseq_desc; + +#if IS_ENABLED(CONFIG_POWER_SEQUENCING) + +struct pwrseq_desc * __must_check +pwrseq_get(struct device *dev, const char *target); +void pwrseq_put(struct pwrseq_desc *desc); + +struct pwrseq_desc * __must_check +devm_pwrseq_get(struct device *dev, const char *target); + +int pwrseq_power_on(struct pwrseq_desc *desc); +int pwrseq_power_off(struct pwrseq_desc *desc); + +#else /* CONFIG_POWER_SEQUENCING */ + +static inline struct pwrseq_desc * __must_check +pwrseq_get(struct device *dev, const char *target) +{ + return ERR_PTR(-ENOSYS); +} + +static inline void pwrseq_put(struct pwrseq_desc *desc) +{ +} + +static inline struct pwrseq_desc * __must_check +devm_pwrseq_get(struct device *dev, const char *target) +{ + return ERR_PTR(-ENOSYS); +} + +static inline int pwrseq_power_on(struct pwrseq_desc *desc) +{ + return -ENOSYS; +} + +static inline int pwrseq_power_off(struct pwrseq_desc *desc) +{ + return -ENOSYS; +} + +#endif /* CONFIG_POWER_SEQUENCING */ + +#endif /* __POWER_SEQUENCING_CONSUMER_H__ */ diff --git a/include/linux/pwrseq/provider.h b/include/linux/pwrseq/provider.h new file mode 100644 index 000000000000..cbc3607cbfcf --- /dev/null +++ b/include/linux/pwrseq/provider.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024 Linaro Ltd. + */ + +#ifndef __POWER_SEQUENCING_PROVIDER_H__ +#define __POWER_SEQUENCING_PROVIDER_H__ + +struct device; +struct module; +struct pwrseq_device; + +typedef int (*pwrseq_power_state_func)(struct pwrseq_device *); +typedef int (*pwrseq_match_func)(struct pwrseq_device *, struct device *); + +/** + * struct pwrseq_unit_data - Configuration of a single power sequencing + * unit. + * @name: Name of the unit. + * @deps: Units that must be enabled before this one and disabled after it + * in the order they come in this array. Must be NULL-terminated. + * @enable: Callback running the part of the power-on sequence provided by + * this unit. + * @disable: Callback running the part of the power-off sequence provided + * by this unit. + */ +struct pwrseq_unit_data { + const char *name; + const struct pwrseq_unit_data **deps; + pwrseq_power_state_func enable; + pwrseq_power_state_func disable; +}; + +/** + * struct pwrseq_target_data - Configuration of a power sequencing target. + * @name: Name of the target. + * @unit: Final unit that this target must reach in order to be considered + * enabled. + * @post_enable: Callback run after the target unit has been enabled, *after* + * the state lock has been released. It's useful for implementing + * boot-up delays without blocking other users from powering up + * using the same power sequencer. + */ +struct pwrseq_target_data { + const char *name; + const struct pwrseq_unit_data *unit; + pwrseq_power_state_func post_enable; +}; + +/** + * struct pwrseq_config - Configuration used for registering a new provider. + * @parent: Parent device for the sequencer. Must be set. + * @owner: Module providing this device. + * @drvdata: Private driver data. + * @match: Provider callback used to match the consumer device to the sequencer. + * @targets: Array of targets for this power sequencer. Must be NULL-terminated. + */ +struct pwrseq_config { + struct device *parent; + struct module *owner; + void *drvdata; + pwrseq_match_func match; + const struct pwrseq_target_data **targets; +}; + +struct pwrseq_device * +pwrseq_device_register(const struct pwrseq_config *config); +void pwrseq_device_unregister(struct pwrseq_device *pwrseq); +struct pwrseq_device * +devm_pwrseq_device_register(struct device *dev, + const struct pwrseq_config *config); + +void *pwrseq_device_get_drvdata(struct pwrseq_device *pwrseq); + +#endif /* __POWER_SEQUENCING_PROVIDER_H__ */ -- cgit v1.2.3 From 92a5df6687da755edd4bfe1acb714f00f56d2e06 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 May 2024 12:09:44 +0200 Subject: wifi: ieee80211: remove unused enum ieee80211_client_reg_power This has never been used, and it's really not directly representing the spec, so shouldn't have been here in the first place. Remove it. Reviewed-by: Miriam Rachel Korenblit Signed-off-by: Johannes Berg Link: https://msgid.link/20240523120945.32ed8fc1522d.Id4480d162e1921478e33d145890dc16c263b57bf@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 769008a51809..456a55bd6c6b 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2426,24 +2426,6 @@ enum ieee80211_ap_reg_power { IEEE80211_REG_AP_POWER_AFTER_LAST - 1, }; -/** - * enum ieee80211_client_reg_power - regulatory power for a client - * - * @IEEE80211_REG_UNSET_CLIENT: Client has no regulatory power mode - * @IEEE80211_REG_DEFAULT_CLIENT: Default Client - * @IEEE80211_REG_SUBORDINATE_CLIENT: Subordinate Client - * @IEEE80211_REG_CLIENT_POWER_AFTER_LAST: internal - * @IEEE80211_REG_CLIENT_POWER_MAX: maximum value - */ -enum ieee80211_client_reg_power { - IEEE80211_REG_UNSET_CLIENT, - IEEE80211_REG_DEFAULT_CLIENT, - IEEE80211_REG_SUBORDINATE_CLIENT, - IEEE80211_REG_CLIENT_POWER_AFTER_LAST, - IEEE80211_REG_CLIENT_POWER_MAX = - IEEE80211_REG_CLIENT_POWER_AFTER_LAST - 1, -}; - /* 802.11ax HE MAC capabilities */ #define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 #define IEEE80211_HE_MAC_CAP0_TWT_REQ 0x02 -- cgit v1.2.3 From 0a9314ad5f45aeb10326dce33c5d70b85f74ef2d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 May 2024 12:09:45 +0200 Subject: wifi: cfg80211: move enum ieee80211_ap_reg_power to cfg80211 This really shouldn't have been in ieee80211.h, since it doesn't directly represent the spec. Move it to cfg80211 rather than mac80211 since upcoming changes will use it there. Reviewed-by: Miriam Rachel Korenblit Signed-off-by: Johannes Berg Link: https://msgid.link/20240523120945.962b16c831cd.I5745962525b1b176c5b90d37b3720fc100eee406@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 456a55bd6c6b..30cef3b940eb 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2406,26 +2406,6 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, int mcs, bool ext_nss_bw_capable, unsigned int max_vht_nss); -/** - * enum ieee80211_ap_reg_power - regulatory power for a Access Point - * - * @IEEE80211_REG_UNSET_AP: Access Point has no regulatory power mode - * @IEEE80211_REG_LPI_AP: Indoor Access Point - * @IEEE80211_REG_SP_AP: Standard power Access Point - * @IEEE80211_REG_VLP_AP: Very low power Access Point - * @IEEE80211_REG_AP_POWER_AFTER_LAST: internal - * @IEEE80211_REG_AP_POWER_MAX: maximum value - */ -enum ieee80211_ap_reg_power { - IEEE80211_REG_UNSET_AP, - IEEE80211_REG_LPI_AP, - IEEE80211_REG_SP_AP, - IEEE80211_REG_VLP_AP, - IEEE80211_REG_AP_POWER_AFTER_LAST, - IEEE80211_REG_AP_POWER_MAX = - IEEE80211_REG_AP_POWER_AFTER_LAST - 1, -}; - /* 802.11ax HE MAC capabilities */ #define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 #define IEEE80211_HE_MAC_CAP0_TWT_REQ 0x02 -- cgit v1.2.3 From 4565d2652a37e438e4cd729e2a8dfeffe34c958c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 12 Jun 2024 10:20:17 +0200 Subject: PCI/pwrctl: Add PCI power control core code Some PCI devices must be powered-on before they can be detected on the bus. Introduce a simple framework reusing the existing PCI OF infrastructure. The way this works is: a DT node representing a PCI device connected to the port can be matched against its power control platform driver. If the match succeeds, the driver is responsible for powering-up the device and calling pci_pwrctl_device_set_ready() which will trigger a PCI bus rescan as well as subscribe to PCI bus notifications. When the device is detected and created, we'll make it consume the same DT node that the platform device did. When the device is bound, we'll create a device link between it and the parent power control device. Tested-by: Amit Pundir Tested-by: Neil Armstrong # on SM8550-QRD, SM8650-QRD & SM8650-HDK Tested-by: Caleb Connolly # OnePlus 8T Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20240612082019.19161-5-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/pci-pwrctl.h | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 include/linux/pci-pwrctl.h (limited to 'include/linux') diff --git a/include/linux/pci-pwrctl.h b/include/linux/pci-pwrctl.h new file mode 100644 index 000000000000..45e9cfe740e4 --- /dev/null +++ b/include/linux/pci-pwrctl.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024 Linaro Ltd. + */ + +#ifndef __PCI_PWRCTL_H__ +#define __PCI_PWRCTL_H__ + +#include + +struct device; +struct device_link; + +/* + * This is a simple framework for solving the issue of PCI devices that require + * certain resources (regulators, GPIOs, clocks) to be enabled before the + * device can actually be detected on the PCI bus. + * + * The idea is to reuse the platform bus to populate OF nodes describing the + * PCI device and its resources, let these platform devices probe and enable + * relevant resources and then trigger a rescan of the PCI bus allowing for the + * same device (with a second associated struct device) to be registered with + * the PCI subsystem. + * + * To preserve a correct hierarchy for PCI power management and device reset, + * we create a device link between the power control platform device (parent) + * and the supplied PCI device (child). + */ + +/** + * struct pci_pwrctl - PCI device power control context. + * @dev: Address of the power controlling device. + * + * An object of this type must be allocated by the PCI power control device and + * passed to the pwrctl subsystem to trigger a bus rescan and setup a device + * link with the device once it's up. + */ +struct pci_pwrctl { + struct device *dev; + + /* Private: don't use. */ + struct notifier_block nb; + struct device_link *link; +}; + +int pci_pwrctl_device_set_ready(struct pci_pwrctl *pwrctl); +void pci_pwrctl_device_unset_ready(struct pci_pwrctl *pwrctl); +int devm_pci_pwrctl_device_set_ready(struct device *dev, + struct pci_pwrctl *pwrctl); + +#endif /* __PCI_PWRCTL_H__ */ -- cgit v1.2.3 From 7180f8d91fcbf252de572d9ffacc945effed0060 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 11 Jun 2024 19:38:22 +0200 Subject: vfs: add rcu-based find_inode variants for iget ops This avoids one inode hash lock acquire in the common case on inode creation, in effect significantly reducing contention. On the stock kernel said lock is typically taken twice: 1. once to check if the inode happens to already be present 2. once to add it to the hash The back-to-back lock/unlock pattern is known to degrade performance significantly, which is further exacerbated if the hash is heavily populated (long chains to walk, extending hold time). Arguably hash sizing and hashing algo need to be revisited, but that's beyond the scope of this patch. With the acquire from step 1 eliminated with RCU lookup throughput increases significantly at the scale of 20 cores (benchmark results at the bottom). So happens the hash already supports RCU-based operation, but lookups on inode insertions didn't take advantage of it. This of course has its limits as the global lock is still a bottleneck. There was a patchset posted which introduced fine-grained locking[1] but it appears staled. Apart from that doubt was expressed whether a handrolled hash implementation is appropriate to begin with, suggesting replacement with rhashtables. Nobody committed to carrying [1] across the finish line or implementing anything better, thus the bandaid below. iget_locked consumers (notably ext4) get away without any changes because inode comparison method is built-in. iget5_locked consumers pass a custom callback. Since removal of locking adds more problems (inode can be changing) it's not safe to assume all filesystems happen to cope. Thus iget5_locked_rcu gets added, requiring manual conversion of interested filesystems. In order to reduce code duplication find_inode and find_inode_fast grow an argument indicating whether inode hash lock is held, which is passed down in case sleeping is necessary. They always rcu_read_lock, which is redundant but harmless. Doing it conditionally reduces readability for no real gain that I can see. RCU-alike restrictions were already put on callbacks due to the hash spinlock being held. Benchmarking: There is a real cache-busting workload scanning millions of files in parallel (it's a backup appliance), where the initial lookup is guaranteed to fail resulting in the two lock acquires on stock kernel (and one with the patch at hand). Implemented below is a synthetic benchmark providing the same behavior. [I shall note the workload is not running on Linux, instead it was causing trouble elsewhere. Benchmark below was used while addressing said problems and was found to adequately represent the real workload.] Total real time fluctuates by 1-2s. With 20 threads each walking a dedicated 1000 dirs * 1000 files directory tree to stat(2) on a 32 core + 24GB RAM vm: ext4 (needed mkfs.ext4 -N 24000000): before: 3.77s user 890.90s system 1939% cpu 46.118 total after: 3.24s user 397.73s system 1858% cpu 21.581 total (-53%) That's 20 million files to visit, while the machine can only cache about 15 million at a time (obtained from ext4_inode_cache object count in /proc/slabinfo). Since each terminal inode is only visited once per run this amounts to 0% hit ratio for the dentry cache and the hash table (there are however hits for the intermediate directories). On repeated runs the kernel caches the last ~15 mln, meaning there is ~5 mln of uncached inodes which are going to be visited first, evicting the previously cached state as it happens. Lack of hits can be trivially verified with bpftrace, like so: bpftrace -e 'kretprobe:find_inode_fast { @[kstack(), retval != 0] = count(); }'\ -c "/bin/sh walktrees /testfs 20" Best ran more than once. Expected results after "warmup": [snip] @[ __ext4_iget+275 ext4_lookup+224 __lookup_slow+130 walk_component+219 link_path_walk.part.0.constprop.0+614 path_lookupat+62 filename_lookup+204 vfs_statx+128 vfs_fstatat+131 __do_sys_newfstatat+38 do_syscall_64+87 entry_SYSCALL_64_after_hwframe+118 , 1]: 20000 @[ __ext4_iget+275 ext4_lookup+224 __lookup_slow+130 walk_component+219 path_lookupat+106 filename_lookup+204 vfs_statx+128 vfs_fstatat+131 __do_sys_newfstatat+38 do_syscall_64+87 entry_SYSCALL_64_after_hwframe+118 , 1]: 20000000 That is 20 million calls for the initial lookup and 20 million after allocating a new inode, all of them failing to return a value != 0 (i.e., they are returning NULL -- no match found). Of course aborting the benchmark in the middle and starting it again (or messing with the state in other ways) is going to alter these results. Benchmark can be found here: https://people.freebsd.org/~mjg/fstree.tgz [1] https://lore.kernel.org/all/20231206060629.2827226-1-david@fromorbit.com/ Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240611173824.535995-2-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0283cf366c2a..aac51da4f90c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3029,7 +3029,12 @@ extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); -extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); +struct inode *iget5_locked(struct super_block *, unsigned long, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *); +struct inode *iget5_locked_rcu(struct super_block *, unsigned long, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *); extern struct inode * iget_locked(struct super_block *, unsigned long); extern struct inode *find_inode_nowait(struct super_block *, unsigned long, -- cgit v1.2.3 From 3b9c181bcde8555ca81b2394c2dc2201cefc2dd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Roberto=20de=20Souza?= Date: Tue, 11 Jun 2024 10:47:15 -0700 Subject: devcoredump: Add dev_coredumpm_timeout() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add function to set a custom coredump timeout. For Xe driver usage, current 5 minutes timeout may be too short for users to search and understand what needs to be done to capture coredump to report bugs. We have plans to automate(distribute a udev script) it but at the end will be up to distros and users to pack it so having a option to increase the timeout is a safer option. v2: - replace dev_coredump_timeout_set() by dev_coredumpm_timeout() (Mukesh) v3: - make dev_coredumpm() static inline (Johannes) v5: - rename DEVCOREDUMP_TIMEOUT -> DEVCD_TIMEOUT to avoid redefinition in include/net/bluetooth/coredump.h v6: - fix definition of dev_coredumpm_timeout() when CONFIG_DEV_COREDUMP is disabled Cc: Rodrigo Vivi Cc: Mukesh Ojha Cc: Johannes Berg Cc: Jonathan Cavitt Reviewed-by: Rodrigo Vivi Reviewed-by: Jonathan Cavitt Signed-off-by: José Roberto de Souza Acked-by: Greg Kroah-Hartman Acked-by: Johannes Berg Link: https://patchwork.freedesktop.org/patch/msgid/20240611174716.72660-1-jose.souza@intel.com Signed-off-by: Rodrigo Vivi --- include/linux/devcoredump.h | 53 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/devcoredump.h b/include/linux/devcoredump.h index c8f7eb6cc191..377892604ff4 100644 --- a/include/linux/devcoredump.h +++ b/include/linux/devcoredump.h @@ -12,6 +12,9 @@ #include #include +/* if data isn't read by userspace after 5 minutes then delete it */ +#define DEVCD_TIMEOUT (HZ * 60 * 5) + /* * _devcd_free_sgtable - free all the memory of the given scatterlist table * (i.e. both pages and scatterlist instances) @@ -50,16 +53,17 @@ static inline void _devcd_free_sgtable(struct scatterlist *table) kfree(delete_iter); } - #ifdef CONFIG_DEV_COREDUMP void dev_coredumpv(struct device *dev, void *data, size_t datalen, gfp_t gfp); -void dev_coredumpm(struct device *dev, struct module *owner, - void *data, size_t datalen, gfp_t gfp, - ssize_t (*read)(char *buffer, loff_t offset, size_t count, - void *data, size_t datalen), - void (*free)(void *data)); +void dev_coredumpm_timeout(struct device *dev, struct module *owner, + void *data, size_t datalen, gfp_t gfp, + ssize_t (*read)(char *buffer, loff_t offset, + size_t count, void *data, + size_t datalen), + void (*free)(void *data), + unsigned long timeout); void dev_coredumpsg(struct device *dev, struct scatterlist *table, size_t datalen, gfp_t gfp); @@ -73,11 +77,13 @@ static inline void dev_coredumpv(struct device *dev, void *data, } static inline void -dev_coredumpm(struct device *dev, struct module *owner, - void *data, size_t datalen, gfp_t gfp, - ssize_t (*read)(char *buffer, loff_t offset, size_t count, - void *data, size_t datalen), - void (*free)(void *data)) +dev_coredumpm_timeout(struct device *dev, struct module *owner, + void *data, size_t datalen, gfp_t gfp, + ssize_t (*read)(char *buffer, loff_t offset, + size_t count, void *data, + size_t datalen), + void (*free)(void *data), + unsigned long timeout) { free(data); } @@ -92,4 +98,29 @@ static inline void dev_coredump_put(struct device *dev) } #endif /* CONFIG_DEV_COREDUMP */ +/** + * dev_coredumpm - create device coredump with read/free methods + * @dev: the struct device for the crashed device + * @owner: the module that contains the read/free functions, use %THIS_MODULE + * @data: data cookie for the @read/@free functions + * @datalen: length of the data + * @gfp: allocation flags + * @read: function to read from the given buffer + * @free: function to free the given buffer + * + * Creates a new device coredump for the given device. If a previous one hasn't + * been read yet, the new coredump is discarded. The data lifetime is determined + * by the device coredump framework and when it is no longer needed the @free + * function will be called to free the data. + */ +static inline void dev_coredumpm(struct device *dev, struct module *owner, + void *data, size_t datalen, gfp_t gfp, + ssize_t (*read)(char *buffer, loff_t offset, size_t count, + void *data, size_t datalen), + void (*free)(void *data)) +{ + dev_coredumpm_timeout(dev, owner, data, datalen, gfp, read, free, + DEVCD_TIMEOUT); +} + #endif /* __DEVCOREDUMP_H */ -- cgit v1.2.3 From e038ee6189842e9662d2fc59d09dbcf48350cf99 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Mon, 10 Jun 2024 16:41:44 +0530 Subject: block: unmap and free user mapped integrity via submitter The user mapped intergity is copied back and unpinned by bio_integrity_free which is a low-level routine. Do it via the submitter rather than doing it in the low-level block layer code, to split the submitter side from the consumer side of the bio. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20240610111144.14647-1-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index d5379548d684..818e93612947 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -731,6 +731,7 @@ static inline bool bioset_initialized(struct bio_set *bs) bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); +void bio_integrity_unmap_free_user(struct bio *bio); extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); extern bool bio_integrity_prep(struct bio *); @@ -807,6 +808,9 @@ static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, { return -EINVAL; } +static inline void bio_integrity_unmap_free_user(struct bio *bio) +{ +} #endif /* CONFIG_BLK_DEV_INTEGRITY */ -- cgit v1.2.3 From ec209ad86324de84ef66990f0e9df0851e45e054 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 12 Jun 2024 09:58:32 -0600 Subject: bpf: verifier: Relax caller requirements for kfunc projection type args Currently, if a kfunc accepts a projection type as an argument (eg struct __sk_buff *), the caller must exactly provide exactly the same type with provable provenance. However in practice, kfuncs that accept projection types _must_ cast to the underlying type before use b/c projection type layouts are completely made up. Thus, it is ok to relax the verifier rules around implicit conversions. We will use this functionality in the next commit when we align kfuncs to user-facing types. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/e2c025cb09ccfd4af1ec9e18284dc3cecff7514d.1718207789.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index f9e56fd12a9f..56d91daacdba 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -531,6 +531,7 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); +bool btf_is_projection_of(const char *pname, const char *tname); bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg); -- cgit v1.2.3 From cce4c40b960673f9e020835def310f1e89d3a940 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 12 Jun 2024 09:58:33 -0600 Subject: bpf: treewide: Align kfunc signatures to prog point-of-view Previously, kfunc declarations in bpf_kfuncs.h (and others) used "user facing" types for kfuncs prototypes while the actual kfunc definitions used "kernel facing" types. More specifically: bpf_dynptr vs bpf_dynptr_kern, __sk_buff vs sk_buff, and xdp_md vs xdp_buff. It wasn't an issue before, as the verifier allows aliased types. However, since we are now generating kfunc prototypes in vmlinux.h (in addition to keeping bpf_kfuncs.h around), this conflict creates compilation errors. Fix this conflict by using "user facing" types in kfunc definitions. This results in more casts, but otherwise has no additional runtime cost. Note, similar to 5b268d1ebcdc ("bpf: Have bpf_rdonly_cast() take a const pointer"), we also make kfuncs take const arguments where appropriate in order to make the kfunc more permissive. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/b58346a63a0e66bc9b7504da751b526b0b189a67.1718207789.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a834f4b761bc..f636b4998bf7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3265,8 +3265,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); -int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, - struct bpf_dynptr_kern *ptr); +int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, + struct bpf_dynptr *ptr); #else static inline bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, @@ -3288,8 +3288,8 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, { return 0; } -static inline int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, - struct bpf_dynptr_kern *ptr) +static inline int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, + struct bpf_dynptr *ptr) { return -EOPNOTSUPP; } -- cgit v1.2.3 From b975d3ee5962237c1e2f5d5aeeaaf0dc2173486c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 9 Jun 2024 00:10:39 +0200 Subject: net: add and use skb_get_hash_net Years ago flow dissector gained ability to delegate flow dissection to a bpf program, scoped per netns. Unfortunately, skb_get_hash() only gets an sk_buff argument instead of both net+skb. This means the flow dissector needs to obtain the netns pointer from somewhere else. The netns is derived from skb->dev, and if that is not available, from skb->sk. If neither is set, we hit a (benign) WARN_ON_ONCE(). Trying both dev and sk covers most cases, but not all, as recently reported by Christoph Paasch. In case of nf-generated tcp reset, both sk and dev are NULL: WARNING: .. net/core/flow_dissector.c:1104 skb_flow_dissect_flow_keys include/linux/skbuff.h:1536 [inline] skb_get_hash include/linux/skbuff.h:1578 [inline] nft_trace_init+0x7d/0x120 net/netfilter/nf_tables_trace.c:320 nft_do_chain+0xb26/0xb90 net/netfilter/nf_tables_core.c:268 nft_do_chain_ipv4+0x7a/0xa0 net/netfilter/nft_chain_filter.c:23 nf_hook_slow+0x57/0x160 net/netfilter/core.c:626 __ip_local_out+0x21d/0x260 net/ipv4/ip_output.c:118 ip_local_out+0x26/0x1e0 net/ipv4/ip_output.c:127 nf_send_reset+0x58c/0x700 net/ipv4/netfilter/nf_reject_ipv4.c:308 nft_reject_ipv4_eval+0x53/0x90 net/ipv4/netfilter/nft_reject_ipv4.c:30 [..] syzkaller did something like this: table inet filter { chain input { type filter hook input priority filter; policy accept; meta nftrace set 1 tcp dport 42 reject with tcp reset } chain output { type filter hook output priority filter; policy accept; # empty chain is enough } } ... then sends a tcp packet to port 42. Initial attempt to simply set skb->dev from nf_reject_ipv4 doesn't cover all cases: skbs generated via ipv4 igmp_send_report trigger similar splat. Moreover, Pablo Neira found that nft_hash.c uses __skb_get_hash_symmetric() which would trigger same warn splat for such skbs. Lets allow callers to pass the current netns explicitly. The nf_trace infrastructure is adjusted to use the new helper. __skb_get_hash_symmetric is handled in the next patch. Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/494 Reviewed-by: Willem de Bruijn Signed-off-by: Florian Westphal Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240608221057.16070-2-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fe7d8dbef77e..6e78019f899a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1498,7 +1498,7 @@ __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4) __skb_set_hash(skb, hash, true, is_l4); } -void __skb_get_hash(struct sk_buff *skb); +void __skb_get_hash_net(const struct net *net, struct sk_buff *skb); u32 __skb_get_hash_symmetric(const struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, @@ -1578,10 +1578,18 @@ void skb_flow_dissect_hash(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); +static inline __u32 skb_get_hash_net(const struct net *net, struct sk_buff *skb) +{ + if (!skb->l4_hash && !skb->sw_hash) + __skb_get_hash_net(net, skb); + + return skb->hash; +} + static inline __u32 skb_get_hash(struct sk_buff *skb) { if (!skb->l4_hash && !skb->sw_hash) - __skb_get_hash(skb); + __skb_get_hash_net(NULL, skb); return skb->hash; } -- cgit v1.2.3 From d1dab4f71d372e00e2d34a9c32bf261623e3a95c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 9 Jun 2024 00:10:40 +0200 Subject: net: add and use __skb_get_hash_symmetric_net Similar to previous patch: apply same logic for __skb_get_hash_symmetric and let callers pass the netns to the dissector core. Existing function is turned into a wrapper to avoid adjusting all callers, nft_hash.c uses new function. Reviewed-by: Willem de Bruijn Signed-off-by: Florian Westphal Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240608221057.16070-3-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6e78019f899a..813406a9bd6c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1498,8 +1498,14 @@ __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4) __skb_set_hash(skb, hash, true, is_l4); } +u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb); + +static inline u32 __skb_get_hash_symmetric(const struct sk_buff *skb) +{ + return __skb_get_hash_symmetric_net(NULL, skb); +} + void __skb_get_hash_net(const struct net *net, struct sk_buff *skb); -u32 __skb_get_hash_symmetric(const struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen); -- cgit v1.2.3 From b5c29fba72a6c950655d1cb0f6aa16b60dc83be7 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 28 May 2024 12:54:58 +0800 Subject: iommu: Make iommu_sva_domain_alloc() static iommu_sva_domain_alloc() is only called in iommu-sva.c, hence make it static. On the other hand, iommu_sva_domain_alloc() should not return NULL anymore after commit <80af5a452024> ("iommu: Add ops->domain_alloc_sva()"), the removal of inline code avoids potential confusion. Fixes: 80af5a452024 ("iommu: Add ops->domain_alloc_sva()") Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240528045458.81458-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 17b3f36ad843..1cd19b903354 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1527,8 +1527,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); -struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, - struct mm_struct *mm); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1553,12 +1551,6 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) } static inline void mm_pasid_drop(struct mm_struct *mm) {} - -static inline struct iommu_domain * -iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) -{ - return NULL; -} #endif /* CONFIG_IOMMU_SVA */ #ifdef CONFIG_IOMMU_IOPF -- cgit v1.2.3 From 163f10b2935362f0e8ef8d7fadd0b5aa33e9130f Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Wed, 12 Jun 2024 08:47:07 +0200 Subject: PCI: Add INTEL_HDA_PTL to pci_ids.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit More PCI ids for Intel audio. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Reviewed-by: Bard Liao Reviewed-by: Péter Ujfalusi Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20240612064709.51141-2-pierre-louis.bossart@linux.intel.com Signed-off-by: Takashi Iwai --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 942a587bb97e..0168c6a60148 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3112,6 +3112,7 @@ #define PCI_DEVICE_ID_INTEL_HDA_LNL_P 0xa828 #define PCI_DEVICE_ID_INTEL_S21152BB 0xb152 #define PCI_DEVICE_ID_INTEL_HDA_BMG 0xe2f7 +#define PCI_DEVICE_ID_INTEL_HDA_PTL 0xe428 #define PCI_DEVICE_ID_INTEL_HDA_CML_R 0xf0c8 #define PCI_DEVICE_ID_INTEL_HDA_RKL_S 0xf1c8 -- cgit v1.2.3 From ff985c759778986f55cbc557055fbeb84ee833eb Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 11 Jun 2024 15:01:04 +0200 Subject: auxbus: make to_auxiliary_drv accept and return a constant pointer In the quest to make struct device constant, start by making to_auxiliary_drv() return a constant pointer so that drivers that call this can be fixed up before the driver core changes. As the return type previously was not constant, also fix up all callers that were assuming that the pointer was not going to be a constant one in order to not break the build. Cc: Dave Ertman Cc: Ira Weiny Cc: Rafael J. Wysocki Cc: Bingbu Cao Cc: Tianshu Qiu Cc: Mauro Carvalho Chehab Cc: Michael Chan Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Jesse Brandeburg Cc: Tony Nguyen Cc: Saeed Mahameed Cc: Leon Romanovsky Cc: Tariq Toukan Cc: Pierre-Louis Bossart Cc: Liam Girdwood Cc: Peter Ujfalusi Cc: Bard Liao Cc: Ranjani Sridharan Cc: Daniel Baluta Cc: Kai Vehmanen Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Richard Cochran Cc: linux-media@vger.kernel.org Cc: netdev@vger.kernel.org Cc: intel-wired-lan@lists.osuosl.org Cc: linux-rdma@vger.kernel.org Cc: sound-open-firmware@alsa-project.org Cc: linux-sound@vger.kernel.org Acked-by: Sakari Ailus # drivers/media/pci/intel/ipu6 Acked-by: Mark Brown Reviewed-by: Martin Habets Link: https://lore.kernel.org/r/20240611130103.3262749-7-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/auxiliary_bus.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index de21d9d24a95..bdff7b85f2ae 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -203,7 +203,7 @@ static inline struct auxiliary_device *to_auxiliary_dev(struct device *dev) return container_of(dev, struct auxiliary_device, dev); } -static inline struct auxiliary_driver *to_auxiliary_drv(struct device_driver *drv) +static inline const struct auxiliary_driver *to_auxiliary_drv(const struct device_driver *drv) { return container_of(drv, struct auxiliary_driver, driver); } -- cgit v1.2.3 From 92424801261d1564a0bb759da3cf3ccd69fdf5a2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 13 Jun 2024 13:53:08 +0200 Subject: bpf: Fix reg_set_min_max corruption of fake_reg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Juan reported that after doing some changes to buzzer [0] and implementing a new fuzzing strategy guided by coverage, they noticed the following in one of the probes: [...] 13: (79) r6 = *(u64 *)(r0 +0) ; R0=map_value(ks=4,vs=8) R6_w=scalar() 14: (b7) r0 = 0 ; R0_w=0 15: (b4) w0 = -1 ; R0_w=0xffffffff 16: (74) w0 >>= 1 ; R0_w=0x7fffffff 17: (5c) w6 &= w0 ; R0_w=0x7fffffff R6_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff)) 18: (44) w6 |= 2 ; R6_w=scalar(smin=umin=smin32=umin32=2,smax=umax=umax32=0x7fffffff,var_off=(0x2; 0x7ffffffd)) 19: (56) if w6 != 0x7ffffffd goto pc+1 REG INVARIANTS VIOLATION (true_reg2): range bounds violation u64=[0x7fffffff, 0x7ffffffd] s64=[0x7fffffff, 0x7ffffffd] u32=[0x7fffffff, 0x7ffffffd] s32=[0x7fffffff, 0x7ffffffd] var_off=(0x7fffffff, 0x0) REG INVARIANTS VIOLATION (false_reg1): range bounds violation u64=[0x7fffffff, 0x7ffffffd] s64=[0x7fffffff, 0x7ffffffd] u32=[0x7fffffff, 0x7ffffffd] s32=[0x7fffffff, 0x7ffffffd] var_off=(0x7fffffff, 0x0) REG INVARIANTS VIOLATION (false_reg2): const tnum out of sync with range bounds u64=[0x0, 0xffffffffffffffff] s64=[0x8000000000000000, 0x7fffffffffffffff] u32=[0x0, 0xffffffff] s32=[0x80000000, 0x7fffffff] var_off=(0x7fffffff, 0x0) 19: R6_w=0x7fffffff 20: (95) exit from 19 to 21: R0=0x7fffffff R6=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x7ffffffe,var_off=(0x2; 0x7ffffffd)) R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 21: R0=0x7fffffff R6=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x7ffffffe,var_off=(0x2; 0x7ffffffd)) R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 21: (14) w6 -= 2147483632 ; R6_w=scalar(smin=umin=umin32=2,smax=umax=0xffffffff,smin32=0x80000012,smax32=14,var_off=(0x2; 0xfffffffd)) 22: (76) if w6 s>= 0xe goto pc+1 ; R6_w=scalar(smin=umin=umin32=2,smax=umax=0xffffffff,smin32=0x80000012,smax32=13,var_off=(0x2; 0xfffffffd)) 23: (95) exit from 22 to 24: R0=0x7fffffff R6_w=14 R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 24: R0=0x7fffffff R6_w=14 R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 24: (14) w6 -= 14 ; R6_w=0 [...] What can be seen here is a register invariant violation on line 19. After the binary-or in line 18, the verifier knows that bit 2 is set but knows nothing about the rest of the content which was loaded from a map value, meaning, range is [2,0x7fffffff] with var_off=(0x2; 0x7ffffffd). When in line 19 the verifier analyzes the branch, it splits the register states in reg_set_min_max() into the registers of the true branch (true_reg1, true_reg2) and the registers of the false branch (false_reg1, false_reg2). Since the test is w6 != 0x7ffffffd, the src_reg is a known constant. Internally, the verifier creates a "fake" register initialized as scalar to the value of 0x7ffffffd, and then passes it onto reg_set_min_max(). Now, for line 19, it is mathematically impossible to take the false branch of this program, yet the verifier analyzes it. It is impossible because the second bit of r6 will be set due to the prior or operation and the constant in the condition has that bit unset (hex(fd) == binary(1111 1101). When the verifier first analyzes the false / fall-through branch, it will compute an intersection between the var_off of r6 and of the constant. This is because the verifier creates a "fake" register initialized to the value of the constant. The intersection result later refines both registers in regs_refine_cond_op(): [...] t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); reg2->var_off = tnum_with_subreg(reg2->var_off, t); [...] Since the verifier is analyzing the false branch of the conditional jump, reg1 is equal to false_reg1 and reg2 is equal to false_reg2, i.e. the reg2 is the "fake" register that was meant to hold a constant value. The resulting var_off of the intersection says that both registers now hold a known value of var_off=(0x7fffffff, 0x0) or in other words: this operation manages to make the verifier think that the "constant" value that was passed in the jump operation now holds a different value. Normally this would not be an issue since it should not influence the true branch, however, false_reg2 and true_reg2 are pointers to the same "fake" register. Meaning, the false branch can influence the results of the true branch. In line 24, the verifier assumes R6_w=0, but the actual runtime value in this case is 1. The fix is simply not passing in the same "fake" register location as inputs to reg_set_min_max(), but instead making a copy. Moving the fake_reg into the env also reduces stack consumption by 120 bytes. With this, the verifier successfully rejects invalid accesses from the test program. [0] https://github.com/google/buzzer Fixes: 67420501e868 ("bpf: generalize reg_set_min_max() to handle non-const register comparisons") Reported-by: Juan José López Jaimez Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/r/20240613115310.25383-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 50aa87f8d77f..e4070fb02b11 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -746,6 +746,8 @@ struct bpf_verifier_env { /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; u64 prev_log_pos, prev_insn_print_pos; + /* buffer used to temporary hold constants as scalar registers */ + struct bpf_reg_state fake_reg[2]; /* buffer used to generate temporary string representations, * e.g., in reg_type_str() to generate reg_type string */ -- cgit v1.2.3 From 9a95c5bfbf02a0a7f5983280fe284a0ff0836c34 Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Tue, 7 May 2024 01:25:41 +0000 Subject: ima: Avoid blocking in RCU read-side critical section A panic happens in ima_match_policy: BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 PGD 42f873067 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 5 PID: 1286325 Comm: kubeletmonit.sh Kdump: loaded Tainted: P Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 RIP: 0010:ima_match_policy+0x84/0x450 Code: 49 89 fc 41 89 cf 31 ed 89 44 24 14 eb 1c 44 39 7b 18 74 26 41 83 ff 05 74 20 48 8b 1b 48 3b 1d f2 b9 f4 00 0f 84 9c 01 00 00 <44> 85 73 10 74 ea 44 8b 6b 14 41 f6 c5 01 75 d4 41 f6 c5 02 74 0f RSP: 0018:ff71570009e07a80 EFLAGS: 00010207 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000200 RDX: ffffffffad8dc7c0 RSI: 0000000024924925 RDI: ff3e27850dea2000 RBP: 0000000000000000 R08: 0000000000000000 R09: ffffffffabfce739 R10: ff3e27810cc42400 R11: 0000000000000000 R12: ff3e2781825ef970 R13: 00000000ff3e2785 R14: 000000000000000c R15: 0000000000000001 FS: 00007f5195b51740(0000) GS:ff3e278b12d40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000000626d24002 CR4: 0000000000361ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ima_get_action+0x22/0x30 process_measurement+0xb0/0x830 ? page_add_file_rmap+0x15/0x170 ? alloc_set_pte+0x269/0x4c0 ? prep_new_page+0x81/0x140 ? simple_xattr_get+0x75/0xa0 ? selinux_file_open+0x9d/0xf0 ima_file_check+0x64/0x90 path_openat+0x571/0x1720 do_filp_open+0x9b/0x110 ? page_counter_try_charge+0x57/0xc0 ? files_cgroup_alloc_fd+0x38/0x60 ? __alloc_fd+0xd4/0x250 ? do_sys_open+0x1bd/0x250 do_sys_open+0x1bd/0x250 do_syscall_64+0x5d/0x1d0 entry_SYSCALL_64_after_hwframe+0x65/0xca Commit c7423dbdbc9e ("ima: Handle -ESTALE returned by ima_filter_rule_match()") introduced call to ima_lsm_copy_rule within a RCU read-side critical section which contains kmalloc with GFP_KERNEL. This implies a possible sleep and violates limitations of RCU read-side critical sections on non-PREEMPT systems. Sleeping within RCU read-side critical section might cause synchronize_rcu() returning early and break RCU protection, allowing a UAF to happen. The root cause of this issue could be described as follows: | Thread A | Thread B | | |ima_match_policy | | | rcu_read_lock | |ima_lsm_update_rule | | | synchronize_rcu | | | | kmalloc(GFP_KERNEL)| | | sleep | ==> synchronize_rcu returns early | kfree(entry) | | | | entry = entry->next| ==> UAF happens and entry now becomes NULL (or could be anything). | | entry->action | ==> Accessing entry might cause panic. To fix this issue, we are converting all kmalloc that is called within RCU read-side critical section to use GFP_ATOMIC. Fixes: c7423dbdbc9e ("ima: Handle -ESTALE returned by ima_filter_rule_match()") Cc: stable@vger.kernel.org Signed-off-by: GUO Zihua Acked-by: John Johansen Reviewed-by: Mimi Zohar Reviewed-by: Casey Schaufler [PM: fixed missing comment, long lines, !CONFIG_IMA_LSM_RULES case] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index f804b76cde44..44488b1ab9a9 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -413,7 +413,7 @@ LSM_HOOK(void, LSM_RET_VOID, key_post_create_or_update, struct key *keyring, #ifdef CONFIG_AUDIT LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr, - void **lsmrule) + void **lsmrule, gfp_t gfp) LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule) LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule) LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) diff --git a/include/linux/security.h b/include/linux/security.h index 21cf70346b33..de3af33e6ff5 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2048,7 +2048,8 @@ static inline void security_key_post_create_or_update(struct key *keyring, #ifdef CONFIG_AUDIT #ifdef CONFIG_SECURITY -int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule); +int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, + gfp_t gfp); int security_audit_rule_known(struct audit_krule *krule); int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule); void security_audit_rule_free(void *lsmrule); @@ -2056,7 +2057,7 @@ void security_audit_rule_free(void *lsmrule); #else static inline int security_audit_rule_init(u32 field, u32 op, char *rulestr, - void **lsmrule) + void **lsmrule, gfp_t gfp) { return 0; } -- cgit v1.2.3 From 79a947bd54348d4f63120aacc30505b3ad81fa59 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 6 Jun 2024 20:52:02 +0200 Subject: ACPI: NUMA: Consolidate header includes The header file acpi/acpi_numa.h is included whether CONFIG_ACPI is defined or not. Include it only once before the #ifdef/#else/#endif preprocessor directives and fix the following make includecheck warning: acpi/acpi_numa.h is included more than once Signed-off-by: Thorsten Blum Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 28c3fb2bef0d..bb18e7bf8826 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -24,6 +24,7 @@ struct irq_domain_ops; #define _LINUX #endif #include +#include #ifdef CONFIG_ACPI @@ -35,7 +36,6 @@ struct irq_domain_ops; #include #include -#include #include #include @@ -777,8 +777,6 @@ const char *acpi_get_subsystem_id(acpi_handle handle); #define acpi_dev_uid_match(adev, uid2) (adev && false) #define acpi_dev_hid_uid_match(adev, hid2, uid2) (adev && false) -#include - struct fwnode_handle; static inline bool acpi_dev_found(const char *hid) -- cgit v1.2.3 From d6161b7d8b33aa8756ab2f7eed8cb371c2e7e8ed Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 12 Jun 2024 15:42:29 +0200 Subject: video/logo: Remove linux_serial_image comments The last user of the serial_console ASCII image was removed in v2.1.115. Signed-off-by: Geert Uytterhoeven Signed-off-by: Helge Deller --- include/linux/linux_logo.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/linux_logo.h b/include/linux/linux_logo.h index d4d5b93efe84..e37699b7e839 100644 --- a/include/linux/linux_logo.h +++ b/include/linux/linux_logo.h @@ -10,9 +10,6 @@ * Copyright (C) 2001 Greg Banks * Copyright (C) 2001 Jan-Benedict Glaw * Copyright (C) 2003 Geert Uytterhoeven - * - * Serial_console ascii image can be any size, - * but should contain %s to display the version */ #include -- cgit v1.2.3 From f4a1254f2a076afb0edd473589bf40f9b4d36b41 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 14 Jun 2024 01:04:29 +0100 Subject: io_uring: fix cancellation overwriting req->flags Only the current owner of a request is allowed to write into req->flags. Hence, the cancellation path should never touch it. Add a new field instead of the flag, move it into the 3rd cache line because it should always be initialised. poll_refs can move further as polling is an involved process anyway. It's a minimal patch, in the future we can and should find a better place for it and remove now unused REQ_F_CANCEL_SEQ. Fixes: 521223d7c229f ("io_uring/cancel: don't default to setting req->work.cancel_seq") Cc: stable@vger.kernel.org Reported-by: Li Shi Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6827b129f8f0ad76fa9d1f0a773de938b240ffab.1718323430.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7a6b190c7da7..b48570eaa449 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -648,7 +648,7 @@ struct io_kiocb { struct io_rsrc_node *rsrc_node; atomic_t refs; - atomic_t poll_refs; + bool cancel_seq_set; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; @@ -657,6 +657,7 @@ struct io_kiocb { /* opcode allocated if it needs to store data for async defer */ void *async_data; /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ + atomic_t poll_refs; struct io_kiocb *link; /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; -- cgit v1.2.3 From 6b0d3355e5a58fd73529fa6f930a877caca3f75d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 26 May 2024 20:17:16 +0200 Subject: leds: class: Add flag to avoid automatic renaming of LED devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a mechanism for drivers to opt-out of the automatic device renaming on conflicts. Those drivers will provide their own conflict resolution. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20240526-cros_ec-kbd-led-framework-v3-2-ee577415a521@weissschuh.net Signed-off-by: Lee Jones --- include/linux/leds.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 6300313c46b7..36663ac6c58a 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -107,6 +107,7 @@ struct led_classdev { #define LED_BRIGHT_HW_CHANGED BIT(21) #define LED_RETAIN_AT_SHUTDOWN BIT(22) #define LED_INIT_DEFAULT_TRIGGER BIT(23) +#define LED_REJECT_NAME_CONFLICT BIT(24) /* set_brightness_work / blink_timer flags, atomic, private. */ unsigned long work_flags; -- cgit v1.2.3 From 146a06a0d225cae240065233fd168fb0b95a10ff Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 8 Jun 2024 11:01:13 +0200 Subject: HID: rename struct hid_bpf_ops into hid_ops Those operations are the ones from HID, not HID-BPF, and I'd like to reuse hid_bpf_ops as the user facing struct_ops API. Link: https://lore.kernel.org/r/20240608-hid_bpf_struct_ops-v3-1-6ac6ade58329@kernel.org Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index eec2592dec12..a66103618e6e 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -97,7 +97,7 @@ enum hid_bpf_prog_type { struct hid_report_enum; -struct hid_bpf_ops { +struct hid_ops { struct hid_report *(*hid_get_report)(struct hid_report_enum *report_enum, const u8 *data); int (*hid_hw_raw_request)(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, @@ -110,7 +110,7 @@ struct hid_bpf_ops { const struct bus_type *bus_type; }; -extern struct hid_bpf_ops *hid_bpf_ops; +extern struct hid_ops *hid_ops; struct hid_bpf_prog_list { u16 prog_idx[HID_BPF_MAX_PROGS_PER_DEV]; -- cgit v1.2.3 From ebc0d8093e8c97de459615438edefad1a4ac352c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 8 Jun 2024 11:01:15 +0200 Subject: HID: bpf: implement HID-BPF through bpf_struct_ops We do this implementation in several steps to not have the CI failing: - first (this patch), we add struct_ops while keeping the existing infra available - then we change the selftests, the examples and the existing in-tree HID-BPF programs - then we remove the existing trace points making old HID-BPF obsolete There are a few advantages of struct_ops over tracing: - compatibility with sleepable programs (for hid_hw_raw_request() in a later patch) - a lot simpler in the kernel: it's a simple rcu protected list - we can add more parameters to the function called without much trouble - the "attach" is now generic through BPF-core: the caller just needs to set hid_id and flags before calling __load(). - all the BPF tough part is not handled in BPF-core through generic processing - hid_bpf_ctx is now only writable where it needs be Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20240608-hid_bpf_struct_ops-v3-3-6ac6ade58329@kernel.org Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 61 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index a66103618e6e..c4f4ce10b7dd 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -65,11 +65,12 @@ struct hid_bpf_ctx { * @HID_BPF_FLAG_INSERT_HEAD: insert the given program before any other program * currently attached to the device. This doesn't * guarantee that this program will always be first - * @HID_BPF_FLAG_MAX: sentinel value, not to be used by the callers */ enum hid_bpf_attach_flags { HID_BPF_FLAG_NONE = 0, HID_BPF_FLAG_INSERT_HEAD = _BITUL(0), + + /* private: internal use only */ HID_BPF_FLAG_MAX, }; @@ -112,6 +113,60 @@ struct hid_ops { extern struct hid_ops *hid_ops; +/** + * struct hid_bpf_ops - A BPF struct_ops of callbacks allowing to attach HID-BPF + * programs to a HID device + * @hid_id: the HID uniq ID to attach to. This is writeable before ``load()``, and + * cannot be changed after + * @flags: flags used while attaching the struct_ops to the device. Currently only + * available value is %0 or ``BPF_F_BEFORE``. + * Writeable only before ``load()`` + */ +struct hid_bpf_ops { + /* hid_id needs to stay first so we can easily change it + * from userspace. + */ + int hid_id; + u32 flags; + + /* private: do not show up in the docs */ + struct list_head list; + + /* public: rest should show up in the docs */ + + /** + * @hid_device_event: called whenever an event is coming in from the device + * + * It has the following arguments: + * + * ``ctx``: The HID-BPF context as &struct hid_bpf_ctx + * + * Return: %0 on success and keep processing; a positive + * value to change the incoming size buffer; a negative + * error code to interrupt the processing of this event + * + * Context: Interrupt context. + */ + int (*hid_device_event)(struct hid_bpf_ctx *ctx, enum hid_report_type report_type); + + /** + * @hid_rdesc_fixup: called when the probe function parses the report descriptor + * of the HID device + * + * It has the following arguments: + * + * ``ctx``: The HID-BPF context as &struct hid_bpf_ctx + * + * Return: %0 on success and keep processing; a positive + * value to change the incoming size buffer; a negative + * error code to interrupt the processing of this device + */ + int (*hid_rdesc_fixup)(struct hid_bpf_ctx *ctx); + + /* private: do not show up in the docs */ + struct hid_device *hdev; +}; + struct hid_bpf_prog_list { u16 prog_idx[HID_BPF_MAX_PROGS_PER_DEV]; u8 prog_cnt; @@ -129,6 +184,10 @@ struct hid_bpf { bool destroyed; /* prevents the assignment of any progs */ spinlock_t progs_lock; /* protects RCU update of progs */ + + struct hid_bpf_ops *rdesc_ops; + struct list_head prog_list; + struct mutex prog_list_lock; /* protects prog_list update */ }; /* specific HID-BPF link when a program is attached to a device */ -- cgit v1.2.3 From 4a86220e046da009bef0948e9f51d1d26d68f93c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 8 Jun 2024 11:01:20 +0200 Subject: HID: bpf: remove tracing HID-BPF capability We can now rely on struct_ops as we cleared the users in-tree. Link: https://lore.kernel.org/r/20240608-hid_bpf_struct_ops-v3-8-6ac6ade58329@kernel.org Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 54 +------------------------------------------------ 1 file changed, 1 insertion(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index c4f4ce10b7dd..447b94aa99ab 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -4,7 +4,7 @@ #define __HID_BPF_H #include -#include +#include #include struct hid_device; @@ -24,11 +24,7 @@ struct hid_device; * * All of these fields are currently read-only. * - * @index: program index in the jump table. No special meaning (a smaller index - * doesn't mean the program will be executed before another program with - * a bigger index). * @hid: the ``struct hid_device`` representing the device itself - * @report_type: used for ``hid_bpf_device_event()`` * @allocated_size: Allocated size of data. * * This is how much memory is available and can be requested @@ -47,54 +43,21 @@ struct hid_device; * @retval: Return value of the previous program. */ struct hid_bpf_ctx { - __u32 index; const struct hid_device *hid; __u32 allocated_size; - enum hid_report_type report_type; union { __s32 retval; __s32 size; }; }; -/** - * enum hid_bpf_attach_flags - flags used when attaching a HIF-BPF program - * - * @HID_BPF_FLAG_NONE: no specific flag is used, the kernel choses where to - * insert the program - * @HID_BPF_FLAG_INSERT_HEAD: insert the given program before any other program - * currently attached to the device. This doesn't - * guarantee that this program will always be first - */ -enum hid_bpf_attach_flags { - HID_BPF_FLAG_NONE = 0, - HID_BPF_FLAG_INSERT_HEAD = _BITUL(0), - - /* private: internal use only */ - HID_BPF_FLAG_MAX, -}; - -/* Following functions are tracepoints that BPF programs can attach to */ -int hid_bpf_device_event(struct hid_bpf_ctx *ctx); -int hid_bpf_rdesc_fixup(struct hid_bpf_ctx *ctx); - /* * Below is HID internal */ -/* internal function to call eBPF programs, not to be used by anybody */ -int __hid_bpf_tail_call(struct hid_bpf_ctx *ctx); - #define HID_BPF_MAX_PROGS_PER_DEV 64 #define HID_BPF_FLAG_MASK (((HID_BPF_FLAG_MAX - 1) << 1) - 1) -/* types of HID programs to attach to */ -enum hid_bpf_prog_type { - HID_BPF_PROG_TYPE_UNDEF = -1, - HID_BPF_PROG_TYPE_DEVICE_EVENT, /* an event is emitted from the device */ - HID_BPF_PROG_TYPE_RDESC_FIXUP, - HID_BPF_PROG_TYPE_MAX, -}; struct hid_report_enum; @@ -167,11 +130,6 @@ struct hid_bpf_ops { struct hid_device *hdev; }; -struct hid_bpf_prog_list { - u16 prog_idx[HID_BPF_MAX_PROGS_PER_DEV]; - u8 prog_cnt; -}; - /* stored in each device */ struct hid_bpf { u8 *device_data; /* allocated when a bpf program of type @@ -179,23 +137,13 @@ struct hid_bpf { * to this HID device */ u32 allocated_data; - - struct hid_bpf_prog_list __rcu *progs[HID_BPF_PROG_TYPE_MAX]; /* attached BPF progs */ bool destroyed; /* prevents the assignment of any progs */ - spinlock_t progs_lock; /* protects RCU update of progs */ - struct hid_bpf_ops *rdesc_ops; struct list_head prog_list; struct mutex prog_list_lock; /* protects prog_list update */ }; -/* specific HID-BPF link when a program is attached to a device */ -struct hid_bpf_link { - struct bpf_link link; - int hid_table_index; -}; - #ifdef CONFIG_HID_BPF u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 *size, int interrupt); -- cgit v1.2.3 From c5958697a5fa29d3ba9332205a88725afe9ed912 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 8 Jun 2024 11:01:22 +0200 Subject: Documentation: HID: amend HID-BPF for struct_ops Now that we are using struct_ops, the docs need to be changed. Link: https://lore.kernel.org/r/20240608-hid_bpf_struct_ops-v3-10-6ac6ade58329@kernel.org Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index 447b94aa99ab..1b4cc1b2c31d 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -20,11 +20,9 @@ struct hid_device; * struct hid_bpf_ctx - User accessible data for all HID programs * * ``data`` is not directly accessible from the context. We need to issue - * a call to ``hid_bpf_get_data()`` in order to get a pointer to that field. + * a call to hid_bpf_get_data() in order to get a pointer to that field. * - * All of these fields are currently read-only. - * - * @hid: the ``struct hid_device`` representing the device itself + * @hid: the &struct hid_device representing the device itself * @allocated_size: Allocated size of data. * * This is how much memory is available and can be requested @@ -41,6 +39,8 @@ struct hid_device; * ``size`` must always be less or equal than ``allocated_size`` (it is enforced * once all BPF programs have been run). * @retval: Return value of the previous program. + * + * ``hid`` and ``allocated_size`` are read-only, ``size`` and ``retval`` are read-write. */ struct hid_bpf_ctx { const struct hid_device *hid; -- cgit v1.2.3 From 33c0fb85b571b0f1bbdbf466e770eebeb29e6f41 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 8 Jun 2024 11:01:28 +0200 Subject: HID: bpf: make part of struct hid_device writable It is useful to change the name, the phys and/or the uniq of a struct hid_device during .rdesc_fixup(). For example, hid-uclogic.ko changes the uniq to store the firmware version to differentiate between 2 devices sharing the same PID. In the same way, changing the device name is useful when the device export 3 nodes, all with the same name. Link: https://lore.kernel.org/r/20240608-hid_bpf_struct_ops-v3-16-6ac6ade58329@kernel.org Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index 1b4cc1b2c31d..65d7e0acc8c2 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -43,7 +43,7 @@ struct hid_device; * ``hid`` and ``allocated_size`` are read-only, ``size`` and ``retval`` are read-write. */ struct hid_bpf_ctx { - const struct hid_device *hid; + struct hid_device *hid; __u32 allocated_size; union { __s32 retval; -- cgit v1.2.3 From 5e5f2f92cccc29f356422d3cbc104f7f42430f22 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 14 Jun 2024 02:43:39 +0300 Subject: platform: arm64: add Lenovo Yoga C630 WOS EC driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lenovo Yoga C630 WOS is a laptop using Snapdragon 850 SoC. Like many laptops it uses an embedded controller (EC) to perform various platform operations, including, but not limited, to Type-C port control or power supply handlng. Add the driver for the EC, that creates devices for UCSI and power supply devices. Reviewed-by: Bryan O'Donoghue Signed-off-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20240614-yoga-ec-driver-v7-2-9f0b9b40ae76@linaro.org [ij: added #include ] Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/lenovo-yoga-c630.h | 44 ++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 include/linux/platform_data/lenovo-yoga-c630.h (limited to 'include/linux') diff --git a/include/linux/platform_data/lenovo-yoga-c630.h b/include/linux/platform_data/lenovo-yoga-c630.h new file mode 100644 index 000000000000..5d1f9fb33cfc --- /dev/null +++ b/include/linux/platform_data/lenovo-yoga-c630.h @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022-2024, Linaro Ltd + * Authors: + * Bjorn Andersson + * Dmitry Baryshkov + */ + +#ifndef _LENOVO_YOGA_C630_DATA_H +#define _LENOVO_YOGA_C630_DATA_H + +struct yoga_c630_ec; +struct notifier_block; + +#define YOGA_C630_MOD_NAME "lenovo_yoga_c630" + +#define YOGA_C630_DEV_UCSI "ucsi" +#define YOGA_C630_DEV_PSY "psy" + +int yoga_c630_ec_read8(struct yoga_c630_ec *ec, u8 addr); +int yoga_c630_ec_read16(struct yoga_c630_ec *ec, u8 addr); + +int yoga_c630_ec_register_notify(struct yoga_c630_ec *ec, struct notifier_block *nb); +void yoga_c630_ec_unregister_notify(struct yoga_c630_ec *ec, struct notifier_block *nb); + +#define YOGA_C630_UCSI_WRITE_SIZE 8 +#define YOGA_C630_UCSI_CCI_SIZE 4 +#define YOGA_C630_UCSI_DATA_SIZE 16 +#define YOGA_C630_UCSI_READ_SIZE (YOGA_C630_UCSI_CCI_SIZE + YOGA_C630_UCSI_DATA_SIZE) + +u16 yoga_c630_ec_ucsi_get_version(struct yoga_c630_ec *ec); +int yoga_c630_ec_ucsi_write(struct yoga_c630_ec *ec, + const u8 req[YOGA_C630_UCSI_WRITE_SIZE]); +int yoga_c630_ec_ucsi_read(struct yoga_c630_ec *ec, + u8 resp[YOGA_C630_UCSI_READ_SIZE]); + +#define LENOVO_EC_EVENT_USB 0x20 +#define LENOVO_EC_EVENT_UCSI 0x21 +#define LENOVO_EC_EVENT_HPD 0x22 +#define LENOVO_EC_EVENT_BAT_STATUS 0x24 +#define LENOVO_EC_EVENT_BAT_INFO 0x25 +#define LENOVO_EC_EVENT_BAT_ADPT_STATUS 0x37 + +#endif -- cgit v1.2.3 From 1652b0bafeaa8281ca9a805d81e13d7647bd2f44 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:08 +0200 Subject: block: remove unused queue limits API Remove all APIs that are unused now that sd and sr have been converted to the atomic queue limits API. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: John Garry Reviewed-by: Nitesh Shetty Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-14-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 24c36929920b..ad995e7a7698 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -332,8 +332,6 @@ struct queue_limits { typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); -void disk_set_zoned(struct gendisk *disk); - #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); @@ -638,18 +636,6 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) return sector >> ilog2(disk->queue->limits.chunk_sectors); } -static inline void disk_set_max_open_zones(struct gendisk *disk, - unsigned int max_open_zones) -{ - disk->queue->limits.max_open_zones = max_open_zones; -} - -static inline void disk_set_max_active_zones(struct gendisk *disk, - unsigned int max_active_zones) -{ - disk->queue->limits.max_active_zones = max_active_zones; -} - static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { return bdev->bd_disk->queue->limits.max_open_zones; @@ -929,24 +915,14 @@ static inline void queue_limits_cancel_update(struct request_queue *q) /* * Access functions for manipulating queue properties */ -extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); void blk_queue_max_secure_erase_sectors(struct request_queue *q, unsigned int max_sectors); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, unsigned int max_write_same_sectors); -extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); -extern void blk_queue_max_zone_append_sectors(struct request_queue *q, - unsigned int max_zone_append_sectors); -extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); -void blk_queue_zone_write_granularity(struct request_queue *q, - unsigned int size); -extern void blk_queue_alignment_offset(struct request_queue *q, - unsigned int alignment); void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); -extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); -- cgit v1.2.3 From 73e3715ed14844067c5c598e72777641004a7f60 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:09 +0200 Subject: block: add special APIs for run-time disabling of discard and friends A few drivers optimistically try to support discard, write zeroes and secure erase and disable the features from the I/O completion handler if the hardware can't support them. This disable can't be done using the atomic queue limits API because the I/O completion handlers can't take sleeping locks or freeze the queue. Keep the existing clearing of the relevant field to zero, but replace the old blk_queue_max_* APIs with new disable APIs that force the value to 0. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: John Garry Reviewed-by: Nitesh Shetty Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-15-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ad995e7a7698..ac8e0cb2353a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -912,15 +912,31 @@ static inline void queue_limits_cancel_update(struct request_queue *q) mutex_unlock(&q->limits_lock); } +/* + * These helpers are for drivers that have sloppy feature negotiation and might + * have to disable DISCARD, WRITE_ZEROES or SECURE_DISCARD from the I/O + * completion handler when the device returned an indicator that the respective + * feature is not actually supported. They are racy and the driver needs to + * cope with that. Try to avoid this scheme if you can. + */ +static inline void blk_queue_disable_discard(struct request_queue *q) +{ + q->limits.max_discard_sectors = 0; +} + +static inline void blk_queue_disable_secure_erase(struct request_queue *q) +{ + q->limits.max_secure_erase_sectors = 0; +} + +static inline void blk_queue_disable_write_zeroes(struct request_queue *q) +{ + q->limits.max_write_zeroes_sectors = 0; +} + /* * Access functions for manipulating queue properties */ -void blk_queue_max_secure_erase_sectors(struct request_queue *q, - unsigned int max_sectors); -extern void blk_queue_max_discard_sectors(struct request_queue *q, - unsigned int max_discard_sectors); -extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, - unsigned int max_write_same_sectors); void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); -- cgit v1.2.3 From e9f5f44ad3725335d9c559c3c22cd3726152a7b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:15 +0200 Subject: block: remove the blk_integrity_profile structure Block layer integrity configuration is a bit complex right now, as it indirects through operation vectors for a simple two-dimensional configuration: a) the checksum type of none, ip checksum, crc, crc64 b) the presence or absence of a reference tag Remove the integrity profile, and instead add a separate csum_type flag which replaces the existing ip-checksum field and a new flag that indicates the presence of the reference tag. This removes up to two layers of indirect calls, remove the need to offload the no-op verification of non-PI metadata to a workqueue and generally simplifies the code. The downside is that block/t10-pi.c now has to be built into the kernel when CONFIG_BLK_DEV_INTEGRITY is supported. Given that both nvme and SCSI require t10-pi.ko, it is loaded for all usual configurations that enabled CONFIG_BLK_DEV_INTEGRITY already, though. Signed-off-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 35 ++++++++++------------------------- include/linux/blkdev.h | 9 ++++++++- include/linux/t10-pi.h | 8 -------- 3 files changed, 18 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 7428cb43952d..56ce1ae35580 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -10,7 +10,7 @@ enum blk_integrity_flags { BLK_INTEGRITY_VERIFY = 1 << 0, BLK_INTEGRITY_GENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, - BLK_INTEGRITY_IP_CHECKSUM = 1 << 3, + BLK_INTEGRITY_REF_TAG = 1 << 3, }; struct blk_integrity_iter { @@ -19,22 +19,10 @@ struct blk_integrity_iter { sector_t seed; unsigned int data_size; unsigned short interval; - unsigned char tuple_size; - unsigned char pi_offset; const char *disk_name; }; -typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); -typedef void (integrity_prepare_fn) (struct request *); -typedef void (integrity_complete_fn) (struct request *, unsigned int); - -struct blk_integrity_profile { - integrity_processing_fn *generate_fn; - integrity_processing_fn *verify_fn; - integrity_prepare_fn *prepare_fn; - integrity_complete_fn *complete_fn; - const char *name; -}; +const char *blk_integrity_profile_name(struct blk_integrity *bi); #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_integrity_register(struct gendisk *, struct blk_integrity *); @@ -44,14 +32,17 @@ int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); -static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +static inline bool +blk_integrity_queue_supports_integrity(struct request_queue *q) { - struct blk_integrity *bi = &disk->queue->integrity; + return q->integrity.tuple_size; +} - if (!bi->profile) +static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +{ + if (!blk_integrity_queue_supports_integrity(disk->queue)) return NULL; - - return bi; + return &disk->queue->integrity; } static inline struct blk_integrity * @@ -60,12 +51,6 @@ bdev_get_integrity(struct block_device *bdev) return blk_get_integrity(bdev->bd_disk); } -static inline bool -blk_integrity_queue_supports_integrity(struct request_queue *q) -{ - return q->integrity.profile; -} - static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ac8e0cb2353a..bdd33388e1ce 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -105,9 +105,16 @@ enum { struct disk_events; struct badblocks; +enum blk_integrity_checksum { + BLK_INTEGRITY_CSUM_NONE = 0, + BLK_INTEGRITY_CSUM_IP = 1, + BLK_INTEGRITY_CSUM_CRC = 2, + BLK_INTEGRITY_CSUM_CRC64 = 3, +} __packed ; + struct blk_integrity { - const struct blk_integrity_profile *profile; unsigned char flags; + enum blk_integrity_checksum csum_type; unsigned char tuple_size; unsigned char pi_offset; unsigned char interval_exp; diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 248f4ac95642..d2bafb76badf 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -48,11 +48,6 @@ static inline u32 t10_pi_ref_tag(struct request *rq) return blk_rq_pos(rq) >> (shift - SECTOR_SHIFT) & 0xffffffff; } -extern const struct blk_integrity_profile t10_pi_type1_crc; -extern const struct blk_integrity_profile t10_pi_type1_ip; -extern const struct blk_integrity_profile t10_pi_type3_crc; -extern const struct blk_integrity_profile t10_pi_type3_ip; - struct crc64_pi_tuple { __be64 guard_tag; __be16 app_tag; @@ -79,7 +74,4 @@ static inline u64 ext_pi_ref_tag(struct request *rq) return lower_48_bits(blk_rq_pos(rq) >> (shift - SECTOR_SHIFT)); } -extern const struct blk_integrity_profile ext_pi_type1_crc64; -extern const struct blk_integrity_profile ext_pi_type3_crc64; - #endif -- cgit v1.2.3 From 3c3e85ddffae93eba1a257eb6939bf5dc1e93b9e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:20 +0200 Subject: block: bypass the STABLE_WRITES flag for protection information Currently registering a checksum-enabled (aka PI) integrity profile sets the QUEUE_FLAG_STABLE_WRITE flag, and unregistering it clears the flag. This can incorrectly clear the flag when the driver requires stable writes even without PI, e.g. in case of iSCSI or NVMe/TCP with data digest enabled. Fix this by looking at the csum_type directly in bdev_stable_writes and not setting the queue flag. Also remove the blk_queue_stable_writes helper as the only user in nvme wants to only look at the actual QUEUE_FLAG_STABLE_WRITE flag as it inherits the integrity configuration by other means. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240613084839.1044015-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bdd33388e1ce..f9089750919c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,8 +571,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) -#define blk_queue_stable_writes(q) \ - test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ @@ -1300,8 +1298,14 @@ static inline bool bdev_synchronous(struct block_device *bdev) static inline bool bdev_stable_writes(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_STABLE_WRITES, - &bdev_get_queue(bdev)->queue_flags); + struct request_queue *q = bdev_get_queue(bdev); + +#ifdef CONFIG_BLK_DEV_INTEGRITY + /* BLK_INTEGRITY_CSUM_NONE is not available in blkdev.h */ + if (q->integrity.csum_type != 0) + return true; +#endif + return test_bit(QUEUE_FLAG_STABLE_WRITES, &q->queue_flags); } static inline bool bdev_write_cache(struct block_device *bdev) -- cgit v1.2.3 From 9f4aa46f2a7401025d8561495cf8740f773310fc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:21 +0200 Subject: block: invert the BLK_INTEGRITY_{GENERATE,VERIFY} flags Invert the flags so that user set values will be able to persist revalidating the integrity information once we switch the integrity information to queue_limits. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-12-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 56ce1ae35580..bafa01d4e7f9 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -7,8 +7,8 @@ struct request; enum blk_integrity_flags { - BLK_INTEGRITY_VERIFY = 1 << 0, - BLK_INTEGRITY_GENERATE = 1 << 1, + BLK_INTEGRITY_NOVERIFY = 1 << 0, + BLK_INTEGRITY_NOGENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_REF_TAG = 1 << 3, }; -- cgit v1.2.3 From c6e56cf6b2e79a463af21286ba951714ed20828c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:22 +0200 Subject: block: move integrity information into queue_limits Move the integrity information into the queue limits so that it can be set atomically with other queue limits, and that the sysfs changes to the read_verify and write_generate flags are properly synchronized. This also allows to provide a more useful helper to stack the integrity fields, although it still is separate from the main stacking function as not all stackable devices want to inherit the integrity settings. Even with that it greatly simplifies the code in md and dm. Note that the integrity field is moved as-is into the queue limits. While there are good arguments for removing the separate blk_integrity structure, this would cause a lot of churn and might better be done at a later time if desired. However the integrity field in the queue_limits structure is now unconditional so that various ifdefs can be avoided or replaced with IS_ENABLED(). Given that tiny size of it that seems like a worthwhile trade off. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-13-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 27 +++++++++++---------------- include/linux/blkdev.h | 12 ++++-------- include/linux/t10-pi.h | 12 ++---------- 3 files changed, 17 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index bafa01d4e7f9..d201140d77a3 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -11,6 +11,7 @@ enum blk_integrity_flags { BLK_INTEGRITY_NOGENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_REF_TAG = 1 << 3, + BLK_INTEGRITY_STACKED = 1 << 4, }; struct blk_integrity_iter { @@ -23,11 +24,15 @@ struct blk_integrity_iter { }; const char *blk_integrity_profile_name(struct blk_integrity *bi); +bool queue_limits_stack_integrity(struct queue_limits *t, + struct queue_limits *b); +static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, + struct block_device *bdev) +{ + return queue_limits_stack_integrity(t, &bdev->bd_disk->queue->limits); +} #ifdef CONFIG_BLK_DEV_INTEGRITY -void blk_integrity_register(struct gendisk *, struct blk_integrity *); -void blk_integrity_unregister(struct gendisk *); -int blk_integrity_compare(struct gendisk *, struct gendisk *); int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); @@ -35,14 +40,14 @@ int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) { - return q->integrity.tuple_size; + return q->limits.integrity.tuple_size; } static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { if (!blk_integrity_queue_supports_integrity(disk->queue)) return NULL; - return &disk->queue->integrity; + return &disk->queue->limits.integrity; } static inline struct blk_integrity * @@ -119,17 +124,6 @@ blk_integrity_queue_supports_integrity(struct request_queue *q) { return false; } -static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) -{ - return 0; -} -static inline void blk_integrity_register(struct gendisk *d, - struct blk_integrity *b) -{ -} -static inline void blk_integrity_unregister(struct gendisk *d) -{ -} static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { @@ -157,4 +151,5 @@ static inline struct bio_vec *rq_integrity_vec(struct request *rq) return NULL; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ + #endif /* _LINUX_BLK_INTEGRITY_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f9089750919c..0c247a716885 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -334,6 +334,8 @@ struct queue_limits { * due to possible offsets. */ unsigned int dma_alignment; + + struct blk_integrity integrity; }; typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, @@ -419,10 +421,6 @@ struct request_queue { struct queue_limits limits; -#ifdef CONFIG_BLK_DEV_INTEGRITY - struct blk_integrity integrity; -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - #ifdef CONFIG_PM struct device *dev; enum rpm_status rpm_status; @@ -1300,11 +1298,9 @@ static inline bool bdev_stable_writes(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -#ifdef CONFIG_BLK_DEV_INTEGRITY - /* BLK_INTEGRITY_CSUM_NONE is not available in blkdev.h */ - if (q->integrity.csum_type != 0) + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + q->limits.integrity.csum_type != BLK_INTEGRITY_CSUM_NONE) return true; -#endif return test_bit(QUEUE_FLAG_STABLE_WRITES, &q->queue_flags); } diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index d2bafb76badf..1773610010eb 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -39,12 +39,8 @@ struct t10_pi_tuple { static inline u32 t10_pi_ref_tag(struct request *rq) { - unsigned int shift = ilog2(queue_logical_block_size(rq->q)); + unsigned int shift = rq->q->limits.integrity.interval_exp; -#ifdef CONFIG_BLK_DEV_INTEGRITY - if (rq->q->integrity.interval_exp) - shift = rq->q->integrity.interval_exp; -#endif return blk_rq_pos(rq) >> (shift - SECTOR_SHIFT) & 0xffffffff; } @@ -65,12 +61,8 @@ static inline u64 lower_48_bits(u64 n) static inline u64 ext_pi_ref_tag(struct request *rq) { - unsigned int shift = ilog2(queue_logical_block_size(rq->q)); + unsigned int shift = rq->q->limits.integrity.interval_exp; -#ifdef CONFIG_BLK_DEV_INTEGRITY - if (rq->q->integrity.interval_exp) - shift = rq->q->integrity.interval_exp; -#endif return lower_48_bits(blk_rq_pos(rq) >> (shift - SECTOR_SHIFT)); } -- cgit v1.2.3 From 0a5d3258d7c97295a89d22e54733b54aacb62562 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 3 Jun 2024 22:23:15 -0700 Subject: compiler_types.h: Define __retain for __attribute__((__retain__)) Some code includes the __used macro to prevent functions and data from being optimized out. This macro implements __attribute__((__used__)), which operates at the compiler and IR-level, and so still allows a linker to remove objects intended to be kept. Compilers supporting __attribute__((__retain__)) can address this gap by setting the flag SHF_GNU_RETAIN on the section of a function/variable, indicating to the linker the object should be retained. This attribute is available since gcc 11, clang 13, and binutils 2.36. Provide a __retain macro implementing __attribute__((__retain__)), whose first user will be the '__bpf_kfunc' tag. [ Additional remark from discussion: Why is CONFIG_LTO_CLANG added here? The __used macro permits garbage collection at section level, so CLANG_LTO_CLANG without CONFIG_LD_DEAD_CODE_DATA_ELIMINATION should not change final section dynamics? The conditional guard was included to ensure consistent behaviour between __retain and other features forcing split sections. In particular, the same guard is used in vmlinux.lds.h to merge split sections where needed. For example, using __retain in LLVM builds without CONFIG_LTO was failing CI tests on kernel-patches/bpf because the kernel didn't boot properly. And in further testing, the kernel had no issues loading BPF kfunc modules with such split sections, so the module (partial) linking scripts were left alone. ] Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Cc: Yonghong Song Link: https://lore.kernel.org/bpf/ZlmGoT9KiYLZd91S@krava/T/ Link: https://lore.kernel.org/bpf/b31bca5a5e6765a0f32cc8c19b1d9cdbfaa822b5.1717477560.git.Tony.Ambardar@gmail.com --- include/linux/compiler_types.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 93600de3800b..f14c275950b5 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -143,6 +143,29 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __preserve_most #endif +/* + * Annotating a function/variable with __retain tells the compiler to place + * the object in its own section and set the flag SHF_GNU_RETAIN. This flag + * instructs the linker to retain the object during garbage-cleanup or LTO + * phases. + * + * Note that the __used macro is also used to prevent functions or data + * being optimized out, but operates at the compiler/IR-level and may still + * allow unintended removal of objects during linking. + * + * Optional: only supported since gcc >= 11, clang >= 13 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-retain-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#retain + */ +#if __has_attribute(__retain__) && \ + (defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || \ + defined(CONFIG_LTO_CLANG)) +# define __retain __attribute__((__retain__)) +#else +# define __retain +#endif + /* Compiler specific macros. */ #ifdef __clang__ #include -- cgit v1.2.3 From 7bdcedd5c8fb88e7176b93812b139eca5fe0aa46 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 3 Jun 2024 22:23:16 -0700 Subject: bpf: Harden __bpf_kfunc tag against linker kfunc removal BPF kfuncs are often not directly referenced and may be inadvertently removed by optimization steps during kernel builds, thus the __bpf_kfunc tag mitigates against this removal by including the __used macro. However, this macro alone does not prevent removal during linking, and may still yield build warnings (e.g. on mips64el): [...] LD vmlinux BTFIDS vmlinux WARN: resolve_btfids: unresolved symbol bpf_verify_pkcs7_signature WARN: resolve_btfids: unresolved symbol bpf_lookup_user_key WARN: resolve_btfids: unresolved symbol bpf_lookup_system_key WARN: resolve_btfids: unresolved symbol bpf_key_put WARN: resolve_btfids: unresolved symbol bpf_iter_task_next WARN: resolve_btfids: unresolved symbol bpf_iter_css_task_new WARN: resolve_btfids: unresolved symbol bpf_get_file_xattr WARN: resolve_btfids: unresolved symbol bpf_ct_insert_entry WARN: resolve_btfids: unresolved symbol bpf_cgroup_release WARN: resolve_btfids: unresolved symbol bpf_cgroup_from_id WARN: resolve_btfids: unresolved symbol bpf_cgroup_acquire WARN: resolve_btfids: unresolved symbol bpf_arena_free_pages NM System.map SORTTAB vmlinux OBJCOPY vmlinux.32 [...] Update the __bpf_kfunc tag to better guard against linker optimization by including the new __retain compiler macro, which fixes the warnings above. Verify the __retain macro with readelf by checking object flags for 'R': $ readelf -Wa kernel/trace/bpf_trace.o Section Headers: [Nr] Name Type Address Off Size ES Flg Lk Inf Al [...] [178] .text.bpf_key_put PROGBITS 00000000 6420 0050 00 AXR 0 0 8 [...] Key to Flags: [...] R (retain), D (mbind), p (processor specific) Fixes: 57e7c169cd6a ("bpf: Add __bpf_kfunc tag for marking kernel functions as kfuncs") Reported-by: kernel test robot Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Tested-by: Jiri Olsa Reviewed-by: Jiri Olsa Cc: Yonghong Song Closes: https://lore.kernel.org/r/202401211357.OCX9yllM-lkp@intel.com/ Link: https://lore.kernel.org/bpf/ZlmGoT9KiYLZd91S@krava/T/ Link: https://lore.kernel.org/bpf/e9c64e9b5c073dabd457ff45128aabcab7630098.1717477560.git.Tony.Ambardar@gmail.com --- include/linux/btf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index f9e56fd12a9f..7c3e40c3295e 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -82,7 +82,7 @@ * as to avoid issues such as the compiler inlining or eliding either a static * kfunc, or a global kfunc in an LTO build. */ -#define __bpf_kfunc __used noinline +#define __bpf_kfunc __used __retain noinline #define __bpf_kfunc_start_defs() \ __diag_push(); \ -- cgit v1.2.3 From 98d7ca374ba4b39e7535613d40e159f09ca14da2 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 12 Jun 2024 18:38:13 -0700 Subject: bpf: Track delta between "linked" registers. Compilers can generate the code r1 = r2 r1 += 0x1 if r2 < 1000 goto ... use knowledge of r2 range in subsequent r1 operations So remember constant delta between r2 and r1 and update r1 after 'if' condition. Unfortunately LLVM still uses this pattern for loops with 'can_loop' construct: for (i = 0; i < 1000 && can_loop; i++) The "undo" pass was introduced in LLVM https://reviews.llvm.org/D121937 to prevent this optimization, but it cannot cover all cases. Instead of fighting middle end optimizer in BPF backend teach the verifier about this pattern. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240613013815.953-3-alexei.starovoitov@gmail.com --- include/linux/bpf_verifier.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 50aa87f8d77f..2b54e25d2364 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -73,7 +73,10 @@ enum bpf_iter_state { struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; - /* Fixed part of pointer offset, pointer types only */ + /* + * Fixed part of pointer offset, pointer types only. + * Or constant delta between "linked" scalars with the same ID. + */ s32 off; union { /* valid when type == PTR_TO_PACKET */ @@ -167,6 +170,13 @@ struct bpf_reg_state { * Similarly to dynptrs, we use ID to track "belonging" of a reference * to a specific instance of bpf_iter. */ + /* + * Upper bit of ID is used to remember relationship between "linked" + * registers. Example: + * r1 = r2; both will have r1->id == r2->id == N + * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->off == 10 + */ +#define BPF_ADD_CONST (1U << 31) u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned * from a pointer-cast helper, bpf_sk_fullsock() and -- cgit v1.2.3 From 894376385a2d80a96816449e4991587a9a5ef0dd Mon Sep 17 00:00:00 2001 From: Sebastian Ene Date: Thu, 13 Jun 2024 13:20:33 +0000 Subject: KVM: arm64: Add support for FFA_PARTITION_INFO_GET Handle the FFA_PARTITION_INFO_GET host call inside the pKVM hypervisor and copy the response message back to the host buffers. Signed-off-by: Sebastian Ene Reviewed-by: Sudeep Holla Tested-by: Sudeep Holla Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240613132035.1070360-3-sebastianene@google.com Signed-off-by: Oliver Upton --- include/linux/arm_ffa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index c82d56768101..c6d18f50f671 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -212,6 +212,9 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; } extern const struct bus_type ffa_bus_type; +/* The FF-A 1.0 partition structure lacks the uuid[4] */ +#define FFA_1_0_PARTITON_INFO_SZ (8) + /* FFA transport related */ struct ffa_partition_info { u16 id; -- cgit v1.2.3 From e575d3a6dd22123888defb622b1742aa2d45b942 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Fri, 14 Jun 2024 00:00:31 +0300 Subject: net/mlx5: Correct TASR typo into TSAR TSAR is the correct spelling (Transmit Scheduling ARbiter). Signed-off-by: Cosmin Ratiu Reviewed-by: Gal Pressman Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240613210036.1125203-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 17acd0f3ca8e..466dcda40bb5 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3914,7 +3914,7 @@ enum { }; enum { - ELEMENT_TYPE_CAP_MASK_TASR = 1 << 0, + ELEMENT_TYPE_CAP_MASK_TSAR = 1 << 0, ELEMENT_TYPE_CAP_MASK_VPORT = 1 << 1, ELEMENT_TYPE_CAP_MASK_VPORT_TC = 1 << 2, ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC = 1 << 3, -- cgit v1.2.3 From 296eaab825060d0f25e89b2f85ab9fc26128cea8 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Fri, 14 Jun 2024 00:00:36 +0300 Subject: net/mlx5e: Support SWP-mode offload L4 csum calculation Calculate the pseudo-header checksum for both IPSec transport mode and IPSec tunnel mode for mlx5 devices that do not implement a pure hardware checksum offload for L4 checksum calculation. Introduce a capability bit that identifies such mlx5 devices. Signed-off-by: Rahul Rameshbabu Reviewed-by: Gal Pressman Reviewed-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240613210036.1125203-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 466dcda40bb5..66b921c81c0f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1093,7 +1093,8 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 tunnel_stateless_ip_over_ip_tx[0x1]; u8 reserved_at_2e[0x2]; u8 max_vxlan_udp_ports[0x8]; - u8 reserved_at_38[0x6]; + u8 swp_csum_l4_partial[0x1]; + u8 reserved_at_39[0x5]; u8 max_geneve_opt_len[0x1]; u8 tunnel_stateless_geneve_rx[0x1]; -- cgit v1.2.3 From 6c3282a6b296385bee2c383442c39f507b0d51dd Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 13 Jun 2024 11:36:06 +0100 Subject: net: stmmac: add select_pcs() platform method Allow platform drivers to provide their logic to select an appropriate PCS. Tested-by: Romain Gantois Reviewed-by: Romain Gantois Signed-off-by: Russell King (Oracle) Link: https://lore.kernel.org/r/E1sHhoM-00Fesu-8E@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 8f0f156d50d3..9c54f82901a1 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -13,7 +13,7 @@ #define __STMMAC_PLATFORM_DATA #include -#include +#include #define MTL_MAX_RX_QUEUES 8 #define MTL_MAX_TX_QUEUES 8 @@ -271,6 +271,8 @@ struct plat_stmmacenet_data { void (*dump_debug_regs)(void *priv); int (*pcs_init)(struct stmmac_priv *priv); void (*pcs_exit)(struct stmmac_priv *priv); + struct phylink_pcs *(*select_pcs)(struct stmmac_priv *priv, + phy_interface_t interface); void *bsp_priv; struct clk *stmmac_clk; struct clk *pclk; -- cgit v1.2.3 From 384a746bb55960aa5ffb3a67de08f11fc2f51042 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 5 Jun 2024 11:17:10 +0200 Subject: Revert "mm: init_mlocked_on_free_v3" There was insufficient review and no agreement that this is the right approach. There are serious flaws with the implementation that make processes using mlock() not even work with simple fork() [1] and we get reliable crashes when rebooting. Further, simply because we might be unmapping a single PTE of a large mlocked folio, we shouldn't zero out the whole folio. ... especially because the code can also *corrupt* urelated memory because kernel_init_pages(page, folio_nr_pages(folio)); Could end up writing outside of the actual folio if we work with a tail page. Let's revert it. Once there is agreement that this is the right approach, the issues were fixed and there was reasonable review and proper testing, we can consider it again. [1] https://lkml.kernel.org/r/4da9da2f-73e4-45fd-b62f-a8a513314057@redhat.com Link: https://lkml.kernel.org/r/20240605091710.38961-1-david@redhat.com Fixes: ba42b524a040 ("mm: init_mlocked_on_free_v3") Signed-off-by: David Hildenbrand Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20240528151340.4282-1-00107082@163.com/ Reported-by: Lance Yang Closes: https://lkml.kernel.org/r/20240601140917.43562-1-ioworker0@gmail.com Acked-by: Lance Yang Cc: York Jasper Niebuhr Cc: Matthew Wilcox (Oracle) Cc: Kees Cook Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9849dfda44d4..9a5652c5fadd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3776,14 +3776,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, - &init_on_free); -} - -DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); -static inline bool want_init_mlocked_on_free(void) -{ - return static_branch_maybe(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, - &init_mlocked_on_free); + &init_on_free); } extern bool _debug_pagealloc_enabled_early; -- cgit v1.2.3 From a273559e9eb68cb58c57803d76a1622b8324a878 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Jun 2024 16:38:40 -0700 Subject: lib/alloc_tag: fix RCU imbalance in pgalloc_tag_get() put_page_tag_ref() should be called only when get_page_tag_ref() returns a valid reference because only in that case get_page_tag_ref() enters RCU read section while put_page_tag_ref() will call rcu_read_unlock() even if the provided reference is NULL. Fix pgalloc_tag_get() which does not follow this rule causing RCU imbalance. Add a warning in put_page_tag_ref() to catch any future mistakes. Link: https://lkml.kernel.org/r/20240601233840.617458-1-surenb@google.com Fixes: cc92eba1c88b ("mm: fix non-compound multi-order memory accounting in __free_pages") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202405271029.6d2f9c4c-lkp@intel.com Acked-by: Vlastimil Babka Cc: Kent Overstreet Cc: Kees Cook Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 86ba5d33e43b..9cacadbd61f8 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -37,6 +37,9 @@ static inline union codetag_ref *get_page_tag_ref(struct page *page) static inline void put_page_tag_ref(union codetag_ref *ref) { + if (WARN_ON(!ref)) + return; + page_ext_put(page_ext_from_codetag_ref(ref)); } @@ -102,9 +105,11 @@ static inline struct alloc_tag *pgalloc_tag_get(struct page *page) union codetag_ref *ref = get_page_tag_ref(page); alloc_tag_sub_check(ref); - if (ref && ref->ct) - tag = ct_to_alloc_tag(ref->ct); - put_page_tag_ref(ref); + if (ref) { + if (ref->ct) + tag = ct_to_alloc_tag(ref->ct); + put_page_tag_ref(ref); + } } return tag; -- cgit v1.2.3 From 6a50c9b512f7734bc356f4bd47885a6f7c98491a Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Fri, 7 Jun 2024 17:40:48 +0800 Subject: mm: huge_memory: fix misused mapping_large_folio_support() for anon folios When I did a large folios split test, a WARNING "[ 5059.122759][ T166] Cannot split file folio to non-0 order" was triggered. But the test cases are only for anonmous folios. while mapping_large_folio_support() is only reasonable for page cache folios. In split_huge_page_to_list_to_order(), the folio passed to mapping_large_folio_support() maybe anonmous folio. The folio_test_anon() check is missing. So the split of the anonmous THP is failed. This is also the same for shmem_mapping(). We'd better add a check for both. But the shmem_mapping() in __split_huge_page() is not involved, as for anonmous folios, the end parameter is set to -1, so (head[i].index >= end) is always false. shmem_mapping() is not called. Also add a VM_WARN_ON_ONCE() in mapping_large_folio_support() for anon mapping, So we can detect the wrong use more easily. THP folios maybe exist in the pagecache even the file system doesn't support large folio, it is because when CONFIG_TRANSPARENT_HUGEPAGE is enabled, khugepaged will try to collapse read-only file-backed pages to THP. But the mapping does not actually support multi order large folios properly. Using /sys/kernel/debug/split_huge_pages to verify this, with this patch, large anon THP is successfully split and the warning is ceased. Link: https://lkml.kernel.org/r/202406071740485174hcFl7jRxncsHDtI-Pz-o@zte.com.cn Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Reviewed-by: Barry Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand Signed-off-by: Ran Xiaokai Cc: Michal Hocko Cc: xu xin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ee633712bba0..59f1df0cde5a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -381,6 +381,10 @@ static inline void mapping_set_large_folios(struct address_space *mapping) */ static inline bool mapping_large_folio_support(struct address_space *mapping) { + /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */ + VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, + "Anonymous mapping always supports large folio"); + return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } -- cgit v1.2.3 From 01c8f9806bde438ca1c8cbbc439f0a14a6694f6c Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Tue, 11 Jun 2024 15:32:29 +0200 Subject: kcov: don't lose track of remote references during softirqs In kcov_remote_start()/kcov_remote_stop(), we swap the previous KCOV metadata of the current task into a per-CPU variable. However, the kcov_mode_enabled(mode) check is not sufficient in the case of remote KCOV coverage: current->kcov_mode always remains KCOV_MODE_DISABLED for remote KCOV objects. If the original task that has invoked the KCOV_REMOTE_ENABLE ioctl happens to get interrupted and kcov_remote_start() is called, it ultimately leads to kcov_remote_stop() NOT restoring the original KCOV reference. So when the task exits, all registered remote KCOV handles remain active forever. The most uncomfortable effect (at least for syzkaller) is that the bug prevents the reuse of the same /sys/kernel/debug/kcov descriptor. If we obtain it in the parent process and then e.g. drop some capabilities and continuously fork to execute individual programs, at some point current->kcov of the forked process is lost, kcov_task_exit() takes no action, and all KCOV_REMOTE_ENABLE ioctls calls from subsequent forks fail. And, yes, the efficiency is also affected if we keep on losing remote kcov objects. a) kcov_remote_map keeps on growing forever. b) (If I'm not mistaken), we're also not freeing the memory referenced by kcov->area. Fix it by introducing a special kcov_mode that is assigned to the task that owns a KCOV remote object. It makes kcov_mode_enabled() return true and yet does not trigger coverage collection in __sanitizer_cov_trace_pc() and write_comp_data(). [nogikh@google.com: replace WRITE_ONCE() with an ordinary assignment] Link: https://lkml.kernel.org/r/20240614171221.2837584-1-nogikh@google.com Link: https://lkml.kernel.org/r/20240611133229.527822-1-nogikh@google.com Fixes: 5ff3b30ab57d ("kcov: collect coverage from interrupts") Signed-off-by: Aleksandr Nogikh Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Arnd Bergmann Cc: Marco Elver Cc: Signed-off-by: Andrew Morton --- include/linux/kcov.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kcov.h b/include/linux/kcov.h index b851ba415e03..3b479a3d235a 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -21,6 +21,8 @@ enum kcov_mode { KCOV_MODE_TRACE_PC = 2, /* Collecting comparison operands mode. */ KCOV_MODE_TRACE_CMP = 3, + /* The process owns a KCOV remote reference. */ + KCOV_MODE_REMOTE = 4, }; #define KCOV_IN_CTXSW (1 << 30) -- cgit v1.2.3 From 4604b3888f614a353c7afa17bdc8e7393bb1f9a1 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Jun 2024 21:51:33 +0300 Subject: hwrng: core - Remove list.h from the hw_random.h The 'struct list' type is defined in types.h, no need to include list.h for that. Signed-off-by: Andy Shevchenko Signed-off-by: Herbert Xu --- include/linux/hw_random.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index 136e9842120e..b424555753b1 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -13,9 +13,8 @@ #define LINUX_HWRANDOM_H_ #include -#include -#include #include +#include /** * struct hwrng - Hardware Random Number Generator driver -- cgit v1.2.3 From 8043832e2a123fd9372007a29192f2f3ba328cd6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Fri, 14 Jun 2024 11:05:43 +0300 Subject: memblock: use numa_valid_node() helper to check for invalid node ID Introduce numa_valid_node(nid) that verifies that nid is a valid node ID and use that instead of comparing nid parameter with either NUMA_NO_NODE or MAX_NUMNODES. This makes the checks for valid node IDs consistent and more robust and allows to get rid of multiple WARNings. Suggested-by: Linus Torvalds Signed-off-by: Mike Rapoport (IBM) --- include/linux/numa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/numa.h b/include/linux/numa.h index 1d43371fafd2..eb19503604fe 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -15,6 +15,11 @@ #define NUMA_NO_NODE (-1) #define NUMA_NO_MEMBLK (-1) +static inline bool numa_valid_node(int nid) +{ + return nid >= 0 && nid < MAX_NUMNODES; +} + /* optionally keep NUMA memory info available post init */ #ifdef CONFIG_NUMA_KEEP_MEMINFO #define __initdata_or_meminfo -- cgit v1.2.3 From d98995b4bf981519dde4af0a081c393d62474039 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Jun 2024 13:26:37 +0300 Subject: net/mlx5: Reimplement write combining test The test of write combining was added before in mlx5_ib driver. It opens UD QP and posts NOP WQEs, and uses BlueFlame doorbell. When BlueFlame is used, WQEs get written directly to a PCI BAR of the device (in addition to memory) so that the device handles them without having to access memory. In this test, the WQEs written in memory are different from the ones written to the BlueFlame which request CQE update. By checking the completion reports posted on CQ, we can know if BlueFlame succeeds or not. The write combining must be supported if BlueFlame succeeds as its register is written using write combining. This patch reimplements the test in the same way, but using a pair of SQ and CQ only. It is moved to mlx5_core as a general feature used by both mlx5_core and mlx5_ib. Besides, save write combine test result of the PCI function, so that its thousands of child functions such as SF can query without paying the time and resource penalty by itself. The test function is called only after failing to get the cached result. With this enhancement, all thousands of SFs of the PF attached to same driver no longer need to perform WC check explicitly, which is already done in the system. This saves several commands per SF, thereby speeds up SF creation and also saves completion EQ creation. Signed-off-by: Jianbo Liu Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/4ff5a8cc4c5b5b0d98397baa45a5019bcdbf096e.1717409369.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 779cfdf2e9d6..0d31f77396fc 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -766,6 +766,12 @@ struct mlx5_hca_cap { u32 max[MLX5_UN_SZ_DW(hca_cap_union)]; }; +enum mlx5_wc_state { + MLX5_WC_STATE_UNINITIALIZED, + MLX5_WC_STATE_UNSUPPORTED, + MLX5_WC_STATE_SUPPORTED, +}; + struct mlx5_core_dev { struct device *device; enum mlx5_coredev_type coredev_type; @@ -824,6 +830,9 @@ struct mlx5_core_dev { #endif u64 num_ipsec_offloads; struct mlx5_sd *sd; + enum mlx5_wc_state wc_state; + /* sync write combining state */ + struct mutex wc_state_lock; }; struct mlx5_db { @@ -1375,4 +1384,6 @@ static inline bool mlx5_is_macsec_roce_supported(struct mlx5_core_dev *mdev) enum { MLX5_OCTWORD = 16, }; + +bool mlx5_wc_support_get(struct mlx5_core_dev *mdev); #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From b339e0a39dc37726712b9f0485d78fe4306d1667 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 13 Jun 2024 21:00:04 +0300 Subject: RDMA/mlx5: Add Qcounters req_transport_retries_exceeded/req_rnr_retries_exceeded The req_transport_retries_exceeded counter shows the number of times requester detected transport retries exceed error. The req_rnr_retries_exceeded counter show the number of times the requester detected RNR NAKs retries exceed error. Signed-off-by: Patrisious Haddad Link: https://lore.kernel.org/r/250466af94f4989d638fab168e246035530e912f.1718301543.git.leon@kernel.org Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5df52e15f7d6..09d9d87d62c6 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5629,7 +5629,11 @@ struct mlx5_ifc_query_q_counter_out_bits { u8 local_ack_timeout_err[0x20]; - u8 reserved_at_320[0xa0]; + u8 reserved_at_320[0x60]; + + u8 req_rnr_retries_exceeded[0x20]; + + u8 reserved_at_3a0[0x20]; u8 resp_local_length_error[0x20]; -- cgit v1.2.3 From 81cc927d9c5eefd4a1b08e16b0ab2263f36d03f7 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 23 May 2024 17:45:17 -0400 Subject: io_uring: Drop per-ctx dummy_ubuf Commit 19a63c402170 ("io_uring/rsrc: keep one global dummy_ubuf") replaced it with a global static object but this stayed behind. Fixes: 19a63c402170 ("io_uring/rsrc: keep one global dummy_ubuf") Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20240523214517.31803-1-krisman@suse.de Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b48570eaa449..93c9044ec3fe 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -373,7 +373,6 @@ struct io_ring_ctx { struct io_restriction restrictions; /* slow path rsrc auxilary data, used by update/register */ - struct io_mapped_ubuf *dummy_ubuf; struct io_rsrc_data *file_data; struct io_rsrc_data *buf_data; -- cgit v1.2.3 From 200f3abd14db55f9aadcb74f4e7a678f1c469ba1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Jun 2024 11:51:19 -0600 Subject: io_uring/eventfd: move eventfd handling to separate file This is pretty nicely abstracted already, but let's move it to a separate file rather than have it in the main io_uring file. With that, we can also move the io_ev_fd struct and enum out of global scope. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 93c9044ec3fe..850e30be9322 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -211,14 +211,6 @@ struct io_submit_state { struct blk_plug plug; }; -struct io_ev_fd { - struct eventfd_ctx *cq_ev_fd; - unsigned int eventfd_async: 1; - struct rcu_head rcu; - atomic_t refs; - atomic_t ops; -}; - struct io_alloc_cache { void **entries; unsigned int nr_cached; -- cgit v1.2.3 From 3474d1b93f897ab33ce160e759afd47d5f412de4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Jun 2024 19:28:27 +0000 Subject: io_uring/io-wq: make io_wq_work flags atomic The work flags can be set/accessed from different tasks, both the originator of the request, and the io-wq workers. While modifications aren't concurrent, it still makes KMSAN unhappy. There's no real downside to just making the flag reading/manipulation use proper atomics here. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 850e30be9322..1052a68fd68d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -50,7 +50,7 @@ struct io_wq_work_list { struct io_wq_work { struct io_wq_work_node list; - unsigned flags; + atomic_t flags; /* place it here instead of io_kiocb as it fills padding and saves 4B */ int cancel_seq; }; -- cgit v1.2.3 From c3042a5403ef2be622023fcc3b11fc1aa08ba7fa Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 14 Jun 2024 09:03:44 +0000 Subject: block: Drop locking annotation for limits_lock Currently compiling block/blk-settings.c with C=1 gives the following warning: block/blk-settings.c:262:9: warning: context imbalance in 'queue_limits_commit_update' - wrong count at exit request_queue.limits_lock is a mutex. Sparse locking annotation for mutexes are currently not supported - see [0] - so drop that locking annotation. [0] https://lore.kernel.org/lkml/cover.1579893447.git.jbi.octave@gmail.com/T/#mbb8bda6c0a7ca7ce19f46df976a8e3b489745488 Fixes: d690cb8ae14bd ("block: add an API to atomically update queue limits") Signed-off-by: John Garry Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240614090345.655716-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0c247a716885..83af6ac5aa48 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -893,7 +893,6 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset, */ static inline struct queue_limits queue_limits_start_update(struct request_queue *q) - __acquires(q->limits_lock) { mutex_lock(&q->limits_lock); return q->limits; -- cgit v1.2.3 From 1c75adb22d49ca9389333ca5e6939052a7203111 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 13 Jun 2024 22:59:09 +0200 Subject: ASoC: SOF: mediatek: Constify struct mtk_adsp_ipc_ops 'struct mtk_adsp_ipc_ops' is not modified in these drivers. Constifying this structure moves some data to a read-only section, so increase overall security. In order to do it, "struct mtk_adsp_ipc" also needs to be adjusted to this new const qualifier. On a x86_64, with allmodconfig: Before: ====== text data bss dec hex filename 15533 2383 0 17916 45fc sound/soc/sof/mediatek/mt8195/mt8195.o After: ===== text data bss dec hex filename 15557 2367 0 17924 4604 sound/soc/sof/mediatek/mt8195/mt8195.o Signed-off-by: Christophe JAILLET Link: https://msgid.link/r/a45d6b2b5ec040ea0fc78fca662c2dca3f13a49f.1718312321.git.christophe.jaillet@wanadoo.fr Signed-off-by: Mark Brown --- include/linux/firmware/mediatek/mtk-adsp-ipc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/firmware/mediatek/mtk-adsp-ipc.h b/include/linux/firmware/mediatek/mtk-adsp-ipc.h index 5b1d16fa3f56..6e86799a7dc4 100644 --- a/include/linux/firmware/mediatek/mtk-adsp-ipc.h +++ b/include/linux/firmware/mediatek/mtk-adsp-ipc.h @@ -40,7 +40,7 @@ struct mtk_adsp_chan { struct mtk_adsp_ipc { struct mtk_adsp_chan chans[MTK_ADSP_MBOX_NUM]; struct device *dev; - struct mtk_adsp_ipc_ops *ops; + const struct mtk_adsp_ipc_ops *ops; void *private_data; }; -- cgit v1.2.3 From f22b4b55edb507a2b30981e133b66b642be4d13f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 13 Jun 2024 14:33:16 -0700 Subject: net: make for_each_netdev_dump() a little more bug-proof I find the behavior of xa_for_each_start() slightly counter-intuitive. It doesn't end the iteration by making the index point after the last element. IOW calling xa_for_each_start() again after it "finished" will run the body of the loop for the last valid element, instead of doing nothing. This works fine for netlink dumps if they terminate correctly (i.e. coalesce or carefully handle NLM_DONE), but as we keep getting reminded legacy dumps are unlikely to go away. Fixing this generically at the xa_for_each_start() level seems hard - there is no index reserved for "end of iteration". ifindexes are 31b wide, tho, and iterator is ulong so for for_each_netdev_dump() it's safe to go to the next element. Signed-off-by: Jakub Kicinski Reviewed-by: Przemek Kitszel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f148a01dd1d1..85111502cf8f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3021,7 +3021,8 @@ int call_netdevice_notifiers_info(unsigned long val, #define net_device_entry(lh) list_entry(lh, struct net_device, dev_list) #define for_each_netdev_dump(net, d, ifindex) \ - xa_for_each_start(&(net)->dev_by_index, (ifindex), (d), (ifindex)) + for (; (d = xa_find(&(net)->dev_by_index, &ifindex, \ + ULONG_MAX, XA_PRESENT)); ifindex++) static inline struct net_device *next_net_device(struct net_device *dev) { -- cgit v1.2.3 From a43b9ec091b1b1924ea18883a715e5aadba2543e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 29 May 2024 10:19:20 -0700 Subject: peci, hwmon: Switch to new Intel CPU model defines Update peci subsystem to use the same vendor-family-model combined definition that core x86 code uses. Signed-off-by: Tony Luck Acked-by: Guenter Roeck Reviewed-by: Iwona Winiarska Link: https://lore.kernel.org/r/20240529171920.62571-1-tony.luck@intel.com Signed-off-by: Iwona Winiarska --- include/linux/peci-cpu.h | 24 ++++++++++++++++++++++++ include/linux/peci.h | 6 ++---- 2 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/peci-cpu.h b/include/linux/peci-cpu.h index ff8ae9c26c80..601cdd086bf6 100644 --- a/include/linux/peci-cpu.h +++ b/include/linux/peci-cpu.h @@ -6,6 +6,30 @@ #include +/* Copied from x86 */ +#define X86_VENDOR_INTEL 0 + +/* Copied from x86 */ +#define VFM_MODEL_BIT 0 +#define VFM_FAMILY_BIT 8 +#define VFM_VENDOR_BIT 16 +#define VFM_RSVD_BIT 24 + +#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT) +#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT) +#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT) + +#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT) +#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT) +#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT) + +#define VFM_MAKE(_vendor, _family, _model) ( \ + ((_model) << VFM_MODEL_BIT) | \ + ((_family) << VFM_FAMILY_BIT) | \ + ((_vendor) << VFM_VENDOR_BIT) \ +) +/* End of copied code */ + #include "../../arch/x86/include/asm/intel-family.h" #define PECI_PCS_PKG_ID 0 /* Package Identifier Read */ diff --git a/include/linux/peci.h b/include/linux/peci.h index 90e241458ef6..3e0bc37591d6 100644 --- a/include/linux/peci.h +++ b/include/linux/peci.h @@ -59,8 +59,7 @@ static inline struct peci_controller *to_peci_controller(void *d) * struct peci_device - PECI device * @dev: device object to register PECI device to the device model * @info: PECI device characteristics - * @info.family: device family - * @info.model: device model + * @info.x86_vfm: device vendor-family-model * @info.peci_revision: PECI revision supported by the PECI device * @info.socket_id: the socket ID represented by the PECI device * @addr: address used on the PECI bus connected to the parent controller @@ -73,8 +72,7 @@ static inline struct peci_controller *to_peci_controller(void *d) struct peci_device { struct device dev; struct { - u16 family; - u8 model; + u32 x86_vfm; u8 peci_revision; u8 socket_id; } info; -- cgit v1.2.3 From f45a6051d582f613f4abc383ae238661f7813302 Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Mon, 25 Mar 2024 18:38:10 +0200 Subject: cpu/hotplug: Fix typo in comment Signed-off-by: Costa Shulyupin Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240325163810.669459-1-costa.shul@redhat.com --- include/linux/cpuhotplug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 7a5785f405b6..7f6c820c12eb 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -27,7 +27,7 @@ * startup callbacks sequentially from CPUHP_OFFLINE + 1 to CPUHP_ONLINE * during a CPU online operation. During a CPU offline operation the * installed teardown callbacks are invoked in the reverse order from - * CPU_ONLINE - 1 down to CPUHP_OFFLINE. + * CPUHP_ONLINE - 1 down to CPUHP_OFFLINE. * * The state space has three sections: PREPARE, STARTING and ONLINE. * -- cgit v1.2.3 From 299d623f5c9ab48e53255cf6b510627f1ef26dfe Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 14 Jun 2024 19:32:03 +0200 Subject: irqdomain: Introduce irq_domain_instantiate() The existing irq_domain_add_*() functions used to instantiate an IRQ domain are wrappers built on top of __irq_domain_add() and describe the domain properties using a bunch of parameters. Adding more parameters and wrappers to hide new parameters in the existing code lead to more and more code without any relevant value and without any flexibility. Introduce irq_domain_instantiate() where the interrupt domain properties are given using a irq_domain_info structure instead of the bunch of parameters to allow flexibility and easy evolution. irq_domain_instantiate() performs the same operation as the one done by __irq_domain_add(). For compatibility reason with existing code, keep __irq_domain_add() but convert it to irq_domain_instantiate(). [ tglx: Fixed up struct initializer coding style ] Suggested-by: Thomas Gleixner Signed-off-by: Herve Codina Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240614173232.1184015-3-herve.codina@bootlin.com --- include/linux/irqdomain.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 21ecf582a0fe..ab8939c8724d 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -257,6 +257,27 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) } void irq_domain_free_fwnode(struct fwnode_handle *fwnode); +/** + * struct irq_domain_info - Domain information structure + * @fwnode: firmware node for the interrupt controller + * @size: Size of linear map; 0 for radix mapping only + * @hwirq_max: Maximum number of interrupts supported by controller + * @direct_max: Maximum value of direct maps; + * Use ~0 for no limit; 0 for no direct mapping + * @ops: Domain operation callbacks + * @host_data: Controller private data pointer + */ +struct irq_domain_info { + struct fwnode_handle *fwnode; + unsigned int size; + irq_hw_number_t hwirq_max; + int direct_max; + const struct irq_domain_ops *ops; + void *host_data; +}; + +struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info); + struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, -- cgit v1.2.3 From 922ac2cf9fe444c4aff165b9f7e158a9b651647d Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 14 Jun 2024 19:32:05 +0200 Subject: irqdomain: Constify parameter in is_fwnode_irqchip() The fwnode parameter has no reason to be a pointer to an un-const struct fwnode_handle. Indeed, struct fwnode_handle is not supposed to be modified by the function. Be consistent with other function performing the same kind of operation such as is_of_node(), is_acpi_device_node() or is_software_node(): constify the fwnode parameter. Signed-off-by: Herve Codina Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240614173232.1184015-5-herve.codina@bootlin.com --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index ab8939c8724d..a3b43e357009 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -314,7 +314,7 @@ static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) extern const struct fwnode_operations irqchip_fwnode_ops; -static inline bool is_fwnode_irqchip(struct fwnode_handle *fwnode) +static inline bool is_fwnode_irqchip(const struct fwnode_handle *fwnode) { return fwnode && fwnode->ops == &irqchip_fwnode_ops; } -- cgit v1.2.3 From 757398541c30a5e898169763b43f08dab71ea3bd Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 14 Jun 2024 19:32:08 +0200 Subject: irqdomain: Handle additional domain flags in irq_domain_instantiate() In order to use irq_domain_instantiate() from several places such as irq_domain_create_hierarchy(), irq_domain_instantiate() needs to handle additional domain flags. Add the required infrastructure. Suggested-by: Thomas Gleixner Signed-off-by: Herve Codina Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240614173232.1184015-8-herve.codina@bootlin.com --- include/linux/irqdomain.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index a3b43e357009..4683b66eded9 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -260,6 +260,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode); /** * struct irq_domain_info - Domain information structure * @fwnode: firmware node for the interrupt controller + * @domain_flags: Additional flags to add to the domain flags * @size: Size of linear map; 0 for radix mapping only * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; @@ -269,6 +270,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode); */ struct irq_domain_info { struct fwnode_handle *fwnode; + unsigned int domain_flags; unsigned int size; irq_hw_number_t hwirq_max; int direct_max; -- cgit v1.2.3 From 419e3778ff295c00aa158d9f2854a70b47ba1136 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 14 Jun 2024 19:32:09 +0200 Subject: irqdomain: Handle domain hierarchy parent in irq_domain_instantiate() To use irq_domain_instantiate() from irq_domain_create_hierarchy(), irq_domain_instantiate() needs to handle the domain hierarchy parent. Add the required functionality. Signed-off-by: Herve Codina Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240614173232.1184015-9-herve.codina@bootlin.com --- include/linux/irqdomain.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 4683b66eded9..e52fd5e5494c 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -276,6 +276,12 @@ struct irq_domain_info { int direct_max; const struct irq_domain_ops *ops; void *host_data; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + /** + * @parent: Pointer to the parent irq domain used in a hierarchy domain + */ + struct irq_domain *parent; +#endif }; struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info); -- cgit v1.2.3 From 0b21add71bd9cfa2bd6677a0300e15fd4c4b84ed Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 14 Jun 2024 19:32:12 +0200 Subject: irqdomain: Handle domain bus token in irq_domain_create() irq_domain_update_bus_token() is the only way to set the domain bus token. This is sub-optimal as irq_domain_update_bus_token() can be called only once the domain is created and needs to revert some operations, change the domain name and redo the operations. In order to avoid this revert/change/redo sequence, take the domain bus into account token during the domain creation. Suggested-by: Thomas Gleixner Signed-off-by: Herve Codina Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240614173232.1184015-12-herve.codina@bootlin.com --- include/linux/irqdomain.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e52fd5e5494c..52bed23e5c61 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -265,6 +265,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode); * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; * Use ~0 for no limit; 0 for no direct mapping + * @bus_token: Domain bus token * @ops: Domain operation callbacks * @host_data: Controller private data pointer */ @@ -274,6 +275,7 @@ struct irq_domain_info { unsigned int size; irq_hw_number_t hwirq_max; int direct_max; + enum irq_domain_bus_token bus_token; const struct irq_domain_ops *ops; void *host_data; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY -- cgit v1.2.3 From 44b68de9b8e3dfde12308e8567548799d7ded0de Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 14 Jun 2024 19:32:13 +0200 Subject: irqdomain: Introduce init() and exit() hooks The current API does not allow additional initialization before the domain is published. This can lead to a race condition between consumers and supplier as a domain can be available for consumers before being fully ready. Introduce the init() hook to allow additional initialization before plublishing the domain. Also introduce the exit() hook to revert operations done in init() on domain removal. Suggested-by: Thomas Gleixner Signed-off-by: Herve Codina Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240614173232.1184015-13-herve.codina@bootlin.com --- include/linux/irqdomain.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 52bed23e5c61..2c927edc4d43 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -141,6 +141,7 @@ struct irq_domain_chip_generic; * purposes related to the irq domain. * @parent: Pointer to parent irq_domain to support hierarchy irq_domains * @msi_parent_ops: Pointer to MSI parent domain methods for per device domain init + * @exit: Function called when the domain is destroyed * * Revmap data, used internally by the irq domain code: * @revmap_size: Size of the linear map table @revmap[] @@ -169,6 +170,7 @@ struct irq_domain { #ifdef CONFIG_GENERIC_MSI_IRQ const struct msi_parent_ops *msi_parent_ops; #endif + void (*exit)(struct irq_domain *d); /* reverse map data. The linear map gets appended to the irq_domain */ irq_hw_number_t hwirq_max; @@ -268,6 +270,10 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode); * @bus_token: Domain bus token * @ops: Domain operation callbacks * @host_data: Controller private data pointer + * @init: Function called when the domain is created. + * Allow to do some additional domain initialisation. + * @exit: Function called when the domain is destroyed. + * Allow to do some additional cleanup operation. */ struct irq_domain_info { struct fwnode_handle *fwnode; @@ -284,6 +290,8 @@ struct irq_domain_info { */ struct irq_domain *parent; #endif + int (*init)(struct irq_domain *d); + void (*exit)(struct irq_domain *d); }; struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info); -- cgit v1.2.3 From e25f553a92973eaf59ff3a00fe7f61ab01b2877f Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 14 Jun 2024 19:32:14 +0200 Subject: genirq/generic_chip: Introduce irq_domain_{alloc,remove}_generic_chips() The existing __irq_alloc_domain_generic_chips() uses a bunch of parameters to describe the generic chips that need to be allocated. Adding more parameters and wrappers to hide new parameters in the existing code leads to more and more code without any relevant values and without any flexibility. Introduce irq_domain_alloc_generic_chips() where the generic chips description is done using the irq_domain_chip_generic_info structure instead of the bunch of parameters to allow flexibility and easy evolution. Also introduce irq_domain_remove_generic_chips() to revert the operations done by irq_domain_alloc_generic_chips(). Suggested-by: Thomas Gleixner Signed-off-by: Herve Codina Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240614173232.1184015-14-herve.codina@bootlin.com --- include/linux/irq.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index a217e1029c1d..58264b236cbf 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1117,6 +1117,27 @@ struct irq_domain_chip_generic { struct irq_chip_generic *gc[]; }; +/** + * struct irq_domain_chip_generic_info - Generic chip information structure + * @name: Name of the generic interrupt chip + * @handler: Interrupt handler used by the generic interrupt chip + * @irqs_per_chip: Number of interrupts each chip handles (max 32) + * @num_ct: Number of irq_chip_type instances associated with each + * chip + * @irq_flags_to_clear: IRQ_* bits to clear in the mapping function + * @irq_flags_to_set: IRQ_* bits to set in the mapping function + * @gc_flags: Generic chip specific setup flags + */ +struct irq_domain_chip_generic_info { + const char *name; + irq_flow_handler_t handler; + unsigned int irqs_per_chip; + unsigned int num_ct; + unsigned int irq_flags_to_clear; + unsigned int irq_flags_to_set; + enum irq_gc_flags gc_flags; +}; + /* Generic chip callback functions */ void irq_gc_noop(struct irq_data *d); void irq_gc_mask_disable_reg(struct irq_data *d); @@ -1153,6 +1174,10 @@ int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); +int irq_domain_alloc_generic_chips(struct irq_domain *d, + const struct irq_domain_chip_generic_info *info); +void irq_domain_remove_generic_chips(struct irq_domain *d); + int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, int num_ct, const char *name, irq_flow_handler_t handler, -- cgit v1.2.3 From fea922ee9f8ffd3c2a8e8dfbc68de42905a3982a Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 14 Jun 2024 19:32:15 +0200 Subject: genirq/generic_chip: Introduce init() and exit() hooks Most of generic chip drivers need to perform some more additional initializations on the generic chips allocated before they can be fully ready. These additional initializations need to be performed before the IRQ domain is published to avoid a race condition between IRQ consumers and suppliers. Introduce the init() hook to perform these initializations at the right place just after the generic chip creation. Also introduce the exit() hook to allow reverting operations done by the init() hook just before the generic chip is destroyed. Suggested-by: Thomas Gleixner Signed-off-by: Herve Codina Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240614173232.1184015-15-herve.codina@bootlin.com --- include/linux/irq.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 58264b236cbf..712bcee56efc 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1106,6 +1106,7 @@ enum irq_gc_flags { * @irq_flags_to_set: IRQ* flags to set on irq setup * @irq_flags_to_clear: IRQ* flags to clear on irq setup * @gc_flags: Generic chip specific setup flags + * @exit: Function called on each chip when they are destroyed. * @gc: Array of pointers to generic interrupt chips */ struct irq_domain_chip_generic { @@ -1114,6 +1115,7 @@ struct irq_domain_chip_generic { unsigned int irq_flags_to_clear; unsigned int irq_flags_to_set; enum irq_gc_flags gc_flags; + void (*exit)(struct irq_chip_generic *gc); struct irq_chip_generic *gc[]; }; @@ -1127,6 +1129,10 @@ struct irq_domain_chip_generic { * @irq_flags_to_clear: IRQ_* bits to clear in the mapping function * @irq_flags_to_set: IRQ_* bits to set in the mapping function * @gc_flags: Generic chip specific setup flags + * @init: Function called on each chip when they are created. + * Allow to do some additional chip initialisation. + * @exit: Function called on each chip when they are destroyed. + * Allow to do some chip cleanup operation. */ struct irq_domain_chip_generic_info { const char *name; @@ -1136,6 +1142,8 @@ struct irq_domain_chip_generic_info { unsigned int irq_flags_to_clear; unsigned int irq_flags_to_set; enum irq_gc_flags gc_flags; + int (*init)(struct irq_chip_generic *gc); + void (*exit)(struct irq_chip_generic *gc); }; /* Generic chip callback functions */ -- cgit v1.2.3 From e6f67ce32e8e6dcbadf42dc435fbc9002cabf1f9 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 14 Jun 2024 19:32:16 +0200 Subject: irqdomain: Add support for generic irq chips creation before publishing a domain The current API functions create an irq_domain and also publish this newly created to domain. Once an irq_domain is published, consumers can request IRQ in order to use them. Some interrupt controller drivers have to perform some more operations with the created irq_domain in order to have it ready to be used. For instance: - Allocate generic irq chips with irq_alloc_domain_generic_chips() - Retrieve the generic irq chips with irq_get_domain_generic_chip() - Initialize retrieved chips: set register base address and offsets, set several hooks such as irq_mask, irq_unmask, ... With the newly introduced irq_domain_alloc_generic_chips(), an interrupt controller driver can use the irq_domain_chip_generic_info structure and set the init() hook to perform its generic chips initialization. In order to avoid a window where the domain is published but not yet ready to be used, handle the generic chip creation (i.e the irq_domain_alloc_generic_chips() call) before the domain is published. Suggested-by: Thomas Gleixner Signed-off-by: Herve Codina Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240614173232.1184015-16-herve.codina@bootlin.com --- include/linux/irqdomain.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 2c927edc4d43..5540b22a755c 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -210,6 +210,9 @@ enum { /* Irq domain is a MSI device domain */ IRQ_DOMAIN_FLAG_MSI_DEVICE = (1 << 9), + /* Irq domain must destroy generic chips when removed */ + IRQ_DOMAIN_FLAG_DESTROY_GC = (1 << 10), + /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the @@ -259,6 +262,9 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) } void irq_domain_free_fwnode(struct fwnode_handle *fwnode); + +struct irq_domain_chip_generic_info; + /** * struct irq_domain_info - Domain information structure * @fwnode: firmware node for the interrupt controller @@ -270,6 +276,8 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode); * @bus_token: Domain bus token * @ops: Domain operation callbacks * @host_data: Controller private data pointer + * @dgc_info: Geneneric chip information structure pointer used to + * create generic chips for the domain if not NULL. * @init: Function called when the domain is created. * Allow to do some additional domain initialisation. * @exit: Function called when the domain is destroyed. @@ -290,6 +298,7 @@ struct irq_domain_info { */ struct irq_domain *parent; #endif + struct irq_domain_chip_generic_info *dgc_info; int (*init)(struct irq_domain *d); void (*exit)(struct irq_domain *d); }; -- cgit v1.2.3 From 0c5b29a6dc7b463b6072da8cef43800008728ff3 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 14 Jun 2024 19:32:17 +0200 Subject: irqdomain: Add a resource managed version of irq_domain_instantiate() Add a devres version of irq_domain_instantiate(). Signed-off-by: Herve Codina Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240614173232.1184015-17-herve.codina@bootlin.com --- include/linux/irqdomain.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 5540b22a755c..8820317582c4 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -304,6 +304,8 @@ struct irq_domain_info { }; struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info); +struct irq_domain *devm_irq_domain_instantiate(struct device *dev, + const struct irq_domain_info *info); struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, irq_hw_number_t hwirq_max, int direct_max, -- cgit v1.2.3 From 7c53626cd11820a11f9cb2c54c02d47fc062a265 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 14 Jun 2024 19:32:18 +0200 Subject: irqdomain: Convert __irq_domain_add() wrappers to irq_domain_instantiate() __irq_domain_add() wrappers use directly __irq_domain_add(). With the introduction of irq_domain_instantiate(), __irq_domain_add() becomes obsolete. In order to fully remove __irq_domain_add(), convert wrappers to irq_domain_instantiate() [ tglx: Fixup struct initializers ] Signed-off-by: Herve Codina Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240614173232.1184015-18-herve.codina@bootlin.com --- include/linux/irqdomain.h | 58 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 8820317582c4..33a968fbdda2 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -400,7 +400,17 @@ static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_no const struct irq_domain_ops *ops, void *host_data) { - return __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data); + struct irq_domain_info info = { + .fwnode = of_node_to_fwnode(of_node), + .size = size, + .hwirq_max = size, + .ops = ops, + .host_data = host_data, + }; + struct irq_domain *d; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; } #ifdef CONFIG_IRQ_DOMAIN_NOMAP @@ -409,7 +419,17 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod const struct irq_domain_ops *ops, void *host_data) { - return __irq_domain_add(of_node_to_fwnode(of_node), 0, max_irq, max_irq, ops, host_data); + struct irq_domain_info info = { + .fwnode = of_node_to_fwnode(of_node), + .hwirq_max = max_irq, + .direct_max = max_irq, + .ops = ops, + .host_data = host_data, + }; + struct irq_domain *d; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; } extern unsigned int irq_create_direct_mapping(struct irq_domain *host); @@ -419,7 +439,16 @@ static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node const struct irq_domain_ops *ops, void *host_data) { - return __irq_domain_add(of_node_to_fwnode(of_node), 0, ~0, 0, ops, host_data); + struct irq_domain_info info = { + .fwnode = of_node_to_fwnode(of_node), + .hwirq_max = ~0U, + .ops = ops, + .host_data = host_data, + }; + struct irq_domain *d; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; } static inline struct irq_domain *irq_domain_create_linear(struct fwnode_handle *fwnode, @@ -427,14 +456,33 @@ static inline struct irq_domain *irq_domain_create_linear(struct fwnode_handle * const struct irq_domain_ops *ops, void *host_data) { - return __irq_domain_add(fwnode, size, size, 0, ops, host_data); + struct irq_domain_info info = { + .fwnode = fwnode, + .size = size, + .hwirq_max = size, + .ops = ops, + .host_data = host_data, + }; + struct irq_domain *d; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; } static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fwnode, const struct irq_domain_ops *ops, void *host_data) { - return __irq_domain_add(fwnode, 0, ~0, 0, ops, host_data); + struct irq_domain_info info = { + .fwnode = fwnode, + .hwirq_max = ~0, + .ops = ops, + .host_data = host_data, + }; + struct irq_domain *d; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; } extern void irq_domain_remove(struct irq_domain *host); -- cgit v1.2.3 From 0b4b172b760efabf8a77ea17644d333fbb444d39 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 14 Jun 2024 19:32:21 +0200 Subject: irqdomain: Remove __irq_domain_add() __irq_domain_add() has been replaced by irq_domain_instanciate() and so, it is no more used. Simply remove it. Signed-off-by: Herve Codina Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240614173232.1184015-21-herve.codina@bootlin.com --- include/linux/irqdomain.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 33a968fbdda2..02cd486ac354 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -184,7 +184,7 @@ enum { /* Irq domain is hierarchical */ IRQ_DOMAIN_FLAG_HIERARCHY = (1 << 0), - /* Irq domain name was allocated in __irq_domain_add() */ + /* Irq domain name was allocated internally */ IRQ_DOMAIN_NAME_ALLOCATED = (1 << 1), /* Irq domain is an IPI domain with virq per cpu */ @@ -307,10 +307,6 @@ struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info); struct irq_domain *devm_irq_domain_instantiate(struct device *dev, const struct irq_domain_info *info); -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, - irq_hw_number_t hwirq_max, int direct_max, - const struct irq_domain_ops *ops, - void *host_data); struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, -- cgit v1.2.3 From 1037e4c53e851682ff8d1ab656567a4d5a333c93 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 14 Jun 2024 12:58:48 +0300 Subject: cpu/hotplug: Add support for declaring CPU offlining not supported The ACPI MADT mailbox wakeup method doesn't allow to offline a CPU after it has been woken up. Currently, offlining is prevented based on the confidential computing attribute which is set for Intel TDX. But TDX is not the only possible user of the wake up method. The MADT wakeup can be implemented outside of a confidential computing environment. Offline support is a property of the wakeup method, not the CoCo implementation. Introduce cpu_hotplug_disable_offlining() that can be called to indicate that CPU offlining should be disabled. This function is going to replace CC_ATTR_HOTPLUG_DISABLED for ACPI MADT wakeup method. Signed-off-by: Kirill A. Shutemov Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Thomas Gleixner Tested-by: Tao Liu Link: https://lore.kernel.org/r/20240614095904.1345461-4-kirill.shutemov@linux.intel.com --- include/linux/cpuhplock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuhplock.h b/include/linux/cpuhplock.h index 431560bbd045..f7aa20f62b87 100644 --- a/include/linux/cpuhplock.h +++ b/include/linux/cpuhplock.h @@ -21,6 +21,7 @@ void cpus_read_lock(void); void cpus_read_unlock(void); int cpus_read_trylock(void); void lockdep_assert_cpus_held(void); +void cpu_hotplug_disable_offlining(void); void cpu_hotplug_disable(void); void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); @@ -36,6 +37,7 @@ static inline void cpus_read_lock(void) { } static inline void cpus_read_unlock(void) { } static inline int cpus_read_trylock(void) { return true; } static inline void lockdep_assert_cpus_held(void) { } +static inline void cpu_hotplug_disable_offlining(void) { } static inline void cpu_hotplug_disable(void) { } static inline void cpu_hotplug_enable(void) { } static inline int remove_cpu(unsigned int cpu) { return -EPERM; } -- cgit v1.2.3 From 66e48e491d1e1a0f243ebfcb9639b23de1a5db5e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 14 Jun 2024 12:58:49 +0300 Subject: cpu/hotplug, x86/acpi: Disable CPU offlining for ACPI MADT wakeup ACPI MADT doesn't allow to offline a CPU after it has been woken up. Currently, CPU hotplug is prevented based on the confidential computing attribute which is set for Intel TDX. But TDX is not the only possible user of the wake up method. Any platform that uses ACPI MADT wakeup method cannot offline CPU. Disable CPU offlining on ACPI MADT wakeup enumeration. This has no visible effects for users: currently, TDX guest is the only platform that uses the ACPI MADT wakeup method. Signed-off-by: Kirill A. Shutemov Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Tested-by: Tao Liu Link: https://lore.kernel.org/r/20240614095904.1345461-5-kirill.shutemov@linux.intel.com --- include/linux/cc_platform.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index 60693a145894..caa4b4430634 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -81,16 +81,6 @@ enum cc_attr { */ CC_ATTR_GUEST_SEV_SNP, - /** - * @CC_ATTR_HOTPLUG_DISABLED: Hotplug is not supported or disabled. - * - * The platform/OS is running as a guest/virtual machine does not - * support CPU hotplug feature. - * - * Examples include TDX Guest. - */ - CC_ATTR_HOTPLUG_DISABLED, - /** * @CC_ATTR_HOST_SEV_SNP: AMD SNP enabled on the host. * -- cgit v1.2.3 From 614dc0fb76327dbd81abd4612fbc2e4ba8f205e6 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 5 Jun 2024 10:18:52 -0500 Subject: sev-guest: configfs-tsm: Allow the privlevel_floor attribute to be updated With the introduction of an SVSM, Linux will be running at a non-zero VMPL. Any request for an attestation report at a higher privilege VMPL than what Linux is currently running will result in an error. Allow for the privlevel_floor attribute to be updated dynamically. [ bp: Trim commit message. ] Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/5a736be9384aebd98a0b7c929660f8a97cbdc366.1717600736.git.thomas.lendacky@amd.com --- include/linux/tsm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tsm.h b/include/linux/tsm.h index de8324a2223c..50c5769657d8 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -54,7 +54,7 @@ struct tsm_report { */ struct tsm_ops { const char *name; - const unsigned int privlevel_floor; + unsigned int privlevel_floor; int (*report_new)(struct tsm_report *report, void *data); }; -- cgit v1.2.3 From 0e6a35b93745bb2d8b921fd0520ef730489d41a2 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 5 Jun 2024 10:18:53 -0500 Subject: fs/configfs: Add a callback to determine attribute visibility In order to support dynamic decisions as to whether an attribute should be created, add a callback that returns a bool to indicate whether the attribute should be displayed. If no callback is registered, the attribute is displayed by default. Co-developed-by: Dan Williams Signed-off-by: Dan Williams Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/e555c8740a263fab9f83b2cbb44da1af49a2813c.1717600736.git.thomas.lendacky@amd.com --- include/linux/configfs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 2606711adb18..c771e9d0d0b9 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -216,6 +216,9 @@ struct configfs_group_operations { struct config_group *(*make_group)(struct config_group *group, const char *name); void (*disconnect_notify)(struct config_group *group, struct config_item *item); void (*drop_item)(struct config_group *group, struct config_item *item); + bool (*is_visible)(struct config_item *item, struct configfs_attribute *attr, int n); + bool (*is_bin_visible)(struct config_item *item, struct configfs_bin_attribute *attr, + int n); }; struct configfs_subsystem { -- cgit v1.2.3 From 20dfee95936413708701eb151f419597fdd9d948 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 5 Jun 2024 10:18:54 -0500 Subject: x86/sev: Take advantage of configfs visibility support in TSM The TSM attestation report support provides multiple configfs attribute types (both for standard and binary attributes) to allow for additional attributes to be displayed for SNP as compared to TDX. With the ability to hide attributes via configfs, consolidate the multiple attribute groups into a single standard attribute group and a single binary attribute group. Modify the TDX support to hide the attributes that were previously "hidden" as a result of registering the selective attribute groups. Co-developed-by: Dan Williams Signed-off-by: Dan Williams Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://lore.kernel.org/r/8873c45d0c8abc35aaf01d7833a55788a6905727.1717600736.git.thomas.lendacky@amd.com --- include/linux/tsm.h | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tsm.h b/include/linux/tsm.h index 50c5769657d8..30d9d270b446 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -42,12 +42,40 @@ struct tsm_report { u8 *auxblob; }; +/** + * enum tsm_attr_index - index used to reference report attributes + * @TSM_REPORT_GENERATION: index of the report generation number attribute + * @TSM_REPORT_PROVIDER: index of the provider name attribute + * @TSM_REPORT_PRIVLEVEL: index of the desired privilege level attribute + * @TSM_REPORT_PRIVLEVEL_FLOOR: index of the minimum allowed privileg level attribute + */ +enum tsm_attr_index { + TSM_REPORT_GENERATION, + TSM_REPORT_PROVIDER, + TSM_REPORT_PRIVLEVEL, + TSM_REPORT_PRIVLEVEL_FLOOR, +}; + +/** + * enum tsm_bin_attr_index - index used to reference binary report attributes + * @TSM_REPORT_INBLOB: index of the binary report input attribute + * @TSM_REPORT_OUTBLOB: index of the binary report output attribute + * @TSM_REPORT_AUXBLOB: index of the binary auxiliary data attribute + */ +enum tsm_bin_attr_index { + TSM_REPORT_INBLOB, + TSM_REPORT_OUTBLOB, + TSM_REPORT_AUXBLOB, +}; + /** * struct tsm_ops - attributes and operations for tsm instances * @name: tsm id reflected in /sys/kernel/config/tsm/report/$report/provider * @privlevel_floor: convey base privlevel for nested scenarios * @report_new: Populate @report with the report blob and auxblob * (optional), return 0 on successful population, or -errno otherwise + * @report_attr_visible: show or hide a report attribute entry + * @report_bin_attr_visible: show or hide a report binary attribute entry * * Implementation specific ops, only one is expected to be registered at * a time i.e. only one of "sev-guest", "tdx-guest", etc. @@ -56,14 +84,10 @@ struct tsm_ops { const char *name; unsigned int privlevel_floor; int (*report_new)(struct tsm_report *report, void *data); + bool (*report_attr_visible)(int n); + bool (*report_bin_attr_visible)(int n); }; -extern const struct config_item_type tsm_report_default_type; - -/* publish @privlevel, @privlevel_floor, and @auxblob attributes */ -extern const struct config_item_type tsm_report_extra_type; - -int tsm_register(const struct tsm_ops *ops, void *priv, - const struct config_item_type *type); +int tsm_register(const struct tsm_ops *ops, void *priv); int tsm_unregister(const struct tsm_ops *ops); #endif /* __TSM_H */ -- cgit v1.2.3 From 627dc671518b7f004ce04c45e8711f8dca94a57c Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 5 Jun 2024 10:18:55 -0500 Subject: x86/sev: Extend the config-fs attestation support for an SVSM When an SVSM is present, the guest can also request attestation reports from it. These SVSM attestation reports can be used to attest the SVSM and any services running within the SVSM. Extend the config-fs attestation support to provide such. This involves creating four new config-fs attributes: - 'service-provider' (input) This attribute is used to determine whether the attestation request should be sent to the specified service provider or to the SEV firmware. The SVSM service provider is represented by the value 'svsm'. - 'service_guid' (input) Used for requesting the attestation of a single service within the service provider. A null GUID implies that the SVSM_ATTEST_SERVICES call should be used to request the attestation report. A non-null GUID implies that the SVSM_ATTEST_SINGLE_SERVICE call should be used. - 'service_manifest_version' (input) Used with the SVSM_ATTEST_SINGLE_SERVICE call, the service version represents a specific service manifest version be used for the attestation report. - 'manifestblob' (output) Used to return the service manifest associated with the attestation report. Only display these new attributes when running under an SVSM. [ bp: Massage. - s/svsm_attestation_call/svsm_attest_call/g ] Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/965015dce3c76bb8724839d50c5dea4e4b5d598f.1717600736.git.thomas.lendacky@amd.com --- include/linux/tsm.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tsm.h b/include/linux/tsm.h index 30d9d270b446..11b0c525be30 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -4,6 +4,7 @@ #include #include +#include #define TSM_INBLOB_MAX 64 #define TSM_OUTBLOB_MAX SZ_32K @@ -19,11 +20,17 @@ * @privlevel: optional privilege level to associate with @outblob * @inblob_len: sizeof @inblob * @inblob: arbitrary input data + * @service_provider: optional name of where to obtain the tsm report blob + * @service_guid: optional service-provider service guid to attest + * @service_manifest_version: optional service-provider service manifest version requested */ struct tsm_desc { unsigned int privlevel; size_t inblob_len; u8 inblob[TSM_INBLOB_MAX]; + char *service_provider; + guid_t service_guid; + unsigned int service_manifest_version; }; /** @@ -33,6 +40,8 @@ struct tsm_desc { * @outblob: generated evidence to provider to the attestation agent * @auxblob_len: sizeof(@auxblob) * @auxblob: (optional) auxiliary data to the report (e.g. certificate data) + * @manifestblob_len: sizeof(@manifestblob) + * @manifestblob: (optional) manifest data associated with the report */ struct tsm_report { struct tsm_desc desc; @@ -40,6 +49,8 @@ struct tsm_report { u8 *outblob; size_t auxblob_len; u8 *auxblob; + size_t manifestblob_len; + u8 *manifestblob; }; /** @@ -48,12 +59,18 @@ struct tsm_report { * @TSM_REPORT_PROVIDER: index of the provider name attribute * @TSM_REPORT_PRIVLEVEL: index of the desired privilege level attribute * @TSM_REPORT_PRIVLEVEL_FLOOR: index of the minimum allowed privileg level attribute + * @TSM_REPORT_SERVICE_PROVIDER: index of the service provider identifier attribute + * @TSM_REPORT_SERVICE_GUID: index of the service GUID attribute + * @TSM_REPORT_SERVICE_MANIFEST_VER: index of the service manifest version attribute */ enum tsm_attr_index { TSM_REPORT_GENERATION, TSM_REPORT_PROVIDER, TSM_REPORT_PRIVLEVEL, TSM_REPORT_PRIVLEVEL_FLOOR, + TSM_REPORT_SERVICE_PROVIDER, + TSM_REPORT_SERVICE_GUID, + TSM_REPORT_SERVICE_MANIFEST_VER, }; /** @@ -61,11 +78,13 @@ enum tsm_attr_index { * @TSM_REPORT_INBLOB: index of the binary report input attribute * @TSM_REPORT_OUTBLOB: index of the binary report output attribute * @TSM_REPORT_AUXBLOB: index of the binary auxiliary data attribute + * @TSM_REPORT_MANIFESTBLOB: index of the binary manifest data attribute */ enum tsm_bin_attr_index { TSM_REPORT_INBLOB, TSM_REPORT_OUTBLOB, TSM_REPORT_AUXBLOB, + TSM_REPORT_MANIFESTBLOB, }; /** -- cgit v1.2.3 From 8cb2dbf94e44bcde4cff0223f2f900f8fb9083a4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Jun 2024 20:31:31 +0200 Subject: irqdomain: Make build work for CONFIG_GENERIC_IRQ_CHIP=n ld: kernel/irq/irqdomain.o: in function `irq_domain_instantiate': kernel/irq/irqdomain.c:296:(.text+0x10dd): undefined reference to `irq_domain_alloc_generic_chips' ld: kernel/irq/irqdomain.c:313:(.text+0x1218): undefined reference to `irq_domain_remove_generic_chips' ld: kernel/irq/irqdomain.o: in function `irq_domain_remove': kernel/irq/irqdomain.c:349:(.text+0x1ddf): undefined reference to `irq_domain_remove_generic_chips' Provide the required stubs. Fixes: e6f67ce32e8e ("irqdomain: Add support for generic irq chips creation before publishing a domain") Reported-by: Borislav Betkov Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 712bcee56efc..1f5dbf1f92c9 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1182,9 +1182,19 @@ int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); +#ifdef CONFIG_GENERIC_IRQ_CHIP int irq_domain_alloc_generic_chips(struct irq_domain *d, const struct irq_domain_chip_generic_info *info); void irq_domain_remove_generic_chips(struct irq_domain *d); +#else +static inline int +irq_domain_alloc_generic_chips(struct irq_domain *d, + const struct irq_domain_chip_generic_info *info) +{ + return -EINVAL; +} +static inline void irq_domain_remove_generic_chips(struct irq_domain *d) { } +#endif /* CONFIG_GENERIC_IRQ_CHIP */ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, int num_ct, const char *name, -- cgit v1.2.3 From efb459303dd5dd6e198a0d58322dc04c3356dc23 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 12 Jun 2024 17:04:02 +0200 Subject: net: Move dev_set_hwtstamp_phylib to net/core/dev.h This declaration was added to the header to be called from ethtool. ethtool is separated from core for code organization but it is not really a separate entity, it controls very core things. As ethtool is an internal stuff it is not wise to have it in netdevice.h. Move the declaration to net/core/dev.h instead. Remove the EXPORT_SYMBOL_GPL call as ethtool can not be built as a module. Reviewed-by: Willem de Bruijn Signed-off-by: Kory Maincent Link: https://lore.kernel.org/r/20240612-feature_ptp_netnext-v15-2-b2a086257b63@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 85111502cf8f..c83b390191d4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3904,9 +3904,6 @@ int generic_hwtstamp_get_lower(struct net_device *dev, int generic_hwtstamp_set_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg, struct netlink_ext_ack *extack); -int dev_set_hwtstamp_phylib(struct net_device *dev, - struct kernel_hwtstamp_config *cfg, - struct netlink_ext_ack *extack); int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata); unsigned int dev_get_flags(const struct net_device *); int __dev_change_flags(struct net_device *dev, unsigned int flags, -- cgit v1.2.3 From 777b8afb8179155353ec14b1d8153122410aba29 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 15 Jun 2024 20:00:27 +0800 Subject: net: phy: introduce core support for phy-mode = "10g-qxgmii" 10G-QXGMII is a MAC-to-PHY interface defined by the USXGMII multiport specification. It uses the same signaling as USXGMII, but it multiplexes 4 ports over the link, resulting in a maximum speed of 2.5G per port. Some in-tree SoCs like the NXP LS1028A use "usxgmii" when they mean either the single-port USXGMII or the quad-port 10G-QXGMII variant, and they could get away just fine with that thus far. But there is a need to distinguish between the 2 as far as SerDes drivers are concerned. Signed-off-by: Vladimir Oltean Signed-off-by: Luo Jie Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Signed-off-by: Paolo Abeni --- include/linux/phy.h | 4 ++++ include/linux/phylink.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index e6e83304558e..205fccfc0f60 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -128,6 +128,7 @@ extern const int phy_10gbit_features_array[1]; * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII * @PHY_INTERFACE_MODE_1000BASEKX: 1000Base-KX - with Clause 73 AN + * @PHY_INTERFACE_MODE_10G_QXGMII: 10G-QXGMII - 4 ports over 10G USXGMII * @PHY_INTERFACE_MODE_MAX: Book keeping * * Describes the interface between the MAC and PHY. @@ -168,6 +169,7 @@ typedef enum { PHY_INTERFACE_MODE_10GKR, PHY_INTERFACE_MODE_QUSGMII, PHY_INTERFACE_MODE_1000BASEKX, + PHY_INTERFACE_MODE_10G_QXGMII, PHY_INTERFACE_MODE_MAX, } phy_interface_t; @@ -289,6 +291,8 @@ static inline const char *phy_modes(phy_interface_t interface) return "100base-x"; case PHY_INTERFACE_MODE_QUSGMII: return "qusgmii"; + case PHY_INTERFACE_MODE_10G_QXGMII: + return "10g-qxgmii"; default: return "unknown"; } diff --git a/include/linux/phylink.h b/include/linux/phylink.h index a30a692acc32..2381e07429a2 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -654,6 +654,7 @@ static inline int phylink_get_link_timer_ns(phy_interface_t interface) case PHY_INTERFACE_MODE_SGMII: case PHY_INTERFACE_MODE_QSGMII: case PHY_INTERFACE_MODE_USXGMII: + case PHY_INTERFACE_MODE_10G_QXGMII: return 1600000; case PHY_INTERFACE_MODE_1000BASEX: -- cgit v1.2.3 From c2bc958b2b03e361f14df99983bc64a39a7323a3 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 17 Jun 2024 13:06:27 +0200 Subject: fbdev: vesafb: Detect VGA compatibility from screen info's VESA attributes Test the vesa_attributes field in struct screen_info for compatibility with VGA hardware. Vesafb currently tests bit 1 in screen_info's capabilities field which indicates a 64-bit lfb address and is unrelated to VGA compatibility. Section 4.4 of the Vesa VBE 2.0 specifications defines that bit 5 in the mode's attributes field signals VGA compatibility. The mode is compatible with VGA hardware if the bit is clear. In that case, the driver can access VGA state of the VBE's underlying hardware. The vesafb driver uses this feature to program the color LUT in palette modes. Without, colors might be incorrect. The problem got introduced in commit 89ec4c238e7a ("[PATCH] vesafb: Fix incorrect logo colors in x86_64"). It incorrectly stores the mode attributes in the screen_info's capabilities field and updates vesafb accordingly. Later, commit 5e8ddcbe8692 ("Video mode probing support for the new x86 setup code") fixed the screen_info, but did not update vesafb. Color output still tends to work, because bit 1 in capabilities is usually 0. Besides fixing the bug in vesafb, this commit introduces a helper that reads the correct bit from screen_info. Signed-off-by: Thomas Zimmermann Fixes: 5e8ddcbe8692 ("Video mode probing support for the new x86 setup code") Reviewed-by: Javier Martinez Canillas Cc: # v2.6.23+ Signed-off-by: Helge Deller --- include/linux/screen_info.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h index 75303c126285..6a4a3cec4638 100644 --- a/include/linux/screen_info.h +++ b/include/linux/screen_info.h @@ -49,6 +49,16 @@ static inline u64 __screen_info_lfb_size(const struct screen_info *si, unsigned return lfb_size; } +static inline bool __screen_info_vbe_mode_nonvga(const struct screen_info *si) +{ + /* + * VESA modes typically run on VGA hardware. Set bit 5 signals that this + * is not the case. Drivers can then not make use of VGA resources. See + * Sec 4.4 of the VBE 2.0 spec. + */ + return si->vesa_attributes & BIT(5); +} + static inline unsigned int __screen_info_video_type(unsigned int type) { switch (type) { -- cgit v1.2.3 From d6a711a898672dd873aab3844f754a3ca40723a5 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Tue, 18 Jun 2024 15:29:51 +0200 Subject: spi: Fix OCTAL mode support Add OCTAL mode support. Issue detected using "--octal" spidev_test's option. Signed-off-by: Patrice Chotard Link: https://msgid.link/r/20240618132951.2743935-4-patrice.chotard@foss.st.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e8e1e798924f..98fdef6e28f2 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1085,12 +1085,13 @@ struct spi_transfer { unsigned dummy_data:1; unsigned cs_off:1; unsigned cs_change:1; - unsigned tx_nbits:3; - unsigned rx_nbits:3; + unsigned tx_nbits:4; + unsigned rx_nbits:4; unsigned timestamped:1; #define SPI_NBITS_SINGLE 0x01 /* 1-bit transfer */ #define SPI_NBITS_DUAL 0x02 /* 2-bit transfer */ #define SPI_NBITS_QUAD 0x04 /* 4-bit transfer */ +#define SPI_NBITS_OCTAL 0x08 /* 8-bit transfer */ u8 bits_per_word; struct spi_delay delay; struct spi_delay cs_change_delay; -- cgit v1.2.3 From 702eb71fd6501b3566283f8c96d7ccc6ddd662e9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Jun 2024 18:23:00 +0200 Subject: fsnotify: Do not generate events for O_PATH file descriptors Currently we will not generate FS_OPEN events for O_PATH file descriptors but we will generate FS_CLOSE events for them. This is asymmetry is confusing. Arguably no fsnotify events should be generated for O_PATH file descriptors as they cannot be used to access or modify file content, they are just convenient handles to file objects like paths. So fix the asymmetry by stopping to generate FS_CLOSE for O_PATH file descriptors. Cc: Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20240617162303.1596-1-jack@suse.cz Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- include/linux/fsnotify.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 4da80e92f804..278620e063ab 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -112,7 +112,13 @@ static inline int fsnotify_file(struct file *file, __u32 mask) { const struct path *path; - if (file->f_mode & FMODE_NONOTIFY) + /* + * FMODE_NONOTIFY are fds generated by fanotify itself which should not + * generate new events. We also don't want to generate events for + * FMODE_PATH fds (involves open & close events) as they are just + * handle creation / destruction events and not "real" file events. + */ + if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH)) return 0; path = &file->f_path; -- cgit v1.2.3 From a6816314af5749cd88944bfdceb270c627cdf348 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 3 May 2024 11:17:32 -0700 Subject: KVM: Introduce vcpu->wants_to_run Introduce vcpu->wants_to_run to indicate when a vCPU is in its core run loop, i.e. when the vCPU is running the KVM_RUN ioctl and immediate_exit was not set. Replace all references to vcpu->run->immediate_exit with !vcpu->wants_to_run to avoid TOCTOU races with userspace. For example, a malicious userspace could invoked KVM_RUN with immediate_exit=true and then after KVM reads it to set wants_to_run=false, flip it to false. This would result in the vCPU running in KVM_RUN with wants_to_run=false. This wouldn't cause any real bugs today but is a dangerous landmine. Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240503181734.1467938-2-dmatlack@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7b9d2633a931..d72ced3e74d1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -378,6 +378,7 @@ struct kvm_vcpu { bool dy_eligible; } spin_loop; #endif + bool wants_to_run; bool preempted; bool ready; bool scheduled_out; -- cgit v1.2.3 From 395e73bd8d35fa9b8505e44f63a2a10abde09cce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 9 Jun 2024 20:12:19 -0700 Subject: srcu: Add NUM_ACTIVE_SRCU_POLL_OLDSTATE This commit adds NUM_ACTIVE_SRCU_POLL_OLDSTATE, which gives the maximum number of distinct return values from get_state_synchronize_rcu() that can, at a given point in time, correspond to not-completed SRCU grace periods. Reported-by: Kent Overstreet Closes: https://lore.kernel.org/all/irycqy4sinjdgm2hkyix2bffunpcmuwgeufsx6nlljvqme3wiu@ify3zdnrmzph/ Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 236610e4a8fa..f664cba7a80c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -61,6 +61,10 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); +// Maximum number of unsigned long values corresponding to +// not-yet-completed SRCU grace periods. +#define NUM_ACTIVE_SRCU_POLL_OLDSTATE 2 + #ifdef CONFIG_NEED_SRCU_NMI_SAFE int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp); -- cgit v1.2.3 From e206f33e2c0774276a0497fe538472e12016a362 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Jun 2024 13:26:44 -0700 Subject: srcu: Fill out polled grace-period APIs This commit adds the get_completed_synchronize_srcu() and the same_state_synchronize_srcu() functions. The first returns a cookie that is always interpreted as corresponding to an expired grace period. The second does an equality comparison of a pair of cookies. Signed-off-by: Paul E. McKenney Cc: Kent Overstreet --- include/linux/srcu.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index f664cba7a80c..6f6cb5fc1242 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -57,6 +57,20 @@ void cleanup_srcu_struct(struct srcu_struct *ssp); int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); void synchronize_srcu(struct srcu_struct *ssp); + +#define SRCU_GET_STATE_COMPLETED 0x1 + +/** + * get_completed_synchronize_srcu - Return a pre-completed polled state cookie + * + * Returns a value that poll_state_synchronize_srcu() will always treat + * as a cookie whose grace period has already completed. + */ +static inline unsigned long get_completed_synchronize_srcu(void) +{ + return SRCU_GET_STATE_COMPLETED; +} + unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); @@ -65,6 +79,23 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); // not-yet-completed SRCU grace periods. #define NUM_ACTIVE_SRCU_POLL_OLDSTATE 2 +/** + * same_state_synchronize_srcu - Are two old-state values identical? + * @oldstate1: First old-state value. + * @oldstate2: Second old-state value. + * + * The two old-state values must have been obtained from either + * get_state_synchronize_srcu(), start_poll_synchronize_srcu(), or + * get_completed_synchronize_srcu(). Returns @true if the two values are + * identical and @false otherwise. This allows structures whose lifetimes + * are tracked by old-state values to push these values to a list header, + * allowing those structures to be slightly smaller. + */ +static inline bool same_state_synchronize_srcu(unsigned long oldstate1, unsigned long oldstate2) +{ + return oldstate1 == oldstate2; +} + #ifdef CONFIG_NEED_SRCU_NMI_SAFE int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp); -- cgit v1.2.3 From 5c563ee90a22d3295bcd6217e3ecd7bf9f4d9d48 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Fri, 24 May 2024 11:58:28 -0700 Subject: cpumask: introduce assign_cpu() macro Now that assign_bit() is a thin macro wrapper around set_bit() and clear_bit(), we can use it in cpumask API and drop duplicating implementations of set_cpu_xxx() helpers with no additional overhead. Bloat-o-meter reports almost 2k less of generated code for allyesconfig, mostly in kernel/cpu.c: add/remove: 2/4 grow/shrink: 3/4 up/down: 498/-2228 (-1730) Reviewed-by: Alexander Lobakin Signed-off-by: Yury Norov --- include/linux/cpumask.h | 40 ++++++---------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 23686bed441d..18410acdbc9e 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1083,44 +1083,16 @@ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); -static inline void -set_cpu_possible(unsigned int cpu, bool possible) -{ - if (possible) - cpumask_set_cpu(cpu, &__cpu_possible_mask); - else - cpumask_clear_cpu(cpu, &__cpu_possible_mask); -} +#define assign_cpu(cpu, mask, val) \ + assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) -static inline void -set_cpu_present(unsigned int cpu, bool present) -{ - if (present) - cpumask_set_cpu(cpu, &__cpu_present_mask); - else - cpumask_clear_cpu(cpu, &__cpu_present_mask); -} +#define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) +#define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) +#define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) +#define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) void set_cpu_online(unsigned int cpu, bool online); -static inline void -set_cpu_active(unsigned int cpu, bool active) -{ - if (active) - cpumask_set_cpu(cpu, &__cpu_active_mask); - else - cpumask_clear_cpu(cpu, &__cpu_active_mask); -} - -static inline void -set_cpu_dying(unsigned int cpu, bool dying) -{ - if (dying) - cpumask_set_cpu(cpu, &__cpu_dying_mask); - else - cpumask_clear_cpu(cpu, &__cpu_dying_mask); -} - /** * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap -- cgit v1.2.3 From e0eeb938adb0367de4b0946125a06142d8de7d37 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 11 Jun 2024 15:38:12 +0300 Subject: bitops: Add a comment explaining the double underscore macros Linus Walleij pointed out that a new comer might be confused about the difference between set_bit() and __set_bit(). Add a comment explaining the difference. Link: https://lore.kernel.org/all/CACRpkdZFPG_YLici-BmYfk9HZ36f4WavCN3JNotkk8cPgCODCg@mail.gmail.com/ Signed-off-by: Dan Carpenter Reviewed-by: Linus Walleij Signed-off-by: Yury Norov --- include/linux/bitops.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 46d4bdc634c0..ba35bbf07798 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -47,12 +47,17 @@ extern unsigned long __sw_hweight64(__u64 w); __builtin_constant_p(*(const unsigned long *)(addr))) ? \ const##op(nr, addr) : op(nr, addr)) +/* + * The following macros are non-atomic versions of their non-underscored + * counterparts. + */ #define __set_bit(nr, addr) bitop(___set_bit, nr, addr) #define __clear_bit(nr, addr) bitop(___clear_bit, nr, addr) #define __change_bit(nr, addr) bitop(___change_bit, nr, addr) #define __test_and_set_bit(nr, addr) bitop(___test_and_set_bit, nr, addr) #define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr) #define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr) + #define test_bit(nr, addr) bitop(_test_bit, nr, addr) #define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr) -- cgit v1.2.3 From 1e4c64b71c9bf230b25fde12cbcceacfdc8b3332 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 13 Jun 2024 11:55:07 -0400 Subject: mm/memblock: Add "reserve_mem" to reserved named memory at boot up In order to allow for requesting a memory region that can be used for things like pstore on multiple machines where the memory layout is not the same, add a new option to the kernel command line called "reserve_mem". The format is: reserve_mem=nn:align:name Where it will find nn amount of memory at the given alignment of align. The name field is to allow another subsystem to retrieve where the memory was found. For example: reserve_mem=12M:4096:oops ramoops.mem_name=oops Where ramoops.mem_name will tell ramoops that memory was reserved for it via the reserve_mem option and it can find it by calling: if (reserve_mem_find_by_name("oops", &start, &size)) { // start holds the start address and size holds the size given This is typically used for systems that do not wipe the RAM, and this command line will try to reserve the same physical memory on soft reboots. Note, it is not guaranteed to be the same location. For example, if KASLR places the kernel at the location of where the RAM reservation was from a previous boot, the new reservation will be at a different location. Any subsystem using this feature must add a way to verify that the contents of the physical memory is from a previous boot, as there may be cases where the memory will not be located at the same location. Not all systems may work either. There could be bit flips if the reboot goes through the BIOS. Using kexec to reboot the machine is likely to have better results in such cases. Link: https://lore.kernel.org/all/ZjJVnZUX3NZiGW6q@kernel.org/ Suggested-by: Mike Rapoport Tested-by: Guilherme G. Piccoli Signed-off-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20240613155527.437020271@goodmis.org Signed-off-by: Mike Rapoport (IBM) --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9849dfda44d4..077fb589b88a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4263,4 +4263,6 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn) void vma_pgtable_walk_begin(struct vm_area_struct *vma); void vma_pgtable_walk_end(struct vm_area_struct *vma); +int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); + #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From ba8de796baf4bdc03530774fb284fe3c97875566 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Mon, 17 Jun 2024 11:09:09 -0700 Subject: net: introduce sk_skb_reason_drop function Long used destructors kfree_skb and kfree_skb_reason do not pass receiving socket to packet drop tracepoints trace_kfree_skb. This makes it hard to track packet drops of a certain netns (container) or a socket (user application). The naming of these destructors are also not consistent with most sk/skb operating functions, i.e. functions named "sk_xxx" or "skb_xxx". Introduce a new functions sk_skb_reason_drop as drop-in replacement for kfree_skb_reason on local receiving path. Callers can now pass receiving sockets to the tracepoints. kfree_skb and kfree_skb_reason are still usable but they are now just inline helpers that call sk_skb_reason_drop. Note it is not feasible to do the same to consume_skb. Packets not dropped can flow through multiple receive handlers, and have multiple receiving sockets. Leave it untouched for now. Suggested-by: Eric Dumazet Signed-off-by: Yan Zhai Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/skbuff.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 813406a9bd6c..f4cda3fbdb75 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1251,8 +1251,14 @@ static inline bool skb_data_unref(const struct sk_buff *skb, return true; } -void __fix_address -kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); +void __fix_address sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason reason); + +static inline void +kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +{ + sk_skb_reason_drop(NULL, skb, reason); +} /** * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason -- cgit v1.2.3 From dc2e77979412d289df9049d8c693761db8602867 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 14 Jun 2024 12:30:44 -0400 Subject: net: Split a __sys_bind helper for io_uring io_uring holds a reference to the file and maintains a sockaddr_storage address. Similarly to what was done to __sys_connect_file, split an internal helper for __sys_bind in preparation to supporting an io_uring bind command. Reviewed-by: Jens Axboe Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Kuniyuki Iwashima Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240614163047.31581-1-krisman@suse.de Signed-off-by: Jens Axboe --- include/linux/socket.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 89d16b90370b..b3000f49e9f5 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -442,6 +442,8 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, extern int __sys_socket(int family, int type, int protocol); extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); +extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, + int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, -- cgit v1.2.3 From bb6aaf736680f0f3c2e6281735c47c64e2042819 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 14 Jun 2024 12:30:45 -0400 Subject: net: Split a __sys_listen helper for io_uring io_uring holds a reference to the file and maintains a sockaddr_storage address. Similarly to what was done to __sys_connect_file, split an internal helper for __sys_listen in preparation to support an io_uring listen command. Reviewed-by: Jens Axboe Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Kuniyuki Iwashima Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240614163047.31581-2-krisman@suse.de Signed-off-by: Jens Axboe --- include/linux/socket.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index b3000f49e9f5..c1f16cdab677 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -449,6 +449,7 @@ extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); +extern int __sys_listen_socket(struct socket *sock, int backlog); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, -- cgit v1.2.3 From 1122c0c1cc71f740fa4d5f14f239194e06a1d5e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:40 +0200 Subject: block: move cache control settings out of queue->flags Move the cache control settings into the queue_limits so that the flags can be set atomically with the device queue frozen. Add new features and flags field for the driver set flags, and internal (usually sysfs-controlled) flags in the block layer. Note that we'll eventually remove enough field from queue_limits to bring it back to the previous size. The disable flag is inverted compared to the previous meaning, which means it now survives a rescan, similar to the max_sectors and max_discard_sectors user limits. The FLUSH and FUA flags are now inherited by blk_stack_limits, which simplified the code in dm a lot, but also causes a slight behavior change in that dm-switch and dm-unstripe now advertise a write cache despite setting num_flush_bios to 0. The I/O path will handle this gracefully, but as far as I can tell the lack of num_flush_bios and thus flush support is a pre-existing data integrity bug in those targets that really needs fixing, after which a non-zero num_flush_bios should be required in dm for targets that map to underlying devices. Signed-off-by: Christoph Hellwig Acked-by: Ulf Hansson Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-14-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0c247a716885..acdfe5122faa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -282,6 +282,28 @@ static inline bool blk_op_is_passthrough(blk_opf_t op) return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; } +/* flags set by the driver in queue_limits.features */ +enum { + /* supports a volatile write cache */ + BLK_FEAT_WRITE_CACHE = (1u << 0), + + /* supports passing on the FUA bit */ + BLK_FEAT_FUA = (1u << 1), +}; + +/* + * Flags automatically inherited when stacking limits. + */ +#define BLK_FEAT_INHERIT_MASK \ + (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA) + + +/* internal flags in queue_limits.flags */ +enum { + /* do not send FLUSH or FUA command despite advertised write cache */ + BLK_FLAGS_WRITE_CACHE_DISABLED = (1u << 31), +}; + /* * BLK_BOUNCE_NONE: never bounce (default) * BLK_BOUNCE_HIGH: bounce all highmem pages @@ -292,6 +314,8 @@ enum blk_bounce { }; struct queue_limits { + unsigned int features; + unsigned int flags; enum blk_bounce bounce; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; @@ -536,12 +560,9 @@ struct request_queue { #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ -#define QUEUE_FLAG_HW_WC 13 /* Write back caching supported */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ -#define QUEUE_FLAG_WC 17 /* Write back caching */ -#define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ #define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ @@ -951,7 +972,6 @@ void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); -extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); struct blk_independent_access_ranges * disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); @@ -1304,14 +1324,20 @@ static inline bool bdev_stable_writes(struct block_device *bdev) return test_bit(QUEUE_FLAG_STABLE_WRITES, &q->queue_flags); } +static inline bool blk_queue_write_cache(struct request_queue *q) +{ + return (q->limits.features & BLK_FEAT_WRITE_CACHE) && + !(q->limits.flags & BLK_FLAGS_WRITE_CACHE_DISABLED); +} + static inline bool bdev_write_cache(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags); + return blk_queue_write_cache(bdev_get_queue(bdev)); } static inline bool bdev_fua(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags); + return bdev_get_queue(bdev)->limits.features & BLK_FEAT_FUA; } static inline bool bdev_nowait(struct block_device *bdev) -- cgit v1.2.3 From bd4a633b6f7c3c6b6ebc1a07317643270e751a94 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:41 +0200 Subject: block: move the nonrot flag to queue_limits Move the nonrot flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Use the chance to switch to defaulting to non-rotational and require the driver to opt into rotational, which matches the polarity of the sysfs interface. For the z2ram, ps3vram, 2x memstick, ubiblock and dcssblk the new rotational flag is not set as they clearly are not rotational despite this being a behavior change. There are some other drivers that unconditionally set the rotational flag to keep the existing behavior as they arguably can be used on rotational devices even if that is probably not their main use today (e.g. virtio_blk and drbd). The flag is automatically inherited in blk_stack_limits matching the existing behavior in dm and md. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-15-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index acdfe5122faa..988e3248cffe 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -289,14 +289,16 @@ enum { /* supports passing on the FUA bit */ BLK_FEAT_FUA = (1u << 1), + + /* rotational device (hard drive or floppy) */ + BLK_FEAT_ROTATIONAL = (1u << 2), }; /* * Flags automatically inherited when stacking limits. */ #define BLK_FEAT_INHERIT_MASK \ - (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA) - + (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL) /* internal flags in queue_limits.flags */ enum { @@ -553,8 +555,6 @@ struct request_queue { #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ -#define QUEUE_FLAG_NONROT 6 /* non-rotational device (SSD) */ -#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ @@ -589,7 +589,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) -#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) +#define blk_queue_nonrot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ -- cgit v1.2.3 From 39a9f1c334f9f27b3b3e6d0005c10ed667268346 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:42 +0200 Subject: block: move the add_random flag to queue_limits Move the add_random flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Note that this also removes code from dm to clear the flag based on the underlying devices, which can't be reached as dm devices will always start out without the flag set. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-16-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 988e3248cffe..cf1bbf566b2b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -292,6 +292,9 @@ enum { /* rotational device (hard drive or floppy) */ BLK_FEAT_ROTATIONAL = (1u << 2), + + /* contributes to the random number pool */ + BLK_FEAT_ADD_RANDOM = (1u << 3), }; /* @@ -557,7 +560,6 @@ struct request_queue { #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ -#define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ @@ -591,7 +593,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) -#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) -- cgit v1.2.3 From cdb2497918cc2929691408bac87b58433b45b6d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:43 +0200 Subject: block: move the io_stat flag setting to queue_limits Move the io_stat flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Simplify md and dm to set the flag unconditionally instead of avoiding setting a simple flag for cases where it already is set by other means, which is a bit pointless. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-17-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cf1bbf566b2b..5fafb2f95fd1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -295,6 +295,9 @@ enum { /* contributes to the random number pool */ BLK_FEAT_ADD_RANDOM = (1u << 3), + + /* do disk/partitions IO accounting */ + BLK_FEAT_IO_STAT = (1u << 4), }; /* @@ -558,7 +561,6 @@ struct request_queue { #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ -#define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ @@ -577,8 +579,7 @@ struct request_queue { #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ #define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/ -#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \ - (1UL << QUEUE_FLAG_SAME_COMP) | \ +#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_SAME_COMP) | \ (1UL << QUEUE_FLAG_NOWAIT)) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); @@ -592,7 +593,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) -#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) +#define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_zone_resetall(q) \ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) -- cgit v1.2.3 From 1a02f3a73f8c670eddeb44bf52a75ae7f67cfc11 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:44 +0200 Subject: block: move the stable_writes flag to queue_limits Move the stable_writes flag into the queue_limits feature field so that it can be set atomically with the queue frozen. The flag is now inherited by blk_stack_limits, which greatly simplifies the code in dm, and fixed md which previously did not pass on the flag set on lower devices. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-18-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5fafb2f95fd1..8936eb6ba609 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -298,13 +298,17 @@ enum { /* do disk/partitions IO accounting */ BLK_FEAT_IO_STAT = (1u << 4), + + /* don't modify data until writeback is done */ + BLK_FEAT_STABLE_WRITES = (1u << 5), }; /* * Flags automatically inherited when stacking limits. */ #define BLK_FEAT_INHERIT_MASK \ - (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL) + (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ + BLK_FEAT_STABLE_WRITES) /* internal flags in queue_limits.flags */ enum { @@ -565,7 +569,6 @@ struct request_queue { #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ -#define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ #define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ @@ -1323,7 +1326,7 @@ static inline bool bdev_stable_writes(struct block_device *bdev) if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && q->limits.integrity.csum_type != BLK_INTEGRITY_CSUM_NONE) return true; - return test_bit(QUEUE_FLAG_STABLE_WRITES, &q->queue_flags); + return q->limits.features & BLK_FEAT_STABLE_WRITES; } static inline bool blk_queue_write_cache(struct request_queue *q) -- cgit v1.2.3 From aadd5c59c910427c0464c217d5ed588ff14e2502 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:45 +0200 Subject: block: move the synchronous flag to queue_limits Move the synchronous flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-19-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8936eb6ba609..cee7b44a1425 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -301,6 +301,9 @@ enum { /* don't modify data until writeback is done */ BLK_FEAT_STABLE_WRITES = (1u << 5), + + /* always completes in submit context */ + BLK_FEAT_SYNCHRONOUS = (1u << 6), }; /* @@ -566,7 +569,6 @@ struct request_queue { #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ -#define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ @@ -1315,8 +1317,7 @@ static inline bool bdev_nonrot(struct block_device *bdev) static inline bool bdev_synchronous(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_SYNCHRONOUS, - &bdev_get_queue(bdev)->queue_flags); + return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; } static inline bool bdev_stable_writes(struct block_device *bdev) -- cgit v1.2.3 From f76af42f8bf13d2620084f305f01691de9238fc7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:46 +0200 Subject: block: move the nowait flag to queue_limits Move the nowait flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Stacking drivers are simplified in that they now can simply set the flag, and blk_stack_limits will clear it when the features is not supported by any of the underlying devices. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-20-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cee7b44a1425..f3d4519d609d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -304,6 +304,9 @@ enum { /* always completes in submit context */ BLK_FEAT_SYNCHRONOUS = (1u << 6), + + /* supports REQ_NOWAIT */ + BLK_FEAT_NOWAIT = (1u << 7), }; /* @@ -580,12 +583,10 @@ struct request_queue { #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ -#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ #define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/ -#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_SAME_COMP) | \ - (1UL << QUEUE_FLAG_NOWAIT)) +#define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); @@ -1348,7 +1349,7 @@ static inline bool bdev_fua(struct block_device *bdev) static inline bool bdev_nowait(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags); + return bdev->bd_disk->queue->limits.features & BLK_FEAT_NOWAIT; } static inline bool bdev_is_zoned(struct block_device *bdev) -- cgit v1.2.3 From f467fee48da4500786e145489787b37adae317c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:47 +0200 Subject: block: move the dax flag to queue_limits Move the dax flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-21-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f3d4519d609d..7022e06a3dd9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -307,6 +307,9 @@ enum { /* supports REQ_NOWAIT */ BLK_FEAT_NOWAIT = (1u << 7), + + /* supports DAX */ + BLK_FEAT_DAX = (1u << 8), }; /* @@ -575,7 +578,6 @@ struct request_queue { #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ -#define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ @@ -602,7 +604,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_zone_resetall(q) \ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) -#define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) +#define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) #define blk_queue_pci_p2pdma(q) \ test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) #ifdef CONFIG_BLK_RQ_ALLOC_TIME -- cgit v1.2.3 From 8023e144f9d6e35f8786937e2f0c2fea0aba6dbc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:48 +0200 Subject: block: move the poll flag to queue_limits Move the poll flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Stacking drivers are simplified in that they now can simply set the flag, and blk_stack_limits will clear it when the features is not supported by any of the underlying devices. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-22-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7022e06a3dd9..cd27b66cbacc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -310,6 +310,9 @@ enum { /* supports DAX */ BLK_FEAT_DAX = (1u << 8), + + /* supports I/O polling */ + BLK_FEAT_POLL = (1u << 9), }; /* @@ -577,7 +580,6 @@ struct request_queue { #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ -#define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ -- cgit v1.2.3 From b1fc937a55f5735b98d9dceae5bb6ba262501f56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:49 +0200 Subject: block: move the zoned flag into the features field Move the zoned flags into the features field to reclaim a little bit of space. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-23-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cd27b66cbacc..bdc30c1fb1b5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -313,6 +313,9 @@ enum { /* supports I/O polling */ BLK_FEAT_POLL = (1u << 9), + + /* is a zoned device */ + BLK_FEAT_ZONED = (1u << 10), }; /* @@ -320,7 +323,7 @@ enum { */ #define BLK_FEAT_INHERIT_MASK \ (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ - BLK_FEAT_STABLE_WRITES) + BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED) /* internal flags in queue_limits.flags */ enum { @@ -372,7 +375,6 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; - bool zoned; unsigned int max_open_zones; unsigned int max_active_zones; @@ -654,7 +656,8 @@ static inline enum rpm_status queue_rpm_status(struct request_queue *q) static inline bool blk_queue_is_zoned(struct request_queue *q) { - return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && q->limits.zoned; + return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + (q->limits.features & BLK_FEAT_ZONED); } #ifdef CONFIG_BLK_DEV_ZONED -- cgit v1.2.3 From a52758a39768f441e468a41da6c15a59d6d6011a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:50 +0200 Subject: block: move the zone_resetall flag to queue_limits Move the zone_resetall flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-24-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bdc30c1fb1b5..1077cb8d8fd8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -316,6 +316,9 @@ enum { /* is a zoned device */ BLK_FEAT_ZONED = (1u << 10), + + /* supports Zone Reset All */ + BLK_FEAT_ZONE_RESETALL = (1u << 11), }; /* @@ -586,7 +589,6 @@ struct request_queue { #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ -#define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ @@ -607,7 +609,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_nonrot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_zone_resetall(q) \ - test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) + ((q)->limits.features & BLK_FEAT_ZONE_RESETALL) #define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) #define blk_queue_pci_p2pdma(q) \ test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) -- cgit v1.2.3 From 9c1e42e3c876c66796eda23e79836a4d92613a61 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:51 +0200 Subject: block: move the pci_p2pdma flag to queue_limits Move the pci_p2pdma flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-25-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1077cb8d8fd8..ab0f7dfba556 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -319,6 +319,9 @@ enum { /* supports Zone Reset All */ BLK_FEAT_ZONE_RESETALL = (1u << 11), + + /* supports PCI(e) p2p requests */ + BLK_FEAT_PCI_P2PDMA = (1u << 12), }; /* @@ -588,7 +591,6 @@ struct request_queue { #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ -#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ @@ -611,8 +613,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_zone_resetall(q) \ ((q)->limits.features & BLK_FEAT_ZONE_RESETALL) #define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) -#define blk_queue_pci_p2pdma(q) \ - test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) +#define blk_queue_pci_p2pdma(q) ((q)->limits.features & BLK_FEAT_PCI_P2PDMA) #ifdef CONFIG_BLK_RQ_ALLOC_TIME #define blk_queue_rq_alloc_time(q) \ test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags) -- cgit v1.2.3 From 8c8f5c85b20d0a7dc0ab9b2a17318130d69ceb5a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:52 +0200 Subject: block: move the skip_tagset_quiesce flag to queue_limits Move the skip_tagset_quiesce flag into the queue_limits feature field so that it can be set atomically with the queue frozen. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-26-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ab0f7dfba556..2c433ebf6f20 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -322,6 +322,9 @@ enum { /* supports PCI(e) p2p requests */ BLK_FEAT_PCI_P2PDMA = (1u << 12), + + /* skip this queue in blk_mq_(un)quiesce_tagset */ + BLK_FEAT_SKIP_TAGSET_QUIESCE = (1u << 13), }; /* @@ -594,7 +597,6 @@ struct request_queue { #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ -#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/ #define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP) @@ -629,7 +631,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) #define blk_queue_skip_tagset_quiesce(q) \ - test_bit(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, &(q)->queue_flags) + ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit v1.2.3 From 339d3948c07b4aa2940aeb874294a7d6782cec16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2024 08:04:53 +0200 Subject: block: move the bounce flag into the features field Move the bounce flag into the features field to reclaim a little bit of space. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240617060532.127975-27-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2c433ebf6f20..e96ba7b97288 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -325,6 +325,9 @@ enum { /* skip this queue in blk_mq_(un)quiesce_tagset */ BLK_FEAT_SKIP_TAGSET_QUIESCE = (1u << 13), + + /* bounce all highmem pages */ + BLK_FEAT_BOUNCE_HIGH = (1u << 14), }; /* @@ -332,7 +335,7 @@ enum { */ #define BLK_FEAT_INHERIT_MASK \ (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ - BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED) + BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH) /* internal flags in queue_limits.flags */ enum { @@ -352,7 +355,6 @@ enum blk_bounce { struct queue_limits { unsigned int features; unsigned int flags; - enum blk_bounce bounce; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; -- cgit v1.2.3 From b9578c49456340ca4d3c7ddbaca054ffc2b51bc1 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 6 Jun 2024 14:02:13 +1200 Subject: dma-buf/heaps: Correct the types of fd_flags and heap_flags dma_heap_allocation_data defines the UAPI as follows: struct dma_heap_allocation_data { __u64 len; __u32 fd; __u32 fd_flags; __u64 heap_flags; }; But dma heaps are casting both fd_flags and heap_flags into unsigned long. This patch makes dma heaps - cma heap and system heap have consistent types with UAPI. Signed-off-by: Barry Song Acked-by: John Stultz Reviewed-by: Carlos Llamas Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20240606020213.49854-1-21cnbao@gmail.com --- include/linux/dma-heap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h index 0c05561cad6e..064bad725061 100644 --- a/include/linux/dma-heap.h +++ b/include/linux/dma-heap.h @@ -23,8 +23,8 @@ struct dma_heap; struct dma_heap_ops { struct dma_buf *(*allocate)(struct dma_heap *heap, unsigned long len, - unsigned long fd_flags, - unsigned long heap_flags); + u32 fd_flags, + u64 heap_flags); }; /** -- cgit v1.2.3 From 2003e483a81cc235e29f77da3f6b256cb4b348e7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Jun 2024 13:31:05 -0700 Subject: fortify: Do not special-case 0-sized destinations All fake flexible arrays should have been removed now, so remove the special casing that was avoiding checking them. If a destination claims to be 0 sized, believe it. This is especially important for cases where __counted_by is in use and may have a 0 element count. Link: https://lore.kernel.org/r/20240619203105.work.747-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 7e0f340bf363..0d99bf11d260 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -601,11 +601,7 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, /* * Warn when writing beyond destination field size. * - * We must ignore p_size_field == 0 for existing 0-element - * fake flexible arrays, until they are all converted to - * proper flexible arrays. - * - * The implementation of __builtin_*object_size() behaves + * Note the implementation of __builtin_*object_size() behaves * like sizeof() when not directly referencing a flexible * array member, which means there will be many bounds checks * that will appear at run-time, without a way for them to be @@ -613,7 +609,7 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, * is specifically the flexible array member). * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832 */ - if (p_size_field != 0 && p_size_field != SIZE_MAX && + if (p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) return true; -- cgit v1.2.3 From 269e974e664207cc45f83b579565ba73de1b75dc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 14 Jun 2024 11:41:02 +0200 Subject: driver core: make [device_]driver_attach take a const * Change device_driver_attach() and driver_attach() to take a const * to struct device driver as neither of them modify the structure at all. Also, for some odd reason, drivers/dma/idxd/compat.c had a duplicate external reference to device_driver_attach(), so remove that to fix up the build, it should never have had that there in the first place. Cc: Rafael J. Wysocki Cc: Fenghua Yu Cc: Dave Jiang Cc: Vinod Koul Cc: Andy Shevchenko Cc: Petr Tesarik Cc: Alexander Lobakin Cc: dmaengine@vger.kernel.org Link: https://lore.kernel.org/r/2024061401-rasping-manger-c385@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 3a630942f358..34eb20f5966f 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1177,12 +1177,12 @@ static inline void *dev_get_platdata(const struct device *dev) * Manual binding of a device to driver. See drivers/base/bus.c * for information on use. */ -int __must_check device_driver_attach(struct device_driver *drv, +int __must_check device_driver_attach(const struct device_driver *drv, struct device *dev); int __must_check device_bind_driver(struct device *dev); void device_release_driver(struct device *dev); int __must_check device_attach(struct device *dev); -int __must_check driver_attach(struct device_driver *drv); +int __must_check driver_attach(const struct device_driver *drv); void device_initial_probe(struct device *dev); int __must_check device_reprobe(struct device *dev); -- cgit v1.2.3 From f6860b6069b92559f5cdb65f48e2d82051eaebca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Jun 2024 17:45:33 +0200 Subject: block: remove the unused blk_bounce enum The enum has been replaced with the BLK_FEAT_BOUNCE_HIGH flag. Reported-by: Keith Busch Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240619154623.450048-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e96ba7b97288..f7d275e3fb2c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -343,15 +343,6 @@ enum { BLK_FLAGS_WRITE_CACHE_DISABLED = (1u << 31), }; -/* - * BLK_BOUNCE_NONE: never bounce (default) - * BLK_BOUNCE_HIGH: bounce all highmem pages - */ -enum blk_bounce { - BLK_BOUNCE_NONE, - BLK_BOUNCE_HIGH, -}; - struct queue_limits { unsigned int features; unsigned int flags; -- cgit v1.2.3 From bae1c74316b86c67c95658c3a0cd312cec9aad77 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Jun 2024 17:45:35 +0200 Subject: block: renumber and rename the cache disabled flag Start with the first bit, and drop the plural-S from the name. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240619154623.450048-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f7d275e3fb2c..713a98b6dbba 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -339,8 +339,8 @@ enum { /* internal flags in queue_limits.flags */ enum { - /* do not send FLUSH or FUA command despite advertised write cache */ - BLK_FLAGS_WRITE_CACHE_DISABLED = (1u << 31), + /* do not send FLUSH/FUA commands despite advertising a write cache */ + BLK_FLAG_WRITE_CACHE_DISABLED = (1u << 0), }; struct queue_limits { @@ -1339,7 +1339,7 @@ static inline bool bdev_stable_writes(struct block_device *bdev) static inline bool blk_queue_write_cache(struct request_queue *q) { return (q->limits.features & BLK_FEAT_WRITE_CACHE) && - !(q->limits.flags & BLK_FLAGS_WRITE_CACHE_DISABLED); + !(q->limits.flags & BLK_FLAG_WRITE_CACHE_DISABLED); } static inline bool bdev_write_cache(struct block_device *bdev) -- cgit v1.2.3 From 5543217be468268dfedf504f4969771b9a377353 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Jun 2024 17:45:36 +0200 Subject: block: move the misaligned flag into the features field Move the misaligned flags into the features field to reclaim a little bit of space. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240619154623.450048-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 713a98b6dbba..7ad2b1240fc0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -341,6 +341,9 @@ enum { enum { /* do not send FLUSH/FUA commands despite advertising a write cache */ BLK_FLAG_WRITE_CACHE_DISABLED = (1u << 0), + + /* I/O topology is misaligned */ + BLK_FEAT_MISALIGNED = (1u << 1), }; struct queue_limits { @@ -374,7 +377,6 @@ struct queue_limits { unsigned short max_integrity_segments; unsigned short max_discard_segments; - unsigned char misaligned; unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; unsigned int max_open_zones; -- cgit v1.2.3 From 4cac3d3a712b5c76d462b29b73b9e58c0b6d9946 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Jun 2024 17:45:37 +0200 Subject: block: remove the discard_alignment flag queue_limits.discard_alignment is never read except in the places where it is stacked into another limit. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240619154623.450048-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7ad2b1240fc0..86410ce41bf6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -377,7 +377,6 @@ struct queue_limits { unsigned short max_integrity_segments; unsigned short max_discard_segments; - unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; unsigned int max_open_zones; unsigned int max_active_zones; -- cgit v1.2.3 From 7d4dec525f5fd555037486af4d02dd3682655ba1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Jun 2024 17:45:38 +0200 Subject: block: move the raid_partial_stripes_expensive flag into the features field Move the raid_partial_stripes_expensive flags into the features field to reclaim a little bit of space. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240619154623.450048-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 86410ce41bf6..1fa2b148c206 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -328,6 +328,9 @@ enum { /* bounce all highmem pages */ BLK_FEAT_BOUNCE_HIGH = (1u << 14), + + /* undocumented magic for bcache */ + BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE = (1u << 15), }; /* @@ -335,7 +338,8 @@ enum { */ #define BLK_FEAT_INHERIT_MASK \ (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ - BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH) + BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH | \ + BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) /* internal flags in queue_limits.flags */ enum { @@ -377,7 +381,6 @@ struct queue_limits { unsigned short max_integrity_segments; unsigned short max_discard_segments; - unsigned char raid_partial_stripes_expensive; unsigned int max_open_zones; unsigned int max_active_zones; -- cgit v1.2.3 From f70167a7a6e7e8a6911f3a216dc044cbfe7c1983 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 20 Jun 2024 12:53:51 +0000 Subject: block: Generalize chunk_sectors support as boundary support The purpose of the chunk_sectors limit is to ensure that a mergeble request fits within the boundary of the chunck_sector value. Such a feature will be useful for other request_queue boundary limits, so generalize the chunk_sectors merge code. This idea was proposed by Hannes Reinecke. Reviewed-by: Martin K. Petersen Signed-off-by: John Garry Reviewed-by: Hannes Reinecke Acked-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20240620125359.2684798-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0e8253c1507a..fb7d4c21bba8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -907,14 +907,15 @@ static inline bool bio_straddles_zones(struct bio *bio) } /* - * Return how much of the chunk is left to be used for I/O at a given offset. + * Return how much within the boundary is left to be used for I/O at a given + * offset. */ -static inline unsigned int blk_chunk_sectors_left(sector_t offset, - unsigned int chunk_sectors) +static inline unsigned int blk_boundary_sectors_left(sector_t offset, + unsigned int boundary_sectors) { - if (unlikely(!is_power_of_2(chunk_sectors))) - return chunk_sectors - sector_div(offset, chunk_sectors); - return chunk_sectors - (offset & (chunk_sectors - 1)); + if (unlikely(!is_power_of_2(boundary_sectors))) + return boundary_sectors - sector_div(offset, boundary_sectors); + return boundary_sectors - (offset & (boundary_sectors - 1)); } /** -- cgit v1.2.3 From c34fc6f26ab86d03a2d47446f42b6cd492dfdc56 Mon Sep 17 00:00:00 2001 From: Prasad Singamsetty Date: Thu, 20 Jun 2024 12:53:52 +0000 Subject: fs: Initial atomic write support An atomic write is a write issued with torn-write protection, meaning that for a power failure or any other hardware failure, all or none of the data from the write will be stored, but never a mix of old and new data. Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the write is to be issued with torn-write prevention, according to special alignment and length rules. For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for iocb->ki_flags field to indicate the same. A call to statx will give the relevant atomic write info for a file: - atomic_write_unit_min - atomic_write_unit_max - atomic_write_segments_max Both min and max values must be a power-of-2. Applications can avail of atomic write feature by ensuring that the total length of a write is a power-of-2 in size and also sized between atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications must ensure that the write is at a naturally-aligned offset in the file wrt the total write length. The value in atomic_write_segments_max indicates the upper limit for IOV_ITER iovcnt. Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the flag set will have RWF_ATOMIC rejected and not just ignored. Add a type argument to kiocb_set_rw_flags() to allows reads which have RWF_ATOMIC set to be rejected. Helper function generic_atomic_write_valid() can be used by FSes to verify compliant writes. There we check for iov_iter type is for ubuf, which implies iovcnt==1 for pwritev2(), which is an initial restriction for atomic_write_segments_max. Initially the only user will be bdev file operations write handler. We will rely on the block BIO submission path to ensure write sizes are compliant for the bdev, so we don't need to check atomic writes sizes yet. Signed-off-by: Prasad Singamsetty jpg: merge into single patch and much rewrite Acked-by: Darrick J. Wong Reviewed-by: Martin K. Petersen Signed-off-by: John Garry Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20240620125359.2684798-4-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0283cf366c2a..e049414bef7d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -125,8 +125,10 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_EXEC ((__force fmode_t)(1 << 5)) /* File writes are restricted (block device specific) */ #define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6)) +/* File supports atomic writes */ +#define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7)) -/* FMODE_* bits 7 to 8 */ +/* FMODE_* bit 8 */ /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) @@ -317,6 +319,7 @@ struct readahead_control; #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND +#define IOCB_ATOMIC (__force int) RWF_ATOMIC /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -351,6 +354,7 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ + { IOCB_ATOMIC, "ATOMIC"}, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -3403,7 +3407,8 @@ static inline int iocb_flags(struct file *file) return res; } -static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, + int rw_type) { int kiocb_flags = 0; @@ -3422,6 +3427,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return -EOPNOTSUPP; kiocb_flags |= IOCB_NOIO; } + if (flags & RWF_ATOMIC) { + if (rw_type != WRITE) + return -EOPNOTSUPP; + if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; @@ -3613,4 +3624,6 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos); + #endif /* _LINUX_FS_H */ -- cgit v1.2.3 From 0f9ca80fa4f9670ba09721e4e36b8baf086a500c Mon Sep 17 00:00:00 2001 From: Prasad Singamsetty Date: Thu, 20 Jun 2024 12:53:53 +0000 Subject: fs: Add initial atomic write support info to statx Extend statx system call to return additional info for atomic write support support for a file. Helper function generic_fill_statx_atomic_writes() can be used by FSes to fill in the relevant statx fields. For now atomic_write_segments_max will always be 1, otherwise some rules would need to be imposed on iovec length and alignment, which we don't want now. Signed-off-by: Prasad Singamsetty jpg: relocate bdev support to another patch Reviewed-by: Darrick J. Wong Reviewed-by: Martin K. Petersen Signed-off-by: John Garry Acked-by: Darrick J. Wong Link: https://lore.kernel.org/r/20240620125359.2684798-5-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 3 +++ include/linux/stat.h | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e049414bef7d..db26b4a70c62 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3235,6 +3235,9 @@ extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); +void generic_fill_statx_atomic_writes(struct kstat *stat, + unsigned int unit_min, + unsigned int unit_max); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); diff --git a/include/linux/stat.h b/include/linux/stat.h index bf92441dbad2..3d900c86981c 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -54,6 +54,9 @@ struct kstat { u32 dio_offset_align; u64 change_cookie; u64 subvol; + u32 atomic_write_unit_min; + u32 atomic_write_unit_max; + u32 atomic_write_segments_max; }; /* These definitions are internal to the kernel for now. Mainly used by nfsd. */ -- cgit v1.2.3 From 9da3d1e912f3953196e66991d75208cde3e845e1 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 20 Jun 2024 12:53:54 +0000 Subject: block: Add core atomic write support Add atomic write support, as follows: - add helper functions to get request_queue atomic write limits - report request_queue atomic write support limits to sysfs and update Doc - support to safely merge atomic writes - deal with splitting atomic writes - misc helper functions - add a per-request atomic write flag New request_queue limits are added, as follows: - atomic_write_hw_max is set by the block driver and is the maximum length of an atomic write which the device may support. It is not necessarily a power-of-2. - atomic_write_max_sectors is derived from atomic_write_hw_max_sectors and max_hw_sectors. It is always a power-of-2. Atomic writes may be merged, and atomic_write_max_sectors would be the limit on a merged atomic write request size. This value is not capped at max_sectors, as the value in max_sectors can be controlled from userspace, and it would only cause trouble if userspace could limit atomic_write_unit_max_bytes and the other atomic write limits. - atomic_write_hw_unit_{min,max} are set by the block driver and are the min/max length of an atomic write unit which the device may support. They both must be a power-of-2. Typically atomic_write_hw_unit_max will hold the same value as atomic_write_hw_max. - atomic_write_unit_{min,max} are derived from atomic_write_hw_unit_{min,max}, max_hw_sectors, and block core limits. Both min and max values must be a power-of-2. - atomic_write_hw_boundary is set by the block driver. If non-zero, it indicates an LBA space boundary at which an atomic write straddles no longer is atomically executed by the disk. The value must be a power-of-2. Note that it would be acceptable to enforce a rule that atomic_write_hw_boundary_sectors is a multiple of atomic_write_hw_unit_max, but the resultant code would be more complicated. All atomic writes limits are by default set 0 to indicate no atomic write support. Even though it is assumed by Linux that a logical block can always be atomically written, we ignore this as it is not of particular interest. Stacked devices are just not supported either for now. An atomic write must always be submitted to the block driver as part of a single request. As such, only a single BIO must be submitted to the block layer for an atomic write. When a single atomic write BIO is submitted, it cannot be split. As such, atomic_write_unit_{max, min}_bytes are limited by the maximum guaranteed BIO size which will not be required to be split. This max size is calculated by request_queue max segments and the number of bvecs a BIO can fit, BIO_MAX_VECS. Currently we rely on userspace issuing a write with iovcnt=1 for pwritev2() - as such, we can rely on each segment containing PAGE_SIZE of data, apart from the first+last, which each can fit logical block size of data. The first+last will be LBS length/aligned as we rely on direct IO alignment rules also. New sysfs files are added to report the following atomic write limits: - atomic_write_unit_max_bytes - same as atomic_write_unit_max_sectors in bytes - atomic_write_unit_min_bytes - same as atomic_write_unit_min_sectors in bytes - atomic_write_boundary_bytes - same as atomic_write_hw_boundary_sectors in bytes - atomic_write_max_bytes - same as atomic_write_max_sectors in bytes Atomic writes may only be merged with other atomic writes and only under the following conditions: - total resultant request length <= atomic_write_max_bytes - the merged write does not straddle a boundary Helper function bdev_can_atomic_write() is added to indicate whether atomic writes may be issued to a bdev. If a bdev is a partition, the partition start must be aligned with both atomic_write_unit_min_sectors and atomic_write_hw_boundary_sectors. FSes will rely on the block layer to validate that an atomic write BIO submitted will be of valid size, so add blk_validate_atomic_write_op_size() for this purpose. Userspace expects an atomic write which is of invalid size to be rejected with -EINVAL, so add BLK_STS_INVAL for this. Also use BLK_STS_INVAL for when a BIO needs to be split, as this should mean an invalid size BIO. Flag REQ_ATOMIC is used for indicating an atomic write. Co-developed-by: Himanshu Madhani Signed-off-by: Himanshu Madhani Reviewed-by: Martin K. Petersen Signed-off-by: John Garry Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20240620125359.2684798-6-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 8 ++++++- include/linux/blkdev.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 781c4500491b..632edd71f8c6 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -162,6 +162,11 @@ typedef u16 blk_short_t; */ #define BLK_STS_DURATION_LIMIT ((__force blk_status_t)17) +/* + * Invalid size or alignment. + */ +#define BLK_STS_INVAL ((__force blk_status_t)19) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with @@ -370,7 +375,7 @@ enum req_flag_bits { __REQ_SWAP, /* swap I/O */ __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ - + __REQ_ATOMIC, /* for atomic write operations */ /* * Command specific flags, keep last: */ @@ -402,6 +407,7 @@ enum req_flag_bits { #define REQ_SWAP (__force blk_opf_t)(1ULL << __REQ_SWAP) #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) +#define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fb7d4c21bba8..4816f3b1d528 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -377,6 +377,16 @@ struct queue_limits { unsigned int discard_alignment; unsigned int zone_write_granularity; + /* atomic write limits */ + unsigned int atomic_write_hw_max; + unsigned int atomic_write_max_sectors; + unsigned int atomic_write_hw_boundary; + unsigned int atomic_write_boundary_sectors; + unsigned int atomic_write_hw_unit_min; + unsigned int atomic_write_unit_min; + unsigned int atomic_write_hw_unit_max; + unsigned int atomic_write_unit_max; + unsigned short max_segments; unsigned short max_integrity_segments; unsigned short max_discard_segments; @@ -1403,6 +1413,30 @@ static inline int queue_dma_alignment(const struct request_queue *q) return q ? q->limits.dma_alignment : 511; } +static inline unsigned int +queue_atomic_write_unit_max_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_unit_max; +} + +static inline unsigned int +queue_atomic_write_unit_min_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_unit_min; +} + +static inline unsigned int +queue_atomic_write_boundary_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_boundary_sectors << SECTOR_SHIFT; +} + +static inline unsigned int +queue_atomic_write_max_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_max_sectors << SECTOR_SHIFT; +} + static inline unsigned int bdev_dma_alignment(struct block_device *bdev) { return queue_dma_alignment(bdev_get_queue(bdev)); @@ -1644,6 +1678,27 @@ struct io_comp_batch { void (*complete)(struct io_comp_batch *); }; +static inline bool bdev_can_atomic_write(struct block_device *bdev) +{ + struct request_queue *bd_queue = bdev->bd_queue; + struct queue_limits *limits = &bd_queue->limits; + + if (!limits->atomic_write_unit_min) + return false; + + if (bdev_is_partition(bdev)) { + sector_t bd_start_sect = bdev->bd_start_sect; + unsigned int alignment = + max(limits->atomic_write_unit_min, + limits->atomic_write_hw_boundary); + + if (!IS_ALIGNED(bd_start_sect, alignment >> SECTOR_SHIFT)) + return false; + } + + return true; +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 9abcfbd235f59fb5b6379e5bc0231dad831ebace Mon Sep 17 00:00:00 2001 From: Prasad Singamsetty Date: Thu, 20 Jun 2024 12:53:55 +0000 Subject: block: Add atomic write support for statx Extend statx system call to return additional info for atomic write support support if the specified file is a block device. Reviewed-by: Martin K. Petersen Signed-off-by: Prasad Singamsetty Signed-off-by: John Garry Reviewed-by: Keith Busch Acked-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240620125359.2684798-7-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4816f3b1d528..2800231046a9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1638,7 +1638,8 @@ int sync_blockdev(struct block_device *bdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); -void bdev_statx_dioalign(struct inode *inode, struct kstat *stat); +void bdev_statx(struct inode *backing_inode, struct kstat *stat, + u32 request_mask); void printk_all_partitions(void); int __init early_lookup_bdev(const char *pathname, dev_t *dev); #else @@ -1656,7 +1657,8 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) static inline void sync_bdevs(bool wait) { } -static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) +static inline void bdev_statx(struct inode *backing_inode, struct kstat *stat, + u32 request_mask) { } static inline void printk_all_partitions(void) -- cgit v1.2.3 From 9919c5c98cb25dbf7e76aadb9beab55a2a25f830 Mon Sep 17 00:00:00 2001 From: Rafael Passos Date: Fri, 14 Jun 2024 23:24:08 -0300 Subject: bpf: remove unused parameter in bpf_jit_binary_pack_finalize Fixes a compiler warning. the bpf_jit_binary_pack_finalize function was taking an extra bpf_prog parameter that went unused. This removves it and updates the callers accordingly. Signed-off-by: Rafael Passos Link: https://lore.kernel.org/r/20240615022641.210320-2-rafael@rcpassos.me Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index b02aea291b7e..dd41a93f06b2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1129,8 +1129,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image, struct bpf_binary_header **rw_hdr, u8 **rw_image, bpf_jit_fill_hole_t bpf_fill_ill_insns); -int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, - struct bpf_binary_header *ro_header, +int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header); void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header); -- cgit v1.2.3 From ab224b9ef7c4eaa752752455ea79bd7022209d5d Mon Sep 17 00:00:00 2001 From: Rafael Passos Date: Fri, 14 Jun 2024 23:24:09 -0300 Subject: bpf: remove unused parameter in __bpf_free_used_btfs Fixes a compiler warning. The __bpf_free_used_btfs function was taking an extra unused struct bpf_prog_aux *aux param Signed-off-by: Rafael Passos Link: https://lore.kernel.org/r/20240615022641.210320-3-rafael@rcpassos.me Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f636b4998bf7..960780ef04e1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2933,8 +2933,7 @@ bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) return ret; } -void __bpf_free_used_btfs(struct bpf_prog_aux *aux, - struct btf_mod_pair *used_btfs, u32 len); +void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len); static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) -- cgit v1.2.3 From 158ed777e330e9bf6bd592daaf1e860d965ec8b5 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 30 Apr 2024 11:43:16 +0100 Subject: firmware: qcom: scm: Add gpu_init_regs call This will used by drm/msm to initialize GPU registers that Qualcomm's firmware doesn't make writeable to the kernel. Reviewed-by: Dmitry Baryshkov Signed-off-by: Connor Abbott Reviewed-by: Konrad Dybcio Acked-by: Bjorn Andersson Link: https://lore.kernel.org/r/20240430-a750-raytracing-v3-2-7f57c5ac082d@gmail.com Signed-off-by: Bjorn Andersson --- include/linux/firmware/qcom/qcom_scm.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index aaa19f93ac43..a221a643dc12 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -115,6 +115,29 @@ int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, int qcom_scm_lmh_profile_change(u32 profile_id); bool qcom_scm_lmh_dcvsh_available(void); +/* + * Request TZ to program set of access controlled registers necessary + * irrespective of any features + */ +#define QCOM_SCM_GPU_ALWAYS_EN_REQ BIT(0) +/* + * Request TZ to program BCL id to access controlled register when BCL is + * enabled + */ +#define QCOM_SCM_GPU_BCL_EN_REQ BIT(1) +/* + * Request TZ to program set of access controlled register for CLX feature + * when enabled + */ +#define QCOM_SCM_GPU_CLX_EN_REQ BIT(2) +/* + * Request TZ to program tsense ids to access controlled registers for reading + * gpu temperature sensors + */ +#define QCOM_SCM_GPU_TSENSE_EN_REQ BIT(3) + +int qcom_scm_gpu_init_regs(u32 gpu_req); + #ifdef CONFIG_QCOM_QSEECOM int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id); -- cgit v1.2.3 From 9267997fa7aa0b597e8b32cb3fdfe91be1d35a83 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 5 Jun 2024 22:10:14 +0200 Subject: soc: qcom: Move some socinfo defines to the header In preparation for parsing the chip "feature code" (FC) and "product code" (PC) (essentially the parameters that let us conclusively characterize the sillicon we're running on, including various speed bins), move the socinfo version defines to the public header. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240605-topic-smem_speedbin-v2-1-8989d7e3d176@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/socinfo.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h index e78777bb0f4a..10e0a4c287f4 100644 --- a/include/linux/soc/qcom/socinfo.h +++ b/include/linux/soc/qcom/socinfo.h @@ -12,6 +12,14 @@ #define SMEM_SOCINFO_BUILD_ID_LENGTH 32 #define SMEM_SOCINFO_CHIP_ID_LENGTH 32 +/* + * SoC version type with major number in the upper 16 bits and minor + * number in the lower 16 bits. + */ +#define SOCINFO_MAJOR(ver) (((ver) >> 16) & 0xffff) +#define SOCINFO_MINOR(ver) ((ver) & 0xffff) +#define SOCINFO_VERSION(maj, min) ((((maj) & 0xffff) << 16)|((min) & 0xffff)) + /* Socinfo SMEM item structure */ struct socinfo { __le32 fmt; -- cgit v1.2.3 From 81bbb2b891174da9301fc0d4fe9622bd4cb6a995 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 5 Jun 2024 22:10:15 +0200 Subject: soc: qcom: smem: Add a feature code getter Recent (SM8550+ ish) Qualcomm SoCs have a new mechanism for precisely identifying the specific SKU and the precise speed bin (in the general meaning of this word, anyway): a pair of values called Product Code and Feature Code. Based on this information, we can deduce the available frequencies for things such as Adreno. In the case of Adreno specifically, Pcode is useless for non-prototype SoCs. Introduce a getter for the feature code and export it. Reviewed-by: Dmitry Baryshkov Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240605-topic-smem_speedbin-v2-2-8989d7e3d176@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/smem.h | 1 + include/linux/soc/qcom/socinfo.h | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/smem.h b/include/linux/soc/qcom/smem.h index a36a3b9d4929..0943bf419e11 100644 --- a/include/linux/soc/qcom/smem.h +++ b/include/linux/soc/qcom/smem.h @@ -13,5 +13,6 @@ int qcom_smem_get_free_space(unsigned host); phys_addr_t qcom_smem_virt_to_phys(void *p); int qcom_smem_get_soc_id(u32 *id); +int qcom_smem_get_feature_code(u32 *code); #endif diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h index 10e0a4c287f4..608950443eee 100644 --- a/include/linux/soc/qcom/socinfo.h +++ b/include/linux/soc/qcom/socinfo.h @@ -3,6 +3,8 @@ #ifndef __QCOM_SOCINFO_H__ #define __QCOM_SOCINFO_H__ +#include + /* * SMEM item id, used to acquire handles to respective * SMEM region. @@ -82,4 +84,28 @@ struct socinfo { __le32 boot_core; }; +/* Internal feature codes */ +enum qcom_socinfo_feature_code { + /* External feature codes */ + SOCINFO_FC_UNKNOWN = 0x0, + SOCINFO_FC_AA, + SOCINFO_FC_AB, + SOCINFO_FC_AC, + SOCINFO_FC_AD, + SOCINFO_FC_AE, + SOCINFO_FC_AF, + SOCINFO_FC_AG, + SOCINFO_FC_AH, +}; + +/* Internal feature codes */ +/* Valid values: 0 <= n <= 0xf */ +#define SOCINFO_FC_Yn(n) (0xf1 + (n)) +#define SOCINFO_FC_INT_MAX SOCINFO_FC_Yn(0xf) + +/* Product codes */ +#define SOCINFO_PC_UNKNOWN 0 +#define SOCINFO_PCn(n) ((n) + 1) +#define SOCINFO_PC_RESERVE (BIT(31) - 1) + #endif -- cgit v1.2.3 From 5878853fc9389e7d0988d4b465a415cf96fd14fa Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Thu, 20 Jun 2024 14:27:20 +0200 Subject: dmaengine: Add API function dmaengine_prep_peripheral_dma_vec() This function can be used to initiate a scatter-gather DMA transfer, where the address and size of each segment is located in one entry of the dma_vec array. The major difference with dmaengine_prep_slave_sg() is that it supports specifying the lengths of each DMA transfer; as trying to override the length of the transfer with dmaengine_prep_slave_sg() is a very tedious process. The introduction of a new API function is also justified by the fact that scatterlists are on their way out. Note that dmaengine_prep_interleaved_dma() is not helpful either in that case, as it assumes that the address of each segment will be higher than the one of the previous segment, which we just cannot guarantee in case of a scatter-gather transfer. Signed-off-by: Paul Cercueil Co-developed-by: Nuno Sa Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240620122726.41232-2-paul@crapouillou.net Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 752dbde4cec1..9fc03068cabc 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -160,6 +160,16 @@ struct dma_interleaved_template { struct data_chunk sgl[]; }; +/** + * struct dma_vec - DMA vector + * @addr: Bus address of the start of the vector + * @len: Length in bytes of the DMA vector + */ +struct dma_vec { + dma_addr_t addr; + size_t len; +}; + /** * enum dma_ctrl_flags - DMA flags to augment operation preparation, * control completion, and communicate status. @@ -910,6 +920,10 @@ struct dma_device { struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)( struct dma_chan *chan, unsigned long flags); + struct dma_async_tx_descriptor *(*device_prep_peripheral_dma_vec)( + struct dma_chan *chan, const struct dma_vec *vecs, + size_t nents, enum dma_transfer_direction direction, + unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_slave_sg)( struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, enum dma_transfer_direction direction, @@ -973,6 +987,25 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_single( dir, flags, NULL); } +/** + * dmaengine_prep_peripheral_dma_vec() - Prepare a DMA scatter-gather descriptor + * @chan: The channel to be used for this descriptor + * @vecs: The array of DMA vectors that should be transferred + * @nents: The number of DMA vectors in the array + * @dir: Specifies the direction of the data transfer + * @flags: DMA engine flags + */ +static inline struct dma_async_tx_descriptor *dmaengine_prep_peripheral_dma_vec( + struct dma_chan *chan, const struct dma_vec *vecs, size_t nents, + enum dma_transfer_direction dir, unsigned long flags) +{ + if (!chan || !chan->device || !chan->device->device_prep_peripheral_dma_vec) + return NULL; + + return chan->device->device_prep_peripheral_dma_vec(chan, vecs, nents, + dir, flags); +} + static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_sg( struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, enum dma_transfer_direction dir, unsigned long flags) -- cgit v1.2.3 From 5f2e950755c76c6fc20fc4ebd8355671fbbd7ac0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 13 Jun 2024 16:48:36 +0200 Subject: leds: core: Introduce led_get_color_name() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is similar to the existing led_colors[] array but is safer to use and usable by everyone. Getting string representations of color ids is useful for drivers which are handling color IDs anyways, for example for the multicolor API. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20240613-cros_ec-led-v3-1-500b50f41e0f@weissschuh.net Signed-off-by: Lee Jones --- include/linux/leds.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 6300313c46b7..dedea965afbf 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -427,6 +427,16 @@ void led_sysfs_enable(struct led_classdev *led_cdev); int led_compose_name(struct device *dev, struct led_init_data *init_data, char *led_classdev_name); +/** + * led_get_color_name - get string representation of color ID + * @color_id: The LED_COLOR_ID_* constant + * + * Get the string name of a LED_COLOR_ID_* constant. + * + * Returns: A string constant or NULL on an invalid ID. + */ +const char *led_get_color_name(u8 color_id); + /** * led_sysfs_is_disabled - check if LED sysfs interface is disabled * @led_cdev: the LED to query -- cgit v1.2.3 From 1cbf347288702af0fe8667c0ce760afbe982a2f1 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 14 Jun 2024 11:14:18 +0300 Subject: i2c: Add nop fwnode operations Add nop variants of i2c_find_device_by_fwnode(), i2c_find_adapter_by_fwnode() and i2c_get_adapter_by_fwnode() for use without CONFIG_I2C. Signed-off-by: Sakari Ailus Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 9709537370ee..424acb98c7c2 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -960,8 +960,6 @@ int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr); #define builtin_i2c_driver(__i2c_driver) \ builtin_driver(__i2c_driver, i2c_add_driver) -#endif /* I2C */ - /* must call put_device() when done with returned i2c_client device */ struct i2c_client *i2c_find_device_by_fwnode(struct fwnode_handle *fwnode); @@ -971,6 +969,28 @@ struct i2c_adapter *i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode); /* must call i2c_put_adapter() when done with returned i2c_adapter device */ struct i2c_adapter *i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode); +#else /* I2C */ + +static inline struct i2c_client * +i2c_find_device_by_fwnode(struct fwnode_handle *fwnode) +{ + return NULL; +} + +static inline struct i2c_adapter * +i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode) +{ + return NULL; +} + +static inline struct i2c_adapter * +i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode) +{ + return NULL; +} + +#endif /* !I2C */ + #if IS_ENABLED(CONFIG_OF) /* must call put_device() when done with returned i2c_client device */ static inline struct i2c_client *of_find_i2c_device_by_node(struct device_node *node) -- cgit v1.2.3 From 6ae0092ca7adfce79336c6525cb0da561ecd4f04 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 14 Jun 2024 14:16:05 -0700 Subject: thermal: intel: intel_tcc: Add model checks for temperature registers The register MSR_TEMPERATURE_TARGET is not architectural. Its fields may be defined differently for each processor model. TCC_OFFSET is an example of such case. Despite being specified as architectural, the registers IA32_[PACKAGE]_ THERM_STATUS have become model-specific: in recent processors, the digital temperature readout uses bits [23:16] whereas the Intel Software Developer's manual specifies bits [22:16]. Create an array of processor models and their bitmasks for TCC_OFFSET and the digital temperature readout fields. Do not include recent processors. Instead, use the bitmasks of these recent processors as default. Use these model-specific bitmasks when reading TCC_OFFSET or the temperature sensors. Initialize a model-specific data structure during subsys_initcall() to have it ready when thermal drivers are loaded. Expose the new interface intel_tcc_get_offset_mask(). The intel_tcc_cooling driver will use it. Reviewed-by: Zhang Rui Signed-off-by: Ricardo Neri Link: https://patch.msgid.link/20240614211606.5896-2-ricardo.neri-calderon@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/intel_tcc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/intel_tcc.h b/include/linux/intel_tcc.h index 8ff8eabb4a98..fa788817acfc 100644 --- a/include/linux/intel_tcc.h +++ b/include/linux/intel_tcc.h @@ -14,5 +14,6 @@ int intel_tcc_get_tjmax(int cpu); int intel_tcc_get_offset(int cpu); int intel_tcc_set_offset(int cpu, int offset); int intel_tcc_get_temp(int cpu, int *temp, bool pkg); +u32 intel_tcc_get_offset_mask(void); #endif /* __INTEL_TCC_H__ */ -- cgit v1.2.3 From b6cfe2287df6e26d685af8a8a96ed1bf87bdde28 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 21 Jun 2024 12:15:05 +0900 Subject: block: Define bdev_nr_zones() as an inline function There is no need for bdev_nr_zones() to be an exported function calculating the number of zones of a block device. Instead, given that all callers use this helper with a fully initialized block device that has a gendisk, we can redefine this function as an inline helper in blkdev.h. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240621031506.759397-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2800231046a9..1078a7d51295 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -673,7 +673,6 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } #ifdef CONFIG_BLK_DEV_ZONED -unsigned int bdev_nr_zones(struct block_device *bdev); static inline unsigned int disk_nr_zones(struct gendisk *disk) { @@ -687,6 +686,11 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) return sector >> ilog2(disk->queue->limits.chunk_sectors); } +static inline unsigned int bdev_nr_zones(struct block_device *bdev) +{ + return disk_nr_zones(bdev->bd_disk); +} + static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { return bdev->bd_disk->queue->limits.max_open_zones; -- cgit v1.2.3 From caaf7101c01a91a882d3da2f566579dda692367d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 21 Jun 2024 12:15:06 +0900 Subject: block: Cleanup block device zone helpers There is no need to conditionally define on CONFIG_BLK_DEV_ZONED the inline helper functions bdev_nr_zones(), bdev_max_open_zones(), bdev_max_active_zones() and disk_zone_no() as these function will return the correct valu in all cases (zoned device or not, including when CONFIG_BLK_DEV_ZONED is not set). Furthermore, disk_nr_zones() definition can be simplified as disk->nr_zones is always 0 for regular block devices. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240621031506.759397-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 44 ++++++++++++-------------------------------- 1 file changed, 12 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1078a7d51295..e89003360c17 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -673,11 +673,21 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } #ifdef CONFIG_BLK_DEV_ZONED - static inline unsigned int disk_nr_zones(struct gendisk *disk) { - return blk_queue_is_zoned(disk->queue) ? disk->nr_zones : 0; + return disk->nr_zones; +} +bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); +#else /* CONFIG_BLK_DEV_ZONED */ +static inline unsigned int disk_nr_zones(struct gendisk *disk) +{ + return 0; +} +static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) +{ + return false; } +#endif /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) { @@ -701,36 +711,6 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) return bdev->bd_disk->queue->limits.max_active_zones; } -bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); -#else /* CONFIG_BLK_DEV_ZONED */ -static inline unsigned int bdev_nr_zones(struct block_device *bdev) -{ - return 0; -} - -static inline unsigned int disk_nr_zones(struct gendisk *disk) -{ - return 0; -} -static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) -{ - return 0; -} -static inline unsigned int bdev_max_open_zones(struct block_device *bdev) -{ - return 0; -} - -static inline unsigned int bdev_max_active_zones(struct block_device *bdev) -{ - return 0; -} -static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) -{ - return false; -} -#endif /* CONFIG_BLK_DEV_ZONED */ - static inline unsigned int blk_queue_depth(struct request_queue *q) { if (q->queue_depth) -- cgit v1.2.3 From f80a55fa90fa76d01e3fffaa5d0413e522ab9a00 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 17 Jun 2024 09:27:27 +0200 Subject: nvme: fixup comment for nvme RDMA Provider Type PRTYPE is the provider type, not the QP service type. Fixes: eb793e2c9286 ("nvme.h: add NVMe over Fabrics definitions") Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 425573202295..69ac2abf8acf 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -87,8 +87,8 @@ enum { NVMF_RDMA_QPTYPE_DATAGRAM = 2, /* Reliable Datagram */ }; -/* RDMA QP Service Type codes for Discovery Log Page entry TSAS - * RDMA_QPTYPE field +/* RDMA Provider Type codes for Discovery Log Page entry TSAS + * RDMA_PRTYPE field */ enum { NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 1, /* No Provider Specified */ -- cgit v1.2.3 From 0f1f5803920d2a6b88bee950914fd37421e17170 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 17 Jun 2024 09:27:28 +0200 Subject: nvmet: make 'tsas' attribute idempotent for RDMA The RDMA transport defines values for TSAS, but it cannot be changed as we only support the 'connected' mode. So to avoid errors during reconfiguration we should allow to write the current value. Fixes: 3f123494db72 ("nvmet: make TCP sectype settable via configfs") Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 69ac2abf8acf..c693ac344ec0 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -85,6 +85,7 @@ enum { enum { NVMF_RDMA_QPTYPE_CONNECTED = 1, /* Reliable Connected */ NVMF_RDMA_QPTYPE_DATAGRAM = 2, /* Reliable Datagram */ + NVMF_RDMA_QPTYPE_INVALID = 0xff, }; /* RDMA Provider Type codes for Discovery Log Page entry TSAS @@ -110,6 +111,7 @@ enum { NVMF_TCP_SECTYPE_NONE = 0, /* No Security */ NVMF_TCP_SECTYPE_TLS12 = 1, /* TLSv1.2, NVMe-oF 1.1 and NVMe-TCP 3.6.1.1 */ NVMF_TCP_SECTYPE_TLS13 = 2, /* TLSv1.3, NVMe-oF 1.1 and NVMe-TCP 3.6.1.1 */ + NVMF_TCP_SECTYPE_INVALID = 0xff, }; #define NVME_AQ_DEPTH 32 -- cgit v1.2.3 From 9403408e122628a6dc7154a95716f00dae3747c7 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Mon, 17 Jun 2024 17:16:15 +0100 Subject: tick: Remove unnused tick_nohz_get_idle_calls() The function returns the idle calls counter for the current cpu and therefore usually isn't what the caller wants. It is unnused since commit 466a2b42d676 ("cpufreq: schedutil: Use idle_calls counter of the remote CPU") Signed-off-by: Christian Loehle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240617161615.49309-1-christian.loehle@arm.com --- include/linux/tick.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index 4924a33700b7..72744638c5b0 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -139,7 +139,6 @@ extern void tick_nohz_irq_exit(void); extern bool tick_nohz_idle_got_tick(void); extern ktime_t tick_nohz_get_next_hrtimer(void); extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next); -extern unsigned long tick_nohz_get_idle_calls(void); extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); -- cgit v1.2.3 From 4df13a6871d9e97aeeef72244e9a954c5cf11f54 Mon Sep 17 00:00:00 2001 From: Ben Segal Date: Wed, 19 Jun 2024 13:58:46 +0200 Subject: vfio/pci: Support 8-byte PCI loads and stores Many PCI adapters can benefit or even require full 64bit read and write access to their registers. In order to enable work on user-space drivers for these devices add two new variations vfio_pci_core_io{read|write}64 of the existing access methods when the architecture supports 64-bit ioreads and iowrites. Signed-off-by: Ben Segal Co-developed-by: Gerd Bayer Signed-off-by: Gerd Bayer Link: https://lore.kernel.org/r/20240619115847.1344875-3-gbayer@linux.ibm.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index f87067438ed4..7b45f70f84c3 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -155,5 +155,8 @@ int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \ VFIO_IOREAD_DECLATION(8) VFIO_IOREAD_DECLATION(16) VFIO_IOREAD_DECLATION(32) +#ifdef ioread64 +VFIO_IOREAD_DECLATION(64) +#endif #endif /* VFIO_PCI_CORE_H */ -- cgit v1.2.3 From abe8103da3c59415b790fa260a6676c7bf252a65 Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Wed, 19 Jun 2024 13:58:47 +0200 Subject: vfio/pci: Fix typo in macro to declare accessors Correct spelling of DECLA[RA]TION Suggested-by: Ramesh Thomas Signed-off-by: Gerd Bayer Link: https://lore.kernel.org/r/20240619115847.1344875-4-gbayer@linux.ibm.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 7b45f70f84c3..fbb472dd99b3 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -137,26 +137,26 @@ bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, loff_t *buf_offset, size_t *intersect_count, size_t *register_offset); -#define VFIO_IOWRITE_DECLATION(size) \ +#define VFIO_IOWRITE_DECLARATION(size) \ int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev, \ bool test_mem, u##size val, void __iomem *io); -VFIO_IOWRITE_DECLATION(8) -VFIO_IOWRITE_DECLATION(16) -VFIO_IOWRITE_DECLATION(32) +VFIO_IOWRITE_DECLARATION(8) +VFIO_IOWRITE_DECLARATION(16) +VFIO_IOWRITE_DECLARATION(32) #ifdef iowrite64 -VFIO_IOWRITE_DECLATION(64) +VFIO_IOWRITE_DECLARATION(64) #endif -#define VFIO_IOREAD_DECLATION(size) \ +#define VFIO_IOREAD_DECLARATION(size) \ int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \ bool test_mem, u##size *val, void __iomem *io); -VFIO_IOREAD_DECLATION(8) -VFIO_IOREAD_DECLATION(16) -VFIO_IOREAD_DECLATION(32) +VFIO_IOREAD_DECLARATION(8) +VFIO_IOREAD_DECLARATION(16) +VFIO_IOREAD_DECLARATION(32) #ifdef ioread64 -VFIO_IOREAD_DECLATION(64) +VFIO_IOREAD_DECLARATION(64) #endif #endif /* VFIO_PCI_CORE_H */ -- cgit v1.2.3 From 90c3e2bc9ecbf3210d661fadee4c947d34a88ceb Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 30 Apr 2024 11:43:16 +0100 Subject: firmware: qcom_scm: Add gpu_init_regs call This will used by drm/msm to initialize GPU registers that Qualcomm's firmware doesn't make writeable to the kernel. Reviewed-by: Dmitry Baryshkov Signed-off-by: Connor Abbott Reviewed-by: Konrad Dybcio Acked-by: Bjorn Andersson Patchwork: https://patchwork.freedesktop.org/patch/592039/ Signed-off-by: Rob Clark --- include/linux/firmware/qcom/qcom_scm.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index aaa19f93ac43..a221a643dc12 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -115,6 +115,29 @@ int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, int qcom_scm_lmh_profile_change(u32 profile_id); bool qcom_scm_lmh_dcvsh_available(void); +/* + * Request TZ to program set of access controlled registers necessary + * irrespective of any features + */ +#define QCOM_SCM_GPU_ALWAYS_EN_REQ BIT(0) +/* + * Request TZ to program BCL id to access controlled register when BCL is + * enabled + */ +#define QCOM_SCM_GPU_BCL_EN_REQ BIT(1) +/* + * Request TZ to program set of access controlled register for CLX feature + * when enabled + */ +#define QCOM_SCM_GPU_CLX_EN_REQ BIT(2) +/* + * Request TZ to program tsense ids to access controlled registers for reading + * gpu temperature sensors + */ +#define QCOM_SCM_GPU_TSENSE_EN_REQ BIT(3) + +int qcom_scm_gpu_init_regs(u32 gpu_req); + #ifdef CONFIG_QCOM_QSEECOM int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id); -- cgit v1.2.3 From d4e48e3dd45017abdd69a19285d197de897ef44f Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Thu, 20 Jun 2024 10:17:29 +0100 Subject: module, bpf: Store BTF base pointer in struct module ...as this will allow split BTF modules with a base BTF representation (rather than the full vmlinux BTF at time of BTF encoding) to resolve their references to kernel types in a way that is more resilient to small changes in kernel types. This will allow modules that are not built every time the kernel is to provide more resilient BTF, rather than have it invalidated every time BTF ids for core kernel types change. Fields are ordered to avoid holes in struct module. Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Reviewed-by: Luis Chamberlain Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240620091733.1967885-3-alan.maguire@oracle.com --- include/linux/module.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index ffa1c603163c..b79d926cae8a 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -509,7 +509,9 @@ struct module { #endif #ifdef CONFIG_DEBUG_INFO_BTF_MODULES unsigned int btf_data_size; + unsigned int btf_base_data_size; void *btf_data; + void *btf_base_data; #endif #ifdef CONFIG_JUMP_LABEL struct jump_entry *jump_entries; -- cgit v1.2.3 From 8646db238997df36c6ad71a9d7e0b52ceee221b2 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Thu, 20 Jun 2024 10:17:31 +0100 Subject: libbpf,bpf: Share BTF relocate-related code with kernel Share relocation implementation with the kernel. As part of this, we also need the type/string iteration functions so also share btf_iter.c file. Relocation code in kernel and userspace is identical save for the impementation of the reparenting of split BTF to the relocated base BTF and retrieval of the BTF header from "struct btf"; these small functions need separate user-space and kernel implementations for the separate "struct btf"s they operate upon. One other wrinkle on the kernel side is we have to map .BTF.ids in modules as they were generated with the type ids used at BTF encoding time. btf_relocate() optionally returns an array mapping from old BTF ids to relocated ids, so we use that to fix up these references where needed for kfuncs. Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240620091733.1967885-5-alan.maguire@oracle.com --- include/linux/btf.h | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 56d91daacdba..d199fa17abb4 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -140,6 +140,7 @@ extern const struct file_operations btf_fops; const char *btf_get_name(const struct btf *btf); void btf_get(struct btf *btf); void btf_put(struct btf *btf); +const struct btf_header *btf_header(const struct btf *btf); int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz); struct btf *btf_get_by_fd(int fd); int btf_get_info_by_fd(const struct btf *btf, @@ -212,8 +213,10 @@ int btf_get_fd_by_id(u32 id); u32 btf_obj_id(const struct btf *btf); bool btf_is_kernel(const struct btf *btf); bool btf_is_module(const struct btf *btf); +bool btf_is_vmlinux(const struct btf *btf); struct module *btf_try_get_module(const struct btf *btf); u32 btf_nr_types(const struct btf *btf); +struct btf *btf_base_btf(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); @@ -339,6 +342,11 @@ static inline u8 btf_int_offset(const struct btf_type *t) return BTF_INT_OFFSET(*(u32 *)(t + 1)); } +static inline __u8 btf_int_bits(const struct btf_type *t) +{ + return BTF_INT_BITS(*(__u32 *)(t + 1)); +} + static inline bool btf_type_is_scalar(const struct btf_type *t) { return btf_type_is_int(t) || btf_type_is_enum(t); @@ -478,6 +486,11 @@ static inline struct btf_param *btf_params(const struct btf_type *t) return (struct btf_param *)(t + 1); } +static inline struct btf_decl_tag *btf_decl_tag(const struct btf_type *t) +{ + return (struct btf_decl_tag *)(t + 1); +} + static inline int btf_id_cmp_func(const void *a, const void *b) { const int *pa = a, *pb = b; @@ -515,9 +528,38 @@ static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf * } #endif +enum btf_field_iter_kind { + BTF_FIELD_ITER_IDS, + BTF_FIELD_ITER_STRS, +}; + +struct btf_field_desc { + /* once-per-type offsets */ + int t_off_cnt, t_offs[2]; + /* member struct size, or zero, if no members */ + int m_sz; + /* repeated per-member offsets */ + int m_off_cnt, m_offs[1]; +}; + +struct btf_field_iter { + struct btf_field_desc desc; + void *p; + int m_idx; + int off_idx; + int vlen; +}; + #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); +void btf_set_base_btf(struct btf *btf, const struct btf *base_btf); +int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **map_ids); +int btf_field_iter_init(struct btf_field_iter *it, struct btf_type *t, + enum btf_field_iter_kind iter_kind); +__u32 *btf_field_iter_next(struct btf_field_iter *it); + const char *btf_name_by_offset(const struct btf *btf, u32 offset); +const char *btf_str_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id, @@ -544,6 +586,28 @@ static inline const struct btf_type *btf_type_by_id(const struct btf *btf, { return NULL; } + +static inline void btf_set_base_btf(struct btf *btf, const struct btf *base_btf) +{ +} + +static inline int btf_relocate(void *log, struct btf *btf, const struct btf *base_btf, + __u32 **map_ids) +{ + return -EOPNOTSUPP; +} + +static inline int btf_field_iter_init(struct btf_field_iter *it, struct btf_type *t, + enum btf_field_iter_kind iter_kind) +{ + return -EOPNOTSUPP; +} + +static inline __u32 *btf_field_iter_next(struct btf_field_iter *it) +{ + return NULL; +} + static inline const char *btf_name_by_offset(const struct btf *btf, u32 offset) { -- cgit v1.2.3 From d4a0055fdc22381fa256e345095e88d134e354c5 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 21 Jun 2024 15:51:31 -0500 Subject: spi: add devm_spi_optimize_message() helper This adds a new helper function devm_spi_optimize_message() that automatically registers spi_unoptimize_message() to be called when the device is removed. Signed-off-by: David Lechner Link: https://patch.msgid.link/20240621-devm_spi_optimize_message-v1-2-3f9dcba6e95e@baylibre.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e8e1e798924f..e6e5978f7564 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1268,6 +1268,8 @@ static inline void spi_message_free(struct spi_message *m) extern int spi_optimize_message(struct spi_device *spi, struct spi_message *msg); extern void spi_unoptimize_message(struct spi_message *msg); +extern int devm_spi_optimize_message(struct device *dev, struct spi_device *spi, + struct spi_message *msg); extern int spi_setup(struct spi_device *spi); extern int spi_async(struct spi_device *spi, struct spi_message *message); -- cgit v1.2.3 From 06efa5f30c28eaf237247ca8c4cb46eb62cb6bd9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Jun 2024 21:38:58 -0400 Subject: closures: closure_get_not_zero(), closure_return_sync() Provide new primitives for solving a lifetime issue with bcachefs btree_trans objects. closure_sync_return(): like closure_sync(), wait synchronously for any outstanding gets. like closure_return, the closure is considered "finished" and the ref left at 0. closure_get_not_zero(): get a ref on a closure if it's alive, i.e. the ref is not zero. Signed-off-by: Kent Overstreet --- include/linux/closure.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/closure.h b/include/linux/closure.h index 99155df162d0..59b8c06b11ff 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -284,6 +284,21 @@ static inline void closure_get(struct closure *cl) #endif } +/** + * closure_get_not_zero + */ +static inline bool closure_get_not_zero(struct closure *cl) +{ + unsigned old = atomic_read(&cl->remaining); + do { + if (!(old & CLOSURE_REMAINING_MASK)) + return false; + + } while (!atomic_try_cmpxchg_acquire(&cl->remaining, &old, old + 1)); + + return true; +} + /** * closure_init - Initialize a closure, setting the refcount to 1 * @cl: closure to initialize @@ -310,6 +325,12 @@ static inline void closure_init_stack(struct closure *cl) atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); } +static inline void closure_init_stack_release(struct closure *cl) +{ + memset(cl, 0, sizeof(struct closure)); + atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +} + /** * closure_wake_up - wake up all closures on a wait list, * with memory barrier @@ -355,6 +376,8 @@ do { \ */ #define closure_return(_cl) continue_at((_cl), NULL, NULL) +void closure_return_sync(struct closure *cl); + /** * continue_at_nobarrier - jump to another function without barrier * -- cgit v1.2.3 From 467cfe945656df044c8cf9121e5cdbe5b977b497 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 19 Feb 2024 13:43:55 +0200 Subject: accel/habanalabs/gaudi2: align embedded specs headers Align embedded headers to latest release. Reviewed-by: Tomer Tayar Signed-off-by: Ofir Bitton --- include/linux/habanalabs/cpucp_if.h | 10 ++++++++-- include/linux/habanalabs/hl_boot_if.h | 29 ++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index f316c8d0f3fc..1ac1d68193e3 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -42,6 +42,12 @@ enum eq_event_id { EQ_EVENT_PWR_BRK_ENTRY, EQ_EVENT_PWR_BRK_EXIT, EQ_EVENT_HEARTBEAT, + EQ_EVENT_CPLD_RESET_REASON, + EQ_EVENT_CPLD_SHUTDOWN, + EQ_EVENT_POWER_EVT_START, + EQ_EVENT_POWER_EVT_END, + EQ_EVENT_THERMAL_EVT_START, + EQ_EVENT_THERMAL_EVT_END, }; /* @@ -1165,7 +1171,7 @@ struct cpucp_security_info { struct cpucp_info { struct cpucp_sensor sensors[CPUCP_MAX_SENSORS]; __u8 kernel_version[VERSION_MAX_LEN]; - __le32 reserved; + __le32 reserved1; __le32 card_type; __le32 card_location; __le32 cpld_version; @@ -1187,7 +1193,7 @@ struct cpucp_info { __u8 substrate_version; __u8 eq_health_check_supported; struct cpucp_security_info sec_info; - __le32 fw_hbm_region_size; + __le32 reserved2; __u8 pll_map[PLL_MAP_LEN]; __le64 mme_binning_mask; __u8 fw_os_version[VERSION_MAX_LEN]; diff --git a/include/linux/habanalabs/hl_boot_if.h b/include/linux/habanalabs/hl_boot_if.h index 93366d5621fd..d2a9fc96424b 100644 --- a/include/linux/habanalabs/hl_boot_if.h +++ b/include/linux/habanalabs/hl_boot_if.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2018-2020 HabanaLabs, Ltd. + * Copyright 2018-2023 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -49,7 +49,6 @@ enum cpu_boot_err { #define CPU_BOOT_ERR_FATAL_MASK \ ((1 << CPU_BOOT_ERR_DRAM_INIT_FAIL) | \ (1 << CPU_BOOT_ERR_PLL_FAIL) | \ - (1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL) | \ (1 << CPU_BOOT_ERR_BINNING_FAIL) | \ (1 << CPU_BOOT_ERR_DRAM_SKIPPED) | \ (1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) | \ @@ -194,6 +193,8 @@ enum cpu_boot_dev_sts { CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN = 24, CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN = 25, CPU_BOOT_DEV_STS_MAP_HWMON_EN = 26, + CPU_BOOT_DEV_STS_NIC_MEM_CLEAR_EN = 27, + CPU_BOOT_DEV_STS_MMU_PGTBL_DRAM_EN = 28, CPU_BOOT_DEV_STS_ENABLED = 31, CPU_BOOT_DEV_STS_SCND_EN = 63, CPU_BOOT_DEV_STS_LAST = 64 /* we have 2 registers of 32 bits */ @@ -331,6 +332,17 @@ enum cpu_boot_dev_sts { * HWMON enum mapping to cpucp enums. * Initialized in: linux * + * CPU_BOOT_DEV_STS0_NIC_MEM_CLEAR_EN + * If set, means f/w supports nic hbm memory clear and + * tmr,txs hbm memory init. + * Initialized in: zephyr-mgmt + * + * CPU_BOOT_DEV_STS_MMU_PGTBL_DRAM_EN + * MMU page tables are located in DRAM. + * F/W initializes security settings for MMU + * page tables to reside in DRAM. + * Initialized in: zephyr-mgmt + * * CPU_BOOT_DEV_STS0_ENABLED Device status register enabled. * This is a main indication that the * running FW populates the device status @@ -367,6 +379,8 @@ enum cpu_boot_dev_sts { #define CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN (1 << CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN) #define CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN (1 << CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN) #define CPU_BOOT_DEV_STS0_MAP_HWMON_EN (1 << CPU_BOOT_DEV_STS_MAP_HWMON_EN) +#define CPU_BOOT_DEV_STS0_NIC_MEM_CLEAR_EN (1 << CPU_BOOT_DEV_STS_NIC_MEM_CLEAR_EN) +#define CPU_BOOT_DEV_STS0_MMU_PGTBL_DRAM_EN (1 << CPU_BOOT_DEV_STS_MMU_PGTBL_DRAM_EN) #define CPU_BOOT_DEV_STS0_ENABLED (1 << CPU_BOOT_DEV_STS_ENABLED) #define CPU_BOOT_DEV_STS1_ENABLED (1 << CPU_BOOT_DEV_STS_ENABLED) @@ -450,11 +464,11 @@ struct cpu_dyn_regs { __le32 gic_dma_core_irq_ctrl; __le32 gic_host_halt_irq; __le32 gic_host_ints_irq; - __le32 gic_host_soft_rst_irq; + __le32 reserved0; __le32 gic_rot_qm_irq_ctrl; - __le32 cpu_rst_status; + __le32 reserved1; __le32 eng_arc_irq_ctrl; - __le32 reserved1[20]; /* reserve for future use */ + __le32 reserved2[20]; /* reserve for future use */ }; /* TODO: remove the desc magic after the code is updated to use message */ @@ -551,8 +565,9 @@ enum lkd_fw_ascii_msg_lvls { LKD_FW_ASCII_MSG_DBG = 3, }; -#define LKD_FW_ASCII_MSG_MAX_LEN 128 -#define LKD_FW_ASCII_MSG_MAX 4 /* consider ABI when changing */ +#define LKD_FW_ASCII_MSG_MAX_LEN 128 +#define LKD_FW_ASCII_MSG_MAX 4 /* consider ABI when changing */ +#define LKD_FW_ASCII_MSG_MIN_DESC_VERSION 3 struct lkd_fw_ascii_msg { __u8 valid; -- cgit v1.2.3 From c2a27584ff3b7f17da3ed2abfa3956f9d376e3da Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 11 Mar 2024 18:31:10 +0200 Subject: accel/habanalabs: separate nonce from max_size in cpucp_packet struct In struct cpucp_packet both nonce and data_max_size members are in an union overlapping each other. This is a problem as they both are used in attestation and info_signed packets. The solution here is to move the nonce member to a different union under the same structure. Signed-off-by: Dani Liberman Reviewed-by: Ofir Bitton Signed-off-by: Ofir Bitton --- include/linux/habanalabs/cpucp_if.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index 1ac1d68193e3..0913415243e8 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -859,9 +859,6 @@ struct cpucp_packet { * result cannot be used to hold general purpose data. */ __le32 status_mask; - - /* random, used once number, for security packets */ - __le32 nonce; }; union { @@ -870,6 +867,9 @@ struct cpucp_packet { /* For Generic packet sub index */ __le32 pkt_subidx; + + /* random, used once number, for security packets */ + __le32 nonce; }; }; -- cgit v1.2.3 From cebb64f9335b073cc2877fed16c613c56bed82bc Mon Sep 17 00:00:00 2001 From: Vitaly Margolin Date: Mon, 27 May 2024 17:25:37 +0300 Subject: accel/habanalabs: add cpld ts cpld_timestamp cpucp Add cpld_timestamp field to cpucp_info structure and return cpld timestamp as part of cpld version Signed-off-by: Vitaly Margolin Reviewed-by: Ofir Bitton Signed-off-by: Ofir Bitton --- include/linux/habanalabs/cpucp_if.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index 0913415243e8..1ed17887f1a8 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -1146,6 +1146,7 @@ struct cpucp_security_info { * (0 = fully functional, 1 = lower-half is not functional, * 2 = upper-half is not functional) * @sec_info: security information + * @cpld_timestamp: CPLD programmed F/W timestamp. * @pll_map: Bit map of supported PLLs for current ASIC version. * @mme_binning_mask: MME binning mask, * bits [0:6] <==> dcore0 mme fma @@ -1193,7 +1194,7 @@ struct cpucp_info { __u8 substrate_version; __u8 eq_health_check_supported; struct cpucp_security_info sec_info; - __le32 reserved2; + __le32 cpld_timestamp; __u8 pll_map[PLL_MAP_LEN]; __le64 mme_binning_mask; __u8 fw_os_version[VERSION_MAX_LEN]; -- cgit v1.2.3 From c4548eee537ed7671170eb370c6c8db12dcfd3e8 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Tue, 16 Apr 2024 19:01:42 +0300 Subject: accel/habanalabs: dump the EQ entries headers on EQ heartbeat failure Add a dump of the EQ entries headers upon a EQ heartbeat failure. Signed-off-by: Tomer Tayar Reviewed-by: Ofir Bitton Signed-off-by: Ofir Bitton --- include/linux/habanalabs/cpucp_if.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index 1ed17887f1a8..7ed3fdd55dda 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -397,6 +397,9 @@ struct hl_eq_entry { #define EQ_CTL_READY_SHIFT 31 #define EQ_CTL_READY_MASK 0x80000000 +#define EQ_CTL_EVENT_MODE_SHIFT 28 +#define EQ_CTL_EVENT_MODE_MASK 0x70000000 + #define EQ_CTL_EVENT_TYPE_SHIFT 16 #define EQ_CTL_EVENT_TYPE_MASK 0x0FFF0000 -- cgit v1.2.3 From 84f5a7b67b61bfeb0a939ddc5eca8586cae101de Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 27 May 2024 14:54:52 +0200 Subject: firmware: qcom: add a dedicated TrustZone buffer allocator We have several SCM calls that require passing buffers to the TrustZone on top of the SMC core which allocates memory for calls that require more than 4 arguments. Currently every user does their own thing which leads to code duplication. Many users call dma_alloc_coherent() for every call which is terribly unperformant (speed- and size-wise). Provide a set of library functions for creating and managing pools of memory which is suitable for sharing with the TrustZone, that is: page-aligned, contiguous and non-cachable as well as provides a way of mapping of kernel virtual addresses to physical space. Make the allocator ready for extending with additional modes of operation which will allow us to support the SHM bridge safety mechanism once all users convert. Signed-off-by: Bartosz Golaszewski Reviewed-by: Andrew Halaney Tested-by: Andrew Halaney # sc8280xp-lenovo-thinkpad-x13s Tested-by: Deepti Jaggi #sa8775p-ride Reviewed-by: Elliot Berman Link: https://lore.kernel.org/r/20240527-shm-bridge-v10-2-ce7afaa58d3a@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/firmware/qcom/qcom_tzmem.h | 56 ++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 include/linux/firmware/qcom/qcom_tzmem.h (limited to 'include/linux') diff --git a/include/linux/firmware/qcom/qcom_tzmem.h b/include/linux/firmware/qcom/qcom_tzmem.h new file mode 100644 index 000000000000..b83b63a0c049 --- /dev/null +++ b/include/linux/firmware/qcom/qcom_tzmem.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023-2024 Linaro Ltd. + */ + +#ifndef __QCOM_TZMEM_H +#define __QCOM_TZMEM_H + +#include +#include +#include + +struct device; +struct qcom_tzmem_pool; + +/** + * enum qcom_tzmem_policy - Policy for pool growth. + */ +enum qcom_tzmem_policy { + /**< Static pool, never grow above initial size. */ + QCOM_TZMEM_POLICY_STATIC = 1, + /**< When out of memory, add increment * current size of memory. */ + QCOM_TZMEM_POLICY_MULTIPLIER, + /**< When out of memory add as much as is needed until max_size. */ + QCOM_TZMEM_POLICY_ON_DEMAND, +}; + +/** + * struct qcom_tzmem_pool_config - TZ memory pool configuration. + * @initial_size: Number of bytes to allocate for the pool during its creation. + * @policy: Pool size growth policy. + * @increment: Used with policies that allow pool growth. + * @max_size: Size above which the pool will never grow. + */ +struct qcom_tzmem_pool_config { + size_t initial_size; + enum qcom_tzmem_policy policy; + size_t increment; + size_t max_size; +}; + +struct qcom_tzmem_pool * +qcom_tzmem_pool_new(const struct qcom_tzmem_pool_config *config); +void qcom_tzmem_pool_free(struct qcom_tzmem_pool *pool); +struct qcom_tzmem_pool * +devm_qcom_tzmem_pool_new(struct device *dev, + const struct qcom_tzmem_pool_config *config); + +void *qcom_tzmem_alloc(struct qcom_tzmem_pool *pool, size_t size, gfp_t gfp); +void qcom_tzmem_free(void *ptr); + +DEFINE_FREE(qcom_tzmem, void *, if (_T) qcom_tzmem_free(_T)) + +phys_addr_t qcom_tzmem_to_phys(void *ptr); + +#endif /* __QCOM_TZMEM */ -- cgit v1.2.3 From 6612103ec35af6058bb85ab24dae28e119b3c055 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 27 May 2024 14:54:59 +0200 Subject: firmware: qcom: qseecom: convert to using the TZ allocator Drop the DMA mapping operations from qcom_scm_qseecom_app_send() and convert all users of it in the qseecom module to using the TZ allocator for creating SCM call buffers. As this is largely a module separate from the SCM driver, let's use a separate memory pool. Set the initial size to 4K and - if we run out - add twice the current amount to the pool. Signed-off-by: Bartosz Golaszewski Reviewed-by: Elliot Berman Reviewed-by: Amirreza Zarrabi Link: https://lore.kernel.org/r/20240527-shm-bridge-v10-9-ce7afaa58d3a@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/firmware/qcom/qcom_qseecom.h | 8 ++++---- include/linux/firmware/qcom/qcom_scm.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/qcom/qcom_qseecom.h b/include/linux/firmware/qcom/qcom_qseecom.h index 366243ee9609..1dc5b3b50aa9 100644 --- a/include/linux/firmware/qcom/qcom_qseecom.h +++ b/include/linux/firmware/qcom/qcom_qseecom.h @@ -73,9 +73,9 @@ static inline void qseecom_dma_free(struct qseecom_client *client, size_t size, /** * qcom_qseecom_app_send() - Send to and receive data from a given QSEE app. * @client: The QSEECOM client associated with the target app. - * @req: DMA address of the request buffer sent to the app. + * @req: Request buffer sent to the app (must be TZ memory). * @req_size: Size of the request buffer. - * @rsp: DMA address of the response buffer, written to by the app. + * @rsp: Response buffer, written to by the app (must be TZ memory). * @rsp_size: Size of the response buffer. * * Sends a request to the QSEE app associated with the given client and read @@ -90,8 +90,8 @@ static inline void qseecom_dma_free(struct qseecom_client *client, size_t size, * Return: Zero on success, nonzero on failure. */ static inline int qcom_qseecom_app_send(struct qseecom_client *client, - dma_addr_t req, size_t req_size, - dma_addr_t rsp, size_t rsp_size) + void *req, size_t req_size, + void *rsp, size_t rsp_size) { return qcom_scm_qseecom_app_send(client->app_id, req, req_size, rsp, rsp_size); } diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index a221a643dc12..77be72d40200 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -141,8 +141,8 @@ int qcom_scm_gpu_init_regs(u32 gpu_req); #ifdef CONFIG_QCOM_QSEECOM int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id); -int qcom_scm_qseecom_app_send(u32 app_id, dma_addr_t req, size_t req_size, - dma_addr_t rsp, size_t rsp_size); +int qcom_scm_qseecom_app_send(u32 app_id, void *req, size_t req_size, + void *rsp, size_t rsp_size); #else /* CONFIG_QCOM_QSEECOM */ @@ -152,8 +152,8 @@ static inline int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id) } static inline int qcom_scm_qseecom_app_send(u32 app_id, - dma_addr_t req, size_t req_size, - dma_addr_t rsp, size_t rsp_size) + void *req, size_t req_size, + void *rsp, size_t rsp_size) { return -EINVAL; } -- cgit v1.2.3 From 178e19c0df1b1b27668fc6ca43b25a03eda01dad Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 27 May 2024 14:55:00 +0200 Subject: firmware: qcom: scm: add support for SHM bridge operations SHM Bridge is a safety mechanism allowing to limit the amount of memory shared between the kernel and the TrustZone to regions explicitly marked as such. Add low-level primitives for enabling SHM bridge support as well as creating and destroying SHM bridges to qcom-scm. Signed-off-by: Bartosz Golaszewski Acked-by: Andrew Halaney Tested-by: Andrew Halaney # sc8280xp-lenovo-thinkpad-x13s Tested-by: Deepti Jaggi #sa8775p-ride Reviewed-by: Elliot Berman Link: https://lore.kernel.org/r/20240527-shm-bridge-v10-10-ce7afaa58d3a@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/firmware/qcom/qcom_scm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index 77be72d40200..9f14976399ab 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -138,6 +138,12 @@ bool qcom_scm_lmh_dcvsh_available(void); int qcom_scm_gpu_init_regs(u32 gpu_req); +int qcom_scm_shm_bridge_enable(void); +int qcom_scm_shm_bridge_create(struct device *dev, u64 pfn_and_ns_perm_flags, + u64 ipfn_and_s_perm_flags, u64 size_and_flags, + u64 ns_vmids, u64 *handle); +int qcom_scm_shm_bridge_delete(struct device *dev, u64 handle); + #ifdef CONFIG_QCOM_QSEECOM int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id); -- cgit v1.2.3 From 2f50304e9efb69604040feadc13f9590be8cd391 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Tue, 4 Jun 2024 18:05:13 +0100 Subject: serial: sh-sci: Add support for RZ/V2H(P) SoC Add serial support for RZ/V2H(P) SoC with earlycon. The SCIF interface in the Renesas RZ/V2H(P) is similar to that available in the RZ/G2L (R9A07G044) SoC, with the following differences: - RZ/V2H(P) SoC has three additional interrupts: one for Tx end/Rx ready and two for Rx and Tx buffer full, all of which are edge-triggered. - RZ/V2H(P) supports asynchronous mode, whereas RZ/G2L supports both synchronous and asynchronous modes. - There are differences in the configuration of certain registers such as SCSMR, SCFCR, and SCSPTR between the two SoCs. To handle these differences on RZ/V2H(P) SoC SCIx_RZV2H_SCIF_REGTYPE is added. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240604170513.522631-6-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_sci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h index 1c89611e0e06..0f2f50b8d28e 100644 --- a/include/linux/serial_sci.h +++ b/include/linux/serial_sci.h @@ -37,6 +37,7 @@ enum { SCIx_SH7705_SCIF_REGTYPE, SCIx_HSCIF_REGTYPE, SCIx_RZ_SCIFA_REGTYPE, + SCIx_RZV2H_SCIF_REGTYPE, SCIx_NR_REGTYPES, }; -- cgit v1.2.3 From c5603e2a621dac10c5e21cc430848ebcfa6c7e01 Mon Sep 17 00:00:00 2001 From: Doug Brown Date: Thu, 6 Jun 2024 12:56:31 -0700 Subject: Revert "serial: core: only stop transmit when HW fifo is empty" This reverts commit 7bfb915a597a301abb892f620fe5c283a9fdbd77. This commit broke pxa and omap-serial, because it inhibited them from calling stop_tx() if their TX FIFOs weren't completely empty. This resulted in these two drivers hanging during transmits because the TX interrupt would stay enabled, and a new TX interrupt would never fire. Cc: stable@vger.kernel.org Fixes: 7bfb915a597a ("serial: core: only stop transmit when HW fifo is empty") Signed-off-by: Doug Brown Link: https://lore.kernel.org/r/20240606195632.173255-2-doug@schmorgal.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 8cb65f50e830..3fb9a29e025f 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -811,8 +811,7 @@ enum UART_TX_FLAGS { if (pending < WAKEUP_CHARS) { \ uart_write_wakeup(__port); \ \ - if (!((flags) & UART_TX_NOSTOP) && pending == 0 && \ - __port->ops->tx_empty(__port)) \ + if (!((flags) & UART_TX_NOSTOP) && pending == 0) \ __port->ops->stop_tx(__port); \ } \ \ -- cgit v1.2.3 From 9bb43b9e8d9a288a214e9b17acc9e46fda3977cf Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Thu, 6 Jun 2024 12:56:32 -0700 Subject: serial: core: introduce uart_port_tx_limited_flags() Analogue to uart_port_tx_flags() introduced in commit 3ee07964d407 ("serial: core: introduce uart_port_tx_flags()"), add a _flags variant for uart_port_tx_limited(). Fixes: d11cc8c3c4b6 ("tty: serial: use uart_port_tx_limited()") Cc: stable@vger.kernel.org Signed-off-by: Jonas Gorski Signed-off-by: Doug Brown Link: https://lore.kernel.org/r/20240606195632.173255-3-doug@schmorgal.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 3fb9a29e025f..aea25eef9a1a 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -850,6 +850,24 @@ enum UART_TX_FLAGS { __count--); \ }) +/** + * uart_port_tx_limited_flags -- transmit helper for uart_port with count limiting with flags + * @port: uart port + * @ch: variable to store a character to be written to the HW + * @flags: %UART_TX_NOSTOP or similar + * @count: a limit of characters to send + * @tx_ready: can HW accept more data function + * @put_char: function to write a character + * @tx_done: function to call after the loop is done + * + * See uart_port_tx_limited() for more details. + */ +#define uart_port_tx_limited_flags(port, ch, flags, count, tx_ready, put_char, tx_done) ({ \ + unsigned int __count = (count); \ + __uart_port_tx(port, ch, flags, tx_ready, put_char, tx_done, __count, \ + __count--); \ +}) + /** * uart_port_tx -- transmit helper for uart_port * @port: uart port -- cgit v1.2.3 From 50cf5f3842af3135b88b041890e7e12a74425fcb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 6 Jun 2024 12:25:01 -0600 Subject: io_uring/msg_ring: add an alloc cache for io_kiocb entries With slab accounting, allocating and freeing memory has considerable overhead. Add a basic alloc cache for the io_kiocb allocations that msg_ring needs to do. Unlike other caches, this one is used by the sender, grabbing it from the remote ring. When the remote ring gets the posted completion, it'll free it locally. Hence it is separately locked, using ctx->msg_lock. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 1052a68fd68d..ede42dce1506 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -397,6 +397,9 @@ struct io_ring_ctx { struct callback_head poll_wq_task_work; struct list_head defer_list; + struct io_alloc_cache msg_cache; + spinlock_t msg_lock; + #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ -- cgit v1.2.3 From 89f415b99050fbf7bcf3fca0cde30c09b62265eb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 20 Jun 2024 17:39:45 +0200 Subject: mfd: tmio: Remove obsolete .set_clk_div() callback Commit bef64d2908e825c5 ("mmc: remove tmio_mmc driver") removed the last user of the .set_clk_div() callback in the tmio_mmc_data structure. Signed-off-by: Geert Uytterhoeven Reviewed-by: Wolfram Sang Tested-by: Wolfram Sang Reviewed-by: Lad Prabhakar Acked-by: Lee Jones Link: https://lore.kernel.org/r/e0fa98f138a7b2836128178f8b3a757978517307.1718897545.git.geert+renesas@glider.be Signed-off-by: Ulf Hansson --- include/linux/mfd/tmio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index eace8ea6cda0..aca74ac1ff69 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -101,7 +101,6 @@ struct tmio_mmc_data { unsigned int max_blk_count; unsigned short max_segs; void (*set_pwr)(struct platform_device *host, int state); - void (*set_clk_div)(struct platform_device *host, int state); }; /* -- cgit v1.2.3 From f86937afb446d056e9e385da05536eb26483874c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 20 Jun 2024 17:39:46 +0200 Subject: mmc: tmio: Remove obsolete .set_pwr() callback() Commit ca78476e4888f1f1 ("mfd: Remove toshiba tmio drivers") removed the last users of the .set_pwr() callback in the tmio_mmc_data structure. Remove the callback, and all related infrastructure. Signed-off-by: Geert Uytterhoeven Acked-by: Lee Jones Reviewed-by: Wolfram Sang Tested-by: Wolfram Sang Reviewed-by: Lad Prabhakar Link: https://lore.kernel.org/r/fbbc13ddd19df2c40933ffa3b82fb14841bf1d4c.1718897545.git.geert+renesas@glider.be --- include/linux/mfd/tmio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index aca74ac1ff69..8c09d14a3a28 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -100,7 +100,6 @@ struct tmio_mmc_data { dma_addr_t dma_rx_offset; unsigned int max_blk_count; unsigned short max_segs; - void (*set_pwr)(struct platform_device *host, int state); }; /* -- cgit v1.2.3 From 4b8e88e563b5f666446d002ad0dc1e6e8e7102b0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Jun 2024 11:34:09 +0200 Subject: ftruncate: pass a signed offset The old ftruncate() syscall, using the 32-bit off_t misses a sign extension when called in compat mode on 64-bit architectures. As a result, passing a negative length accidentally succeeds in truncating to file size between 2GiB and 4GiB. Changing the type of the compat syscall to the signed compat_off_t changes the behavior so it instead returns -EINVAL. The native entry point, the truncate() syscall and the corresponding loff_t based variants are all correct already and do not suffer from this mistake. Fixes: 3f6d078d4acc ("fix compat truncate/ftruncate") Reviewed-by: Christian Brauner Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann --- include/linux/compat.h | 2 +- include/linux/syscalls.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 233f61ec8afc..56cebaff0c91 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -608,7 +608,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf); asmlinkage long compat_sys_truncate(const char __user *, compat_off_t); -asmlinkage long compat_sys_ftruncate(unsigned int, compat_ulong_t); +asmlinkage long compat_sys_ftruncate(unsigned int, compat_off_t); /* No generic prototype for truncate64, ftruncate64, fallocate */ asmlinkage long compat_sys_openat(int dfd, const char __user *filename, int flags, umode_t mode); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 9104952d323d..ba9337709878 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -418,7 +418,7 @@ asmlinkage long sys_listmount(const struct mnt_id_req __user *req, u64 __user *mnt_ids, size_t nr_mnt_ids, unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); -asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length); +asmlinkage long sys_ftruncate(unsigned int fd, off_t length); #if BITS_PER_LONG == 32 asmlinkage long sys_truncate64(const char __user *path, loff_t length); asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length); -- cgit v1.2.3 From 118d777c4cb40f44e7b675955fd2da8b22f83913 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 17 Jun 2024 12:18:37 +0100 Subject: wordpart.h: Add REPEAT_BYTE_U32() In some cases it's necessary to replicate a byte across a u32 value, for which REPEAT_BYTE() would be helpful. Currently this requires explicit masking of the result to avoid sparse warnings, as e.g. (u32)REPEAT_BYTE(0xa0)) ... will result in a warning: cast truncates bits from constant value (a0a0a0a0a0a0a0a0 becomes a0a0a0a0) Add a new REPEAT_BYTE_U32() which does the necessary masking internally, so that we don't need to duplicate this for every usage. Signed-off-by: Mark Rutland Cc: Alexandru Elisei Cc: Marc Zyngier Cc: Thomas Gleixner Cc: Will Deacon Link: https://lore.kernel.org/r/20240617111841.2529370-2-mark.rutland@arm.com Signed-off-by: Catalin Marinas Acked-by: Thomas Gleixner --- include/linux/wordpart.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wordpart.h b/include/linux/wordpart.h index 4ca1ba66d2f0..5a7b97bb7c95 100644 --- a/include/linux/wordpart.h +++ b/include/linux/wordpart.h @@ -39,6 +39,14 @@ */ #define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) +/** + * REPEAT_BYTE_U32 - repeat the value @x multiple times as a u32 value + * @x: value to repeat + * + * NOTE: @x is not checked for > 0xff; larger values produce odd results. + */ +#define REPEAT_BYTE_U32(x) lower_32_bits(REPEAT_BYTE(x)) + /* Set bits in the first 'n' bytes when loaded from memory */ #ifdef __LITTLE_ENDIAN # define aligned_byte_mask(n) ((1UL << 8*(n))-1) -- cgit v1.2.3 From a6156e70316b322b61739468e465bfb1345b7ae2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 17 Jun 2024 12:18:39 +0100 Subject: irqchip/gic-v3: Make distributor priorities variables In subsequent patches the GICv3 driver will choose the regular interrupt priority at boot time. In preparation for using dynamic priorities, place the priorities in variables and update the code to pass these as parameters. Users of GICD_INT_DEF_PRI_X4 are modified to replicate the priority byte using REPEAT_BYTE_U32(). There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Alexandru Elisei Cc: Marc Zyngier Cc: Thomas Gleixner Cc: Will Deacon Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://lore.kernel.org/r/20240617111841.2529370-4-mark.rutland@arm.com Signed-off-by: Catalin Marinas Acked-by: Thomas Gleixner --- include/linux/irqchip/arm-gic-common.h | 4 ---- include/linux/irqchip/arm-gic-v3.h | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-common.h b/include/linux/irqchip/arm-gic-common.h index 1177f3a1aed5..fc0246cc05ac 100644 --- a/include/linux/irqchip/arm-gic-common.h +++ b/include/linux/irqchip/arm-gic-common.h @@ -10,10 +10,6 @@ #include #define GICD_INT_DEF_PRI 0xa0 -#define GICD_INT_DEF_PRI_X4 ((GICD_INT_DEF_PRI << 24) |\ - (GICD_INT_DEF_PRI << 16) |\ - (GICD_INT_DEF_PRI << 8) |\ - GICD_INT_DEF_PRI) struct irq_domain; struct fwnode_handle; diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 728691365464..70c0948f978e 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -638,7 +638,7 @@ struct fwnode_handle; int __init its_lpi_memreserve_init(void); int its_cpu_init(void); int its_init(struct fwnode_handle *handle, struct rdists *rdists, - struct irq_domain *domain); + struct irq_domain *domain, u8 irq_prio); int mbi_init(struct fwnode_handle *fwnode, struct irq_domain *parent); static inline bool gic_enable_sre(void) -- cgit v1.2.3 From 18fdb6348c480fb646799f621204f674815f05e7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 17 Jun 2024 12:18:41 +0100 Subject: arm64: irqchip/gic-v3: Select priorities at boot time The distributor and PMR/RPR can present different views of the interrupt priority space dependent upon the values of GICD_CTLR.DS and SCR_EL3.FIQ. Currently we treat the distributor's view of the priority space as canonical, and when the two differ we change the way we handle values in the PMR/RPR, using the `gic_nonsecure_priorities` static key to decide what to do. This approach works, but it's sub-optimal. When using pseudo-NMI we manipulate the distributor rarely, and we manipulate the PMR/RPR registers very frequently in code spread out throughout the kernel (e.g. local_irq_{save,restore}()). It would be nicer if we could use fixed values for the PMR/RPR, and dynamically choose the values programmed into the distributor. This patch changes the GICv3 driver and arm64 code accordingly. PMR values are chosen at compile time, and the GICv3 driver determines the appropriate values to program into the distributor at boot time. This removes the need for the `gic_nonsecure_priorities` static key and results in smaller and better generated code for saving/restoring the irqflags. Before this patch, local_irq_disable() compiles to: | 0000000000000000 : | 0: d503201f nop | 4: d50343df msr daifset, #0x3 | 8: d65f03c0 ret | c: d503201f nop | 10: d2800c00 mov x0, #0x60 // #96 | 14: d5184600 msr icc_pmr_el1, x0 | 18: d65f03c0 ret | 1c: d2801400 mov x0, #0xa0 // #160 | 20: 17fffffd b 14 After this patch, local_irq_disable() compiles to: | 0000000000000000 : | 0: d503201f nop | 4: d50343df msr daifset, #0x3 | 8: d65f03c0 ret | c: d2801800 mov x0, #0xc0 // #192 | 10: d5184600 msr icc_pmr_el1, x0 | 14: d65f03c0 ret ... with 3 fewer instructions per call. For defconfig + CONFIG_PSEUDO_NMI=y, this results in a minor saving of ~4K of text, and will make it easier to make further improvements to the way we manipulate irqflags and DAIF bits. Signed-off-by: Mark Rutland Cc: Alexandru Elisei Cc: Marc Zyngier Cc: Thomas Gleixner Cc: Will Deacon Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://lore.kernel.org/r/20240617111841.2529370-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas Acked-by: Thomas Gleixner --- include/linux/irqchip/arm-gic-v3-prio.h | 52 +++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 include/linux/irqchip/arm-gic-v3-prio.h (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3-prio.h b/include/linux/irqchip/arm-gic-v3-prio.h new file mode 100644 index 000000000000..44157c9abb78 --- /dev/null +++ b/include/linux/irqchip/arm-gic-v3-prio.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __LINUX_IRQCHIP_ARM_GIC_V3_PRIO_H +#define __LINUX_IRQCHIP_ARM_GIC_V3_PRIO_H + +/* + * GIC priorities from the view of the PMR/RPR. + * + * These values are chosen to be valid in either the absolute priority space or + * the NS view of the priority space. The value programmed into the distributor + * and ITS will be chosen at boot time such that these values appear in the + * PMR/RPR. + * + * GICV3_PRIO_UNMASKED is the PMR view of the priority to use to permit both + * IRQs and pseudo-NMIs. + * + * GICV3_PRIO_IRQ is the PMR view of the priority of regular interrupts. This + * can be written to the PMR to mask regular IRQs. + * + * GICV3_PRIO_NMI is the PMR view of the priority of pseudo-NMIs. This can be + * written to the PMR to mask pseudo-NMIs. + * + * On arm64 some code sections either automatically switch back to PSR.I or + * explicitly require to not use priority masking. If bit GICV3_PRIO_PSR_I_SET + * is included in the priority mask, it indicates that PSR.I should be set and + * interrupt disabling temporarily does not rely on IRQ priorities. + */ +#define GICV3_PRIO_UNMASKED 0xe0 +#define GICV3_PRIO_IRQ 0xc0 +#define GICV3_PRIO_NMI 0x80 + +#define GICV3_PRIO_PSR_I_SET (1 << 4) + +#ifndef __ASSEMBLER__ + +#define __gicv3_prio_to_ns(p) (0xff & ((p) << 1)) +#define __gicv3_ns_to_prio(ns) (0x80 | ((ns) >> 1)) + +#define __gicv3_prio_valid_ns(p) \ + (__gicv3_ns_to_prio(__gicv3_prio_to_ns(p)) == (p)) + +static_assert(__gicv3_prio_valid_ns(GICV3_PRIO_NMI)); +static_assert(__gicv3_prio_valid_ns(GICV3_PRIO_IRQ)); + +static_assert(GICV3_PRIO_NMI < GICV3_PRIO_IRQ); +static_assert(GICV3_PRIO_IRQ < GICV3_PRIO_UNMASKED); + +static_assert(GICV3_PRIO_IRQ < (GICV3_PRIO_IRQ | GICV3_PRIO_PSR_I_SET)); + +#endif /* __ASSEMBLER */ + +#endif /* __LINUX_IRQCHIP_ARM_GIC_V3_PRIO_H */ -- cgit v1.2.3 From 12046f8c77e0ed6d41beabde0edbb729499c970b Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 24 Jun 2024 19:31:15 +0200 Subject: platform/x86: wmi: Add driver_override support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for forcing the WMI driver core to bind a certain WMI driver to a WMI device. This will be necessary to support generic WMI drivers without an ID table Reviewed-by: Hans de Goede Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20240624173116.31314-2-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 63cca3b58d6d..3275470b5531 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -16,12 +16,16 @@ * struct wmi_device - WMI device structure * @dev: Device associated with this WMI device * @setable: True for devices implementing the Set Control Method + * @driver_override: Driver name to force a match; do not set directly, + * because core frees it; use driver_set_override() to + * set or clear it. * * This represents WMI devices discovered by the WMI driver core. */ struct wmi_device { struct device dev; bool setable; + const char *driver_override; }; /** -- cgit v1.2.3 From 44348870de4b8f292f97b84583a298d66fbaf738 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jun 2024 19:38:35 +0200 Subject: block: fix the blk_queue_nonrot polarity Take care of the inverse polarity of the BLK_FEAT_ROTATIONAL flag vs the old nonrot helper. Fixes: bd4a633b6f7c ("block: move the nonrot flag to queue_limits") Reported-by: kernel test robot Reported-by: Keith Busch Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240624173835.76753-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e89003360c17..b2f1362c4681 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -617,7 +617,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) -#define blk_queue_nonrot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) +#define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_zone_resetall(q) \ ((q)->limits.features & BLK_FEAT_ZONE_RESETALL) -- cgit v1.2.3 From 4a24bbabc826eaba6f94a9741712117b8e2b0587 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 7 Jun 2024 16:40:14 +0800 Subject: slab: delete useless RED_INACTIVE and RED_ACTIVE These seem useless since we use the SLUB_RED_INACTIVE and SLUB_RED_ACTIVE, so just delete them, no functional change. Reviewed-by: Vlastimil Babka Signed-off-by: Chengming Zhou Reviewed-by: Christoph Lameter (Ampere) Signed-off-by: Vlastimil Babka --- include/linux/poison.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/poison.h b/include/linux/poison.h index 1f0ee2459f2a..9c1a035af97c 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -38,11 +38,8 @@ * Magic nums for obj red zoning. * Placed in the first word before and the first word after an obj. */ -#define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */ -#define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */ - -#define SLUB_RED_INACTIVE 0xbb -#define SLUB_RED_ACTIVE 0xcc +#define SLUB_RED_INACTIVE 0xbb /* when obj is inactive */ +#define SLUB_RED_ACTIVE 0xcc /* when obj is active */ /* ...and for poisoning */ #define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ -- cgit v1.2.3 From d89a5c6705998ddc42b104f8eabd3c4b9e8fde08 Mon Sep 17 00:00:00 2001 From: Weiwen Hu Date: Mon, 3 Jun 2024 20:57:00 +0800 Subject: nvme: fix status magic numbers Replaced some magic numbers about SC and SCT with enum and macro. Signed-off-by: Weiwen Hu Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 425573202295..c8d7fdd095a1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1846,6 +1846,7 @@ enum { /* * Generic Command Status: */ + NVME_SCT_GENERIC = 0x0, NVME_SC_SUCCESS = 0x0, NVME_SC_INVALID_OPCODE = 0x1, NVME_SC_INVALID_FIELD = 0x2, @@ -1893,6 +1894,7 @@ enum { /* * Command Specific Status: */ + NVME_SCT_COMMAND_SPECIFIC = 0x100, NVME_SC_CQ_INVALID = 0x100, NVME_SC_QID_INVALID = 0x101, NVME_SC_QUEUE_SIZE = 0x102, @@ -1966,6 +1968,7 @@ enum { /* * Media and Data Integrity Errors: */ + NVME_SCT_MEDIA_ERROR = 0x200, NVME_SC_WRITE_FAULT = 0x280, NVME_SC_READ_ERROR = 0x281, NVME_SC_GUARD_CHECK = 0x282, @@ -1978,6 +1981,7 @@ enum { /* * Path-related Errors: */ + NVME_SCT_PATH = 0x300, NVME_SC_INTERNAL_PATH_ERROR = 0x300, NVME_SC_ANA_PERSISTENT_LOSS = 0x301, NVME_SC_ANA_INACCESSIBLE = 0x302, @@ -1986,11 +1990,17 @@ enum { NVME_SC_HOST_PATH_ERROR = 0x370, NVME_SC_HOST_ABORTED_CMD = 0x371, - NVME_SC_CRD = 0x1800, + NVME_SC_MASK = 0x00ff, /* Status Code */ + NVME_SCT_MASK = 0x0700, /* Status Code Type */ + NVME_SCT_SC_MASK = NVME_SCT_MASK | NVME_SC_MASK, + + NVME_SC_CRD = 0x1800, /* Command Retry Delayed */ NVME_SC_MORE = 0x2000, - NVME_SC_DNR = 0x4000, + NVME_SC_DNR = 0x4000, /* Do Not Retry */ }; +#define NVME_SCT(status) ((status) >> 8 & 7) + struct nvme_completion { /* * Used by Admin and Fabrics commands to return data: -- cgit v1.2.3 From dd0b0a4a2c5d7209457dc172997d1243ad269cfa Mon Sep 17 00:00:00 2001 From: Weiwen Hu Date: Mon, 3 Jun 2024 20:57:01 +0800 Subject: nvme: rename CDR/MORE/DNR to NVME_STATUS_* CDR/MORE/DNR fields are not belonging to SC in the NVMe spec, rename them to NVME_STATUS_* to avoid confusion. Signed-off-by: Weiwen Hu Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index c8d7fdd095a1..27faae34245d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1994,9 +1994,9 @@ enum { NVME_SCT_MASK = 0x0700, /* Status Code Type */ NVME_SCT_SC_MASK = NVME_SCT_MASK | NVME_SC_MASK, - NVME_SC_CRD = 0x1800, /* Command Retry Delayed */ - NVME_SC_MORE = 0x2000, - NVME_SC_DNR = 0x4000, /* Do Not Retry */ + NVME_STATUS_CRD = 0x1800, /* Command Retry Delayed */ + NVME_STATUS_MORE = 0x2000, + NVME_STATUS_DNR = 0x4000, /* Do Not Retry */ }; #define NVME_SCT(status) ((status) >> 8 & 7) -- cgit v1.2.3 From 99032e9dbabcfc64bf71e94a1b662c4bfe534ed1 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 May 2024 07:15:23 +0200 Subject: nvmet-fc: implement host_traddr() Implement callback to display the host transport address by adding a callback 'host_traddr' for nvmet_fc_target_template. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: James Smart Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- include/linux/nvme-fc-driver.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 4109f1bd6128..89ea1ebd975a 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -920,6 +920,9 @@ struct nvmet_fc_target_port { * further references to hosthandle. * Entrypoint is Mandatory if the lldd calls nvmet_fc_invalidate_host(). * + * @host_traddr: called by the transport to retrieve the node name and + * port name of the host port address. + * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -975,6 +978,7 @@ struct nvmet_fc_target_template { void (*ls_abort)(struct nvmet_fc_target_port *targetport, void *hosthandle, struct nvmefc_ls_req *lsreq); void (*host_release)(void *hosthandle); + int (*host_traddr)(void *hosthandle, u64 *wwnn, u64 *wwpn); u32 max_hw_queues; u16 max_sgl_segments; -- cgit v1.2.3 From 07e4fd4c0592ad57fe4c5033393d30362dbfaa5d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 20 Jun 2024 15:21:51 +0200 Subject: locking/local_lock: Introduce guard definition for local_lock. Introduce lock guard definition for local_lock_t. There are no users yet. Acked-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20240620132727.660738-2-bigeasy@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/local_lock.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index e55010fa7329..82366a37f447 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -51,4 +51,15 @@ #define local_unlock_irqrestore(lock, flags) \ __local_unlock_irqrestore(lock, flags) +DEFINE_GUARD(local_lock, local_lock_t __percpu*, + local_lock(_T), + local_unlock(_T)) +DEFINE_GUARD(local_lock_irq, local_lock_t __percpu*, + local_lock_irq(_T), + local_unlock_irq(_T)) +DEFINE_LOCK_GUARD_1(local_lock_irqsave, local_lock_t __percpu, + local_lock_irqsave(_T->lock, _T->flags), + local_unlock_irqrestore(_T->lock, _T->flags), + unsigned long flags) + #endif -- cgit v1.2.3 From c5bcab7558220bbad8bd4863576afd1b340ce29f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 20 Jun 2024 15:21:52 +0200 Subject: locking/local_lock: Add local nested BH locking infrastructure. Add local_lock_nested_bh() locking. It is based on local_lock_t and the naming follows the preempt_disable_nested() example. For !PREEMPT_RT + !LOCKDEP it is a per-CPU annotation for locking assumptions based on local_bh_disable(). The macro is optimized away during compilation. For !PREEMPT_RT + LOCKDEP the local_lock_nested_bh() is reduced to the usual lock-acquire plus lockdep_assert_in_softirq() - ensuring that BH is disabled. For PREEMPT_RT local_lock_nested_bh() acquires the specified per-CPU lock. It does not disable CPU migration because it relies on local_bh_disable() disabling CPU migration. With LOCKDEP it performans the usual lockdep checks as with !PREEMPT_RT. Due to include hell the softirq check has been moved spinlock.c. The intention is to use this locking in places where locking of a per-CPU variable relies on BH being disabled. Instead of treating disabled bottom halves as a big per-CPU lock, PREEMPT_RT can use this to reduce the locking scope to what actually needs protecting. A side effect is that it also documents the protection scope of the per-CPU variables. Acked-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20240620132727.660738-3-bigeasy@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/local_lock.h | 10 ++++++++++ include/linux/local_lock_internal.h | 31 +++++++++++++++++++++++++++++++ include/linux/lockdep.h | 3 +++ 3 files changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 82366a37f447..091dc0b6bdfb 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -62,4 +62,14 @@ DEFINE_LOCK_GUARD_1(local_lock_irqsave, local_lock_t __percpu, local_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) +#define local_lock_nested_bh(_lock) \ + __local_lock_nested_bh(_lock) + +#define local_unlock_nested_bh(_lock) \ + __local_unlock_nested_bh(_lock) + +DEFINE_GUARD(local_lock_nested_bh, local_lock_t __percpu*, + local_lock_nested_bh(_T), + local_unlock_nested_bh(_T)) + #endif diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 975e33b793a7..8dd71fbbb6d2 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -62,6 +62,17 @@ do { \ local_lock_debug_init(lock); \ } while (0) +#define __spinlock_nested_bh_init(lock) \ +do { \ + static struct lock_class_key __key; \ + \ + debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ + lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, \ + 0, LD_WAIT_CONFIG, LD_WAIT_INV, \ + LD_LOCK_NORMAL); \ + local_lock_debug_init(lock); \ +} while (0) + #define __local_lock(lock) \ do { \ preempt_disable(); \ @@ -98,6 +109,15 @@ do { \ local_irq_restore(flags); \ } while (0) +#define __local_lock_nested_bh(lock) \ + do { \ + lockdep_assert_in_softirq(); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_unlock_nested_bh(lock) \ + local_lock_release(this_cpu_ptr(lock)) + #else /* !CONFIG_PREEMPT_RT */ /* @@ -138,4 +158,15 @@ typedef spinlock_t local_lock_t; #define __local_unlock_irqrestore(lock, flags) __local_unlock(lock) +#define __local_lock_nested_bh(lock) \ +do { \ + lockdep_assert_in_softirq_func(); \ + spin_lock(this_cpu_ptr(lock)); \ +} while (0) + +#define __local_unlock_nested_bh(lock) \ +do { \ + spin_unlock(this_cpu_ptr((lock))); \ +} while (0) + #endif /* CONFIG_PREEMPT_RT */ diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 08b0d1d9d78b..3f5a551579cc 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -600,6 +600,8 @@ do { \ (!in_softirq() || in_irq() || in_nmi())); \ } while (0) +extern void lockdep_assert_in_softirq_func(void); + #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) @@ -613,6 +615,7 @@ do { \ # define lockdep_assert_preemption_enabled() do { } while (0) # define lockdep_assert_preemption_disabled() do { } while (0) # define lockdep_assert_in_softirq() do { } while (0) +# define lockdep_assert_in_softirq_func() do { } while (0) #endif #ifdef CONFIG_PROVE_RAW_LOCK_NESTING -- cgit v1.2.3 From ecefbc09e8ee768ae85b7bb7a1de8c8287397d68 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 20 Jun 2024 15:21:58 +0200 Subject: net: softnet_data: Make xmit per task. Softirq is preemptible on PREEMPT_RT. Without a per-CPU lock in local_bh_disable() there is no guarantee that only one device is transmitting at a time. With preemption and multiple senders it is possible that the per-CPU `recursion' counter gets incremented by different threads and exceeds XMIT_RECURSION_LIMIT leading to a false positive recursion alert. The `more' member is subject to similar problems if set by one thread for one driver and wrongly used by another driver within another thread. Instead of adding a lock to protect the per-CPU variable it is simpler to make xmit per-task. Sending and receiving skbs happens always in thread context anyway. Having a lock to protected the per-CPU counter would block/ serialize two sending threads needlessly. It would also require a recursive lock to ensure that the owner can increment the counter further. Make the softnet_data.xmit a task_struct member on PREEMPT_RT. Add needed wrapper. Cc: Ben Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Mel Gorman Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20240620132727.660738-9-bigeasy@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 42 +++++++++++++++++++++++++++++++----------- include/linux/netdevice_xmit.h | 13 +++++++++++++ include/linux/sched.h | 5 ++++- 3 files changed, 48 insertions(+), 12 deletions(-) create mode 100644 include/linux/netdevice_xmit.h (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c83b390191d4..f6fc9066147d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -43,6 +43,7 @@ #include #include +#include #include #include #include @@ -3223,13 +3224,7 @@ struct softnet_data { struct sk_buff_head xfrm_backlog; #endif /* written and read only by owning cpu: */ - struct { - u16 recursion; - u8 more; -#ifdef CONFIG_NET_EGRESS - u8 skip_txqueue; -#endif - } xmit; + struct netdev_xmit xmit; #ifdef CONFIG_RPS /* input_queue_head should be written by cpu owning this struct, * and only read by other cpus. Worth using a cache line. @@ -3257,10 +3252,18 @@ struct softnet_data { DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +#ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) { return this_cpu_read(softnet_data.xmit.recursion); } +#else +static inline int dev_recursion_level(void) +{ + return current->net_xmit.recursion; +} + +#endif void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); @@ -4872,18 +4875,35 @@ static inline ktime_t netdev_get_tstamp(struct net_device *dev, return hwtstamps->hwtstamp; } -static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, - struct sk_buff *skb, struct net_device *dev, - bool more) +#ifndef CONFIG_PREEMPT_RT +static inline void netdev_xmit_set_more(bool more) { __this_cpu_write(softnet_data.xmit.more, more); - return ops->ndo_start_xmit(skb, dev); } static inline bool netdev_xmit_more(void) { return __this_cpu_read(softnet_data.xmit.more); } +#else +static inline void netdev_xmit_set_more(bool more) +{ + current->net_xmit.more = more; +} + +static inline bool netdev_xmit_more(void) +{ + return current->net_xmit.more; +} +#endif + +static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, + struct sk_buff *skb, struct net_device *dev, + bool more) +{ + netdev_xmit_set_more(more); + return ops->ndo_start_xmit(skb, dev); +} static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h new file mode 100644 index 000000000000..38325e070296 --- /dev/null +++ b/include/linux/netdevice_xmit.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_NETDEVICE_XMIT_H +#define _LINUX_NETDEVICE_XMIT_H + +struct netdev_xmit { + u16 recursion; + u8 more; +#ifdef CONFIG_NET_EGRESS + u8 skip_txqueue; +#endif +}; + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 61591ac6eab6..5187486c2522 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -975,7 +976,9 @@ struct task_struct { /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif - +#ifdef CONFIG_PREEMPT_RT + struct netdev_xmit net_xmit; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; -- cgit v1.2.3 From b22800f9d3b142bf2550dd47ff738b9feedc1093 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 20 Jun 2024 15:22:00 +0200 Subject: dev: Use nested-BH locking for softnet_data.process_queue. softnet_data::process_queue is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. softnet_data::input_queue_head can be updated lockless. This is fine because this value is only update CPU local by the local backlog_napi thread. Add a local_lock_t to softnet_data and use local_lock_nested_bh() for locking of process_queue. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20240620132727.660738-11-bigeasy@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f6fc9066147d..4e81660b4462 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3202,6 +3202,7 @@ static inline bool dev_has_header(const struct net_device *dev) struct softnet_data { struct list_head poll_list; struct sk_buff_head process_queue; + local_lock_t process_queue_bh_lock; /* stats */ unsigned int processed; -- cgit v1.2.3 From 401cb7dae8130fd34eb84648e02ab4c506df7d5e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 20 Jun 2024 15:22:04 +0200 Subject: net: Reference bpf_redirect_info via task_struct on PREEMPT_RT. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XDP redirect process is two staged: - bpf_prog_run_xdp() is invoked to run a eBPF program which inspects the packet and makes decisions. While doing that, the per-CPU variable bpf_redirect_info is used. - Afterwards xdp_do_redirect() is invoked and accesses bpf_redirect_info and it may also access other per-CPU variables like xskmap_flush_list. At the very end of the NAPI callback, xdp_do_flush() is invoked which does not access bpf_redirect_info but will touch the individual per-CPU lists. The per-CPU variables are only used in the NAPI callback hence disabling bottom halves is the only protection mechanism. Users from preemptible context (like cpu_map_kthread_run()) explicitly disable bottom halves for protections reasons. Without locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. PREEMPT_RT has forced-threaded interrupts enabled and every NAPI-callback runs in a thread. If each thread has its own data structure then locking can be avoided. Create a struct bpf_net_context which contains struct bpf_redirect_info. Define the variable on stack, use bpf_net_ctx_set() to save a pointer to it, bpf_net_ctx_clear() removes it again. The bpf_net_ctx_set() may nest. For instance a function can be used from within NET_RX_SOFTIRQ/ net_rx_action which uses bpf_net_ctx_set() and NET_TX_SOFTIRQ which does not. Therefore only the first invocations updates the pointer. Use bpf_net_ctx_get_ri() as a wrapper to retrieve the current struct bpf_redirect_info. The returned data structure is zero initialized to ensure nothing is leaked from stack. This is done on first usage of the struct. bpf_net_ctx_set() sets bpf_redirect_info::kern_flags to 0 to note that initialisation is required. First invocation of bpf_net_ctx_get_ri() will memset() the data structure and update bpf_redirect_info::kern_flags. bpf_redirect_info::nh is excluded from memset because it is only used once BPF_F_NEIGH is set which also sets the nh member. The kern_flags is moved past nh to exclude it from memset. The pointer to bpf_net_context is saved task's task_struct. Using always the bpf_net_context approach has the advantage that there is almost zero differences between PREEMPT_RT and non-PREEMPT_RT builds. Cc: Andrii Nakryiko Cc: Eduard Zingerman Cc: Hao Luo Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Martin KaFai Lau Cc: Song Liu Cc: Stanislav Fomichev Cc: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20240620132727.660738-15-bigeasy@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/filter.h | 56 +++++++++++++++++++++++++++++++++++++++++--------- include/linux/sched.h | 3 +++ 2 files changed, 49 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index b02aea291b7e..0a7f6e4a00b6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -733,21 +733,59 @@ struct bpf_nh_params { }; }; +/* flags for bpf_redirect_info kern_flags */ +#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ +#define BPF_RI_F_RI_INIT BIT(1) + struct bpf_redirect_info { u64 tgt_index; void *tgt_value; struct bpf_map *map; u32 flags; - u32 kern_flags; u32 map_id; enum bpf_map_type map_type; struct bpf_nh_params nh; + u32 kern_flags; }; -DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +struct bpf_net_context { + struct bpf_redirect_info ri; +}; -/* flags for bpf_redirect_info kern_flags */ -#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ +static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx) +{ + struct task_struct *tsk = current; + + if (tsk->bpf_net_context != NULL) + return NULL; + bpf_net_ctx->ri.kern_flags = 0; + + tsk->bpf_net_context = bpf_net_ctx; + return bpf_net_ctx; +} + +static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx) +{ + if (bpf_net_ctx) + current->bpf_net_context = NULL; +} + +static inline struct bpf_net_context *bpf_net_ctx_get(void) +{ + return current->bpf_net_context; +} + +static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void) +{ + struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + + if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) { + memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh)); + bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT; + } + + return &bpf_net_ctx->ri; +} /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, @@ -1018,25 +1056,23 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); -void bpf_clear_redirect_map(struct bpf_map *map); - static inline bool xdp_return_frame_no_direct(void) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_set_return_frame_no_direct(void) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_clear_return_frame_no_direct(void) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; } @@ -1592,7 +1628,7 @@ static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 inde u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; /* Lower bits of the flags are used as return code on lookup failure */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5187486c2522..5ff5e65a4627 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -54,6 +54,7 @@ struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; +struct bpf_net_context; struct capture_control; struct cfs_rq; struct fs_struct; @@ -1509,6 +1510,8 @@ struct task_struct { /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif + /* Used by BPF for per-TASK xdp storage */ + struct bpf_net_context *bpf_net_context; #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; -- cgit v1.2.3 From 3f9fe37d9e16a6cfd5f4d1f536686ea71db3196f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 20 Jun 2024 15:22:05 +0200 Subject: net: Move per-CPU flush-lists to bpf_net_context on PREEMPT_RT. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-CPU flush lists, which are accessed from within the NAPI callback (xdp_do_flush() for instance), are per-CPU. There are subject to the same problem as struct bpf_redirect_info. Add the per-CPU lists cpu_map_flush_list, dev_map_flush_list and xskmap_map_flush_list to struct bpf_net_context. Add wrappers for the access. The lists initialized on first usage (similar to bpf_net_ctx_get_ri()). Cc: "Björn Töpel" Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Eduard Zingerman Cc: Hao Luo Cc: Jiri Olsa Cc: John Fastabend Cc: Jonathan Lemon Cc: KP Singh Cc: Maciej Fijalkowski Cc: Magnus Karlsson Cc: Martin KaFai Lau Cc: Song Liu Cc: Stanislav Fomichev Cc: Yonghong Song Acked-by: Jesper Dangaard Brouer Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20240620132727.660738-16-bigeasy@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/filter.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 0a7f6e4a00b6..c0349522de8f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -736,6 +736,9 @@ struct bpf_nh_params { /* flags for bpf_redirect_info kern_flags */ #define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ #define BPF_RI_F_RI_INIT BIT(1) +#define BPF_RI_F_CPU_MAP_INIT BIT(2) +#define BPF_RI_F_DEV_MAP_INIT BIT(3) +#define BPF_RI_F_XSK_MAP_INIT BIT(4) struct bpf_redirect_info { u64 tgt_index; @@ -750,6 +753,9 @@ struct bpf_redirect_info { struct bpf_net_context { struct bpf_redirect_info ri; + struct list_head cpu_map_flush_list; + struct list_head dev_map_flush_list; + struct list_head xskmap_map_flush_list; }; static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx) @@ -787,6 +793,42 @@ static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void) return &bpf_net_ctx->ri; } +static inline struct list_head *bpf_net_ctx_get_cpu_map_flush_list(void) +{ + struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + + if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_CPU_MAP_INIT)) { + INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list); + bpf_net_ctx->ri.kern_flags |= BPF_RI_F_CPU_MAP_INIT; + } + + return &bpf_net_ctx->cpu_map_flush_list; +} + +static inline struct list_head *bpf_net_ctx_get_dev_flush_list(void) +{ + struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + + if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_DEV_MAP_INIT)) { + INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list); + bpf_net_ctx->ri.kern_flags |= BPF_RI_F_DEV_MAP_INIT; + } + + return &bpf_net_ctx->dev_map_flush_list; +} + +static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void) +{ + struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + + if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_XSK_MAP_INIT)) { + INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list); + bpf_net_ctx->ri.kern_flags |= BPF_RI_F_XSK_MAP_INIT; + } + + return &bpf_net_ctx->xskmap_map_flush_list; +} + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) -- cgit v1.2.3 From 26b97668e5339434e5df8ddc7b1898a37a350112 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 24 Jun 2024 13:47:22 -0600 Subject: io_uring: remove dead struct io_submit_state member When the intermediate CQE aux cache got removed, any usage of the this member went away. As it isn't used anymore, kill it. Fixes: 902ce82c2aa1 ("io_uring: get rid of intermediate aux cqe caches") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b48570eaa449..7abdc0927124 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -207,7 +207,6 @@ struct io_submit_state { bool need_plug; bool cq_flush; unsigned short submit_nr; - unsigned int cqes_count; struct blk_plug plug; }; -- cgit v1.2.3 From 399ab86ea55039f9d0a5f621a68cb4631f796f37 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Fri, 14 Jun 2024 23:20:14 +0000 Subject: /proc/pid/smaps: add mseal info for vma MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add sl in /proc/pid/smaps to indicate vma is sealed Link: https://lkml.kernel.org/r/20240614232014.806352-2-jeffxu@google.com Fixes: 8be7258aad44 ("mseal: add mseal syscall") Signed-off-by: Jeff Xu Acked-by: David Hildenbrand Cc: Adhemerval Zanella Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: Kees Cook Cc: Randy Dunlap Cc: Stephen Röttger Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9a5652c5fadd..eb7c96d24ac0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -406,6 +406,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif +#ifdef CONFIG_64BIT +/* VM is sealed, in vm_flags */ +#define VM_SEALED _BITUL(63) +#endif + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) -- cgit v1.2.3 From ff202303c398ed56386ca4954154de9a96eb732a Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 7 Jun 2024 13:29:53 -0700 Subject: mm: convert page type macros to enum Changing PG_slab from a page flag to a page type in commit 46df8e73a4a3 ("mm: free up PG_slab") in has the unintended consequence of removing the PG_slab constant from kernel debuginfo. The commit does add the value to the vmcoreinfo note, which allows debuggers to find the value without hardcoding it. However it's most flexible to continue representing the constant with an enum. To that end, convert the page type fields into an enum. Debuggers will now be able to detect that PG_slab's type has changed from enum pageflags to enum pagetype. Link: https://lkml.kernel.org/r/20240607202954.1198180-1-stephen.s.brennan@oracle.com Fixes: 46df8e73a4a3 ("mm: free up PG_slab") Signed-off-by: Stephen Brennan Acked-by: Vlastimil Babka Cc: David Hildenbrand Cc: Hao Ge Cc: Matthew Wilcox (Oracle) Cc: Omar Sandoval Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 104078afe0b1..b9e914e1face 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -944,15 +944,18 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) * mistaken for a page type value. */ -#define PAGE_TYPE_BASE 0xf0000000 -/* Reserve 0x0000007f to catch underflows of _mapcount */ -#define PAGE_MAPCOUNT_RESERVE -128 -#define PG_buddy 0x00000080 -#define PG_offline 0x00000100 -#define PG_table 0x00000200 -#define PG_guard 0x00000400 -#define PG_hugetlb 0x00000800 -#define PG_slab 0x00001000 +enum pagetype { + PG_buddy = 0x00000080, + PG_offline = 0x00000100, + PG_table = 0x00000200, + PG_guard = 0x00000400, + PG_hugetlb = 0x00000800, + PG_slab = 0x00001000, + + PAGE_TYPE_BASE = 0xf0000000, + /* Reserve 0x0000007f to catch underflows of _mapcount */ + PAGE_MAPCOUNT_RESERVE = -128, +}; #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) -- cgit v1.2.3 From bf14ed81f571f8dba31cd72ab2e50fbcc877cc31 Mon Sep 17 00:00:00 2001 From: yangge Date: Thu, 20 Jun 2024 08:59:50 +0800 Subject: mm/page_alloc: Separate THP PCP into movable and non-movable categories Since commit 5d0a661d808f ("mm/page_alloc: use only one PCP list for THP-sized allocations") no longer differentiates the migration type of pages in THP-sized PCP list, it's possible that non-movable allocation requests may get a CMA page from the list, in some cases, it's not acceptable. If a large number of CMA memory are configured in system (for example, the CMA memory accounts for 50% of the system memory), starting a virtual machine with device passthrough will get stuck. During starting the virtual machine, it will call pin_user_pages_remote(..., FOLL_LONGTERM, ...) to pin memory. Normally if a page is present and in CMA area, pin_user_pages_remote() will migrate the page from CMA area to non-CMA area because of FOLL_LONGTERM flag. But if non-movable allocation requests return CMA memory, migrate_longterm_unpinnable_pages() will migrate a CMA page to another CMA page, which will fail to pass the check in check_and_migrate_movable_pages() and cause migration endless. Call trace: pin_user_pages_remote --__gup_longterm_locked // endless loops in this function ----_get_user_pages_locked ----check_and_migrate_movable_pages ------migrate_longterm_unpinnable_pages --------alloc_migration_target This problem will also have a negative impact on CMA itself. For example, when CMA is borrowed by THP, and we need to reclaim it through cma_alloc() or dma_alloc_coherent(), we must move those pages out to ensure CMA's users can retrieve that contigous memory. Currently, CMA's memory is occupied by non-movable pages, meaning we can't relocate them. As a result, cma_alloc() is more likely to fail. To fix the problem above, we add one PCP list for THP, which will not introduce a new cacheline for struct per_cpu_pages. THP will have 2 PCP lists, one PCP list is used by MOVABLE allocation, and the other PCP list is used by UNMOVABLE allocation. MOVABLE allocation contains GPF_MOVABLE, and UNMOVABLE allocation contains GFP_UNMOVABLE and GFP_RECLAIMABLE. Link: https://lkml.kernel.org/r/1718845190-4456-1-git-send-email-yangge1116@126.com Fixes: 5d0a661d808f ("mm/page_alloc: use only one PCP list for THP-sized allocations") Signed-off-by: yangge Cc: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8f9c9590a42c..586a8f0104d7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -654,13 +654,12 @@ enum zone_watermarks { }; /* - * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list - * for THP which will usually be GFP_MOVABLE. Even if it is another type, - * it should not contribute to serious fragmentation causing THP allocation - * failures. + * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists + * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list + * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define NR_PCP_THP 1 +#define NR_PCP_THP 2 #else #define NR_PCP_THP 0 #endif -- cgit v1.2.3 From cf28d7716e0c777f593948eea109dc273047dac7 Mon Sep 17 00:00:00 2001 From: Wei-Hsin Yeh Date: Sun, 12 May 2024 13:42:11 +0800 Subject: include/linux/jhash.h: fix typos Drop one '-' to adhere to coding style. Replace 'arbitray' with 'arbitrary'. Link: https://lkml.kernel.org/r/20240512054211.24726-1-weihsinyeh168@gmail.com Signed-off-by: Wei-Hsin Yeh Signed-off-by: Andrew Morton --- include/linux/jhash.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jhash.h b/include/linux/jhash.h index ab7f8c152b89..fa26a2dd3b52 100644 --- a/include/linux/jhash.h +++ b/include/linux/jhash.h @@ -31,7 +31,7 @@ /* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */ #define jhash_mask(n) (jhash_size(n)-1) -/* __jhash_mix -- mix 3 32-bit values reversibly. */ +/* __jhash_mix - mix 3 32-bit values reversibly. */ #define __jhash_mix(a, b, c) \ { \ a -= c; a ^= rol32(c, 4); c += b; \ @@ -60,7 +60,7 @@ /* jhash - hash an arbitrary key * @k: sequence of bytes as key * @length: the length of the key - * @initval: the previous hash, or an arbitray value + * @initval: the previous hash, or an arbitrary value * * The generic version, hashes an arbitrary sequence of bytes. * No alignment or length assumptions are made about the input key. @@ -110,7 +110,7 @@ static inline u32 jhash(const void *key, u32 length, u32 initval) /* jhash2 - hash an array of u32's * @k: the key which must be an array of u32's * @length: the number of u32's in the key - * @initval: the previous hash, or an arbitray value + * @initval: the previous hash, or an arbitrary value * * Returns the hash value of the key. */ -- cgit v1.2.3 From 873ce25766019dd017c1a3a10c19ac3fc4bf24aa Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 24 May 2024 23:29:46 +0800 Subject: lib min_heap: add type safe interface Implement a type-safe interface for min_heap using strong type pointers instead of void * in the data field. This change includes adding small macro wrappers around functions, enabling the use of __minheap_cast and __minheap_obj_size macros for type casting and obtaining element size. This implementation removes the necessity of passing element size in min_heap_callbacks. Additionally, introduce the MIN_HEAP_PREALLOCATED macro for preallocating some elements. Link: https://lkml.kernel.org/ioyfizrzq7w7mjrqcadtzsfgpuntowtjdw5pgn4qhvsdp4mqqg@nrlek5vmisbu Link: https://lkml.kernel.org/r/20240524152958.919343-5-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Bagas Sanjaya Cc: Brian Foster Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kent Overstreet Cc: Mark Rutland Cc: Matthew Sakai Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 79 +++++++++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index d52daf45861b..92830f41642a 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -7,45 +7,53 @@ #include /** - * struct min_heap - Data structure to hold a min-heap. - * @data: Start of array holding the heap elements. + * Data structure to hold a min-heap. * @nr: Number of elements currently in the heap. * @size: Maximum number of elements that can be held in current storage. + * @data: Pointer to the start of array holding the heap elements. + * @preallocated: Start of the static preallocated array holding the heap elements. */ -struct min_heap { - void *data; - int nr; - int size; -}; +#define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \ +struct _name { \ + int nr; \ + int size; \ + _type *data; \ + _type preallocated[_nr]; \ +} + +#define DEFINE_MIN_HEAP(_type, _name) MIN_HEAP_PREALLOCATED(_type, _name, 0) + +typedef DEFINE_MIN_HEAP(char, min_heap_char) min_heap_char; + +#define __minheap_cast(_heap) (typeof((_heap)->data[0]) *) +#define __minheap_obj_size(_heap) sizeof((_heap)->data[0]) /** * struct min_heap_callbacks - Data/functions to customise the min_heap. - * @elem_size: The nr of each element in bytes. * @less: Partial order function for this heap. * @swp: Swap elements function. */ struct min_heap_callbacks { - int elem_size; bool (*less)(const void *lhs, const void *rhs); void (*swp)(void *lhs, void *rhs); }; /* Sift the element at pos down the heap. */ static __always_inline -void min_heapify(struct min_heap *heap, int pos, +void __min_heapify(min_heap_char *heap, int pos, size_t elem_size, const struct min_heap_callbacks *func) { void *left, *right; void *data = heap->data; - void *root = data + pos * func->elem_size; + void *root = data + pos * elem_size; int i = pos, j; /* Find the sift-down path all the way to the leaves. */ for (;;) { if (i * 2 + 2 >= heap->nr) break; - left = data + (i * 2 + 1) * func->elem_size; - right = data + (i * 2 + 2) * func->elem_size; + left = data + (i * 2 + 1) * elem_size; + right = data + (i * 2 + 2) * elem_size; i = func->less(left, right) ? i * 2 + 1 : i * 2 + 2; } @@ -54,31 +62,37 @@ void min_heapify(struct min_heap *heap, int pos, i = i * 2 + 1; /* Backtrack to the correct location. */ - while (i != pos && func->less(root, data + i * func->elem_size)) + while (i != pos && func->less(root, data + i * elem_size)) i = (i - 1) / 2; /* Shift the element into its correct place. */ j = i; while (i != pos) { i = (i - 1) / 2; - func->swp(data + i * func->elem_size, data + j * func->elem_size); + func->swp(data + i * elem_size, data + j * elem_size); } } +#define min_heapify(_heap, _pos, _func) \ + __min_heapify((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func) + /* Floyd's approach to heapification that is O(nr). */ static __always_inline -void min_heapify_all(struct min_heap *heap, +void __min_heapify_all(min_heap_char *heap, size_t elem_size, const struct min_heap_callbacks *func) { int i; for (i = heap->nr / 2 - 1; i >= 0; i--) - min_heapify(heap, i, func); + __min_heapify(heap, i, elem_size, func); } +#define min_heapify_all(_heap, _func) \ + __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func) + /* Remove minimum element from the heap, O(log2(nr)). */ static __always_inline -void min_heap_pop(struct min_heap *heap, +void __min_heap_pop(min_heap_char *heap, size_t elem_size, const struct min_heap_callbacks *func) { void *data = heap->data; @@ -88,27 +102,33 @@ void min_heap_pop(struct min_heap *heap, /* Place last element at the root (position 0) and then sift down. */ heap->nr--; - memcpy(data, data + (heap->nr * func->elem_size), func->elem_size); - min_heapify(heap, 0, func); + memcpy(data, data + (heap->nr * elem_size), elem_size); + __min_heapify(heap, 0, elem_size, func); } +#define min_heap_pop(_heap, _func) \ + __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func) + /* * Remove the minimum element and then push the given element. The * implementation performs 1 sift (O(log2(nr))) and is therefore more * efficient than a pop followed by a push that does 2. */ static __always_inline -void min_heap_pop_push(struct min_heap *heap, - const void *element, +void __min_heap_pop_push(min_heap_char *heap, + const void *element, size_t elem_size, const struct min_heap_callbacks *func) { - memcpy(heap->data, element, func->elem_size); - min_heapify(heap, 0, func); + memcpy(heap->data, element, elem_size); + __min_heapify(heap, 0, elem_size, func); } +#define min_heap_pop_push(_heap, _element, _func) \ + __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func) + /* Push an element on to the heap, O(log2(nr)). */ static __always_inline -void min_heap_push(struct min_heap *heap, const void *element, +void __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, const struct min_heap_callbacks *func) { void *data = heap->data; @@ -120,17 +140,20 @@ void min_heap_push(struct min_heap *heap, const void *element, /* Place at the end of data. */ pos = heap->nr; - memcpy(data + (pos * func->elem_size), element, func->elem_size); + memcpy(data + (pos * elem_size), element, elem_size); heap->nr++; /* Sift child at pos up. */ for (; pos > 0; pos = (pos - 1) / 2) { - child = data + (pos * func->elem_size); - parent = data + ((pos - 1) / 2) * func->elem_size; + child = data + (pos * elem_size); + parent = data + ((pos - 1) / 2) * elem_size; if (func->less(parent, child)) break; func->swp(parent, child); } } +#define min_heap_push(_heap, _element, _func) \ + __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func) + #endif /* _LINUX_MIN_HEAP_H */ -- cgit v1.2.3 From e146683ccff9af2d26b70a9ea4c87f5a86eff0c4 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 24 May 2024 23:29:47 +0800 Subject: lib min_heap: add min_heap_init() Add min_heap_init() for initializing heap with data, nr, and size. Link: https://lkml.kernel.org/r/20240524152958.919343-6-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Bagas Sanjaya Cc: Brian Foster Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kent Overstreet Cc: Mark Rutland Cc: Matthew Sakai Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 92830f41642a..b384a4ea2afa 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -38,6 +38,21 @@ struct min_heap_callbacks { void (*swp)(void *lhs, void *rhs); }; +/* Initialize a min-heap. */ +static __always_inline +void __min_heap_init(min_heap_char *heap, void *data, int size) +{ + heap->nr = 0; + heap->size = size; + if (data) + heap->data = data; + else + heap->data = heap->preallocated; +} + +#define min_heap_init(_heap, _data, _size) \ + __min_heap_init((min_heap_char *)_heap, _data, _size) + /* Sift the element at pos down the heap. */ static __always_inline void __min_heapify(min_heap_char *heap, int pos, size_t elem_size, -- cgit v1.2.3 From 0562d54ddc3dc195f338bd8e816dd8ff154f44ad Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 24 May 2024 23:29:48 +0800 Subject: lib min_heap: add min_heap_peek() Add min_heap_peek() to retrieve a pointer to the smallest element. The pointer is cast to the appropriate type of heap elements. Link: https://lkml.kernel.org/r/20240524152958.919343-7-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Bagas Sanjaya Cc: Brian Foster Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kent Overstreet Cc: Mark Rutland Cc: Matthew Sakai Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index b384a4ea2afa..d9c4ae7ad0cc 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -53,6 +53,16 @@ void __min_heap_init(min_heap_char *heap, void *data, int size) #define min_heap_init(_heap, _data, _size) \ __min_heap_init((min_heap_char *)_heap, _data, _size) +/* Get the minimum element from the heap. */ +static __always_inline +void *__min_heap_peek(struct min_heap_char *heap) +{ + return heap->nr ? heap->data : NULL; +} + +#define min_heap_peek(_heap) \ + (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) + /* Sift the element at pos down the heap. */ static __always_inline void __min_heapify(min_heap_char *heap, int pos, size_t elem_size, -- cgit v1.2.3 From b9d720e65a72c9754faf689e0859a5632853f4bc Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 24 May 2024 23:29:49 +0800 Subject: lib min_heap: add min_heap_full() Add min_heap_full() which returns a boolean value indicating whether the heap has reached its maximum capacity. Link: https://lkml.kernel.org/r/20240524152958.919343-8-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Bagas Sanjaya Cc: Brian Foster Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kent Overstreet Cc: Mark Rutland Cc: Matthew Sakai Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index d9c4ae7ad0cc..f41898c05f5a 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -63,6 +63,16 @@ void *__min_heap_peek(struct min_heap_char *heap) #define min_heap_peek(_heap) \ (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) +/* Check if the heap is full. */ +static __always_inline +bool __min_heap_full(min_heap_char *heap) +{ + return heap->nr == heap->size; +} + +#define min_heap_full(_heap) \ + __min_heap_full((min_heap_char *)_heap) + /* Sift the element at pos down the heap. */ static __always_inline void __min_heapify(min_heap_char *heap, int pos, size_t elem_size, -- cgit v1.2.3 From 267607e87599509a6a39a5f7dd3959365e58af27 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 24 May 2024 23:29:50 +0800 Subject: lib min_heap: add args for min_heap_callbacks Add a third parameter 'args' for the 'less' and 'swp' functions in the 'struct min_heap_callbacks'. This additional parameter allows these comparison and swap functions to handle extra arguments when necessary. Link: https://lkml.kernel.org/r/20240524152958.919343-9-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Bagas Sanjaya Cc: Brian Foster Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kent Overstreet Cc: Mark Rutland Cc: Matthew Sakai Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 51 ++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index f41898c05f5a..4acd0f4b3faf 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -34,8 +34,8 @@ typedef DEFINE_MIN_HEAP(char, min_heap_char) min_heap_char; * @swp: Swap elements function. */ struct min_heap_callbacks { - bool (*less)(const void *lhs, const void *rhs); - void (*swp)(void *lhs, void *rhs); + bool (*less)(const void *lhs, const void *rhs, void *args); + void (*swp)(void *lhs, void *rhs, void *args); }; /* Initialize a min-heap. */ @@ -76,7 +76,7 @@ bool __min_heap_full(min_heap_char *heap) /* Sift the element at pos down the heap. */ static __always_inline void __min_heapify(min_heap_char *heap, int pos, size_t elem_size, - const struct min_heap_callbacks *func) + const struct min_heap_callbacks *func, void *args) { void *left, *right; void *data = heap->data; @@ -89,7 +89,7 @@ void __min_heapify(min_heap_char *heap, int pos, size_t elem_size, break; left = data + (i * 2 + 1) * elem_size; right = data + (i * 2 + 2) * elem_size; - i = func->less(left, right) ? i * 2 + 1 : i * 2 + 2; + i = func->less(left, right, args) ? i * 2 + 1 : i * 2 + 2; } /* Special case for the last leaf with no sibling. */ @@ -97,38 +97,38 @@ void __min_heapify(min_heap_char *heap, int pos, size_t elem_size, i = i * 2 + 1; /* Backtrack to the correct location. */ - while (i != pos && func->less(root, data + i * elem_size)) + while (i != pos && func->less(root, data + i * elem_size, args)) i = (i - 1) / 2; /* Shift the element into its correct place. */ j = i; while (i != pos) { i = (i - 1) / 2; - func->swp(data + i * elem_size, data + j * elem_size); + func->swp(data + i * elem_size, data + j * elem_size, args); } } -#define min_heapify(_heap, _pos, _func) \ - __min_heapify((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func) +#define min_heapify(_heap, _pos, _func, _args) \ + __min_heapify((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) /* Floyd's approach to heapification that is O(nr). */ static __always_inline void __min_heapify_all(min_heap_char *heap, size_t elem_size, - const struct min_heap_callbacks *func) + const struct min_heap_callbacks *func, void *args) { int i; for (i = heap->nr / 2 - 1; i >= 0; i--) - __min_heapify(heap, i, elem_size, func); + __min_heapify(heap, i, elem_size, func, args); } -#define min_heapify_all(_heap, _func) \ - __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func) +#define min_heapify_all(_heap, _func, _args) \ + __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) /* Remove minimum element from the heap, O(log2(nr)). */ static __always_inline void __min_heap_pop(min_heap_char *heap, size_t elem_size, - const struct min_heap_callbacks *func) + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; @@ -138,11 +138,11 @@ void __min_heap_pop(min_heap_char *heap, size_t elem_size, /* Place last element at the root (position 0) and then sift down. */ heap->nr--; memcpy(data, data + (heap->nr * elem_size), elem_size); - __min_heapify(heap, 0, elem_size, func); + __min_heapify(heap, 0, elem_size, func, args); } -#define min_heap_pop(_heap, _func) \ - __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func) +#define min_heap_pop(_heap, _func, _args) \ + __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) /* * Remove the minimum element and then push the given element. The @@ -152,19 +152,20 @@ void __min_heap_pop(min_heap_char *heap, size_t elem_size, static __always_inline void __min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size, - const struct min_heap_callbacks *func) + const struct min_heap_callbacks *func, + void *args) { memcpy(heap->data, element, elem_size); - __min_heapify(heap, 0, elem_size, func); + __min_heapify(heap, 0, elem_size, func, args); } -#define min_heap_pop_push(_heap, _element, _func) \ - __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func) +#define min_heap_pop_push(_heap, _element, _func, _args) \ + __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) /* Push an element on to the heap, O(log2(nr)). */ static __always_inline void __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, - const struct min_heap_callbacks *func) + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; void *child, *parent; @@ -182,13 +183,13 @@ void __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, for (; pos > 0; pos = (pos - 1) / 2) { child = data + (pos * elem_size); parent = data + ((pos - 1) / 2) * elem_size; - if (func->less(parent, child)) + if (func->less(parent, child, args)) break; - func->swp(parent, child); + func->swp(parent, child, args); } } -#define min_heap_push(_heap, _element, _func) \ - __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func) +#define min_heap_push(_heap, _element, _func, _args) \ + __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) #endif /* _LINUX_MIN_HEAP_H */ -- cgit v1.2.3 From eaa0bc7119247942200b9209d40ffd86ace347d4 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 24 May 2024 23:29:51 +0800 Subject: lib min_heap: add min_heap_sift_up() Add min_heap_sift_up() to sift up the element at index 'idx' in the heap. Link: https://lkml.kernel.org/r/20240524152958.919343-10-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Bagas Sanjaya Cc: Brian Foster Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kent Overstreet Cc: Mark Rutland Cc: Matthew Sakai Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 4acd0f4b3faf..ddd0186afb77 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -111,6 +111,26 @@ void __min_heapify(min_heap_char *heap, int pos, size_t elem_size, #define min_heapify(_heap, _pos, _func, _args) \ __min_heapify((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) +/* Sift up ith element from the heap, O(log2(nr)). */ +static __always_inline +void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + void *data = heap->data; + size_t parent; + + while (idx) { + parent = (idx - 1) / 2; + if (func->less(data + parent * elem_size, data + idx * elem_size, args)) + break; + func->swp(data + parent * elem_size, data + idx * elem_size, args); + idx = parent; + } +} + +#define min_heap_sift_up(_heap, _idx, _func, _args) \ + __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) + /* Floyd's approach to heapification that is O(nr). */ static __always_inline void __min_heapify_all(min_heap_char *heap, size_t elem_size, -- cgit v1.2.3 From 420f171031207a13c83560b2ebb32ccc3c8ed6df Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 24 May 2024 23:29:52 +0800 Subject: lib min_heap: add min_heap_del() Add min_heap_del() to delete the element at index 'idx' in the heap. Link: https://lkml.kernel.org/r/20240524152958.919343-11-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Bagas Sanjaya Cc: Brian Foster Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kent Overstreet Cc: Mark Rutland Cc: Matthew Sakai Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index ddd0186afb77..98e838195e7e 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -212,4 +212,28 @@ void __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, #define min_heap_push(_heap, _element, _func, _args) \ __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) +/* Remove ith element from the heap, O(log2(nr)). */ +static __always_inline +bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + void *data = heap->data; + + if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap")) + return false; + + /* Place last element at the root (position 0) and then sift down. */ + heap->nr--; + if (idx == heap->nr) + return true; + func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args); + __min_heap_sift_up(heap, elem_size, idx, func, args); + __min_heapify(heap, idx, elem_size, func, args); + + return true; +} + +#define min_heap_del(_heap, _idx, _func, _args) \ + __min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) + #endif /* _LINUX_MIN_HEAP_H */ -- cgit v1.2.3 From 2eb637c649a3d89c5483dc851ce98d56717a36d2 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 24 May 2024 23:29:53 +0800 Subject: lib min_heap: update min_heap_push() and min_heap_pop() to return bool values Modify the min_heap_push() and min_heap_pop() to return a boolean value. They now return false when the operation fails and true when it succeeds. Link: https://lkml.kernel.org/r/20240524152958.919343-12-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Bagas Sanjaya Cc: Brian Foster Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kent Overstreet Cc: Mark Rutland Cc: Matthew Sakai Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 98e838195e7e..3410a5f907ad 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -147,18 +147,20 @@ void __min_heapify_all(min_heap_char *heap, size_t elem_size, /* Remove minimum element from the heap, O(log2(nr)). */ static __always_inline -void __min_heap_pop(min_heap_char *heap, size_t elem_size, +bool __min_heap_pop(min_heap_char *heap, size_t elem_size, const struct min_heap_callbacks *func, void *args) { void *data = heap->data; if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap")) - return; + return false; /* Place last element at the root (position 0) and then sift down. */ heap->nr--; memcpy(data, data + (heap->nr * elem_size), elem_size); __min_heapify(heap, 0, elem_size, func, args); + + return true; } #define min_heap_pop(_heap, _func, _args) \ @@ -184,7 +186,7 @@ void __min_heap_pop_push(min_heap_char *heap, /* Push an element on to the heap, O(log2(nr)). */ static __always_inline -void __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, +bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, const struct min_heap_callbacks *func, void *args) { void *data = heap->data; @@ -192,7 +194,7 @@ void __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, int pos; if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap")) - return; + return false; /* Place at the end of data. */ pos = heap->nr; @@ -207,6 +209,8 @@ void __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, break; func->swp(parent, child, args); } + + return true; } #define min_heap_push(_heap, _element, _func, _args) \ -- cgit v1.2.3 From bfe3127180418f1b569999954cb653b9926f75ae Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 24 May 2024 23:29:54 +0800 Subject: lib min_heap: rename min_heapify() to min_heap_sift_down() After adding min_heap_sift_up(), the naming convention has been adjusted to maintain consistency with the min_heap_sift_up(). Consequently, min_heapify() has been renamed to min_heap_sift_down(). Link: https://lkml.kernel.org/CAP-5=fVcBAxt8Mw72=NCJPRJfjDaJcqk4rjbadgouAEAHz_q1A@mail.gmail.com Link: https://lkml.kernel.org/r/20240524152958.919343-13-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Bagas Sanjaya Cc: Brian Foster Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kent Overstreet Cc: Mark Rutland Cc: Matthew Sakai Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 3410a5f907ad..0baee5787247 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -75,7 +75,7 @@ bool __min_heap_full(min_heap_char *heap) /* Sift the element at pos down the heap. */ static __always_inline -void __min_heapify(min_heap_char *heap, int pos, size_t elem_size, +void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, const struct min_heap_callbacks *func, void *args) { void *left, *right; @@ -108,8 +108,8 @@ void __min_heapify(min_heap_char *heap, int pos, size_t elem_size, } } -#define min_heapify(_heap, _pos, _func, _args) \ - __min_heapify((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) +#define min_heap_sift_down(_heap, _pos, _func, _args) \ + __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) /* Sift up ith element from the heap, O(log2(nr)). */ static __always_inline @@ -139,7 +139,7 @@ void __min_heapify_all(min_heap_char *heap, size_t elem_size, int i; for (i = heap->nr / 2 - 1; i >= 0; i--) - __min_heapify(heap, i, elem_size, func, args); + __min_heap_sift_down(heap, i, elem_size, func, args); } #define min_heapify_all(_heap, _func, _args) \ @@ -158,7 +158,7 @@ bool __min_heap_pop(min_heap_char *heap, size_t elem_size, /* Place last element at the root (position 0) and then sift down. */ heap->nr--; memcpy(data, data + (heap->nr * elem_size), elem_size); - __min_heapify(heap, 0, elem_size, func, args); + __min_heap_sift_down(heap, 0, elem_size, func, args); return true; } @@ -178,7 +178,7 @@ void __min_heap_pop_push(min_heap_char *heap, void *args) { memcpy(heap->data, element, elem_size); - __min_heapify(heap, 0, elem_size, func, args); + __min_heap_sift_down(heap, 0, elem_size, func, args); } #define min_heap_pop_push(_heap, _element, _func, _args) \ @@ -232,7 +232,7 @@ bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, return true; func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args); __min_heap_sift_up(heap, elem_size, idx, func, args); - __min_heapify(heap, idx, elem_size, func, args); + __min_heap_sift_down(heap, idx, elem_size, func, args); return true; } -- cgit v1.2.3 From e596930fc78b97b71df80df0fb1cbf2efb9fc6e4 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 24 May 2024 23:29:55 +0800 Subject: lib min_heap: update min_heap_push() to use min_heap_sift_up() Update min_heap_push() to use min_heap_sift_up() rather than its origin inline version. Link: https://lkml.kernel.org/r/20240524152958.919343-14-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Bagas Sanjaya Cc: Brian Foster Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kent Overstreet Cc: Mark Rutland Cc: Matthew Sakai Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 0baee5787247..43a7b9dcf15e 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -190,7 +190,6 @@ bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, const struct min_heap_callbacks *func, void *args) { void *data = heap->data; - void *child, *parent; int pos; if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap")) @@ -202,13 +201,7 @@ bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, heap->nr++; /* Sift child at pos up. */ - for (; pos > 0; pos = (pos - 1) / 2) { - child = data + (pos * elem_size); - parent = data + ((pos - 1) / 2) * elem_size; - if (func->less(parent, child, args)) - break; - func->swp(parent, child, args); - } + __min_heap_sift_up(heap, elem_size, pos, func, args); return true; } -- cgit v1.2.3 From 7c45d8282660cf4ce011d96bf918df50b7481b3c Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 27 May 2024 17:56:44 -0700 Subject: sched: avoid using ilog2() in sched.h indirectly via cpumask.h path includes the ilog2.h header to calculate ilog2(TASK_REPORT_MAX). The following patches drops sched.h dependency on cpumask.h, and to have a successful build, the header has to be included explicitly. sched.h is a frequently included header, and it's better to keep the dependency list as small as possible. So, instead of including ilog2.h for a single BUILD_BUG_ON() check, the same check may be implemented by taking exponent of the other part of equation. Link: https://lkml.kernel.org/r/20240528005648.182376-3-yury.norov@gmail.com Signed-off-by: Yury Norov Cc: Amit Daniel Kachhap Cc: Anna-Maria Behnsen Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Dennis Zhou Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Juri Lelli Cc: Kees Cook Cc: Mathieu Desnoyers Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rasmus Villemoes Cc: Tejun Heo Cc: Thomas Gleixner Cc: Ulf Hansson Cc: Vincent Guittot Cc: Viresh Kumar Cc: Yury Norov Signed-off-by: Andrew Morton --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 61591ac6eab6..98abb07de149 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1604,7 +1604,7 @@ static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; - BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); + BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1)); return state_char[state]; } -- cgit v1.2.3 From eb4faa36d674eed60a522a72a3a40384f4b285d4 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 27 May 2024 17:56:45 -0700 Subject: cpumask: split out include/linux/cpumask_types.h Many core headers, like sched.h, include cpumask.h mostly for struct cpumask and cpumask_var_t. Those are frequently used headers and shouldn't pull more than the bare minimum. Link: https://lkml.kernel.org/r/20240528005648.182376-4-yury.norov@gmail.com Signed-off-by: Yury Norov Cc: Amit Daniel Kachhap Cc: Anna-Maria Behnsen Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Dennis Zhou Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Juri Lelli Cc: Kees Cook Cc: Mathieu Desnoyers Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rasmus Villemoes Cc: Tejun Heo Cc: Thomas Gleixner Cc: Ulf Hansson Cc: Vincent Guittot Cc: Viresh Kumar Signed-off-by: Andrew Morton --- include/linux/cpumask.h | 56 +----------------------------------- include/linux/cpumask_types.h | 66 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 55 deletions(-) create mode 100644 include/linux/cpumask_types.h (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 23686bed441d..76dca7b86189 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -9,25 +9,13 @@ */ #include #include -#include #include +#include #include #include #include #include -/* Don't assign or return these: may not be this big! */ -typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; - -/** - * cpumask_bits - get the bits in a cpumask - * @maskp: the struct cpumask * - * - * You should only assume nr_cpu_ids bits of this mask are valid. This is - * a macro so it's const-correct. - */ -#define cpumask_bits(maskp) ((maskp)->bits) - /** * cpumask_pr_args - printf args to output a cpumask * @maskp: cpumask to be printed @@ -922,48 +910,7 @@ static inline unsigned int cpumask_size(void) return bitmap_size(large_cpumask_bits); } -/* - * cpumask_var_t: struct cpumask for stack usage. - * - * Oh, the wicked games we play! In order to make kernel coding a - * little more difficult, we typedef cpumask_var_t to an array or a - * pointer: doing &mask on an array is a noop, so it still works. - * - * i.e. - * cpumask_var_t tmpmask; - * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) - * return -ENOMEM; - * - * ... use 'tmpmask' like a normal struct cpumask * ... - * - * free_cpumask_var(tmpmask); - * - * - * However, one notable exception is there. alloc_cpumask_var() allocates - * only nr_cpumask_bits bits (in the other hand, real cpumask_t always has - * NR_CPUS bits). Therefore you don't have to dereference cpumask_var_t. - * - * cpumask_var_t tmpmask; - * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) - * return -ENOMEM; - * - * var = *tmpmask; - * - * This code makes NR_CPUS length memcopy and brings to a memory corruption. - * cpumask_copy() provide safe copy functionality. - * - * Note that there is another evil here: If you define a cpumask_var_t - * as a percpu variable then the way to obtain the address of the cpumask - * structure differently influences what this_cpu_* operation needs to be - * used. Please use this_cpu_cpumask_var_t in those cases. The direct use - * of this_cpu_ptr() or this_cpu_read() will lead to failures when the - * other type of cpumask_var_t implementation is configured. - * - * Please also note that __cpumask_var_read_mostly can be used to declare - * a cpumask_var_t variable itself (not its content) as read mostly. - */ #ifdef CONFIG_CPUMASK_OFFSTACK -typedef struct cpumask *cpumask_var_t; #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly @@ -1010,7 +957,6 @@ static inline bool cpumask_available(cpumask_var_t mask) } #else -typedef struct cpumask cpumask_var_t[1]; #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly diff --git a/include/linux/cpumask_types.h b/include/linux/cpumask_types.h new file mode 100644 index 000000000000..461ed1b6bcdb --- /dev/null +++ b/include/linux/cpumask_types.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_CPUMASK_TYPES_H +#define __LINUX_CPUMASK_TYPES_H + +#include +#include + +/* Don't assign or return these: may not be this big! */ +typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; + +/** + * cpumask_bits - get the bits in a cpumask + * @maskp: the struct cpumask * + * + * You should only assume nr_cpu_ids bits of this mask are valid. This is + * a macro so it's const-correct. + */ +#define cpumask_bits(maskp) ((maskp)->bits) + +/* + * cpumask_var_t: struct cpumask for stack usage. + * + * Oh, the wicked games we play! In order to make kernel coding a + * little more difficult, we typedef cpumask_var_t to an array or a + * pointer: doing &mask on an array is a noop, so it still works. + * + * i.e. + * cpumask_var_t tmpmask; + * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + * return -ENOMEM; + * + * ... use 'tmpmask' like a normal struct cpumask * ... + * + * free_cpumask_var(tmpmask); + * + * + * However, one notable exception is there. alloc_cpumask_var() allocates + * only nr_cpumask_bits bits (in the other hand, real cpumask_t always has + * NR_CPUS bits). Therefore you don't have to dereference cpumask_var_t. + * + * cpumask_var_t tmpmask; + * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + * return -ENOMEM; + * + * var = *tmpmask; + * + * This code makes NR_CPUS length memcopy and brings to a memory corruption. + * cpumask_copy() provide safe copy functionality. + * + * Note that there is another evil here: If you define a cpumask_var_t + * as a percpu variable then the way to obtain the address of the cpumask + * structure differently influences what this_cpu_* operation needs to be + * used. Please use this_cpu_cpumask_var_t in those cases. The direct use + * of this_cpu_ptr() or this_cpu_read() will lead to failures when the + * other type of cpumask_var_t implementation is configured. + * + * Please also note that __cpumask_var_read_mostly can be used to declare + * a cpumask_var_t variable itself (not its content) as read mostly. + */ +#ifdef CONFIG_CPUMASK_OFFSTACK +typedef struct cpumask *cpumask_var_t; +#else +typedef struct cpumask cpumask_var_t[1]; +#endif /* CONFIG_CPUMASK_OFFSTACK */ + +#endif /* __LINUX_CPUMASK_TYPES_H */ -- cgit v1.2.3 From 361c1f04f3b43b00997b2845c7a9e1d0a9b4c38a Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 27 May 2024 17:56:46 -0700 Subject: sched: drop sched.h dependency on cpumask sched.h needs cpumask.h mostly for types declaration. Now that we have cpumask_types.h, which is a significantly smaller header, we can rely on it. The only exception is UP stub for set_cpus_allowed_ptr(). The function needs to test bit #0 in a @new_mask, which can be trivially opencoded. Link: https://lkml.kernel.org/r/20240528005648.182376-5-yury.norov@gmail.com Signed-off-by: Yury Norov Cc: Amit Daniel Kachhap Cc: Anna-Maria Behnsen Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Dennis Zhou Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Juri Lelli Cc: Kees Cook Cc: Mathieu Desnoyers Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rasmus Villemoes Cc: Tejun Heo Cc: Thomas Gleixner Cc: Ulf Hansson Cc: Vincent Guittot Cc: Viresh Kumar Cc: Yury Norov Signed-off-by: Andrew Morton --- include/linux/sched.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 98abb07de149..f2f907ef1389 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include @@ -1778,7 +1778,8 @@ static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpuma } static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - if (!cpumask_test_cpu(0, new_mask)) + /* Opencoded cpumask_test_cpu(0, new_mask) to avoid dependency on cpumask.h */ + if ((*cpumask_bits(new_mask) & 1) == 0) return -EINVAL; return 0; } -- cgit v1.2.3 From 7f36688f126ba4a4ec510fa81466b1dacdec97ee Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 27 May 2024 17:56:47 -0700 Subject: cpumask: cleanup core headers inclusion Many core headers include cpumask.h for nothing. Drop it. Link: https://lkml.kernel.org/r/20240528005648.182376-6-yury.norov@gmail.com Signed-off-by: Yury Norov Cc: Amit Daniel Kachhap Cc: Anna-Maria Behnsen Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Dennis Zhou Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Juri Lelli Cc: Kees Cook Cc: Mathieu Desnoyers Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rasmus Villemoes Cc: Tejun Heo Cc: Thomas Gleixner Cc: Ulf Hansson Cc: Vincent Guittot Cc: Viresh Kumar Cc: Yury Norov Signed-off-by: Andrew Morton --- include/linux/cgroup.h | 1 - include/linux/cpu.h | 1 - include/linux/cpu_cooling.h | 1 - include/linux/kernel_stat.h | 1 - include/linux/node.h | 1 - include/linux/percpu.h | 1 - include/linux/profile.h | 1 - include/linux/rcupdate.h | 1 - include/linux/seq_file.h | 1 - include/linux/tracepoint.h | 1 - 10 files changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2150ca60394b..c60ba0ab1462 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -10,7 +10,6 @@ */ #include -#include #include #include #include diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 861c3bfc5f17..ea6ac8f98e4a 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h index a3bdc8a98f2c..2c774fb3c091 100644 --- a/include/linux/cpu_cooling.h +++ b/include/linux/cpu_cooling.h @@ -15,7 +15,6 @@ #include #include -#include struct cpufreq_policy; diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 9c042c6384bb..b97ce2df376f 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/node.h b/include/linux/node.h index dfc004e4bee7..9a881c2208b3 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -16,7 +16,6 @@ #define _LINUX_NODE_H_ #include -#include #include /** diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 03053de557cf..4b2047b78b67 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/profile.h b/include/linux/profile.h index 04ae5ebcb637..2fb487f61d12 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -4,7 +4,6 @@ #include #include -#include #include #include diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index dfd2399f2cde..fb8ab4618d63 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -29,7 +29,6 @@ #include #include #include -#include #include #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 8bd4fda6e027..2fb266ea69fa 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 689b6d71590e..6be396bb4297 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From e1b6705bcfb2797ea182e313d5ec4f57fa8571f2 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 27 May 2024 17:56:48 -0700 Subject: cpumask: make core headers including cpumask_types.h where possible Now that cpumask types are split out to a separate smaller header, many frequently included core headers may switch to using it. Link: https://lkml.kernel.org/r/20240528005648.182376-7-yury.norov@gmail.com Signed-off-by: Yury Norov Cc: Amit Daniel Kachhap Cc: Anna-Maria Behnsen Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Dennis Zhou Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Juri Lelli Cc: Kees Cook Cc: Mathieu Desnoyers Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rasmus Villemoes Cc: Tejun Heo Cc: Thomas Gleixner Cc: Ulf Hansson Cc: Vincent Guittot Cc: Viresh Kumar Cc: Yury Norov Signed-off-by: Andrew Morton --- include/linux/cacheinfo.h | 2 +- include/linux/clockchips.h | 2 +- include/linux/cpu_rmap.h | 2 +- include/linux/interrupt.h | 2 +- include/linux/irqchip/irq-partition-percpu.h | 2 +- include/linux/msi.h | 2 +- include/linux/pm_domain.h | 2 +- include/linux/stop_machine.h | 2 +- include/linux/torture.h | 2 +- include/linux/workqueue.h | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 2cb15fe4fe12..286db104e054 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -3,7 +3,7 @@ #define _LINUX_CACHEINFO_H #include -#include +#include #include struct device_node; diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 9aac31d856f3..b0df28ddd394 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -12,7 +12,7 @@ #ifdef CONFIG_GENERIC_CLOCKEVENTS # include -# include +# include # include # include diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h index cae324d10965..20b5729903d7 100644 --- a/include/linux/cpu_rmap.h +++ b/include/linux/cpu_rmap.h @@ -7,7 +7,7 @@ * Copyright 2011 Solarflare Communications Inc. */ -#include +#include #include #include #include diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5c9bdd3ffccc..136a55455529 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -5,13 +5,13 @@ #include #include -#include #include #include #include #include #include #include +#include #include #include diff --git a/include/linux/irqchip/irq-partition-percpu.h b/include/linux/irqchip/irq-partition-percpu.h index 2f6ae7551748..b35ee22c278f 100644 --- a/include/linux/irqchip/irq-partition-percpu.h +++ b/include/linux/irqchip/irq-partition-percpu.h @@ -8,7 +8,7 @@ #define __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H #include -#include +#include #include struct partition_affinity { diff --git a/include/linux/msi.h b/include/linux/msi.h index dc27cf3903d5..26588da88bdd 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -19,7 +19,7 @@ */ #include -#include +#include #include #include #include diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index f24546a3d3db..71e4f0fb8867 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include /* diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index ea7a74ea7389..3132262a404d 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -3,7 +3,7 @@ #define _LINUX_STOP_MACHINE #include -#include +#include #include #include diff --git a/include/linux/torture.h b/include/linux/torture.h index 1541454da03e..c2e979f82f8d 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fb3993894536..52496f07fba5 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From 2f183c68345a26213e5e7f798399bee68d1c4a97 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 31 May 2024 12:04:57 +0300 Subject: kernel/panic: add verbose logging of kernel taints in backtraces With nearly 20 taint flags and respective characters, it's getting a bit difficult to remember what each taint flag character means. Add verbose logging of the set taints in the format: Tainted: [P]=PROPRIETARY_MODULE, [W]=WARN in dump_stack_print_info() when there are taints. Note that the "negative flag" G is not included. Link: https://lkml.kernel.org/r/7321e306166cb2ca2807ab8639e665baa2462e9c.1717146197.git.jani.nikula@intel.com Signed-off-by: Jani Nikula Reviewed-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- include/linux/panic.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/panic.h b/include/linux/panic.h index 6717b15e798c..3130e0b5116b 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -77,9 +77,10 @@ static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout) #define TAINT_FLAGS_MAX ((1UL << TAINT_FLAGS_COUNT) - 1) struct taint_flag { - char c_true; /* character printed when tainted */ - char c_false; /* character printed when not tainted */ - bool module; /* also show as a per-module taint flag */ + char c_true; /* character printed when tainted */ + char c_false; /* character printed when not tainted */ + bool module; /* also show as a per-module taint flag */ + const char *desc; /* verbose description of the set taint flag */ }; extern const struct taint_flag taint_flags[TAINT_FLAGS_COUNT]; @@ -90,6 +91,7 @@ enum lockdep_ok { }; extern const char *print_tainted(void); +extern const char *print_tainted_verbose(void); extern void add_taint(unsigned flag, enum lockdep_ok); extern int test_taint(unsigned flag); extern unsigned long get_taint(void); -- cgit v1.2.3 From cc8d5a2f09a54405321769abfd6ec3395482336a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 25 Jun 2024 07:58:10 +0200 Subject: Revert "printk: Save console options for add_preferred_console_match()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f03e8c1060f86c23eb49bafee99d9fcbd1c1bd77. Let's roll back all of the serial core and printk console changes that went into 6.10-rc1 as there still are problems with them that need to be sorted out. Link: https://lore.kernel.org/r/ZnpRozsdw6zbjqze@tlindgre-MOBL1 Reported-by: Petr Mladek Reported-by: Tony Lindgren Cc: Jiri Slaby Cc: John Ogness Cc: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Greg Kroah-Hartman --- include/linux/printk.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 40afab23881a..65c5184470f1 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -60,9 +60,6 @@ static inline const char *printk_skip_headers(const char *buffer) #define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT #define CONSOLE_LOGLEVEL_QUIET CONFIG_CONSOLE_LOGLEVEL_QUIET -int add_preferred_console_match(const char *match, const char *name, - const short idx); - extern int console_printk[]; #define console_loglevel (console_printk[0]) -- cgit v1.2.3 From 9b6a14f08b4875aa22ea0b5bc35042e2580b311b Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Thu, 20 Jun 2024 11:23:33 +0800 Subject: fs: Export in_group_or_capable() Export in_group_or_capable() as a VFS helper function. Signed-off-by: Youling Tang Link: https://lore.kernel.org/r/20240620032335.147136-1-youling.tang@linux.dev Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bfc1e6407bf6..942cb11dba96 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1942,6 +1942,8 @@ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, extern bool may_open_dev(const struct path *path); umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode); +bool in_group_or_capable(struct mnt_idmap *idmap, + const struct inode *inode, vfsgid_t vfsgid); /* * This is the "filldir" function type, used by readdir() to let -- cgit v1.2.3 From 0fa8ab5f3533b307a7d0e438ab08ecd92725dad7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Jun 2024 14:47:27 +0200 Subject: linux/syscalls.h: add missing __user annotations A couple of declarations in linux/syscalls.h are missing __user annotations on their pointers, which can lead to warnings from sparse because these don't match the implementation that have the correct address space annotations. Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index ba9337709878..63424af87bba 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -322,13 +322,13 @@ asmlinkage long sys_io_pgetevents(aio_context_t ctx_id, long nr, struct io_event __user *events, struct __kernel_timespec __user *timeout, - const struct __aio_sigset *sig); + const struct __aio_sigset __user *sig); asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id, long min_nr, long nr, struct io_event __user *events, struct old_timespec32 __user *timeout, - const struct __aio_sigset *sig); + const struct __aio_sigset __user *sig); asmlinkage long sys_io_uring_setup(u32 entries, struct io_uring_params __user *p); asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, @@ -441,7 +441,7 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, umode_t mode); asmlinkage long sys_openat2(int dfd, const char __user *filename, - struct open_how *how, size_t size); + struct open_how __user *how, size_t size); asmlinkage long sys_close(unsigned int fd); asmlinkage long sys_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); @@ -555,7 +555,7 @@ asmlinkage long sys_get_robust_list(int pid, asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); -asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, +asmlinkage long sys_futex_waitv(struct futex_waitv __user *waiters, unsigned int nr_futexes, unsigned int flags, struct __kernel_timespec __user *timeout, clockid_t clockid); @@ -907,7 +907,7 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); -asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); +asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size); asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp, int flags); @@ -960,11 +960,11 @@ asmlinkage long sys_cachestat(unsigned int fd, struct cachestat_range __user *cstat_range, struct cachestat __user *cstat, unsigned int flags); asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags); -asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx, - u32 *size, u32 flags); -asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx, +asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx __user *ctx, + u32 __user *size, u32 flags); +asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx __user *ctx, u32 size, u32 flags); -asmlinkage long sys_lsm_list_modules(u64 *ids, u32 *size, u32 flags); +asmlinkage long sys_lsm_list_modules(u64 __user *ids, u32 __user *size, u32 flags); /* * Architecture-specific system calls -- cgit v1.2.3 From 98a563de231fcc91d65c6028c7f646fe28faf83a Mon Sep 17 00:00:00 2001 From: Dumitru Ceclan Date: Fri, 7 Jun 2024 17:53:09 +0300 Subject: iio: adc: ad_sigma_delta: add disable_one callback Sigma delta ADCs with a sequencer need to disable the previously enabled channel when reading using ad_sigma_delta_single_conversion(). This was done manually in drivers for devices with sequencers. This patch implements handling of single channel disabling after a single conversion. Reviewed-by: Nuno Sa Signed-off-by: Dumitru Ceclan Reviewed-by: David Lechner Link: https://patch.msgid.link/20240607-ad4111-v7-3-97e3855900a0@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 383614ebd760..f8c1d2505940 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -37,6 +37,10 @@ struct iio_dev; * @append_status: Will be called to enable status append at the end of the sample, may be NULL. * @set_mode: Will be called to select the current mode, may be NULL. * @disable_all: Will be called to disable all channels, may be NULL. + * @disable_one: Will be called to disable a single channel after + * ad_sigma_delta_single_conversion(), may be NULL. + * Usage of this callback expects iio_chan_spec.address to contain + * the value required for the driver to identify the channel. * @postprocess_sample: Is called for each sampled data word, can be used to * modify or drop the sample data, it, may be NULL. * @has_registers: true if the device has writable and readable registers, false @@ -55,6 +59,7 @@ struct ad_sigma_delta_info { int (*append_status)(struct ad_sigma_delta *, bool append); int (*set_mode)(struct ad_sigma_delta *, enum ad_sigma_delta_mode mode); int (*disable_all)(struct ad_sigma_delta *); + int (*disable_one)(struct ad_sigma_delta *, unsigned int chan); int (*postprocess_sample)(struct ad_sigma_delta *, unsigned int raw_sample); bool has_registers; unsigned int addr_shift; @@ -140,6 +145,15 @@ static inline int ad_sigma_delta_disable_all(struct ad_sigma_delta *sd) return 0; } +static inline int ad_sigma_delta_disable_one(struct ad_sigma_delta *sd, + unsigned int chan) +{ + if (sd->info->disable_one) + return sd->info->disable_one(sd, chan); + + return 0; +} + static inline int ad_sigma_delta_set_mode(struct ad_sigma_delta *sd, unsigned int mode) { -- cgit v1.2.3 From d305b7f34ee11c4901b8da150b641bcffbd3e951 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Tue, 18 Jun 2024 15:32:05 +0200 Subject: iio: imu: adis: move to the cleanup magic This makes locking and handling error paths simpler. Signed-off-by: Nuno Sa Link: https://patch.msgid.link/20240618-dev-iio-adis-cleanup-v1-2-bd93ce7845c7@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 54 +++++++++++--------------------------------- 1 file changed, 13 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 4bb0a53cf7ea..93dad627378f 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -9,6 +9,7 @@ #ifndef __IIO_ADIS_H__ #define __IIO_ADIS_H__ +#include #include #include #include @@ -150,13 +151,8 @@ int __adis_reset(struct adis *adis); */ static inline int adis_reset(struct adis *adis) { - int ret; - - mutex_lock(&adis->state_lock); - ret = __adis_reset(adis); - mutex_unlock(&adis->state_lock); - - return ret; + guard(mutex)(&adis->state_lock); + return __adis_reset(adis); } int __adis_write_reg(struct adis *adis, unsigned int reg, @@ -248,13 +244,8 @@ static inline int __adis_read_reg_32(struct adis *adis, unsigned int reg, static inline int adis_write_reg(struct adis *adis, unsigned int reg, unsigned int val, unsigned int size) { - int ret; - - mutex_lock(&adis->state_lock); - ret = __adis_write_reg(adis, reg, val, size); - mutex_unlock(&adis->state_lock); - - return ret; + guard(mutex)(&adis->state_lock); + return __adis_write_reg(adis, reg, val, size); } /** @@ -267,13 +258,8 @@ static inline int adis_write_reg(struct adis *adis, unsigned int reg, static int adis_read_reg(struct adis *adis, unsigned int reg, unsigned int *val, unsigned int size) { - int ret; - - mutex_lock(&adis->state_lock); - ret = __adis_read_reg(adis, reg, val, size); - mutex_unlock(&adis->state_lock); - - return ret; + guard(mutex)(&adis->state_lock); + return __adis_read_reg(adis, reg, val, size); } /** @@ -365,12 +351,8 @@ int __adis_update_bits_base(struct adis *adis, unsigned int reg, const u32 mask, static inline int adis_update_bits_base(struct adis *adis, unsigned int reg, const u32 mask, const u32 val, u8 size) { - int ret; - - mutex_lock(&adis->state_lock); - ret = __adis_update_bits_base(adis, reg, mask, val, size); - mutex_unlock(&adis->state_lock); - return ret; + guard(mutex)(&adis->state_lock); + return __adis_update_bits_base(adis, reg, mask, val, size); } /** @@ -411,24 +393,14 @@ int __adis_enable_irq(struct adis *adis, bool enable); static inline int adis_enable_irq(struct adis *adis, bool enable) { - int ret; - - mutex_lock(&adis->state_lock); - ret = __adis_enable_irq(adis, enable); - mutex_unlock(&adis->state_lock); - - return ret; + guard(mutex)(&adis->state_lock); + return __adis_enable_irq(adis, enable); } static inline int adis_check_status(struct adis *adis) { - int ret; - - mutex_lock(&adis->state_lock); - ret = __adis_check_status(adis); - mutex_unlock(&adis->state_lock); - - return ret; + guard(mutex)(&adis->state_lock); + return __adis_check_status(adis); } static inline void adis_dev_lock(struct adis *adis) -- cgit v1.2.3 From e6cab1ad9769a6b03dd1ba7ace106c36de85b1a8 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Tue, 18 Jun 2024 15:32:06 +0200 Subject: iio: imu: adis: add cleanup based lock helpers Add two new lock helpers that make use of the cleanup guard() and scoped_guard() macros. Thus, users won't have to worry about unlocking which is less prone to errors and allows for simpler error paths. Signed-off-by: Nuno Sa Link: https://patch.msgid.link/20240618-dev-iio-adis-cleanup-v1-3-bd93ce7845c7@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 93dad627378f..bc7332d12c44 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -403,6 +403,10 @@ static inline int adis_check_status(struct adis *adis) return __adis_check_status(adis); } +#define adis_dev_auto_lock(adis) guard(mutex)(&(adis)->state_lock) +#define adis_dev_auto_scoped_lock(adis) \ + scoped_guard(mutex, &(adis)->state_lock) + static inline void adis_dev_lock(struct adis *adis) { mutex_lock(&adis->state_lock); -- cgit v1.2.3 From bb78ad627659fc7a738356951999f2ba79e68059 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Tue, 18 Jun 2024 15:32:12 +0200 Subject: iio: imu: adis: remove legacy lock helpers Since all users were converted to the new cleanup based helper, adis_dev_lock() and adis_dev_unlock() can now be removed from the lib. Signed-off-by: Nuno Sa Link: https://patch.msgid.link/20240618-dev-iio-adis-cleanup-v1-9-bd93ce7845c7@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index bc7332d12c44..e6a75356567a 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -407,16 +407,6 @@ static inline int adis_check_status(struct adis *adis) #define adis_dev_auto_scoped_lock(adis) \ scoped_guard(mutex, &(adis)->state_lock) -static inline void adis_dev_lock(struct adis *adis) -{ - mutex_lock(&adis->state_lock); -} - -static inline void adis_dev_unlock(struct adis *adis) -{ - mutex_unlock(&adis->state_lock); -} - int adis_single_conversion(struct iio_dev *indio_dev, const struct iio_chan_spec *chan, unsigned int error_mask, int *val); -- cgit v1.2.3 From 0e942053e4dc42a760f48c1981f3239825430f15 Mon Sep 17 00:00:00 2001 From: Heng Qi Date: Fri, 21 Jun 2024 18:13:49 +0800 Subject: linux/dim: move useful macros to .h file Useful macros will be used effectively elsewhere. These will be utilized in subsequent patches. Signed-off-by: Heng Qi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240621101353.107425-2-hengqi@linux.alibaba.com Signed-off-by: Jakub Kicinski --- include/linux/dim.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dim.h b/include/linux/dim.h index f343bc9aa2ec..43398f5eade2 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -10,6 +10,13 @@ #include #include +/* Number of DIM profiles and period mode. */ +#define NET_DIM_PARAMS_NUM_PROFILES 5 +#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256 +#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128 +#define NET_DIM_DEF_PROFILE_CQE 1 +#define NET_DIM_DEF_PROFILE_EQE 1 + /* * Number of events between DIM iterations. * Causes a moderation of the algorithm run. -- cgit v1.2.3 From f750dfe825b904164688adeb147950e0e0c4d262 Mon Sep 17 00:00:00 2001 From: Heng Qi Date: Fri, 21 Jun 2024 18:13:51 +0800 Subject: ethtool: provide customized dim profile management The NetDIM library, currently leveraged by an array of NICs, delivers excellent acceleration benefits. Nevertheless, NICs vary significantly in their dim profile list prerequisites. Specifically, virtio-net backends may present diverse sw or hw device implementation, making a one-size-fits-all parameter list impractical. On Alibaba Cloud, the virtio DPU's performance under the default DIM profile falls short of expectations, partly due to a mismatch in parameter configuration. I also noticed that ice/idpf/ena and other NICs have customized profilelist or placed some restrictions on dim capabilities. Motivated by this, I tried adding new params for "ethtool -C" that provides a per-device control to modify and access a device's interrupt parameters. Usage ======== The target NIC is named ethx. Assume that ethx only declares support for rx profile setting (with DIM_PROFILE_RX flag set in profile_flags) and supports modification of usec and pkt fields. 1. Query the currently customized list of the device $ ethtool -c ethx ... rx-profile: {.usec = 1, .pkts = 256, .comps = n/a,}, {.usec = 8, .pkts = 256, .comps = n/a,}, {.usec = 64, .pkts = 256, .comps = n/a,}, {.usec = 128, .pkts = 256, .comps = n/a,}, {.usec = 256, .pkts = 256, .comps = n/a,} tx-profile: n/a 2. Tune $ ethtool -C ethx rx-profile 1,1,n_2,n,n_3,3,n_4,4,n_n,5,n "n" means do not modify this field. $ ethtool -c ethx ... rx-profile: {.usec = 1, .pkts = 1, .comps = n/a,}, {.usec = 2, .pkts = 256, .comps = n/a,}, {.usec = 3, .pkts = 3, .comps = n/a,}, {.usec = 4, .pkts = 4, .comps = n/a,}, {.usec = 256, .pkts = 5, .comps = n/a,} tx-profile: n/a 3. Hint If the device does not support some type of customized dim profiles, the corresponding "n/a" will display. If the "n/a" field is being modified, -EOPNOTSUPP will be reported. Signed-off-by: Heng Qi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240621101353.107425-4-hengqi@linux.alibaba.com Signed-off-by: Jakub Kicinski --- include/linux/dim.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/ethtool.h | 4 +++- include/linux/netdevice.h | 3 +++ 3 files changed, 64 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dim.h b/include/linux/dim.h index 43398f5eade2..e0f39bd85432 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -10,6 +10,8 @@ #include #include +struct net_device; + /* Number of DIM profiles and period mode. */ #define NET_DIM_PARAMS_NUM_PROFILES 5 #define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256 @@ -45,12 +47,45 @@ * @pkts: CQ packet counter suggestion (by DIM) * @comps: Completion counter * @cq_period_mode: CQ period count mode (from CQE/EQE) + * @rcu: for asynchronous kfree_rcu */ struct dim_cq_moder { u16 usec; u16 pkts; u16 comps; u8 cq_period_mode; + struct rcu_head rcu; +}; + +#define DIM_PROFILE_RX BIT(0) /* support rx profile modification */ +#define DIM_PROFILE_TX BIT(1) /* support tx profile modification */ + +#define DIM_COALESCE_USEC BIT(0) /* support usec field modification */ +#define DIM_COALESCE_PKTS BIT(1) /* support pkts field modification */ +#define DIM_COALESCE_COMPS BIT(2) /* support comps field modification */ + +/** + * struct dim_irq_moder - Structure for irq moderation information. + * Used to collect irq moderation related information. + * + * @profile_flags: DIM_PROFILE_* + * @coal_flags: DIM_COALESCE_* for Rx and Tx + * @dim_rx_mode: Rx DIM period count mode: CQE or EQE + * @dim_tx_mode: Tx DIM period count mode: CQE or EQE + * @rx_profile: DIM profile list for Rx + * @tx_profile: DIM profile list for Tx + * @rx_dim_work: Rx DIM worker scheduled by net_dim() + * @tx_dim_work: Tx DIM worker scheduled by net_dim() + */ +struct dim_irq_moder { + u8 profile_flags; + u8 coal_flags; + u8 dim_rx_mode; + u8 dim_tx_mode; + struct dim_cq_moder __rcu *rx_profile; + struct dim_cq_moder __rcu *tx_profile; + void (*rx_dim_work)(struct work_struct *work); + void (*tx_dim_work)(struct work_struct *work); }; /** @@ -198,6 +233,29 @@ enum dim_step_result { DIM_ON_EDGE, }; +/** + * net_dim_init_irq_moder - collect information to initialize irq moderation + * @dev: target network device + * @profile_flags: Rx or Tx profile modification capability + * @coal_flags: irq moderation params flags + * @rx_mode: CQ period mode for Rx + * @tx_mode: CQ period mode for Tx + * @rx_dim_work: Rx worker called after dim decision + * @tx_dim_work: Tx worker called after dim decision + * + * Return: 0 on success or a negative error code. + */ +int net_dim_init_irq_moder(struct net_device *dev, u8 profile_flags, + u8 coal_flags, u8 rx_mode, u8 tx_mode, + void (*rx_dim_work)(struct work_struct *work), + void (*tx_dim_work)(struct work_struct *work)); + +/** + * net_dim_free_irq_moder - free fields for irq moderation + * @dev: target network device + */ +void net_dim_free_irq_moder(struct net_device *dev); + /** * dim_on_top - check if current state is a good place to stop (top location) * @dim: DIM context diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 6fd9107d3cc0..959196af7f5a 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -284,7 +284,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, #define ETHTOOL_COALESCE_TX_AGGR_MAX_BYTES BIT(24) #define ETHTOOL_COALESCE_TX_AGGR_MAX_FRAMES BIT(25) #define ETHTOOL_COALESCE_TX_AGGR_TIME_USECS BIT(26) -#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(26, 0) +#define ETHTOOL_COALESCE_RX_PROFILE BIT(27) +#define ETHTOOL_COALESCE_TX_PROFILE BIT(28) +#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(28, 0) #define ETHTOOL_COALESCE_USECS \ (ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4e81660b4462..cc18acd3c58b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2402,6 +2402,9 @@ struct net_device { /** @page_pools: page pools created for this netdevice */ struct hlist_head page_pools; #endif + + /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */ + struct dim_irq_moder *irq_moder; }; #define to_net_dev(d) container_of(d, struct net_device, dev) -- cgit v1.2.3 From 13ba28c5cd047e272f9dbbeb5c62c403873d8987 Mon Sep 17 00:00:00 2001 From: Heng Qi Date: Fri, 21 Jun 2024 18:13:52 +0800 Subject: dim: add new interfaces for initialization and getting results DIM-related mode and work have been collected in one same place, so new interfaces are added to provide convenience. Signed-off-by: Heng Qi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240621101353.107425-5-hengqi@linux.alibaba.com Signed-off-by: Jakub Kicinski --- include/linux/dim.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dim.h b/include/linux/dim.h index e0f39bd85432..1b581ff25a15 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -256,6 +256,54 @@ int net_dim_init_irq_moder(struct net_device *dev, u8 profile_flags, */ void net_dim_free_irq_moder(struct net_device *dev); +/** + * net_dim_setting - initialize DIM's cq mode and schedule worker + * @dev: target network device + * @dim: DIM context + * @is_tx: true indicates the tx direction, false indicates the rx direction + */ +void net_dim_setting(struct net_device *dev, struct dim *dim, bool is_tx); + +/** + * net_dim_work_cancel - synchronously cancel dim's worker + * @dim: DIM context + */ +void net_dim_work_cancel(struct dim *dim); + +/** + * net_dim_get_rx_irq_moder - get DIM rx results based on profile_ix + * @dev: target network device + * @dim: DIM context + * + * Return: DIM irq moderation + */ +struct dim_cq_moder +net_dim_get_rx_irq_moder(struct net_device *dev, struct dim *dim); + +/** + * net_dim_get_tx_irq_moder - get DIM tx results based on profile_ix + * @dev: target network device + * @dim: DIM context + * + * Return: DIM irq moderation + */ +struct dim_cq_moder +net_dim_get_tx_irq_moder(struct net_device *dev, struct dim *dim); + +/** + * net_dim_set_rx_mode - set DIM rx cq mode + * @dev: target network device + * @rx_mode: target rx cq mode + */ +void net_dim_set_rx_mode(struct net_device *dev, u8 rx_mode); + +/** + * net_dim_set_tx_mode - set DIM tx cq mode + * @dev: target network device + * @tx_mode: target tx cq mode + */ +void net_dim_set_tx_mode(struct net_device *dev, u8 tx_mode); + /** * dim_on_top - check if current state is a good place to stop (top location) * @dim: DIM context -- cgit v1.2.3 From eee5528890d54b22b46f833002355a5ee94c3bb4 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 24 Jun 2024 08:55:01 +0900 Subject: PCI: Add Edimax Vendor ID to pci_ids.h Add the Edimax Vendor ID (0x1432) for an ethernet driver for Tehuti Networks TN40xx chips. This ID can be used for Realtek 8180 and Ralink rt28xx wireless drivers. Signed-off-by: FUJITA Tomonori Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20240623235507.108147-2-fujita.tomonori@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 942a587bb97e..677aea20d3e1 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2126,6 +2126,8 @@ #define PCI_VENDOR_ID_CHELSIO 0x1425 +#define PCI_VENDOR_ID_EDIMAX 0x1432 + #define PCI_VENDOR_ID_ADLINK 0x144a #define PCI_VENDOR_ID_SAMSUNG 0x144d -- cgit v1.2.3 From e3943f00afdb71684c4f209f9d3a90d6b79771fc Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 19 Jun 2024 16:08:46 +0200 Subject: OPP: Introduce an OF helper function to inform if required-opps is used As being shown from a subsequent change to genpd, it's useful to understand if a device's OF node has an OPP-table described and whether it contains OPP nodes that makes use of the required-opps DT property. For this reason, let's introduce an OPP OF helper function called dev_pm_opp_of_has_required_opp(). Signed-off-by: Ulf Hansson Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index dd7c8441af42..6424692c30b7 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -474,6 +474,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpuma struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev); struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp); int of_get_required_opp_performance_state(struct device_node *np, int index); +bool dev_pm_opp_of_has_required_opp(struct device *dev); int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_table *opp_table); int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus); int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW, @@ -552,6 +553,11 @@ static inline int of_get_required_opp_performance_state(struct device_node *np, return -EOPNOTSUPP; } +static inline bool dev_pm_opp_of_has_required_opp(struct device *dev) +{ + return false; +} + static inline int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_table *opp_table) { return -EOPNOTSUPP; -- cgit v1.2.3 From 0c5275bf75ec3708d95654195ae4ed80d946d088 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Sun, 16 Jun 2024 19:10:36 +0300 Subject: RDMA/mlx5: Use sq timestamp as QP timestamp when RoCE is disabled When creating a QP, one of the attributes is TS format (timestamp). In some devices, we have a limitation that all QPs should have the same ts_format. The ts_format is chosen based on the device's capability. The qp_ts_format cap resides under the RoCE caps table, and the cap will be 0 when RoCE is disabled. So when RoCE is disabled, the value that should be queried is sq_ts_format under HCA caps. Consider the case when the system supports REAL_TIME_TS format (0x2), some QPs are created with REAL_TIME_TS as ts_format, and afterwards RoCE gets disabled. When trying to construct a new QP, we can't use the qp_ts_format, that is queried from the RoCE caps table, Since it leads to passing 0x0 (FREE_RUNNING_TS) as the value of the qp_ts_format, which is different than the ts_format of the previously allocated QPs REAL_TIME_TS format (0x2). Thus, to resolve this, read the sq_ts_format, which also reflect the supported ts format for the QP when RoCE is disabled. Fixes: 4806f1e2fee8 ("net/mlx5: Set QP timestamp mode to default") Signed-off-by: Maher Sanalla Signed-off-by: Or Har-Toov Link: https://lore.kernel.org/r/32801966eb767c7fd62b8dea3b63991d5fbfe213.1718554199.git.leon@kernel.org Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/qp.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index f0e55bf3ec8b..ad1ce650146c 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -576,9 +576,12 @@ static inline const char *mlx5_qp_state_str(int state) static inline int mlx5_get_qp_default_ts(struct mlx5_core_dev *dev) { - return !MLX5_CAP_ROCE(dev, qp_ts_format) ? - MLX5_TIMESTAMP_FORMAT_FREE_RUNNING : - MLX5_TIMESTAMP_FORMAT_DEFAULT; + u8 supported_ts_cap = mlx5_get_roce_state(dev) ? + MLX5_CAP_ROCE(dev, qp_ts_format) : + MLX5_CAP_GEN(dev, sq_ts_format); + + return supported_ts_cap ? MLX5_TIMESTAMP_FORMAT_DEFAULT : + MLX5_TIMESTAMP_FORMAT_FREE_RUNNING; } #endif /* MLX5_QP_H */ -- cgit v1.2.3 From 210b1f6576e8b367907e7ff51ef425062e1468e4 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 24 Jun 2024 08:56:17 -0700 Subject: nvme-pci: do not directly handle subsys reset fallout Scheduling reset_work after a nvme subsystem reset is expected to fail on pcie, but this also prevents potential handling the platform's pcie services may provide that might successfully recovering the link without re-enumeration. Such examples include AER, DPC, and power's EEH. Provide a pci specific operation that safely initiates a subsystem reset, and instead of scheduling reset work, read back the status register to trigger a pcie read error. Since this only affects pci, the other fabrics drivers subscribe to a generic nvmf subsystem reset that is exactly the same as before. The loop fabric doesn't use it because nvmet doesn't support setting that property anyway. And since we're using the magic NSSR value in two places now, provide a symbolic define for it. Reported-by: Nilay Shroff Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 27faae34245d..57e27e48c913 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -25,6 +25,9 @@ #define NVME_NSID_ALL 0xffffffff +/* Special NSSR value, 'NVMe' */ +#define NVME_SUBSYS_RESET 0x4E564D65 + enum nvme_subsys_type { /* Referral to another discovery type target subsystem */ NVME_NQN_DISC = 1, -- cgit v1.2.3 From a6143bdcaf7e37ecce05df2a3d3c1c5d587f87b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 19 Jun 2024 12:11:42 +0200 Subject: mfd: stm32-timers: Unify alignment of register definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use tabs consistently for indention and properly align register names, values and comments. This improves readability (at least for my eyes). Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/da3b7f9af5794d7463aa62cbaa7251abf1af2018.1718791090.git.u.kleine-koenig@baylibre.com Signed-off-by: Lee Jones --- include/linux/mfd/stm32-timers.h | 170 +++++++++++++++++++-------------------- 1 file changed, 85 insertions(+), 85 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h index 9eb17481b07f..5794110b2b28 100644 --- a/include/linux/mfd/stm32-timers.h +++ b/include/linux/mfd/stm32-timers.h @@ -12,97 +12,97 @@ #include #include -#define TIM_CR1 0x00 /* Control Register 1 */ -#define TIM_CR2 0x04 /* Control Register 2 */ -#define TIM_SMCR 0x08 /* Slave mode control reg */ -#define TIM_DIER 0x0C /* DMA/interrupt register */ -#define TIM_SR 0x10 /* Status register */ -#define TIM_EGR 0x14 /* Event Generation Reg */ -#define TIM_CCMR1 0x18 /* Capt/Comp 1 Mode Reg */ -#define TIM_CCMR2 0x1C /* Capt/Comp 2 Mode Reg */ -#define TIM_CCER 0x20 /* Capt/Comp Enable Reg */ -#define TIM_CNT 0x24 /* Counter */ -#define TIM_PSC 0x28 /* Prescaler */ -#define TIM_ARR 0x2c /* Auto-Reload Register */ -#define TIM_CCR1 0x34 /* Capt/Comp Register 1 */ -#define TIM_CCR2 0x38 /* Capt/Comp Register 2 */ -#define TIM_CCR3 0x3C /* Capt/Comp Register 3 */ -#define TIM_CCR4 0x40 /* Capt/Comp Register 4 */ -#define TIM_BDTR 0x44 /* Break and Dead-Time Reg */ -#define TIM_DCR 0x48 /* DMA control register */ -#define TIM_DMAR 0x4C /* DMA register for transfer */ -#define TIM_TISEL 0x68 /* Input Selection */ +#define TIM_CR1 0x00 /* Control Register 1 */ +#define TIM_CR2 0x04 /* Control Register 2 */ +#define TIM_SMCR 0x08 /* Slave mode control reg */ +#define TIM_DIER 0x0C /* DMA/interrupt register */ +#define TIM_SR 0x10 /* Status register */ +#define TIM_EGR 0x14 /* Event Generation Reg */ +#define TIM_CCMR1 0x18 /* Capt/Comp 1 Mode Reg */ +#define TIM_CCMR2 0x1C /* Capt/Comp 2 Mode Reg */ +#define TIM_CCER 0x20 /* Capt/Comp Enable Reg */ +#define TIM_CNT 0x24 /* Counter */ +#define TIM_PSC 0x28 /* Prescaler */ +#define TIM_ARR 0x2c /* Auto-Reload Register */ +#define TIM_CCR1 0x34 /* Capt/Comp Register 1 */ +#define TIM_CCR2 0x38 /* Capt/Comp Register 2 */ +#define TIM_CCR3 0x3C /* Capt/Comp Register 3 */ +#define TIM_CCR4 0x40 /* Capt/Comp Register 4 */ +#define TIM_BDTR 0x44 /* Break and Dead-Time Reg */ +#define TIM_DCR 0x48 /* DMA control register */ +#define TIM_DMAR 0x4C /* DMA register for transfer */ +#define TIM_TISEL 0x68 /* Input Selection */ -#define TIM_CR1_CEN BIT(0) /* Counter Enable */ -#define TIM_CR1_DIR BIT(4) /* Counter Direction */ -#define TIM_CR1_ARPE BIT(7) /* Auto-reload Preload Ena */ -#define TIM_CR2_MMS (BIT(4) | BIT(5) | BIT(6)) /* Master mode selection */ -#define TIM_CR2_MMS2 GENMASK(23, 20) /* Master mode selection 2 */ -#define TIM_SMCR_SMS (BIT(0) | BIT(1) | BIT(2)) /* Slave mode selection */ -#define TIM_SMCR_TS (BIT(4) | BIT(5) | BIT(6)) /* Trigger selection */ -#define TIM_DIER_UIE BIT(0) /* Update interrupt */ -#define TIM_DIER_CC1IE BIT(1) /* CC1 Interrupt Enable */ -#define TIM_DIER_CC2IE BIT(2) /* CC2 Interrupt Enable */ -#define TIM_DIER_CC3IE BIT(3) /* CC3 Interrupt Enable */ -#define TIM_DIER_CC4IE BIT(4) /* CC4 Interrupt Enable */ -#define TIM_DIER_CC_IE(x) BIT((x) + 1) /* CC1, CC2, CC3, CC4 interrupt enable */ -#define TIM_DIER_UDE BIT(8) /* Update DMA request Enable */ -#define TIM_DIER_CC1DE BIT(9) /* CC1 DMA request Enable */ -#define TIM_DIER_CC2DE BIT(10) /* CC2 DMA request Enable */ -#define TIM_DIER_CC3DE BIT(11) /* CC3 DMA request Enable */ -#define TIM_DIER_CC4DE BIT(12) /* CC4 DMA request Enable */ -#define TIM_DIER_COMDE BIT(13) /* COM DMA request Enable */ -#define TIM_DIER_TDE BIT(14) /* Trigger DMA request Enable */ -#define TIM_SR_UIF BIT(0) /* Update interrupt flag */ -#define TIM_SR_CC_IF(x) BIT((x) + 1) /* CC1, CC2, CC3, CC4 interrupt flag */ -#define TIM_EGR_UG BIT(0) /* Update Generation */ -#define TIM_CCMR_PE BIT(3) /* Channel Preload Enable */ -#define TIM_CCMR_M1 (BIT(6) | BIT(5)) /* Channel PWM Mode 1 */ -#define TIM_CCMR_CC1S (BIT(0) | BIT(1)) /* Capture/compare 1 sel */ -#define TIM_CCMR_IC1PSC GENMASK(3, 2) /* Input capture 1 prescaler */ -#define TIM_CCMR_CC2S (BIT(8) | BIT(9)) /* Capture/compare 2 sel */ -#define TIM_CCMR_IC2PSC GENMASK(11, 10) /* Input capture 2 prescaler */ -#define TIM_CCMR_CC1S_TI1 BIT(0) /* IC1/IC3 selects TI1/TI3 */ -#define TIM_CCMR_CC1S_TI2 BIT(1) /* IC1/IC3 selects TI2/TI4 */ -#define TIM_CCMR_CC2S_TI2 BIT(8) /* IC2/IC4 selects TI2/TI4 */ -#define TIM_CCMR_CC2S_TI1 BIT(9) /* IC2/IC4 selects TI1/TI3 */ -#define TIM_CCMR_CC3S (BIT(0) | BIT(1)) /* Capture/compare 3 sel */ -#define TIM_CCMR_CC4S (BIT(8) | BIT(9)) /* Capture/compare 4 sel */ -#define TIM_CCMR_CC3S_TI3 BIT(0) /* IC3 selects TI3 */ -#define TIM_CCMR_CC4S_TI4 BIT(8) /* IC4 selects TI4 */ -#define TIM_CCER_CC1E BIT(0) /* Capt/Comp 1 out Ena */ -#define TIM_CCER_CC1P BIT(1) /* Capt/Comp 1 Polarity */ -#define TIM_CCER_CC1NE BIT(2) /* Capt/Comp 1N out Ena */ -#define TIM_CCER_CC1NP BIT(3) /* Capt/Comp 1N Polarity */ -#define TIM_CCER_CC2E BIT(4) /* Capt/Comp 2 out Ena */ -#define TIM_CCER_CC2P BIT(5) /* Capt/Comp 2 Polarity */ -#define TIM_CCER_CC2NP BIT(7) /* Capt/Comp 2N Polarity */ -#define TIM_CCER_CC3E BIT(8) /* Capt/Comp 3 out Ena */ -#define TIM_CCER_CC3P BIT(9) /* Capt/Comp 3 Polarity */ -#define TIM_CCER_CC3NP BIT(11) /* Capt/Comp 3N Polarity */ -#define TIM_CCER_CC4E BIT(12) /* Capt/Comp 4 out Ena */ -#define TIM_CCER_CC4P BIT(13) /* Capt/Comp 4 Polarity */ -#define TIM_CCER_CC4NP BIT(15) /* Capt/Comp 4N Polarity */ -#define TIM_CCER_CCXE (BIT(0) | BIT(4) | BIT(8) | BIT(12)) -#define TIM_BDTR_BKE(x) BIT(12 + (x) * 12) /* Break input enable */ -#define TIM_BDTR_BKP(x) BIT(13 + (x) * 12) /* Break input polarity */ -#define TIM_BDTR_AOE BIT(14) /* Automatic Output Enable */ -#define TIM_BDTR_MOE BIT(15) /* Main Output Enable */ -#define TIM_BDTR_BKF(x) (0xf << (16 + (x) * 4)) -#define TIM_DCR_DBA GENMASK(4, 0) /* DMA base addr */ -#define TIM_DCR_DBL GENMASK(12, 8) /* DMA burst len */ +#define TIM_CR1_CEN BIT(0) /* Counter Enable */ +#define TIM_CR1_DIR BIT(4) /* Counter Direction */ +#define TIM_CR1_ARPE BIT(7) /* Auto-reload Preload Ena */ +#define TIM_CR2_MMS (BIT(4) | BIT(5) | BIT(6)) /* Master mode selection */ +#define TIM_CR2_MMS2 GENMASK(23, 20) /* Master mode selection 2 */ +#define TIM_SMCR_SMS (BIT(0) | BIT(1) | BIT(2)) /* Slave mode selection */ +#define TIM_SMCR_TS (BIT(4) | BIT(5) | BIT(6)) /* Trigger selection */ +#define TIM_DIER_UIE BIT(0) /* Update interrupt */ +#define TIM_DIER_CC1IE BIT(1) /* CC1 Interrupt Enable */ +#define TIM_DIER_CC2IE BIT(2) /* CC2 Interrupt Enable */ +#define TIM_DIER_CC3IE BIT(3) /* CC3 Interrupt Enable */ +#define TIM_DIER_CC4IE BIT(4) /* CC4 Interrupt Enable */ +#define TIM_DIER_CC_IE(x) BIT((x) + 1) /* CC1, CC2, CC3, CC4 interrupt enable */ +#define TIM_DIER_UDE BIT(8) /* Update DMA request Enable */ +#define TIM_DIER_CC1DE BIT(9) /* CC1 DMA request Enable */ +#define TIM_DIER_CC2DE BIT(10) /* CC2 DMA request Enable */ +#define TIM_DIER_CC3DE BIT(11) /* CC3 DMA request Enable */ +#define TIM_DIER_CC4DE BIT(12) /* CC4 DMA request Enable */ +#define TIM_DIER_COMDE BIT(13) /* COM DMA request Enable */ +#define TIM_DIER_TDE BIT(14) /* Trigger DMA request Enable */ +#define TIM_SR_UIF BIT(0) /* Update interrupt flag */ +#define TIM_SR_CC_IF(x) BIT((x) + 1) /* CC1, CC2, CC3, CC4 interrupt flag */ +#define TIM_EGR_UG BIT(0) /* Update Generation */ +#define TIM_CCMR_PE BIT(3) /* Channel Preload Enable */ +#define TIM_CCMR_M1 (BIT(6) | BIT(5)) /* Channel PWM Mode 1 */ +#define TIM_CCMR_CC1S (BIT(0) | BIT(1)) /* Capture/compare 1 sel */ +#define TIM_CCMR_IC1PSC GENMASK(3, 2) /* Input capture 1 prescaler */ +#define TIM_CCMR_CC2S (BIT(8) | BIT(9)) /* Capture/compare 2 sel */ +#define TIM_CCMR_IC2PSC GENMASK(11, 10) /* Input capture 2 prescaler */ +#define TIM_CCMR_CC1S_TI1 BIT(0) /* IC1/IC3 selects TI1/TI3 */ +#define TIM_CCMR_CC1S_TI2 BIT(1) /* IC1/IC3 selects TI2/TI4 */ +#define TIM_CCMR_CC2S_TI2 BIT(8) /* IC2/IC4 selects TI2/TI4 */ +#define TIM_CCMR_CC2S_TI1 BIT(9) /* IC2/IC4 selects TI1/TI3 */ +#define TIM_CCMR_CC3S (BIT(0) | BIT(1)) /* Capture/compare 3 sel */ +#define TIM_CCMR_CC4S (BIT(8) | BIT(9)) /* Capture/compare 4 sel */ +#define TIM_CCMR_CC3S_TI3 BIT(0) /* IC3 selects TI3 */ +#define TIM_CCMR_CC4S_TI4 BIT(8) /* IC4 selects TI4 */ +#define TIM_CCER_CC1E BIT(0) /* Capt/Comp 1 out Ena */ +#define TIM_CCER_CC1P BIT(1) /* Capt/Comp 1 Polarity */ +#define TIM_CCER_CC1NE BIT(2) /* Capt/Comp 1N out Ena */ +#define TIM_CCER_CC1NP BIT(3) /* Capt/Comp 1N Polarity */ +#define TIM_CCER_CC2E BIT(4) /* Capt/Comp 2 out Ena */ +#define TIM_CCER_CC2P BIT(5) /* Capt/Comp 2 Polarity */ +#define TIM_CCER_CC2NP BIT(7) /* Capt/Comp 2N Polarity */ +#define TIM_CCER_CC3E BIT(8) /* Capt/Comp 3 out Ena */ +#define TIM_CCER_CC3P BIT(9) /* Capt/Comp 3 Polarity */ +#define TIM_CCER_CC3NP BIT(11) /* Capt/Comp 3N Polarity */ +#define TIM_CCER_CC4E BIT(12) /* Capt/Comp 4 out Ena */ +#define TIM_CCER_CC4P BIT(13) /* Capt/Comp 4 Polarity */ +#define TIM_CCER_CC4NP BIT(15) /* Capt/Comp 4N Polarity */ +#define TIM_CCER_CCXE (BIT(0) | BIT(4) | BIT(8) | BIT(12)) +#define TIM_BDTR_BKE(x) BIT(12 + (x) * 12) /* Break input enable */ +#define TIM_BDTR_BKP(x) BIT(13 + (x) * 12) /* Break input polarity */ +#define TIM_BDTR_AOE BIT(14) /* Automatic Output Enable */ +#define TIM_BDTR_MOE BIT(15) /* Main Output Enable */ +#define TIM_BDTR_BKF(x) (0xf << (16 + (x) * 4)) +#define TIM_DCR_DBA GENMASK(4, 0) /* DMA base addr */ +#define TIM_DCR_DBL GENMASK(12, 8) /* DMA burst len */ -#define MAX_TIM_PSC 0xFFFF -#define MAX_TIM_ICPSC 0x3 -#define TIM_CR2_MMS_SHIFT 4 -#define TIM_CR2_MMS2_SHIFT 20 +#define MAX_TIM_PSC 0xFFFF +#define MAX_TIM_ICPSC 0x3 +#define TIM_CR2_MMS_SHIFT 4 +#define TIM_CR2_MMS2_SHIFT 20 #define TIM_SMCR_SMS_SLAVE_MODE_DISABLED 0 /* counts on internal clock when CEN=1 */ #define TIM_SMCR_SMS_ENCODER_MODE_1 1 /* counts TI1FP1 edges, depending on TI2FP2 level */ #define TIM_SMCR_SMS_ENCODER_MODE_2 2 /* counts TI2FP2 edges, depending on TI1FP1 level */ #define TIM_SMCR_SMS_ENCODER_MODE_3 3 /* counts on both TI1FP1 and TI2FP2 edges */ -#define TIM_SMCR_TS_SHIFT 4 -#define TIM_BDTR_BKF_MASK 0xF -#define TIM_BDTR_BKF_SHIFT(x) (16 + (x) * 4) +#define TIM_SMCR_TS_SHIFT 4 +#define TIM_BDTR_BKF_MASK 0xF +#define TIM_BDTR_BKF_SHIFT(x) (16 + (x) * 4) enum stm32_timers_dmas { STM32_TIMERS_DMA_CH1, -- cgit v1.2.3 From 796b942f65967af55684bf520812787b97522151 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 19 Jun 2024 12:11:43 +0200 Subject: mfd: stm32-timers: Add some register definitions with a parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are some registers that belong together and are numbered from 1 to 4. Introduce a macro definition for these that takes the channel number as parameter and define the previously available constants using the new ones. This allows to simplify some users that up to now use constructs like TIM_CCER_CC1NE << (ch * 4) which is an ugly mix of using a predefined value and still knowing internal details about it. Note that there are several decrements by 1 involved. These are necessary because software guys start counting at 0 while the hardware designer started at 1 (and having TIM_CCER_CCxE(1) be TIM_CCER_CC2E isn't a sane option). The compiler is expected to optimize these out nicely. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/05df15f61dde81033407d3b4fcb67ee403ecc8db.1718791090.git.u.kleine-koenig@baylibre.com Signed-off-by: Lee Jones --- include/linux/mfd/stm32-timers.h | 60 +++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h index 5794110b2b28..92b45a559656 100644 --- a/include/linux/mfd/stm32-timers.h +++ b/include/linux/mfd/stm32-timers.h @@ -24,10 +24,11 @@ #define TIM_CNT 0x24 /* Counter */ #define TIM_PSC 0x28 /* Prescaler */ #define TIM_ARR 0x2c /* Auto-Reload Register */ -#define TIM_CCR1 0x34 /* Capt/Comp Register 1 */ -#define TIM_CCR2 0x38 /* Capt/Comp Register 2 */ -#define TIM_CCR3 0x3C /* Capt/Comp Register 3 */ -#define TIM_CCR4 0x40 /* Capt/Comp Register 4 */ +#define TIM_CCRx(x) (0x34 + 4 * ((x) - 1)) /* Capt/Comp Register x (x ∈ {1, .. 4}) */ +#define TIM_CCR1 TIM_CCRx(1) /* Capt/Comp Register 1 */ +#define TIM_CCR2 TIM_CCRx(2) /* Capt/Comp Register 2 */ +#define TIM_CCR3 TIM_CCRx(3) /* Capt/Comp Register 3 */ +#define TIM_CCR4 TIM_CCRx(4) /* Capt/Comp Register 4 */ #define TIM_BDTR 0x44 /* Break and Dead-Time Reg */ #define TIM_DCR 0x48 /* DMA control register */ #define TIM_DMAR 0x4C /* DMA register for transfer */ @@ -41,16 +42,18 @@ #define TIM_SMCR_SMS (BIT(0) | BIT(1) | BIT(2)) /* Slave mode selection */ #define TIM_SMCR_TS (BIT(4) | BIT(5) | BIT(6)) /* Trigger selection */ #define TIM_DIER_UIE BIT(0) /* Update interrupt */ -#define TIM_DIER_CC1IE BIT(1) /* CC1 Interrupt Enable */ -#define TIM_DIER_CC2IE BIT(2) /* CC2 Interrupt Enable */ -#define TIM_DIER_CC3IE BIT(3) /* CC3 Interrupt Enable */ -#define TIM_DIER_CC4IE BIT(4) /* CC4 Interrupt Enable */ +#define TIM_DIER_CCxIE(x) BIT(1 + ((x) - 1)) /* CCx Interrupt Enable (x ∈ {1, .. 4}) */ +#define TIM_DIER_CC1IE TIM_DIER_CCxIE(1) /* CC1 Interrupt Enable */ +#define TIM_DIER_CC2IE TIM_DIER_CCxIE(2) /* CC2 Interrupt Enable */ +#define TIM_DIER_CC3IE TIM_DIER_CCxIE(3) /* CC3 Interrupt Enable */ +#define TIM_DIER_CC4IE TIM_DIER_CCxIE(4) /* CC4 Interrupt Enable */ #define TIM_DIER_CC_IE(x) BIT((x) + 1) /* CC1, CC2, CC3, CC4 interrupt enable */ #define TIM_DIER_UDE BIT(8) /* Update DMA request Enable */ -#define TIM_DIER_CC1DE BIT(9) /* CC1 DMA request Enable */ -#define TIM_DIER_CC2DE BIT(10) /* CC2 DMA request Enable */ -#define TIM_DIER_CC3DE BIT(11) /* CC3 DMA request Enable */ -#define TIM_DIER_CC4DE BIT(12) /* CC4 DMA request Enable */ +#define TIM_DIER_CCxDE(x) BIT(9 + ((x) - 1)) /* CCx DMA request Enable (x ∈ {1, .. 4}) */ +#define TIM_DIER_CC1DE TIM_DIER_CCxDE(1) /* CC1 DMA request Enable */ +#define TIM_DIER_CC2DE TIM_DIER_CCxDE(2) /* CC2 DMA request Enable */ +#define TIM_DIER_CC3DE TIM_DIER_CCxDE(3) /* CC3 DMA request Enable */ +#define TIM_DIER_CC4DE TIM_DIER_CCxDE(4) /* CC4 DMA request Enable */ #define TIM_DIER_COMDE BIT(13) /* COM DMA request Enable */ #define TIM_DIER_TDE BIT(14) /* Trigger DMA request Enable */ #define TIM_SR_UIF BIT(0) /* Update interrupt flag */ @@ -70,19 +73,26 @@ #define TIM_CCMR_CC4S (BIT(8) | BIT(9)) /* Capture/compare 4 sel */ #define TIM_CCMR_CC3S_TI3 BIT(0) /* IC3 selects TI3 */ #define TIM_CCMR_CC4S_TI4 BIT(8) /* IC4 selects TI4 */ -#define TIM_CCER_CC1E BIT(0) /* Capt/Comp 1 out Ena */ -#define TIM_CCER_CC1P BIT(1) /* Capt/Comp 1 Polarity */ -#define TIM_CCER_CC1NE BIT(2) /* Capt/Comp 1N out Ena */ -#define TIM_CCER_CC1NP BIT(3) /* Capt/Comp 1N Polarity */ -#define TIM_CCER_CC2E BIT(4) /* Capt/Comp 2 out Ena */ -#define TIM_CCER_CC2P BIT(5) /* Capt/Comp 2 Polarity */ -#define TIM_CCER_CC2NP BIT(7) /* Capt/Comp 2N Polarity */ -#define TIM_CCER_CC3E BIT(8) /* Capt/Comp 3 out Ena */ -#define TIM_CCER_CC3P BIT(9) /* Capt/Comp 3 Polarity */ -#define TIM_CCER_CC3NP BIT(11) /* Capt/Comp 3N Polarity */ -#define TIM_CCER_CC4E BIT(12) /* Capt/Comp 4 out Ena */ -#define TIM_CCER_CC4P BIT(13) /* Capt/Comp 4 Polarity */ -#define TIM_CCER_CC4NP BIT(15) /* Capt/Comp 4N Polarity */ +#define TIM_CCER_CCxE(x) BIT(0 + 4 * ((x) - 1)) /* Capt/Comp x out Ena (x ∈ {1, .. 4}) */ +#define TIM_CCER_CCxP(x) BIT(1 + 4 * ((x) - 1)) /* Capt/Comp x Polarity (x ∈ {1, .. 4}) */ +#define TIM_CCER_CCxNE(x) BIT(2 + 4 * ((x) - 1)) /* Capt/Comp xN out Ena (x ∈ {1, .. 4}) */ +#define TIM_CCER_CCxNP(x) BIT(3 + 4 * ((x) - 1)) /* Capt/Comp xN Polarity (x ∈ {1, .. 4}) */ +#define TIM_CCER_CC1E TIM_CCER_CCxE(1) /* Capt/Comp 1 out Ena */ +#define TIM_CCER_CC1P TIM_CCER_CCxP(1) /* Capt/Comp 1 Polarity */ +#define TIM_CCER_CC1NE TIM_CCER_CCxNE(1) /* Capt/Comp 1N out Ena */ +#define TIM_CCER_CC1NP TIM_CCER_CCxNP(1) /* Capt/Comp 1N Polarity */ +#define TIM_CCER_CC2E TIM_CCER_CCxE(2) /* Capt/Comp 2 out Ena */ +#define TIM_CCER_CC2P TIM_CCER_CCxP(2) /* Capt/Comp 2 Polarity */ +#define TIM_CCER_CC2NE TIM_CCER_CCxNE(2) /* Capt/Comp 2N out Ena */ +#define TIM_CCER_CC2NP TIM_CCER_CCxNP(2) /* Capt/Comp 2N Polarity */ +#define TIM_CCER_CC3E TIM_CCER_CCxE(3) /* Capt/Comp 3 out Ena */ +#define TIM_CCER_CC3P TIM_CCER_CCxP(3) /* Capt/Comp 3 Polarity */ +#define TIM_CCER_CC3NE TIM_CCER_CCxNE(3) /* Capt/Comp 3N out Ena */ +#define TIM_CCER_CC3NP TIM_CCER_CCxNP(3) /* Capt/Comp 3N Polarity */ +#define TIM_CCER_CC4E TIM_CCER_CCxE(4) /* Capt/Comp 4 out Ena */ +#define TIM_CCER_CC4P TIM_CCER_CCxP(4) /* Capt/Comp 4 Polarity */ +#define TIM_CCER_CC4NE TIM_CCER_CCxNE(4) /* Capt/Comp 4N out Ena */ +#define TIM_CCER_CC4NP TIM_CCER_CCxNP(4) /* Capt/Comp 4N Polarity */ #define TIM_CCER_CCXE (BIT(0) | BIT(4) | BIT(8) | BIT(12)) #define TIM_BDTR_BKE(x) BIT(12 + (x) * 12) /* Break input enable */ #define TIM_BDTR_BKP(x) BIT(13 + (x) * 12) /* Break input polarity */ -- cgit v1.2.3 From 304d02aa711369da89b4f8c01702bf1b5d1f7abc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 19 Jun 2024 12:11:45 +0200 Subject: mfd: stm32-timers: Drop unused TIM_DIER_CC_IE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This macro is misleading as TIM_DIER_CC_IE(1) == TIM_DIER_CC2IE . The only user was updated to use TIM_DIER_CCxIE() instead which doesn't suffer from this mismatch, so TIM_DIER_CC_IE can be dropped. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/6c8fcc4ed159992a1dbb0796087e6ceb10c39c96.1718791090.git.u.kleine-koenig@baylibre.com Signed-off-by: Lee Jones --- include/linux/mfd/stm32-timers.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h index 92b45a559656..f09ba598c97a 100644 --- a/include/linux/mfd/stm32-timers.h +++ b/include/linux/mfd/stm32-timers.h @@ -47,7 +47,6 @@ #define TIM_DIER_CC2IE TIM_DIER_CCxIE(2) /* CC2 Interrupt Enable */ #define TIM_DIER_CC3IE TIM_DIER_CCxIE(3) /* CC3 Interrupt Enable */ #define TIM_DIER_CC4IE TIM_DIER_CCxIE(4) /* CC4 Interrupt Enable */ -#define TIM_DIER_CC_IE(x) BIT((x) + 1) /* CC1, CC2, CC3, CC4 interrupt enable */ #define TIM_DIER_UDE BIT(8) /* Update DMA request Enable */ #define TIM_DIER_CCxDE(x) BIT(9 + ((x) - 1)) /* CCx DMA request Enable (x ∈ {1, .. 4}) */ #define TIM_DIER_CC1DE TIM_DIER_CCxDE(1) /* CC1 DMA request Enable */ -- cgit v1.2.3 From cf546dd289e0f6d2594c25e2fb4e19ee67c6d988 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 27 May 2024 17:40:10 +0200 Subject: block: change rq_integrity_vec to respect the iterator If we allocate a bio that is larger than NVMe maximum request size, attach integrity metadata to it and send it to the NVMe subsystem, the integrity metadata will be corrupted. Splitting the bio works correctly. The function bio_split will clone the bio, trim the iterator of the first bio and advance the iterator of the second bio. However, the function rq_integrity_vec has a bug - it returns the first vector of the bio's metadata and completely disregards the metadata iterator that was advanced when the bio was split. Thus, the second bio uses the same metadata as the first bio and this leads to metadata corruption. This commit changes rq_integrity_vec, so that it calls mp_bvec_iter_bvec instead of returning the first vector. mp_bvec_iter_bvec reads the iterator and uses it to build a bvec for the current position in the iterator. The "queue_max_integrity_segments(rq->q) > 1" check was removed, because the updated rq_integrity_vec function works correctly with multiple segments. Signed-off-by: Mikulas Patocka Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/49d1afaa-f934-6ed2-a678-e0d428c63a65@redhat.com Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index d201140d77a3..0fdd62e6d4b0 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -90,14 +90,13 @@ static inline bool blk_integrity_rq(struct request *rq) } /* - * Return the first bvec that contains integrity data. Only drivers that are - * limited to a single integrity segment should use this helper. + * Return the current bvec that contains the integrity data. bip_iter may be + * advanced to iterate over the integrity data. */ -static inline struct bio_vec *rq_integrity_vec(struct request *rq) +static inline struct bio_vec rq_integrity_vec(struct request *rq) { - if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1)) - return NULL; - return rq->bio->bi_integrity->bip_vec; + return mp_bvec_iter_bvec(rq->bio->bi_integrity->bip_vec, + rq->bio->bi_integrity->bip_iter); } #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline int blk_rq_count_integrity_sg(struct request_queue *q, @@ -148,7 +147,8 @@ static inline int blk_integrity_rq(struct request *rq) static inline struct bio_vec *rq_integrity_vec(struct request *rq) { - return NULL; + /* the optimizer will remove all calls to this function */ + return (struct bio_vec){ }; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ -- cgit v1.2.3 From 5d55721d6e24c8e99cc86ee1fcb90d776ef47964 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 23 Jun 2024 08:45:33 +0200 Subject: power: supply: samsung-sdi-battery: Constify struct power_supply_vbat_ri_table 'struct power_supply_vbat_ri_table' are not modified in this driver. Constifying these structures moves some data to a read-only section, so increase overall security. In order to do it, some code also needs to be adjusted to this new const qualifier. On a x86_64, with allmodconfig: Before: ====== $ size drivers/power/supply/samsung-sdi-battery.o text data bss dec hex filename 955 7664 0 8619 21ab drivers/power/supply/samsung-sdi-battery.o After: ===== $ size drivers/power/supply/samsung-sdi-battery.o text data bss dec hex filename 4055 4584 0 8639 21bf drivers/power/supply/samsung-sdi-battery.o Signed-off-by: Christophe JAILLET Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/d01818abd880bf435d1106a9a6cc11a7a8a3e661.1719125040.git.christophe.jaillet@wanadoo.fr Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 65082ef75692..5061eeecf62e 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -755,9 +755,9 @@ struct power_supply_battery_info { int ocv_table_size[POWER_SUPPLY_OCV_TEMP_MAX]; struct power_supply_resistance_temp_table *resist_table; int resist_table_size; - struct power_supply_vbat_ri_table *vbat2ri_discharging; + const struct power_supply_vbat_ri_table *vbat2ri_discharging; int vbat2ri_discharging_size; - struct power_supply_vbat_ri_table *vbat2ri_charging; + const struct power_supply_vbat_ri_table *vbat2ri_charging; int vbat2ri_charging_size; int bti_resistance_ohm; int bti_resistance_tolerance; -- cgit v1.2.3 From 0b209ec85b2b73c38a09ba71dc05fbe4aee7be67 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 23 Jun 2024 10:04:45 +0200 Subject: power: supply: samsung-sdi-battery: Constify struct power_supply_maintenance_charge_table 'struct power_supply_maintenance_charge_table' is not modified in this driver. Constifying this structure moves some data to a read-only section, so increase overall security. In order to do it, some code also needs to be adjusted to this new const qualifier. On a x86_64, with allmodconfig: Before: ====== $ size drivers/power/supply/samsung-sdi-battery.o text data bss dec hex filename 4055 4584 0 8639 21bf drivers/power/supply/samsung-sdi-battery.o After: ===== $ size drivers/power/supply/samsung-sdi-battery.o text data bss dec hex filename 4087 4552 0 8639 21bf drivers/power/supply/samsung-sdi-battery.o Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/6caafd0ac2556a40405273b1a4badc508ea8e9b0.1719125040.git.christophe.jaillet@wanadoo.fr Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 5061eeecf62e..72dc7e45c90c 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -736,7 +736,7 @@ struct power_supply_battery_info { int overvoltage_limit_uv; int constant_charge_current_max_ua; int constant_charge_voltage_max_uv; - struct power_supply_maintenance_charge_table *maintenance_charge; + const struct power_supply_maintenance_charge_table *maintenance_charge; int maintenance_charge_size; int alert_low_temp_charge_current_ua; int alert_low_temp_charge_voltage_uv; @@ -810,7 +810,7 @@ power_supply_temp2resist_simple(struct power_supply_resistance_temp_table *table int table_len, int temp); extern int power_supply_vbat2ri(struct power_supply_battery_info *info, int vbat_uv, bool charging); -extern struct power_supply_maintenance_charge_table * +extern const struct power_supply_maintenance_charge_table * power_supply_get_maintenance_charging_setting(struct power_supply_battery_info *info, int index); extern bool power_supply_battery_bti_in_range(struct power_supply_battery_info *info, int resistance); @@ -824,7 +824,7 @@ extern int power_supply_set_battery_charged(struct power_supply *psy); static inline bool power_supply_supports_maintenance_charging(struct power_supply_battery_info *info) { - struct power_supply_maintenance_charge_table *mt; + const struct power_supply_maintenance_charge_table *mt; mt = power_supply_get_maintenance_charging_setting(info, 0); -- cgit v1.2.3 From aaa53168cbcc486ca1927faac00bd99e81d4ff04 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 28 May 2024 13:32:34 +0200 Subject: dm: optimize flushes Device mapper sends flush bios to all the targets and the targets send it to the underlying device. That may be inefficient, for example if a table contains 10 linear targets pointing to the same physical device, then device mapper would send 10 flush bios to that device - despite the fact that only one bio would be sufficient. This commit optimizes the flush behavior. It introduces a per-target variable flush_bypasses_map - it is set when the target supports flush optimization - currently, the dm-linear and dm-stripe targets support it. When all the targets in a table have flush_bypasses_map, flush_bypasses_map on the table is set. __send_empty_flush tests if the table has flush_bypasses_map - and if it has, no flush bios are sent to the targets via the "map" method and the list dm_table->devices is iterated and the flush bios are sent to each member of the list. Signed-off-by: Mikulas Patocka Reviewed-by: Mike Snitzer Suggested-by: Yang Yang --- include/linux/device-mapper.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 82b2195efaca..3611b230d0aa 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -397,6 +397,21 @@ struct dm_target { * bio_set_dev(). NOTE: ideally a target should _not_ need this. */ bool needs_bio_set_dev:1; + + /* + * Set if the target supports flush optimization. If all the targets in + * a table have flush_bypasses_map set, the dm core will not send + * flushes to the targets via a ->map method. It will iterate over + * dm_table->devices and send flushes to the devices directly. This + * optimization reduces the number of flushes being sent when multiple + * targets in a table use the same underlying device. + * + * This optimization may be enabled on targets that just pass the + * flushes to the underlying devices without performing any other + * actions on the flush request. Currently, dm-linear and dm-stripe + * support it. + */ + bool flush_bypasses_map:1; }; void *dm_per_bio_data(struct bio *bio, size_t data_size); -- cgit v1.2.3 From ec9b1cf0b0ebfb52274971a8a0e74e0a133f64fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:24 +0200 Subject: block: rename BLK_FEAT_MISALIGNED This is a flag for ->flags and not a feature for ->features. And fix the one place that actually incorrectly cleared it from ->features. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240626142637.300624-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b2f1362c4681..1a7e9d9c16d7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -347,7 +347,7 @@ enum { BLK_FLAG_WRITE_CACHE_DISABLED = (1u << 0), /* I/O topology is misaligned */ - BLK_FEAT_MISALIGNED = (1u << 1), + BLK_FLAG_MISALIGNED = (1u << 1), }; struct queue_limits { -- cgit v1.2.3 From fcf865e357f80285af12c0c9a49f89d71acb7f4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:25 +0200 Subject: block: convert features and flags to __bitwise types ... and let sparse help us catch mismatches or abuses. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240626142637.300624-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 85 +++++++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1a7e9d9c16d7..b37826b350a2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -283,55 +283,56 @@ static inline bool blk_op_is_passthrough(blk_opf_t op) } /* flags set by the driver in queue_limits.features */ -enum { - /* supports a volatile write cache */ - BLK_FEAT_WRITE_CACHE = (1u << 0), +typedef unsigned int __bitwise blk_features_t; - /* supports passing on the FUA bit */ - BLK_FEAT_FUA = (1u << 1), +/* supports a volatile write cache */ +#define BLK_FEAT_WRITE_CACHE ((__force blk_features_t)(1u << 0)) - /* rotational device (hard drive or floppy) */ - BLK_FEAT_ROTATIONAL = (1u << 2), +/* supports passing on the FUA bit */ +#define BLK_FEAT_FUA ((__force blk_features_t)(1u << 1)) - /* contributes to the random number pool */ - BLK_FEAT_ADD_RANDOM = (1u << 3), +/* rotational device (hard drive or floppy) */ +#define BLK_FEAT_ROTATIONAL ((__force blk_features_t)(1u << 2)) - /* do disk/partitions IO accounting */ - BLK_FEAT_IO_STAT = (1u << 4), +/* contributes to the random number pool */ +#define BLK_FEAT_ADD_RANDOM ((__force blk_features_t)(1u << 3)) - /* don't modify data until writeback is done */ - BLK_FEAT_STABLE_WRITES = (1u << 5), +/* do disk/partitions IO accounting */ +#define BLK_FEAT_IO_STAT ((__force blk_features_t)(1u << 4)) - /* always completes in submit context */ - BLK_FEAT_SYNCHRONOUS = (1u << 6), +/* don't modify data until writeback is done */ +#define BLK_FEAT_STABLE_WRITES ((__force blk_features_t)(1u << 5)) - /* supports REQ_NOWAIT */ - BLK_FEAT_NOWAIT = (1u << 7), +/* always completes in submit context */ +#define BLK_FEAT_SYNCHRONOUS ((__force blk_features_t)(1u << 6)) - /* supports DAX */ - BLK_FEAT_DAX = (1u << 8), +/* supports REQ_NOWAIT */ +#define BLK_FEAT_NOWAIT ((__force blk_features_t)(1u << 7)) - /* supports I/O polling */ - BLK_FEAT_POLL = (1u << 9), +/* supports DAX */ +#define BLK_FEAT_DAX ((__force blk_features_t)(1u << 8)) - /* is a zoned device */ - BLK_FEAT_ZONED = (1u << 10), +/* supports I/O polling */ +#define BLK_FEAT_POLL ((__force blk_features_t)(1u << 9)) - /* supports Zone Reset All */ - BLK_FEAT_ZONE_RESETALL = (1u << 11), +/* is a zoned device */ +#define BLK_FEAT_ZONED ((__force blk_features_t)(1u << 10)) - /* supports PCI(e) p2p requests */ - BLK_FEAT_PCI_P2PDMA = (1u << 12), +/* supports Zone Reset All */ +#define BLK_FEAT_ZONE_RESETALL ((__force blk_features_t)(1u << 11)) - /* skip this queue in blk_mq_(un)quiesce_tagset */ - BLK_FEAT_SKIP_TAGSET_QUIESCE = (1u << 13), +/* supports PCI(e) p2p requests */ +#define BLK_FEAT_PCI_P2PDMA ((__force blk_features_t)(1u << 12)) - /* bounce all highmem pages */ - BLK_FEAT_BOUNCE_HIGH = (1u << 14), +/* skip this queue in blk_mq_(un)quiesce_tagset */ +#define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13)) - /* undocumented magic for bcache */ - BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE = (1u << 15), -}; +/* bounce all highmem pages */ +#define BLK_FEAT_BOUNCE_HIGH ((__force blk_features_t)(1u << 14)) + +/* undocumented magic for bcache */ +#define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ + ((__force blk_features_t)(1u << 15)) /* * Flags automatically inherited when stacking limits. @@ -342,17 +343,17 @@ enum { BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) /* internal flags in queue_limits.flags */ -enum { - /* do not send FLUSH/FUA commands despite advertising a write cache */ - BLK_FLAG_WRITE_CACHE_DISABLED = (1u << 0), +typedef unsigned int __bitwise blk_flags_t; - /* I/O topology is misaligned */ - BLK_FLAG_MISALIGNED = (1u << 1), -}; +/* do not send FLUSH/FUA commands despite advertising a write cache */ +#define BLK_FLAG_WRITE_CACHE_DISABLED ((__force blk_flags_t)(1u << 0)) + +/* I/O topology is misaligned */ +#define BLK_FLAG_MISALIGNED ((__force blk_flags_t)(1u << 1)) struct queue_limits { - unsigned int features; - unsigned int flags; + blk_features_t features; + blk_flags_t flags; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; -- cgit v1.2.3 From 73781b3b81e76583708a652c853d54d03dce031d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:27 +0200 Subject: block: remove disk_update_readahead Mark blk_apply_bdi_limits non-static and open code disk_update_readahead in the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240626142637.300624-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b37826b350a2..6b88382012e9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -973,7 +973,6 @@ static inline void blk_queue_disable_write_zeroes(struct request_queue *q) /* * Access functions for manipulating queue properties */ -void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); -- cgit v1.2.3 From abfc9d810926dfbf5645c7755c8d5ab96273f27d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:28 +0200 Subject: block: remove the fallback case in queue_dma_alignment Now that all updates go through blk_validate_limits the default of 511 is set at initialization time. Also remove the unused NULL check as calling this helper on a NULL queue can't happen (and doesn't make much sense to start with). Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240626142637.300624-8-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6b88382012e9..94fcbc912312 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1394,7 +1394,7 @@ static inline bool bdev_is_zone_start(struct block_device *bdev, static inline int queue_dma_alignment(const struct request_queue *q) { - return q ? q->limits.dma_alignment : 511; + return q->limits.dma_alignment; } static inline unsigned int -- cgit v1.2.3 From e94b45d08b5d1c230c0f59c3eed758d28658851e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 16:26:29 +0200 Subject: block: move dma_pad_mask into queue_limits dma_pad_mask is a queue_limits by all ways of looking at it, so move it there and set it through the atomic queue limits APIs. Add a little helper that takes the alignment and pad into account to simplify the code that is touched a bit. Note that there never was any need for the > check in blk_queue_update_dma_pad, this probably was just copy and paste from dma_update_dma_alignment. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240626142637.300624-9-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 94fcbc912312..a53e3434e1a2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -401,6 +401,7 @@ struct queue_limits { * due to possible offsets. */ unsigned int dma_alignment; + unsigned int dma_pad_mask; struct blk_integrity integrity; }; @@ -509,8 +510,6 @@ struct request_queue { */ int id; - unsigned int dma_pad_mask; - /* * queue settings */ @@ -981,7 +980,6 @@ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); -extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); struct blk_independent_access_ranges * @@ -1433,10 +1431,16 @@ static inline bool bdev_iter_is_aligned(struct block_device *bdev, bdev_logical_block_size(bdev) - 1); } +static inline int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) +{ + return lim->dma_alignment | lim->dma_pad_mask; +} + static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, unsigned int len) { - unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask; + unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits); + return !(addr & alignment) && !(len & alignment); } -- cgit v1.2.3 From 769cb63166d90f1fadafa4352f180cbd96b6cb77 Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Fri, 21 Jun 2024 12:55:43 +0100 Subject: mfd: syscon: Add of_syscon_register_regmap() API The of_syscon_register_regmap() API allows an externally created regmap to be registered with syscon. This regmap can then be returned to client drivers using the syscon_regmap_lookup_by_phandle() APIs. The API is used by platforms where mmio access to the syscon registers is not possible, and a underlying soc driver like exynos-pmu provides a SoC specific regmap that can issue a SMC or hypervisor call to write the register. This approach keeps the SoC complexities out of syscon, but allows common drivers such as syscon-poweroff, syscon-reboot and friends that are used by many SoCs already to be re-used. Signed-off-by: Peter Griffin Reviewed-by: Arnd Bergmann Reviewed-by: Sam Protsenko Tested-by: Will McVicker Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240621115544.1655458-2-peter.griffin@linaro.org Signed-off-by: Lee Jones --- include/linux/mfd/syscon.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/syscon.h b/include/linux/mfd/syscon.h index c315903f6dab..aad9c6b50463 100644 --- a/include/linux/mfd/syscon.h +++ b/include/linux/mfd/syscon.h @@ -28,6 +28,8 @@ struct regmap *syscon_regmap_lookup_by_phandle_args(struct device_node *np, unsigned int *out_args); struct regmap *syscon_regmap_lookup_by_phandle_optional(struct device_node *np, const char *property); +int of_syscon_register_regmap(struct device_node *np, + struct regmap *regmap); #else static inline struct regmap *device_node_to_regmap(struct device_node *np) { @@ -67,6 +69,12 @@ static inline struct regmap *syscon_regmap_lookup_by_phandle_optional( return NULL; } +static inline int of_syscon_register_regmap(struct device_node *np, + struct regmap *regmap) +{ + return -EOPNOTSUPP; +} + #endif #endif /* __LINUX_MFD_SYSCON_H__ */ -- cgit v1.2.3 From 1cb7d29157603561af4c38535e936850ceb99f0f Mon Sep 17 00:00:00 2001 From: Biju Das Date: Sun, 16 Jun 2024 11:53:55 +0100 Subject: regulator: core: Add helper for allow HW access to enable/disable regulator Add a helper function that allow regulator consumers to allow low-level HW access, in order to enable/disable regulator in atomic context. The use-case for RZ/G2L SoC is to enable VBUS selection register based on vbus detection that happens in interrupt context. Signed-off-by: Biju Das Link: https://patch.msgid.link/20240616105402.45211-4-biju.das.jz@bp.renesas.com Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 59d0b9a79e6e..550a0e6523ae 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -250,6 +250,7 @@ int regulator_get_hardware_vsel_register(struct regulator *regulator, unsigned *vsel_mask); int regulator_list_hardware_vsel(struct regulator *regulator, unsigned selector); +int regulator_hardware_enable(struct regulator *regulator, bool enable); /* regulator notifier block */ int regulator_register_notifier(struct regulator *regulator, @@ -571,6 +572,12 @@ static inline int regulator_list_hardware_vsel(struct regulator *regulator, return -EOPNOTSUPP; } +static inline int regulator_hardware_enable(struct regulator *regulator, + bool enable) +{ + return -EOPNOTSUPP; +} + static inline int regulator_register_notifier(struct regulator *regulator, struct notifier_block *nb) { -- cgit v1.2.3 From 69b6517687a4b1fb250bd8c9c193a0a304c8ba17 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 Jun 2024 19:01:58 -0600 Subject: block: use the right type for stub rq_integrity_vec() For !CONFIG_BLK_DEV_INTEGRITY, rq_integrity_vec() wasn't updated properly. Fix it up. Fixes: cf546dd289e0 ("block: change rq_integrity_vec to respect the iterator") Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 0fdd62e6d4b0..c58634924782 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -145,7 +145,7 @@ static inline int blk_integrity_rq(struct request *rq) return 0; } -static inline struct bio_vec *rq_integrity_vec(struct request *rq) +static inline struct bio_vec rq_integrity_vec(struct request *rq) { /* the optimizer will remove all calls to this function */ return (struct bio_vec){ }; -- cgit v1.2.3 From 7276f425b744a81eb5a8e519b0c452d5e64b530c Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 27 Jun 2024 10:43:27 +0300 Subject: mfd: support ROHM BD96801 PMIC core The ROHM BD96801 PMIC is highly customizable automotive grade PMIC which integrates regulator and watchdog funtionalities. Provide INTB IRQ and register accesses for regulator/watchdog drivers. Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/c5260e2dd222e3c64cdf410802bba195637ccb93.1719473802.git.mazziesaccount@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/rohm-bd96801.h | 215 +++++++++++++++++++++++++++++++++++++++ include/linux/mfd/rohm-generic.h | 1 + 2 files changed, 216 insertions(+) create mode 100644 include/linux/mfd/rohm-bd96801.h (limited to 'include/linux') diff --git a/include/linux/mfd/rohm-bd96801.h b/include/linux/mfd/rohm-bd96801.h new file mode 100644 index 000000000000..e2d9e10b6364 --- /dev/null +++ b/include/linux/mfd/rohm-bd96801.h @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (C) 2024 ROHM Semiconductors */ + +#ifndef __MFD_BD96801_H__ +#define __MFD_BD96801_H__ + +#define BD96801_REG_SSCG_CTRL 0x09 +#define BD96801_REG_SHD_INTB 0x20 +#define BD96801_LDO5_VOL_LVL_REG 0x2c +#define BD96801_LDO6_VOL_LVL_REG 0x2d +#define BD96801_LDO7_VOL_LVL_REG 0x2e +#define BD96801_REG_BUCK_OVP 0x30 +#define BD96801_REG_BUCK_OVD 0x35 +#define BD96801_REG_LDO_OVP 0x31 +#define BD96801_REG_LDO_OVD 0x36 +#define BD96801_REG_BOOT_OVERTIME 0x3a +#define BD96801_REG_WD_TMO 0x40 +#define BD96801_REG_WD_CONF 0x41 +#define BD96801_REG_WD_FEED 0x42 +#define BD96801_REG_WD_FAILCOUNT 0x43 +#define BD96801_REG_WD_ASK 0x46 +#define BD96801_REG_WD_STATUS 0x4a +#define BD96801_REG_PMIC_STATE 0x4f +#define BD96801_REG_EXT_STATE 0x50 + +#define BD96801_STATE_STBY 0x09 + +#define BD96801_LOCK_REG 0x04 +#define BD96801_UNLOCK 0x9d +#define BD96801_LOCK 0x00 + +/* IRQ register area */ +#define BD96801_REG_INT_MAIN 0x51 + +/* + * The BD96801 has two physical IRQ lines, INTB and ERRB. + * + * The 'main status register' is located at 0x51. + * The ERRB status registers are located at 0x52 ... 0x5B + * INTB status registers are at range 0x5c ... 0x63 + */ +#define BD96801_REG_INT_SYS_ERRB1 0x52 +#define BD96801_REG_INT_SYS_INTB 0x5c +#define BD96801_REG_INT_LDO7_INTB 0x63 + +/* MASK registers */ +#define BD96801_REG_MASK_SYS_INTB 0x73 +#define BD96801_REG_MASK_SYS_ERRB 0x69 + +#define BD96801_MAX_REGISTER 0x7a + +#define BD96801_OTP_ERR_MASK BIT(0) +#define BD96801_DBIST_ERR_MASK BIT(1) +#define BD96801_EEP_ERR_MASK BIT(2) +#define BD96801_ABIST_ERR_MASK BIT(3) +#define BD96801_PRSTB_ERR_MASK BIT(4) +#define BD96801_DRMOS1_ERR_MASK BIT(5) +#define BD96801_DRMOS2_ERR_MASK BIT(6) +#define BD96801_SLAVE_ERR_MASK BIT(7) +#define BD96801_VREF_ERR_MASK BIT(0) +#define BD96801_TSD_ERR_MASK BIT(1) +#define BD96801_UVLO_ERR_MASK BIT(2) +#define BD96801_OVLO_ERR_MASK BIT(3) +#define BD96801_OSC_ERR_MASK BIT(4) +#define BD96801_PON_ERR_MASK BIT(5) +#define BD96801_POFF_ERR_MASK BIT(6) +#define BD96801_CMD_SHDN_ERR_MASK BIT(7) +#define BD96801_INT_PRSTB_WDT_ERR_MASK BIT(0) +#define BD96801_INT_CHIP_IF_ERR_MASK BIT(3) +#define BD96801_INT_SHDN_ERR_MASK BIT(7) +#define BD96801_OUT_PVIN_ERR_MASK BIT(0) +#define BD96801_OUT_OVP_ERR_MASK BIT(1) +#define BD96801_OUT_UVP_ERR_MASK BIT(2) +#define BD96801_OUT_SHDN_ERR_MASK BIT(7) + +/* ERRB IRQs */ +enum { + /* Reg 0x52, 0x53, 0x54 - ERRB system IRQs */ + BD96801_OTP_ERR_STAT, + BD96801_DBIST_ERR_STAT, + BD96801_EEP_ERR_STAT, + BD96801_ABIST_ERR_STAT, + BD96801_PRSTB_ERR_STAT, + BD96801_DRMOS1_ERR_STAT, + BD96801_DRMOS2_ERR_STAT, + BD96801_SLAVE_ERR_STAT, + BD96801_VREF_ERR_STAT, + BD96801_TSD_ERR_STAT, + BD96801_UVLO_ERR_STAT, + BD96801_OVLO_ERR_STAT, + BD96801_OSC_ERR_STAT, + BD96801_PON_ERR_STAT, + BD96801_POFF_ERR_STAT, + BD96801_CMD_SHDN_ERR_STAT, + BD96801_INT_PRSTB_WDT_ERR, + BD96801_INT_CHIP_IF_ERR, + BD96801_INT_SHDN_ERR_STAT, + + /* Reg 0x55 BUCK1 ERR IRQs */ + BD96801_BUCK1_PVIN_ERR_STAT, + BD96801_BUCK1_OVP_ERR_STAT, + BD96801_BUCK1_UVP_ERR_STAT, + BD96801_BUCK1_SHDN_ERR_STAT, + + /* Reg 0x56 BUCK2 ERR IRQs */ + BD96801_BUCK2_PVIN_ERR_STAT, + BD96801_BUCK2_OVP_ERR_STAT, + BD96801_BUCK2_UVP_ERR_STAT, + BD96801_BUCK2_SHDN_ERR_STAT, + + /* Reg 0x57 BUCK3 ERR IRQs */ + BD96801_BUCK3_PVIN_ERR_STAT, + BD96801_BUCK3_OVP_ERR_STAT, + BD96801_BUCK3_UVP_ERR_STAT, + BD96801_BUCK3_SHDN_ERR_STAT, + + /* Reg 0x58 BUCK4 ERR IRQs */ + BD96801_BUCK4_PVIN_ERR_STAT, + BD96801_BUCK4_OVP_ERR_STAT, + BD96801_BUCK4_UVP_ERR_STAT, + BD96801_BUCK4_SHDN_ERR_STAT, + + /* Reg 0x59 LDO5 ERR IRQs */ + BD96801_LDO5_PVIN_ERR_STAT, + BD96801_LDO5_OVP_ERR_STAT, + BD96801_LDO5_UVP_ERR_STAT, + BD96801_LDO5_SHDN_ERR_STAT, + + /* Reg 0x5a LDO6 ERR IRQs */ + BD96801_LDO6_PVIN_ERR_STAT, + BD96801_LDO6_OVP_ERR_STAT, + BD96801_LDO6_UVP_ERR_STAT, + BD96801_LDO6_SHDN_ERR_STAT, + + /* Reg 0x5b LDO7 ERR IRQs */ + BD96801_LDO7_PVIN_ERR_STAT, + BD96801_LDO7_OVP_ERR_STAT, + BD96801_LDO7_UVP_ERR_STAT, + BD96801_LDO7_SHDN_ERR_STAT, +}; + +/* INTB IRQs */ +enum { + /* Reg 0x5c (System INTB) */ + BD96801_TW_STAT, + BD96801_WDT_ERR_STAT, + BD96801_I2C_ERR_STAT, + BD96801_CHIP_IF_ERR_STAT, + + /* Reg 0x5d (BUCK1 INTB) */ + BD96801_BUCK1_OCPH_STAT, + BD96801_BUCK1_OCPL_STAT, + BD96801_BUCK1_OCPN_STAT, + BD96801_BUCK1_OVD_STAT, + BD96801_BUCK1_UVD_STAT, + BD96801_BUCK1_TW_CH_STAT, + + /* Reg 0x5e (BUCK2 INTB) */ + BD96801_BUCK2_OCPH_STAT, + BD96801_BUCK2_OCPL_STAT, + BD96801_BUCK2_OCPN_STAT, + BD96801_BUCK2_OVD_STAT, + BD96801_BUCK2_UVD_STAT, + BD96801_BUCK2_TW_CH_STAT, + + /* Reg 0x5f (BUCK3 INTB)*/ + BD96801_BUCK3_OCPH_STAT, + BD96801_BUCK3_OCPL_STAT, + BD96801_BUCK3_OCPN_STAT, + BD96801_BUCK3_OVD_STAT, + BD96801_BUCK3_UVD_STAT, + BD96801_BUCK3_TW_CH_STAT, + + /* Reg 0x60 (BUCK4 INTB)*/ + BD96801_BUCK4_OCPH_STAT, + BD96801_BUCK4_OCPL_STAT, + BD96801_BUCK4_OCPN_STAT, + BD96801_BUCK4_OVD_STAT, + BD96801_BUCK4_UVD_STAT, + BD96801_BUCK4_TW_CH_STAT, + + /* Reg 0x61 (LDO5 INTB) */ + BD96801_LDO5_OCPH_STAT, /* bit [0] */ + BD96801_LDO5_OVD_STAT, /* bit [3] */ + BD96801_LDO5_UVD_STAT, /* bit [4] */ + + /* Reg 0x62 (LDO6 INTB) */ + BD96801_LDO6_OCPH_STAT, /* bit [0] */ + BD96801_LDO6_OVD_STAT, /* bit [3] */ + BD96801_LDO6_UVD_STAT, /* bit [4] */ + + /* Reg 0x63 (LDO7 INTB) */ + BD96801_LDO7_OCPH_STAT, /* bit [0] */ + BD96801_LDO7_OVD_STAT, /* bit [3] */ + BD96801_LDO7_UVD_STAT, /* bit [4] */ +}; + +/* IRQ MASKs */ +#define BD96801_TW_STAT_MASK BIT(0) +#define BD96801_WDT_ERR_STAT_MASK BIT(1) +#define BD96801_I2C_ERR_STAT_MASK BIT(2) +#define BD96801_CHIP_IF_ERR_STAT_MASK BIT(3) + +#define BD96801_BUCK_OCPH_STAT_MASK BIT(0) +#define BD96801_BUCK_OCPL_STAT_MASK BIT(1) +#define BD96801_BUCK_OCPN_STAT_MASK BIT(2) +#define BD96801_BUCK_OVD_STAT_MASK BIT(3) +#define BD96801_BUCK_UVD_STAT_MASK BIT(4) +#define BD96801_BUCK_TW_CH_STAT_MASK BIT(5) + +#define BD96801_LDO_OCPH_STAT_MASK BIT(0) +#define BD96801_LDO_OVD_STAT_MASK BIT(3) +#define BD96801_LDO_UVD_STAT_MASK BIT(4) + +#endif diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h index 4eeb22876bad..e7d4e6afe388 100644 --- a/include/linux/mfd/rohm-generic.h +++ b/include/linux/mfd/rohm-generic.h @@ -16,6 +16,7 @@ enum rohm_chip_type { ROHM_CHIP_TYPE_BD71828, ROHM_CHIP_TYPE_BD71837, ROHM_CHIP_TYPE_BD71847, + ROHM_CHIP_TYPE_BD96801, ROHM_CHIP_TYPE_AMOUNT }; -- cgit v1.2.3 From 67eccf151d76a9939ad8a50c6db5cb486b01df24 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 26 Jun 2024 15:46:23 +0200 Subject: HID: add source argument to HID low level functions This allows to know who actually sent what when we process the request to the device. This will be useful for a BPF firewall program to allow or not requests coming from a dedicated hidraw node client. Link: https://patch.msgid.link/20240626-hid_hw_req_bpf-v2-2-cfd60fb6c79f@kernel.org Acked-by: Jiri Kosina Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 6 ++++++ include/linux/hid_bpf.h | 16 ++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 8e06d89698e6..dac2804b4562 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1125,6 +1125,12 @@ int __must_check hid_hw_open(struct hid_device *hdev); void hid_hw_close(struct hid_device *hdev); void hid_hw_request(struct hid_device *hdev, struct hid_report *report, enum hid_class_request reqtype); +int __hid_hw_raw_request(struct hid_device *hdev, + unsigned char reportnum, __u8 *buf, + size_t len, enum hid_report_type rtype, + enum hid_class_request reqtype, + __u64 source); +int __hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len, __u64 source); int hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index 65d7e0acc8c2..a54741db0415 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -66,10 +66,12 @@ struct hid_ops { int (*hid_hw_raw_request)(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, - enum hid_class_request reqtype); - int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len); + enum hid_class_request reqtype, + __u64 source); + int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len, + __u64 source); int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 size, int interrupt); + u8 *data, u32 size, int interrupt, u64 source); struct module *owner; const struct bus_type *bus_type; }; @@ -110,7 +112,8 @@ struct hid_bpf_ops { * * Context: Interrupt context. */ - int (*hid_device_event)(struct hid_bpf_ctx *ctx, enum hid_report_type report_type); + int (*hid_device_event)(struct hid_bpf_ctx *ctx, enum hid_report_type report_type, + __u64 source); /** * @hid_rdesc_fixup: called when the probe function parses the report descriptor @@ -146,7 +149,7 @@ struct hid_bpf { #ifdef CONFIG_HID_BPF u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, - u32 *size, int interrupt); + u32 *size, int interrupt, u64 source); int hid_bpf_connect_device(struct hid_device *hdev); void hid_bpf_disconnect_device(struct hid_device *hdev); void hid_bpf_destroy_device(struct hid_device *hid); @@ -154,7 +157,8 @@ void hid_bpf_device_init(struct hid_device *hid); u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, u8 *rdesc, unsigned int *size); #else /* CONFIG_HID_BPF */ static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 *size, int interrupt) { return data; } + u8 *data, u32 *size, int interrupt, + u64 source) { return data; } static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; } static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {} static inline void hid_bpf_destroy_device(struct hid_device *hid) {} -- cgit v1.2.3 From 6cd735f0e57a6c8510ad92f5b63837a8d0cff3a7 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 26 Jun 2024 15:46:24 +0200 Subject: HID: bpf: protect HID-BPF prog_list access by a SRCU We want to add sleepable callbacks for hid_hw_raw_request() and hid_hw_output_report(), but we can not use a plain RCU for those. Prepare for a SRCU so we can extend HID-BPF. This changes a little bit how hid_bpf_device_init() behaves, as it may now fail, so there is a tiny hid-core.c change to accommodate for this. Link: https://patch.msgid.link/20240626-hid_hw_req_bpf-v2-3-cfd60fb6c79f@kernel.org Acked-by: Jiri Kosina Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index a54741db0415..f93845de5cac 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -5,6 +5,7 @@ #include #include +#include #include struct hid_device; @@ -145,6 +146,7 @@ struct hid_bpf { struct hid_bpf_ops *rdesc_ops; struct list_head prog_list; struct mutex prog_list_lock; /* protects prog_list update */ + struct srcu_struct srcu; /* protects prog_list read-only access */ }; #ifdef CONFIG_HID_BPF @@ -153,7 +155,7 @@ u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type t int hid_bpf_connect_device(struct hid_device *hdev); void hid_bpf_disconnect_device(struct hid_device *hdev); void hid_bpf_destroy_device(struct hid_device *hid); -void hid_bpf_device_init(struct hid_device *hid); +int hid_bpf_device_init(struct hid_device *hid); u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, u8 *rdesc, unsigned int *size); #else /* CONFIG_HID_BPF */ static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, @@ -162,7 +164,7 @@ static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; } static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {} static inline void hid_bpf_destroy_device(struct hid_device *hid) {} -static inline void hid_bpf_device_init(struct hid_device *hid) {} +static inline int hid_bpf_device_init(struct hid_device *hid) { return 0; } #define call_hid_bpf_rdesc_fixup(_hdev, _rdesc, _size) \ ((u8 *)kmemdup(_rdesc, *(_size), GFP_KERNEL)) -- cgit v1.2.3 From 8bd0488b5ea58655ad6fdcbe0408ef49b16882b1 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 26 Jun 2024 15:46:25 +0200 Subject: HID: bpf: add HID-BPF hooks for hid_hw_raw_requests This allows to intercept and prevent or change the behavior of hid_hw_raw_request() from a bpf program. The intent is to solve a couple of use case: - firewalling a HID device: a firewall can monitor who opens the hidraw nodes and then prevent or allow access to write operations on that hidraw node. - change the behavior of a device and emulate a new HID feature request The hook is allowed to be run as sleepable so it can itself call hid_bpf_hw_request(), which allows to "convert" one feature request into another or even call the feature request on a different HID device on the same physical device. Link: https://patch.msgid.link/20240626-hid_hw_req_bpf-v2-4-cfd60fb6c79f@kernel.org Acked-by: Jiri Kosina Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index f93845de5cac..3c01f7f8b6fc 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -130,6 +130,31 @@ struct hid_bpf_ops { */ int (*hid_rdesc_fixup)(struct hid_bpf_ctx *ctx); + /** + * @hid_hw_request: called whenever a hid_hw_raw_request() call is emitted + * on the HID device + * + * It has the following arguments: + * + * ``ctx``: The HID-BPF context as &struct hid_bpf_ctx + * ``reportnum``: the report number, as in hid_hw_raw_request() + * ``rtype``: the report type (``HID_INPUT_REPORT``, ``HID_FEATURE_REPORT``, + * ``HID_OUTPUT_REPORT``) + * ``reqtype``: the request + * ``source``: a u64 referring to a uniq but identifiable source. If %0, the + * kernel itself emitted that call. For hidraw, ``source`` is set + * to the associated ``struct file *``. + * + * Return: %0 to keep processing the request by hid-core; any other value + * stops hid-core from processing that event. A positive value should be + * returned with the number of bytes returned in the incoming buffer; a + * negative error code interrupts the processing of this call. + */ + int (*hid_hw_request)(struct hid_bpf_ctx *ctx, unsigned char reportnum, + enum hid_report_type rtype, enum hid_class_request reqtype, + __u64 source); + + /* private: do not show up in the docs */ struct hid_device *hdev; }; @@ -152,6 +177,11 @@ struct hid_bpf { #ifdef CONFIG_HID_BPF u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 *size, int interrupt, u64 source); +int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, + unsigned char reportnum, __u8 *buf, + u32 size, enum hid_report_type rtype, + enum hid_class_request reqtype, + __u64 source); int hid_bpf_connect_device(struct hid_device *hdev); void hid_bpf_disconnect_device(struct hid_device *hdev); void hid_bpf_destroy_device(struct hid_device *hid); @@ -161,6 +191,11 @@ u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, u8 *rdesc, unsigned int *s static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 *size, int interrupt, u64 source) { return data; } +static inline int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, + unsigned char reportnum, u8 *buf, + u32 size, enum hid_report_type rtype, + enum hid_class_request reqtype, + u64 source) { return 0; } static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; } static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {} static inline void hid_bpf_destroy_device(struct hid_device *hid) {} -- cgit v1.2.3 From 75839101ce52e319cb2154a027d14f1f0aa3be09 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 26 Jun 2024 15:46:26 +0200 Subject: HID: bpf: prevent infinite recursions with hid_hw_raw_requests hooks When we attach a sleepable hook to hid_hw_raw_requests, we can (and in many cases should) call ourself hid_bpf_raw_request(), to actually fetch data from the device itself. However, this means that we might enter an infinite loop between hid_hw_raw_requests hooks and hid_bpf_hw_request() call. To prevent that, if a hid_bpf_hw_request() call is emitted, we prevent any new call of this kfunc by storing the information in the context. This way we can always trace/monitor/filter the incoming bpf requests, while preventing those loops to happen. I don't think exposing "from_bpf" is very interesting because while writing such a bpf program, you need to match at least the report number and/or the source of the call. So a blind "if there is a hid_hw_raw_request() call, I'm emitting another one" makes no real sense. Link: https://patch.msgid.link/20240626-hid_hw_req_bpf-v2-5-cfd60fb6c79f@kernel.org Acked-by: Jiri Kosina Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 2 +- include/linux/hid_bpf.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index dac2804b4562..24d0d7c0bd33 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1129,7 +1129,7 @@ int __hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype, - __u64 source); + __u64 source, bool from_bpf); int __hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len, __u64 source); int hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index 3c01f7f8b6fc..088c94b6d8ec 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -68,7 +68,7 @@ struct hid_ops { unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype, - __u64 source); + __u64 source, bool from_bpf); int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len, __u64 source); int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type, @@ -181,7 +181,7 @@ int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, u32 size, enum hid_report_type rtype, enum hid_class_request reqtype, - __u64 source); + __u64 source, bool from_bpf); int hid_bpf_connect_device(struct hid_device *hdev); void hid_bpf_disconnect_device(struct hid_device *hdev); void hid_bpf_destroy_device(struct hid_device *hid); @@ -195,7 +195,7 @@ static inline int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, u8 *buf, u32 size, enum hid_report_type rtype, enum hid_class_request reqtype, - u64 source) { return 0; } + u64 source, bool from_bpf) { return 0; } static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; } static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {} static inline void hid_bpf_destroy_device(struct hid_device *hid) {} -- cgit v1.2.3 From 9286675a2aed40a517be8cc4e283a04f473275b5 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 26 Jun 2024 15:46:28 +0200 Subject: HID: bpf: add HID-BPF hooks for hid_hw_output_report Same story than hid_hw_raw_requests: This allows to intercept and prevent or change the behavior of hid_hw_output_report() from a bpf program. The intent is to solve a couple of use case: - firewalling a HID device: a firewall can monitor who opens the hidraw nodes and then prevent or allow access to write operations on that hidraw node. - change the behavior of a device and emulate a new HID feature request The hook is allowed to be run as sleepable so it can itself call hid_hw_output_report(), which allows to "convert" one feature request into another or even call the feature request on a different HID device on the same physical device. Link: https://patch.msgid.link/20240626-hid_hw_req_bpf-v2-7-cfd60fb6c79f@kernel.org Acked-by: Jiri Kosina Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 3 ++- include/linux/hid_bpf.h | 24 +++++++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 24d0d7c0bd33..1533c9dcd3a6 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1130,7 +1130,8 @@ int __hid_hw_raw_request(struct hid_device *hdev, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype, __u64 source, bool from_bpf); -int __hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len, __u64 source); +int __hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len, __u64 source, + bool from_bpf); int hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index 088c94b6d8ec..f35508a73067 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -70,7 +70,7 @@ struct hid_ops { enum hid_class_request reqtype, __u64 source, bool from_bpf); int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len, - __u64 source); + __u64 source, bool from_bpf); int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt, u64 source); struct module *owner; @@ -154,6 +154,24 @@ struct hid_bpf_ops { enum hid_report_type rtype, enum hid_class_request reqtype, __u64 source); + /** + * @hid_hw_output_report: called whenever a hid_hw_output_report() call is emitted + * on the HID device + * + * It has the following arguments: + * + * ``ctx``: The HID-BPF context as &struct hid_bpf_ctx + * ``source``: a u64 referring to a uniq but identifiable source. If %0, the + * kernel itself emitted that call. For hidraw, ``source`` is set + * to the associated ``struct file *``. + * + * Return: %0 to keep processing the request by hid-core; any other value + * stops hid-core from processing that event. A positive value should be + * returned with the number of bytes written to the device; a negative error + * code interrupts the processing of this call. + */ + int (*hid_hw_output_report)(struct hid_bpf_ctx *ctx, __u64 source); + /* private: do not show up in the docs */ struct hid_device *hdev; @@ -182,6 +200,8 @@ int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, u32 size, enum hid_report_type rtype, enum hid_class_request reqtype, __u64 source, bool from_bpf); +int dispatch_hid_bpf_output_report(struct hid_device *hdev, __u8 *buf, u32 size, + __u64 source, bool from_bpf); int hid_bpf_connect_device(struct hid_device *hdev); void hid_bpf_disconnect_device(struct hid_device *hdev); void hid_bpf_destroy_device(struct hid_device *hid); @@ -196,6 +216,8 @@ static inline int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, u32 size, enum hid_report_type rtype, enum hid_class_request reqtype, u64 source, bool from_bpf) { return 0; } +static inline int dispatch_hid_bpf_output_report(struct hid_device *hdev, __u8 *buf, u32 size, + __u64 source, bool from_bpf) { return 0; } static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; } static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {} static inline void hid_bpf_destroy_device(struct hid_device *hid) {} -- cgit v1.2.3 From fa03f398a8ac46f46927e0b509b302ebe0ed7e8a Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 26 Jun 2024 15:46:30 +0200 Subject: HID: bpf: make hid_bpf_input_report() sleep until the device is ready hid_bpf_input_report() is already marked to be used in sleepable context only. So instead of hammering with timers the device to hopefully get an available slot where the device is not sending events, we can make that kfunc wait for the current event to be terminated before it goes in. This allows to work with the following pseudo code: in struct_ops/hid_device_event: - schedule a bpf_wq, which calls hid_bpf_input_report() - once this struct_ops function terminates, hid_bpf_input_report() immediately starts before the next event Link: https://patch.msgid.link/20240626-hid_hw_req_bpf-v2-9-cfd60fb6c79f@kernel.org Acked-by: Jiri Kosina Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index f35508a73067..7f04353d09e9 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -72,7 +72,8 @@ struct hid_ops { int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len, __u64 source, bool from_bpf); int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 size, int interrupt, u64 source); + u8 *data, u32 size, int interrupt, u64 source, + bool lock_already_taken); struct module *owner; const struct bus_type *bus_type; }; -- cgit v1.2.3 From 9acbb7ba4589d4715141d4e14230a828ddc95f3d Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 26 Jun 2024 15:46:32 +0200 Subject: HID: bpf: allow hid_device_event hooks to inject input reports on self This is the same logic than hid_hw_raw_request or hid_hw_output_report: we can allow hid_bpf_try_input_report to be called from a hook on hid_input_report if we ensure that the call can not be made twice in a row. There is one extra subtlety in which there is a lock in hid_input_report. But given that we can detect if we are already in the hook, we can notify hid_input_report to not take the lock. This is done by checking if ctx_kern data is valid or null, and if it is equal to the dedicated incoming data buffer. In order to have more control on whether the lock needs to be taken or not we introduce a new kfunc for it: hid_bpf_try_input_report() Link: https://patch.msgid.link/20240626-hid_hw_req_bpf-v2-11-cfd60fb6c79f@kernel.org Acked-by: Jiri Kosina Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index 7f04353d09e9..93546ee7677a 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -72,7 +72,7 @@ struct hid_ops { int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len, __u64 source, bool from_bpf); int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 size, int interrupt, u64 source, + u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, bool lock_already_taken); struct module *owner; const struct bus_type *bus_type; @@ -195,7 +195,7 @@ struct hid_bpf { #ifdef CONFIG_HID_BPF u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, - u32 *size, int interrupt, u64 source); + u32 *size, int interrupt, u64 source, bool from_bpf); int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, u32 size, enum hid_report_type rtype, @@ -211,7 +211,7 @@ u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, u8 *rdesc, unsigned int *s #else /* CONFIG_HID_BPF */ static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 *size, int interrupt, - u64 source) { return data; } + u64 source, bool from_bpf) { return data; } static inline int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, u8 *buf, u32 size, enum hid_report_type rtype, -- cgit v1.2.3 From 58de63ddd0ecc84548b8fd24c72deb3739365c78 Mon Sep 17 00:00:00 2001 From: "Jason-JH.Lin" Date: Tue, 25 Jun 2024 16:39:57 +0800 Subject: soc: mtk-cmdq: Add cmdq_pkt_logic_command to support math operation Add cmdq_pkt_logic_command to support math operation. cmdq_pkt_logic_command can append logic command to the CMDQ packet, ask GCE to execute a arithmetic calculate instruction, such as add, subtract, multiply, AND, OR and NOT, etc. Note that all arithmetic instructions are unsigned calculations. If there are any overflows, GCE will sent the invalid IRQ to notify CMDQ driver. Signed-off-by: Jason-JH.Lin Signed-off-by: Hsiao Chien Sung Link: https://lore.kernel.org/r/20240625083957.3540-1-jason-jh.lin@mediatek.com Signed-off-by: AngeloGioacchino Del Regno --- include/linux/soc/mediatek/mtk-cmdq.h | 42 +++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h index d4a8e34505e6..5bee6f7fc400 100644 --- a/include/linux/soc/mediatek/mtk-cmdq.h +++ b/include/linux/soc/mediatek/mtk-cmdq.h @@ -25,6 +25,31 @@ struct cmdq_pkt; +enum cmdq_logic_op { + CMDQ_LOGIC_ASSIGN = 0, + CMDQ_LOGIC_ADD = 1, + CMDQ_LOGIC_SUBTRACT = 2, + CMDQ_LOGIC_MULTIPLY = 3, + CMDQ_LOGIC_XOR = 8, + CMDQ_LOGIC_NOT = 9, + CMDQ_LOGIC_OR = 10, + CMDQ_LOGIC_AND = 11, + CMDQ_LOGIC_LEFT_SHIFT = 12, + CMDQ_LOGIC_RIGHT_SHIFT = 13, + CMDQ_LOGIC_MAX, +}; + +struct cmdq_operand { + /* register type */ + bool reg; + union { + /* index */ + u16 idx; + /* value */ + u16 value; + }; +}; + struct cmdq_client_reg { u8 subsys; u16 offset; @@ -272,6 +297,23 @@ int cmdq_pkt_poll(struct cmdq_pkt *pkt, u8 subsys, int cmdq_pkt_poll_mask(struct cmdq_pkt *pkt, u8 subsys, u16 offset, u32 value, u32 mask); +/** + * cmdq_pkt_logic_command() - Append logic command to the CMDQ packet, ask GCE to + * execute an instruction that store the result of logic operation + * with left and right operand into result_reg_idx. + * @pkt: the CMDQ packet + * @result_reg_idx: SPR index that store operation result of left_operand and right_operand + * @left_operand: left operand + * @s_op: the logic operator enum + * @right_operand: right operand + * + * Return: 0 for success; else the error code is returned + */ +int cmdq_pkt_logic_command(struct cmdq_pkt *pkt, u16 result_reg_idx, + struct cmdq_operand *left_operand, + enum cmdq_logic_op s_op, + struct cmdq_operand *right_operand); + /** * cmdq_pkt_assign() - Append logic assign command to the CMDQ packet, ask GCE * to execute an instruction that set a constant value into -- cgit v1.2.3 From 7e1f4eb9a60d40dd17a97d9b76818682a024a127 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Apr 2024 12:04:54 +0200 Subject: kallsyms: rework symbol lookup return codes Building with W=1 in some configurations produces a false positive warning for kallsyms: kernel/kallsyms.c: In function '__sprint_symbol.isra': kernel/kallsyms.c:503:17: error: 'strcpy' source argument is the same as destination [-Werror=restrict] 503 | strcpy(buffer, name); | ^~~~~~~~~~~~~~~~~~~~ This originally showed up while building with -O3, but later started happening in other configurations as well, depending on inlining decisions. The underlying issue is that the local 'name' variable is always initialized to the be the same as 'buffer' in the called functions that fill the buffer, which gcc notices while inlining, though it could see that the address check always skips the copy. The calling conventions here are rather unusual, as all of the internal lookup functions (bpf_address_lookup, ftrace_mod_address_lookup, ftrace_func_address_lookup, module_address_lookup and kallsyms_lookup_buildid) already use the provided buffer and either return the address of that buffer to indicate success, or NULL for failure, but the callers are written to also expect an arbitrary other buffer to be returned. Rework the calling conventions to return the length of the filled buffer instead of its address, which is simpler and easier to follow as well as avoiding the warning. Leave only the kallsyms_lookup() calling conventions unchanged, since that is called from 16 different functions and adapting this would be a much bigger change. Link: https://lore.kernel.org/lkml/20200107214042.855757-1-arnd@arndb.de/ Link: https://lore.kernel.org/lkml/20240326130647.7bfb1d92@gandalf.local.home/ Tested-by: Geert Uytterhoeven Reviewed-by: Luis Chamberlain Acked-by: Steven Rostedt (Google) Signed-off-by: Arnd Bergmann --- include/linux/filter.h | 14 +++++++------- include/linux/ftrace.h | 6 +++--- include/linux/module.h | 14 +++++++------- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 0f12cf01070e..5669da513cd7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1208,18 +1208,18 @@ static inline bool bpf_jit_kallsyms_enabled(void) return false; } -const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, +int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym); bool is_bpf_text_address(unsigned long addr); int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym); struct bpf_prog *bpf_prog_ksym_find(unsigned long addr); -static inline const char * +static inline int bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { - const char *ret = __bpf_address_lookup(addr, size, off, sym); + int ret = __bpf_address_lookup(addr, size, off, sym); if (ret && modname) *modname = NULL; @@ -1263,11 +1263,11 @@ static inline bool bpf_jit_kallsyms_enabled(void) return false; } -static inline const char * +static inline int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { - return NULL; + return 0; } static inline bool is_bpf_text_address(unsigned long addr) @@ -1286,11 +1286,11 @@ static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) return NULL; } -static inline const char * +static inline int bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { - return NULL; + return 0; } static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 800995c425e0..b792274189a3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -86,15 +86,15 @@ struct ftrace_hash; #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \ defined(CONFIG_DYNAMIC_FTRACE) -const char * +int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym); #else -static inline const char * +static inline int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { - return NULL; + return 0; } #endif diff --git a/include/linux/module.h b/include/linux/module.h index ffa1c603163c..330ffb59efe5 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -931,11 +931,11 @@ int module_kallsyms_on_each_symbol(const char *modname, * least KSYM_NAME_LEN long: a pointer to namebuf is returned if * found, otherwise NULL. */ -const char *module_address_lookup(unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset, - char **modname, const unsigned char **modbuildid, - char *namebuf); +int module_address_lookup(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset, + char **modname, const unsigned char **modbuildid, + char *namebuf); int lookup_module_symbol_name(unsigned long addr, char *symname); int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, @@ -964,14 +964,14 @@ static inline int module_kallsyms_on_each_symbol(const char *modname, } /* For kallsyms to ask for address resolution. NULL means not found. */ -static inline const char *module_address_lookup(unsigned long addr, +static inline int module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { - return NULL; + return 0; } static inline int lookup_module_symbol_name(unsigned long addr, char *symname) -- cgit v1.2.3 From 1bc6d4452d5c91beb09e37a98a590808e1997b79 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 30 Apr 2024 13:54:46 +0200 Subject: fs: new helper vfs_empty_path() Make it possible to quickly check whether AT_EMPTY_PATH is valid. Note, after some discussion we decided to also allow NULL to be passed instead of requiring the empty string. Signed-off-by: Christian Brauner --- include/linux/fs.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 942cb11dba96..5ff362277834 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3631,4 +3631,21 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +static inline bool vfs_empty_path(int dfd, const char __user *path) +{ + char c; + + if (dfd < 0) + return false; + + /* We now allow NULL to be used for empty path. */ + if (!path) + return true; + + if (unlikely(get_user(c, path))) + return false; + + return !c; +} + #endif /* _LINUX_FS_H */ -- cgit v1.2.3 From f378ec4eec8b7e607dbc0112913fd3d5d84eb1b8 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 27 Jun 2024 18:11:52 +0200 Subject: vfs: rename parent_ino to d_parent_ino and make it use RCU The routine is used by procfs through dir_emit_dots. The combined RCU and lock fallback implementation is too big for an inline. Given that the routine takes a dentry argument fs/dcache.c seems like the place to put it in. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240627161152.802567-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/dcache.h | 2 ++ include/linux/fs.h | 16 +--------------- 2 files changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bf53e3894aae..ea58843942b9 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -278,6 +278,8 @@ static inline unsigned d_count(const struct dentry *dentry) return dentry->d_lockref.count; } +ino_t d_parent_ino(struct dentry *dentry); + /* * helper function for dentry_operations.d_dname() members */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 5ff362277834..2fa06a4d197a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3454,20 +3454,6 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return 0; } -static inline ino_t parent_ino(struct dentry *dentry) -{ - ino_t res; - - /* - * Don't strictly need d_lock here? If the parent ino could change - * then surely we'd have a deeper race in the caller? - */ - spin_lock(&dentry->d_lock); - res = dentry->d_parent->d_inode->i_ino; - spin_unlock(&dentry->d_lock); - return res; -} - /* Transaction based IO helpers */ /* @@ -3592,7 +3578,7 @@ static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx) static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, "..", 2, ctx->pos, - parent_ino(file->f_path.dentry), DT_DIR); + d_parent_ino(file->f_path.dentry), DT_DIR); } static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) { -- cgit v1.2.3 From 63db4a1f795a19e4e12f036a12a5f61c48b03e5c Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 27 Jun 2024 16:07:35 +0000 Subject: block: Delete blk_queue_flag_test_and_set() Since commit 70200574cc22 ("block: remove QUEUE_FLAG_DISCARD"), blk_queue_flag_test_and_set() has not been used, so delete it. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240627160735.842189-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a53e3434e1a2..53c41ef4222c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -609,7 +609,6 @@ struct request_queue { void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); -bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) -- cgit v1.2.3 From d3dcb084c70727be4a2f61bd94796e66147cfa35 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Wed, 26 Jun 2024 05:06:17 +0200 Subject: net: phy: phy_device: Fix PHY LED blinking code comment Fix copy-paste error in the code comment. The code refers to LED blinking configuration, not brightness configuration. It was likely copied from comment above this one which does refer to brightness configuration. Fixes: 4e901018432e ("net: phy: phy_device: Call into the PHY driver to set LED blinking") Signed-off-by: Marek Vasut Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20240626030638.512069-1-marex@denx.de Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index e6e83304558e..3be430cf3132 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1122,7 +1122,7 @@ struct phy_driver { u8 index, enum led_brightness value); /** - * @led_blink_set: Set a PHY LED brightness. Index indicates + * @led_blink_set: Set a PHY LED blinking. Index indicates * which of the PHYs led should be configured to blink. Delays * are in milliseconds and if both are zero then a sensible * default should be chosen. The call should adjust the -- cgit v1.2.3 From ff2c570ef7eaa9ded58e7a02dd7a68874a897508 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 7 Jun 2024 16:55:35 +0200 Subject: path: add cleanup helper Add a simple cleanup helper so we can cleanup struct path easily. No need for any extra machinery. Avoid DEFINE_FREE() as it causes a local copy of struct path to be used. Just rely on path_put() directly called from a cleanup helper. Link: https://lore.kernel.org/r/20240607-vfs-listmount-reverse-v1-2-7877a2bfa5e5@kernel.org Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- include/linux/path.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/path.h b/include/linux/path.h index 475225a03d0d..ca073e70decd 100644 --- a/include/linux/path.h +++ b/include/linux/path.h @@ -24,4 +24,13 @@ static inline void path_put_init(struct path *path) *path = (struct path) { }; } +/* + * Cleanup macro for use with __free(path_put). Avoids dereference and + * copying @path unlike DEFINE_FREE(). path_put() will handle the empty + * path correctly just ensure @path is initialized: + * + * struct path path __free(path_put) = {}; + */ +#define __free_path_put path_put + #endif /* _LINUX_PATH_H */ -- cgit v1.2.3 From c6269149cbf7053272d918101981869438ff7c1e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 27 Jun 2024 16:11:39 +0200 Subject: file: add take_fd() cleanup helper Add a helper that returns the file descriptor and ensures that the old variable contains a negative value. This makes it easy to rely on CLASS(get_unused_fd). Link: https://lore.kernel.org/r/20240627-work-pidfs-v1-1-7e9ab6cc3bb1@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Josef Bacik Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- include/linux/cleanup.h | 13 ++++++++----- include/linux/file.h | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index c2d09bc4f976..80c4181e194a 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -63,17 +63,20 @@ #define __free(_name) __cleanup(__free_##_name) -#define __get_and_null_ptr(p) \ - ({ __auto_type __ptr = &(p); \ - __auto_type __val = *__ptr; \ - *__ptr = NULL; __val; }) +#define __get_and_null(p, nullvalue) \ + ({ \ + __auto_type __ptr = &(p); \ + __auto_type __val = *__ptr; \ + *__ptr = nullvalue; \ + __val; \ + }) static inline __must_check const volatile void * __must_check_fn(const volatile void *val) { return val; } #define no_free_ptr(p) \ - ((typeof(p)) __must_check_fn(__get_and_null_ptr(p))) + ((typeof(p)) __must_check_fn(__get_and_null(p, NULL))) #define return_ptr(p) return no_free_ptr(p) diff --git a/include/linux/file.h b/include/linux/file.h index 45d0f4800abd..237931f20739 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -97,6 +97,26 @@ extern void put_unused_fd(unsigned int fd); DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), get_unused_fd_flags(flags), unsigned flags) +/* + * take_fd() will take care to set @fd to -EBADF ensuring that + * CLASS(get_unused_fd) won't call put_unused_fd(). This makes it + * easier to rely on CLASS(get_unused_fd): + * + * struct file *f; + * + * CLASS(get_unused_fd, fd)(O_CLOEXEC); + * if (fd < 0) + * return fd; + * + * f = dentry_open(&path, O_RDONLY, current_cred()); + * if (IS_ERR(f)) + * return PTR_ERR(fd); + * + * fd_install(fd, f); + * return take_fd(fd); + */ +#define take_fd(fd) __get_and_null(fd, -EBADF) + extern void fd_install(unsigned int fd, struct file *file); int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); -- cgit v1.2.3 From d057c108155ab8d02b603c7ee5da7df615c2f32f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 27 Jun 2024 16:11:40 +0200 Subject: nsproxy: add a cleanup helper for nsproxy Add a simple cleanup helper for nsproxy. Link: https://lore.kernel.org/r/20240627-work-pidfs-v1-2-7e9ab6cc3bb1@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Josef Bacik Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- include/linux/nsproxy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 5601d14e2886..e6bec522b139 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -112,4 +112,6 @@ static inline void get_nsproxy(struct nsproxy *ns) refcount_inc(&ns->count); } +DEFINE_FREE(put_nsproxy, struct nsproxy *, if (_T) put_nsproxy(_T)) + #endif -- cgit v1.2.3 From 85e4daaeb70bc68ec920e0c268eb428113d31b21 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 28 Jun 2024 10:05:19 +0200 Subject: nsproxy: add helper to go from arbitrary namespace to ns_common They all contains struct ns_common ns and if there ever is one where that isn't the case we'll catch it here at build time. Signed-off-by: Christian Brauner --- include/linux/nsproxy.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index e6bec522b139..dab6a1734a22 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -42,6 +42,17 @@ struct nsproxy { }; extern struct nsproxy init_nsproxy; +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns->ns), \ + struct ipc_namespace *: &(__ns->ns), \ + struct net *: &(__ns->ns), \ + struct pid_namespace *: &(__ns->ns), \ + struct mnt_namespace *: &(__ns->ns), \ + struct time_namespace *: &(__ns->ns), \ + struct user_namespace *: &(__ns->ns), \ + struct uts_namespace *: &(__ns->ns)) + /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. -- cgit v1.2.3 From 69540b7987ef05d1dff36981d4cc7c31e9716b36 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 27 Jun 2024 17:08:48 +0300 Subject: ethtool: Add ethtool operation to write to a transceiver module EEPROM Ethtool can already retrieve information from a transceiver module EEPROM by invoking the ethtool_ops::get_module_eeprom_by_page operation. Add a corresponding operation that allows ethtool to write to a transceiver module EEPROM. The new write operation is purely an in-kernel API and is not exposed to user space. The purpose of this operation is not to enable arbitrary read / write access, but to allow the kernel to write to specific addresses as part of transceiver module firmware flashing. In the future, more functionality can be implemented on top of these read / write operations. Adjust the comments of the 'ethtool_module_eeprom' structure as it is no longer used only for read access. Signed-off-by: Ido Schimmel Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/ethtool.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 959196af7f5a..c7f6f2bc9cac 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -506,17 +506,16 @@ struct ethtool_ts_stats { #define ETH_MODULE_MAX_I2C_ADDRESS 0x7f /** - * struct ethtool_module_eeprom - EEPROM dump from specified page - * @offset: Offset within the specified EEPROM page to begin read, in bytes. - * @length: Number of bytes to read. - * @page: Page number to read from. - * @bank: Page bank number to read from, if applicable by EEPROM spec. + * struct ethtool_module_eeprom - plug-in module EEPROM read / write parameters + * @offset: When @offset is 0-127, it is used as an address to the Lower Memory + * (@page must be 0). Otherwise, it is used as an address to the + * Upper Memory. + * @length: Number of bytes to read / write. + * @page: Page number. + * @bank: Bank number, if supported by EEPROM spec. * @i2c_address: I2C address of a page. Value less than 0x7f expected. Most * EEPROMs use 0x50 or 0x51. * @data: Pointer to buffer with EEPROM data of @length size. - * - * This can be used to manage pages during EEPROM dump in ethtool and pass - * required information to the driver. */ struct ethtool_module_eeprom { u32 offset; @@ -824,6 +823,8 @@ struct ethtool_rxfh_param { * @get_module_eeprom_by_page: Get a region of plug-in module EEPROM data from * specified page. Returns a negative error code or the amount of bytes * read. + * @set_module_eeprom_by_page: Write to a region of plug-in module EEPROM, + * from kernel space only. Returns a negative error code or zero. * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics. * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics. * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics. @@ -958,6 +959,9 @@ struct ethtool_ops { int (*get_module_eeprom_by_page)(struct net_device *dev, const struct ethtool_module_eeprom *page, struct netlink_ext_ack *extack); + int (*set_module_eeprom_by_page)(struct net_device *dev, + const struct ethtool_module_eeprom *page, + struct netlink_ext_ack *extack); void (*get_eth_phy_stats)(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats); void (*get_eth_mac_stats)(struct net_device *dev, -- cgit v1.2.3 From 31e0aa99dc02b2b038a270b0670fc8201b69ec8a Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 27 Jun 2024 17:08:52 +0300 Subject: ethtool: Veto some operations during firmware flashing process Some operations cannot be performed during the firmware flashing process. For example: - Port must be down during the whole flashing process to avoid packet loss while committing reset for example. - Writing to EEPROM interrupts the flashing process, so operations like ethtool dump, module reset, get and set power mode should be vetoed. - Split port firmware flashing should be vetoed. In order to veto those scenarios, add a flag in 'struct net_device' that indicates when a firmware flash is taking place on the module and use it to prevent interruptions during the process. Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cc18acd3c58b..1e3401093c13 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1990,6 +1990,8 @@ enum netdev_reg_state { * * @threaded: napi threaded mode is enabled * + * @module_fw_flash_in_progress: Module firmware flashing is in progress. + * * @net_notifier_list: List of per-net netdev notifier block * that follow this device when it is moved * to another network namespace. @@ -2374,7 +2376,7 @@ struct net_device { bool proto_down; bool threaded; unsigned wol_enabled:1; - + unsigned module_fw_flash_in_progress:1; struct list_head net_notifier_list; #if IS_ENABLED(CONFIG_MACSEC) -- cgit v1.2.3 From e4f91936993ce4d0954688ec3cb80b3471b9eda5 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 27 Jun 2024 17:08:53 +0300 Subject: net: sfp: Add more extended compliance codes SFF-8024 is used to define various constants re-used in several SFF SFP-related specifications. Add SFF-8024 extended compliance code definitions for CMIS compliant modules and use them in the next patch to determine the firmware flashing work. Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Reviewed-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/sfp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index a45da7eef9a2..b14be59550e3 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -284,6 +284,12 @@ enum { SFF8024_ID_QSFP_8438 = 0x0c, SFF8024_ID_QSFP_8436_8636 = 0x0d, SFF8024_ID_QSFP28_8636 = 0x11, + SFF8024_ID_QSFP_DD = 0x18, + SFF8024_ID_OSFP = 0x19, + SFF8024_ID_DSFP = 0x1B, + SFF8024_ID_QSFP_PLUS_CMIS = 0x1E, + SFF8024_ID_SFP_DD_CMIS = 0x1F, + SFF8024_ID_SFP_PLUS_CMIS = 0x20, SFF8024_ENCODING_UNSPEC = 0x00, SFF8024_ENCODING_8B10B = 0x01, -- cgit v1.2.3 From 048a403648fcef8bd9f4f1a290c57b626ad16296 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 27 Jun 2024 21:02:34 +0300 Subject: net/mlx5: IFC updates for changing max EQs Expose new capability to support changing the number of EQs available to other functions. Fixes: 93197c7c509d ("mlx5/core: Support max_io_eqs for a function") Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Reviewed-by: William Tu Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5df52e15f7d6..d45bfb7cf81d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2029,7 +2029,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 pcc_ifa2[0x1]; u8 reserved_at_3f1[0xf]; - u8 reserved_at_400[0x400]; + u8 reserved_at_400[0x40]; + + u8 reserved_at_440[0x8]; + u8 max_num_eqs_24b[0x18]; + u8 reserved_at_460[0x3a0]; }; enum mlx5_ifc_flow_destination_type { -- cgit v1.2.3 From 205fdba5d0ffe1ad8de61763d74323e88b640d41 Mon Sep 17 00:00:00 2001 From: James Ogletree Date: Thu, 20 Jun 2024 16:17:41 +0000 Subject: firmware: cs_dsp: Add write sequence interface A write sequence is a sequence of register addresses and values executed by some Cirrus DSPs following certain power state transitions. Add support for Cirrus drivers to update or add to a write sequence present in firmware. Reviewed-by: Charles Keepax Signed-off-by: James Ogletree Reviewed-by: Jeff LaBundy Link: https://lore.kernel.org/r/20240620161745.2312359-2-jogletre@opensource.cirrus.com Signed-off-by: Lee Jones --- include/linux/firmware/cirrus/cs_dsp.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 82687e07a7c2..c28a6b9c3b2d 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -42,6 +42,16 @@ #define CS_DSP_ACKED_CTL_MIN_VALUE 0 #define CS_DSP_ACKED_CTL_MAX_VALUE 0xFFFFFF +/* + * Write sequence operation codes + */ +#define CS_DSP_WSEQ_FULL 0x00 +#define CS_DSP_WSEQ_ADDR8 0x02 +#define CS_DSP_WSEQ_L16 0x04 +#define CS_DSP_WSEQ_H16 0x05 +#define CS_DSP_WSEQ_UNLOCK 0xFD +#define CS_DSP_WSEQ_END 0xFF + /** * struct cs_dsp_region - Describes a logical memory region in DSP address space * @type: Memory region type @@ -258,6 +268,23 @@ struct cs_dsp_alg_region *cs_dsp_find_alg_region(struct cs_dsp *dsp, const char *cs_dsp_mem_region_name(unsigned int type); +/** + * struct cs_dsp_wseq - Describes a write sequence + * @ctl: Write sequence cs_dsp control + * @ops: Operations contained within + */ +struct cs_dsp_wseq { + struct cs_dsp_coeff_ctl *ctl; + struct list_head ops; +}; + +int cs_dsp_wseq_init(struct cs_dsp *dsp, struct cs_dsp_wseq *wseqs, unsigned int num_wseqs); +int cs_dsp_wseq_write(struct cs_dsp *dsp, struct cs_dsp_wseq *wseq, u32 addr, u32 data, + u8 op_code, bool update); +int cs_dsp_wseq_multi_write(struct cs_dsp *dsp, struct cs_dsp_wseq *wseq, + const struct reg_sequence *reg_seq, int num_regs, + u8 op_code, bool update); + /** * struct cs_dsp_chunk - Describes a buffer holding data formatted for the DSP * @data: Pointer to underlying buffer memory -- cgit v1.2.3 From cb626376cbd00cd69329371519a8e9568baef715 Mon Sep 17 00:00:00 2001 From: James Ogletree Date: Thu, 20 Jun 2024 16:17:43 +0000 Subject: mfd: cs40l50: Add support for CS40L50 core driver Introduce support for Cirrus Logic Device CS40L50: a haptic driver with waveform memory, integrated DSP, and closed-loop algorithms. The MFD component registers and initializes the device. Signed-off-by: James Ogletree Reviewed-by: Jeff LaBundy Link: https://lore.kernel.org/r/20240620161745.2312359-4-jogletre@opensource.cirrus.com Signed-off-by: Lee Jones --- include/linux/mfd/cs40l50.h | 137 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 include/linux/mfd/cs40l50.h (limited to 'include/linux') diff --git a/include/linux/mfd/cs40l50.h b/include/linux/mfd/cs40l50.h new file mode 100644 index 000000000000..e5dc49860944 --- /dev/null +++ b/include/linux/mfd/cs40l50.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * CS40L50 Advanced Haptic Driver with waveform memory, + * integrated DSP, and closed-loop algorithms + * + * Copyright 2024 Cirrus Logic, Inc. + * + * Author: James Ogletree + */ + +#ifndef __MFD_CS40L50_H__ +#define __MFD_CS40L50_H__ + +#include +#include +#include +#include + +/* Power Supply Configuration */ +#define CS40L50_BLOCK_ENABLES2 0x201C +#define CS40L50_ERR_RLS 0x2034 +#define CS40L50_BST_LPMODE_SEL 0x3810 +#define CS40L50_DCM_LOW_POWER 0x1 +#define CS40L50_OVERTEMP_WARN 0x4000010 + +/* Interrupts */ +#define CS40L50_IRQ1_INT_1 0xE010 +#define CS40L50_IRQ1_BASE CS40L50_IRQ1_INT_1 +#define CS40L50_IRQ1_INT_2 0xE014 +#define CS40L50_IRQ1_INT_8 0xE02C +#define CS40L50_IRQ1_INT_9 0xE030 +#define CS40L50_IRQ1_INT_10 0xE034 +#define CS40L50_IRQ1_INT_18 0xE054 +#define CS40L50_IRQ1_MASK_1 0xE090 +#define CS40L50_IRQ1_MASK_2 0xE094 +#define CS40L50_IRQ1_MASK_20 0xE0DC +#define CS40L50_IRQ1_INT_1_OFFSET (CS40L50_IRQ1_INT_1 - CS40L50_IRQ1_BASE) +#define CS40L50_IRQ1_INT_2_OFFSET (CS40L50_IRQ1_INT_2 - CS40L50_IRQ1_BASE) +#define CS40L50_IRQ1_INT_8_OFFSET (CS40L50_IRQ1_INT_8 - CS40L50_IRQ1_BASE) +#define CS40L50_IRQ1_INT_9_OFFSET (CS40L50_IRQ1_INT_9 - CS40L50_IRQ1_BASE) +#define CS40L50_IRQ1_INT_10_OFFSET (CS40L50_IRQ1_INT_10 - CS40L50_IRQ1_BASE) +#define CS40L50_IRQ1_INT_18_OFFSET (CS40L50_IRQ1_INT_18 - CS40L50_IRQ1_BASE) +#define CS40L50_IRQ_MASK_2_OVERRIDE 0xFFDF7FFF +#define CS40L50_IRQ_MASK_20_OVERRIDE 0x15C01000 +#define CS40L50_AMP_SHORT_MASK BIT(31) +#define CS40L50_DSP_QUEUE_MASK BIT(21) +#define CS40L50_TEMP_ERR_MASK BIT(31) +#define CS40L50_BST_UVP_MASK BIT(6) +#define CS40L50_BST_SHORT_MASK BIT(7) +#define CS40L50_BST_ILIMIT_MASK BIT(18) +#define CS40L50_UVLO_VDDBATT_MASK BIT(16) +#define CS40L50_GLOBAL_ERROR_MASK BIT(15) + +enum cs40l50_irq_list { + CS40L50_DSP_QUEUE_IRQ, + CS40L50_GLOBAL_ERROR_IRQ, + CS40L50_UVLO_VDDBATT_IRQ, + CS40L50_BST_ILIMIT_IRQ, + CS40L50_BST_SHORT_IRQ, + CS40L50_BST_UVP_IRQ, + CS40L50_TEMP_ERR_IRQ, + CS40L50_AMP_SHORT_IRQ, +}; + +/* DSP */ +#define CS40L50_XMEM_PACKED_0 0x2000000 +#define CS40L50_XMEM_UNPACKED24_0 0x2800000 +#define CS40L50_SYS_INFO_ID 0x25E0000 +#define CS40L50_DSP_QUEUE_WT 0x28042C8 +#define CS40L50_DSP_QUEUE_RD 0x28042CC +#define CS40L50_NUM_WAVES 0x2805C18 +#define CS40L50_CORE_BASE 0x2B80000 +#define CS40L50_YMEM_PACKED_0 0x2C00000 +#define CS40L50_YMEM_UNPACKED24_0 0x3400000 +#define CS40L50_PMEM_0 0x3800000 +#define CS40L50_DSP_POLL_US 1000 +#define CS40L50_DSP_TIMEOUT_COUNT 100 +#define CS40L50_RESET_PULSE_US 2200 +#define CS40L50_CP_READY_US 3100 +#define CS40L50_AUTOSUSPEND_MS 2000 +#define CS40L50_PM_ALGO 0x9F206 +#define CS40L50_GLOBAL_ERR_RLS_SET BIT(11) +#define CS40L50_GLOBAL_ERR_RLS_CLEAR 0 + +enum cs40l50_wseqs { + CS40L50_PWR_ON, + CS40L50_STANDBY, + CS40L50_ACTIVE, + CS40L50_NUM_WSEQS, +}; + +/* DSP Queue */ +#define CS40L50_DSP_QUEUE_BASE 0x11004 +#define CS40L50_DSP_QUEUE_END 0x1101C +#define CS40L50_DSP_QUEUE 0x11020 +#define CS40L50_PREVENT_HIBER 0x2000003 +#define CS40L50_ALLOW_HIBER 0x2000004 +#define CS40L50_SHUTDOWN 0x2000005 +#define CS40L50_SYSTEM_RESET 0x2000007 +#define CS40L50_START_I2S 0x3000002 +#define CS40L50_OWT_PUSH 0x3000008 +#define CS40L50_STOP_PLAYBACK 0x5000000 +#define CS40L50_OWT_DELETE 0xD000000 + +/* Firmware files */ +#define CS40L50_FW "cs40l50.wmfw" +#define CS40L50_WT "cs40l50.bin" + +/* Device */ +#define CS40L50_DEVID 0x0 +#define CS40L50_REVID 0x4 +#define CS40L50_DEVID_A 0x40A50 +#define CS40L50_REVID_B0 0xB0 + +struct cs40l50 { + struct device *dev; + struct regmap *regmap; + struct mutex lock; + struct cs_dsp dsp; + struct gpio_desc *reset_gpio; + struct regmap_irq_chip_data *irq_data; + const struct firmware *fw; + const struct firmware *bin; + struct cs_dsp_wseq wseqs[CS40L50_NUM_WSEQS]; + int irq; + u32 devid; + u32 revid; +}; + +int cs40l50_dsp_write(struct device *dev, struct regmap *regmap, u32 val); +int cs40l50_probe(struct cs40l50 *cs40l50); +int cs40l50_remove(struct cs40l50 *cs40l50); + +extern const struct regmap_config cs40l50_regmap; +extern const struct dev_pm_ops cs40l50_pm_ops; + +#endif /* __MFD_CS40L50_H__ */ -- cgit v1.2.3 From 860f8e3beac0b800bbe20f23c5f3440b1c470b8f Mon Sep 17 00:00:00 2001 From: Karel Balej Date: Fri, 31 May 2024 19:34:57 +0200 Subject: mfd: Add driver for Marvell 88PM886 PMIC Marvell 88PM886 is a PMIC which provides various functions such as onkey, battery, charger and regulators. It is found for instance in the samsung,coreprimevelte smartphone with which this was tested. Implement basic support to allow for the use of regulators and onkey. Signed-off-by: Karel Balej Link: https://lore.kernel.org/r/20240531175109.15599-3-balejk@matfyz.cz Signed-off-by: Lee Jones --- include/linux/mfd/88pm886.h | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 include/linux/mfd/88pm886.h (limited to 'include/linux') diff --git a/include/linux/mfd/88pm886.h b/include/linux/mfd/88pm886.h new file mode 100644 index 000000000000..133aa302e492 --- /dev/null +++ b/include/linux/mfd/88pm886.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __MFD_88PM886_H +#define __MFD_88PM886_H + +#include +#include + +#define PM886_A1_CHIP_ID 0xa1 + +#define PM886_IRQ_ONKEY 0 + +#define PM886_PAGE_OFFSET_REGULATORS 1 + +#define PM886_REG_ID 0x00 + +#define PM886_REG_STATUS1 0x01 +#define PM886_ONKEY_STS1 BIT(0) + +#define PM886_REG_INT_STATUS1 0x05 + +#define PM886_REG_INT_ENA_1 0x0a +#define PM886_INT_ENA1_ONKEY BIT(0) + +#define PM886_REG_MISC_CONFIG1 0x14 +#define PM886_SW_PDOWN BIT(5) + +#define PM886_REG_MISC_CONFIG2 0x15 +#define PM886_INT_INV BIT(0) +#define PM886_INT_CLEAR BIT(1) +#define PM886_INT_RC 0x00 +#define PM886_INT_WC BIT(1) +#define PM886_INT_MASK_MODE BIT(2) + +#define PM886_REG_RTC_SPARE6 0xef + +#define PM886_REG_BUCK_EN 0x08 +#define PM886_REG_LDO_EN1 0x09 +#define PM886_REG_LDO_EN2 0x0a +#define PM886_REG_LDO1_VOUT 0x20 +#define PM886_REG_LDO2_VOUT 0x26 +#define PM886_REG_LDO3_VOUT 0x2c +#define PM886_REG_LDO4_VOUT 0x32 +#define PM886_REG_LDO5_VOUT 0x38 +#define PM886_REG_LDO6_VOUT 0x3e +#define PM886_REG_LDO7_VOUT 0x44 +#define PM886_REG_LDO8_VOUT 0x4a +#define PM886_REG_LDO9_VOUT 0x50 +#define PM886_REG_LDO10_VOUT 0x56 +#define PM886_REG_LDO11_VOUT 0x5c +#define PM886_REG_LDO12_VOUT 0x62 +#define PM886_REG_LDO13_VOUT 0x68 +#define PM886_REG_LDO14_VOUT 0x6e +#define PM886_REG_LDO15_VOUT 0x74 +#define PM886_REG_LDO16_VOUT 0x7a +#define PM886_REG_BUCK1_VOUT 0xa5 +#define PM886_REG_BUCK2_VOUT 0xb3 +#define PM886_REG_BUCK3_VOUT 0xc1 +#define PM886_REG_BUCK4_VOUT 0xcf +#define PM886_REG_BUCK5_VOUT 0xdd + +#define PM886_LDO_VSEL_MASK 0x0f +#define PM886_BUCK_VSEL_MASK 0x7f + +struct pm886_chip { + struct i2c_client *client; + unsigned int chip_id; + struct regmap *regmap; +}; +#endif /* __MFD_88PM886_H */ -- cgit v1.2.3 From d19b46340b3c0ea66bef0f6c58876cc085813ba8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2024 06:59:38 +0200 Subject: block: remove bio_integrity_process Move the bvec interation into the generate/verify helpers to avoid a bit of argument passing churn. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240626045950.189758-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index c58634924782..804f856ed3e5 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -14,15 +14,6 @@ enum blk_integrity_flags { BLK_INTEGRITY_STACKED = 1 << 4, }; -struct blk_integrity_iter { - void *prot_buf; - void *data_buf; - sector_t seed; - unsigned int data_size; - unsigned short interval; - const char *disk_name; -}; - const char *blk_integrity_profile_name(struct blk_integrity *bi); bool queue_limits_stack_integrity(struct queue_limits *t, struct queue_limits *b); -- cgit v1.2.3 From 36b921637e900c77f5f9bd5b45db522124068a96 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Wed, 29 May 2024 14:34:34 +0100 Subject: ACPI: processor: Add acpi_get_processor_handle() helper If CONFIG_ACPI_PROCESSOR provide a helper to retrieve the acpi_handle for a given CPU allowing access to methods in DSDT. Tested-by: Miguel Luis Reviewed-by: Gavin Shan Acked-by: Rafael J. Wysocki Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240529133446.28446-8-Jonathan.Cameron@huawei.com Signed-off-by: Catalin Marinas --- include/linux/acpi.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 28c3fb2bef0d..fd45bfab66b8 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -304,6 +304,8 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int acpi_unmap_cpu(int cpu); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ +acpi_handle acpi_get_processor_handle(int cpu); + #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr); #endif @@ -1076,6 +1078,11 @@ static inline bool acpi_sleep_state_supported(u8 sleep_state) return false; } +static inline acpi_handle acpi_get_processor_handle(int cpu) +{ + return NULL; +} + #endif /* !CONFIG_ACPI */ extern void arch_post_acpi_subsys_init(void); -- cgit v1.2.3 From d633da5d3ab1a0eb26a2213d65da1e189e82f8ab Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 29 May 2024 14:34:41 +0100 Subject: irqchip/gic-v3: Add support for ACPI's disabled but 'online capable' CPUs To support virtual CPU hotplug, ACPI has added an 'online capable' bit to the MADT GICC entries. This indicates a disabled CPU entry may not be possible to online via PSCI until firmware has set enabled bit in _STA. This means that a "usable" GIC redistributor is one that is marked as either enabled, or online capable. The meaning of the acpi_gicc_is_usable() would become less clear than just checking the pair of flags at call sites. As such, drop that helper function. The test in gic_acpi_match_gicc() remains as testing just the enabled bit so the count of enabled distributors is correct. What about the redistributor in the GICC entry? ACPI doesn't want to say. Assume the worst: When a redistributor is described in the GICC entry, but the entry is marked as disabled at boot, assume the redistributor is inaccessible. The GICv3 driver doesn't support late online of redistributors, so this means the corresponding CPU can't be brought online either. Rather than modifying cpu masks that may already have been used, register a new cpuhp callback to fail this case. This must run earlier than the main gic_starting_cpu() so that this case can be rejected before the section of cpuhp that runs on the CPU that is coming up as that is not allowed to fail. This solution keeps the handling of this broken firmware corner case local to the GIC driver. As precise ordering of this callback doesn't need to be controlled as long as it is in that initial prepare phase, use CPUHP_BP_PREPARE_DYN. Systems that want CPU hotplug in a VM can ensure their redistributors are always-on, and describe them that way with a GICR entry in the MADT. Suggested-by: Marc Zyngier Signed-off-by: James Morse Signed-off-by: Russell King (Oracle) Tested-by: Miguel Luis Co-developed-by: Jonathan Cameron Signed-off-by: Jonathan Cameron Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240529133446.28446-15-Jonathan.Cameron@huawei.com Signed-off-by: Catalin Marinas --- include/linux/acpi.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index fd45bfab66b8..9f8c9d29b035 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -237,11 +237,6 @@ acpi_table_parse_cedt(enum acpi_cedt_type id, int acpi_parse_mcfg (struct acpi_table_header *header); void acpi_table_print_madt_entry (struct acpi_subtable_header *madt); -static inline bool acpi_gicc_is_usable(struct acpi_madt_generic_interrupt *gicc) -{ - return gicc->flags & ACPI_MADT_ENABLED; -} - #if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa); #else -- cgit v1.2.3 From 4e1a7df4548003fc081360b0f4edce3f7a991bfc Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 29 May 2024 14:34:46 +0100 Subject: cpumask: Add enabled cpumask for present CPUs that can be brought online The 'offline' file in sysfs shows all offline CPUs, including those that aren't present. User-space is expected to remove not-present CPUs from this list to learn which CPUs could be brought online. CPUs can be present but not-enabled. These CPUs can't be brought online until the firmware policy changes, which comes with an ACPI notification that will register the CPUs. With only the offline and present files, user-space is unable to determine which CPUs it can try to bring online. Add a new CPU mask that shows this based on all the registered CPUs. Signed-off-by: James Morse Tested-by: Miguel Luis Tested-by: Vishnu Pajjuri Tested-by: Jianyong Wu Acked-by: Thomas Gleixner Signed-off-by: Russell King (Oracle) Reviewed-by: Gavin Shan Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240529133446.28446-20-Jonathan.Cameron@huawei.com Signed-off-by: Catalin Marinas --- include/linux/cpumask.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 23686bed441d..954d4adc8f81 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -93,6 +93,7 @@ static inline void set_nr_cpu_ids(unsigned int nr) * * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable * cpu_present_mask - has bit 'cpu' set iff cpu is populated + * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * @@ -125,11 +126,13 @@ static inline void set_nr_cpu_ids(unsigned int nr) extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; +extern struct cpumask __cpu_enabled_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; extern struct cpumask __cpu_dying_mask; #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) #define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) +#define cpu_enabled_mask ((const struct cpumask *)&__cpu_enabled_mask) #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) #define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) @@ -1075,6 +1078,7 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #else #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) +#define for_each_enabled_cpu(cpu) for_each_cpu((cpu), cpu_enabled_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) #endif @@ -1092,6 +1096,15 @@ set_cpu_possible(unsigned int cpu, bool possible) cpumask_clear_cpu(cpu, &__cpu_possible_mask); } +static inline void +set_cpu_enabled(unsigned int cpu, bool can_be_onlined) +{ + if (can_be_onlined) + cpumask_set_cpu(cpu, &__cpu_enabled_mask); + else + cpumask_clear_cpu(cpu, &__cpu_enabled_mask); +} + static inline void set_cpu_present(unsigned int cpu, bool present) { @@ -1173,6 +1186,7 @@ static __always_inline unsigned int num_online_cpus(void) return raw_atomic_read(&__num_online_cpus); } #define num_possible_cpus() cpumask_weight(cpu_possible_mask) +#define num_enabled_cpus() cpumask_weight(cpu_enabled_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) @@ -1181,6 +1195,11 @@ static inline bool cpu_online(unsigned int cpu) return cpumask_test_cpu(cpu, cpu_online_mask); } +static inline bool cpu_enabled(unsigned int cpu) +{ + return cpumask_test_cpu(cpu, cpu_enabled_mask); +} + static inline bool cpu_possible(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_possible_mask); @@ -1205,6 +1224,7 @@ static inline bool cpu_dying(unsigned int cpu) #define num_online_cpus() 1U #define num_possible_cpus() 1U +#define num_enabled_cpus() 1U #define num_present_cpus() 1U #define num_active_cpus() 1U @@ -1218,6 +1238,11 @@ static inline bool cpu_possible(unsigned int cpu) return cpu == 0; } +static inline bool cpu_enabled(unsigned int cpu) +{ + return cpu == 0; +} + static inline bool cpu_present(unsigned int cpu) { return cpu == 0; -- cgit v1.2.3 From 43c0226c9ba56ffebdef9a858dbcf6eb0d376291 Mon Sep 17 00:00:00 2001 From: Dhruva Gole Date: Thu, 27 Jun 2024 11:31:17 +0530 Subject: cpufreq: make cpufreq_boost_enabled() return bool Since this function is supposed to return boost_enabled which is anyway a bool type make sure that it's return value is also marked as bool. This helps maintain better consistency in data types being used. Signed-off-by: Dhruva Gole Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/20240627060117.1809477-1-d-gole@ti.com Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 20f7e98ee8af..6f57de7de433 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -785,7 +785,7 @@ ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf); #ifdef CONFIG_CPU_FREQ int cpufreq_boost_trigger_state(int state); -int cpufreq_boost_enabled(void); +bool cpufreq_boost_enabled(void); int cpufreq_enable_boost_support(void); bool policy_has_boost_freq(struct cpufreq_policy *policy); @@ -1164,9 +1164,9 @@ static inline int cpufreq_boost_trigger_state(int state) { return 0; } -static inline int cpufreq_boost_enabled(void) +static inline bool cpufreq_boost_enabled(void) { - return 0; + return false; } static inline int cpufreq_enable_boost_support(void) -- cgit v1.2.3 From 0214b27fc949a7bcaf57e71a7f9f534d6481d08b Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Mon, 24 Jun 2024 13:46:00 -0400 Subject: iio: Add iio_read_channel_label to inkern API It can be convenient for other in-kernel drivers to reuse IIO channel labels. Export the iio_read_channel_label function to allow this. The signature is different depending on where we are calling it from, so the meat is moved to do_iio_read_channel_label. Signed-off-by: Sean Anderson Acked-by: Jonathan Cameron Link: https://patch.msgid.link/20240624174601.1527244-2-sean.anderson@linux.dev Signed-off-by: Jonathan Cameron --- include/linux/iio/consumer.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h index e9910b41d48e..333d1d8ccb37 100644 --- a/include/linux/iio/consumer.h +++ b/include/linux/iio/consumer.h @@ -441,4 +441,14 @@ ssize_t iio_read_channel_ext_info(struct iio_channel *chan, ssize_t iio_write_channel_ext_info(struct iio_channel *chan, const char *attr, const char *buf, size_t len); +/** + * iio_read_channel_label() - read label for a given channel + * @chan: The channel being queried. + * @buf: Where to store the attribute value. Assumed to hold + * at least PAGE_SIZE bytes. + * + * Returns the number of bytes written to buf, or an error code. + */ +ssize_t iio_read_channel_label(struct iio_channel *chan, char *buf); + #endif -- cgit v1.2.3 From 5476394aa9f27d670dd2bac426fdb6ac12b12cb3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jun 2024 13:14:01 +0200 Subject: block: simplify queue_logical_block_size queue_logical_block_size is never called with a 0 queue, and the logical_block_size field in queue_limits is always initialized for a live queue. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20240627111407.476276-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 53c41ef4222c..4d0d4b83bc74 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1227,12 +1227,7 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev) static inline unsigned queue_logical_block_size(const struct request_queue *q) { - int retval = 512; - - if (q && q->limits.logical_block_size) - retval = q->limits.logical_block_size; - - return retval; + return q->limits.logical_block_size; } static inline unsigned int bdev_logical_block_size(struct block_device *bdev) -- cgit v1.2.3 From 3ebbd9f6de7ec6d538639ebb657246f629ace81e Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 27 Jun 2024 16:33:46 +0100 Subject: net: move ethtool-related netdev state into its own struct net_dev->ethtool is a pointer to new struct ethtool_netdev_state, which currently contains only the wol_enabled field. Suggested-by: Jakub Kicinski Signed-off-by: Edward Cree Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/293a562278371de7534ed1eb17531838ca090633.1719502239.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 8 ++++++++ include/linux/netdevice.h | 8 +++++--- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c7f6f2bc9cac..374639e661d1 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1004,6 +1004,14 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd, u32 *dev_speed, u8 *dev_duplex); +/** + * struct ethtool_netdev_state - per-netdevice state for ethtool features + * @wol_enabled: Wake-on-LAN is enabled + */ +struct ethtool_netdev_state { + unsigned wol_enabled:1; +}; + struct phy_device; struct phy_tdr_config; struct phy_plca_cfg; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1e3401093c13..3c719f0d5f5a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -80,6 +80,7 @@ struct xdp_buff; struct xdp_frame; struct xdp_metadata_ops; struct xdp_md; +struct ethtool_netdev_state; typedef u32 xdp_features_t; @@ -1986,8 +1987,6 @@ enum netdev_reg_state { * switch driver and used to set the phys state of the * switch port. * - * @wol_enabled: Wake-on-LAN is enabled - * * @threaded: napi threaded mode is enabled * * @module_fw_flash_in_progress: Module firmware flashing is in progress. @@ -2001,6 +2000,7 @@ enum netdev_reg_state { * @udp_tunnel_nic_info: static structure describing the UDP tunnel * offload capabilities of the device * @udp_tunnel_nic: UDP tunnel offload state + * @ethtool: ethtool related state * @xdp_state: stores info on attached XDP BPF programs * * @nested_level: Used as a parameter of spin_lock_nested() of @@ -2375,7 +2375,7 @@ struct net_device { struct lock_class_key *qdisc_tx_busylock; bool proto_down; bool threaded; - unsigned wol_enabled:1; + unsigned module_fw_flash_in_progress:1; struct list_head net_notifier_list; @@ -2386,6 +2386,8 @@ struct net_device { const struct udp_tunnel_nic_info *udp_tunnel_nic_info; struct udp_tunnel_nic *udp_tunnel_nic; + struct ethtool_netdev_state *ethtool; + /* protected by rtnl_lock */ struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; -- cgit v1.2.3 From 6ad2962f8adfd53fca52dce7f830783e95d99ce7 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 27 Jun 2024 16:33:47 +0100 Subject: net: ethtool: attach an XArray of custom RSS contexts to a netdevice Each context stores the RXFH settings (indir, key, and hfunc) as well as optionally some driver private data. Delete any still-existing contexts at netdev unregister time. Signed-off-by: Edward Cree Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/cbd1c402cec38f2e03124f2ab65b4ae4e08bd90d.1719502240.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 374639e661d1..c741d6403364 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -159,6 +159,46 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) return index % n_rx_rings; } +/** + * struct ethtool_rxfh_context - a custom RSS context configuration + * @indir_size: Number of u32 entries in indirection table + * @key_size: Size of hash key, in bytes + * @priv_size: Size of driver private data, in bytes + * @hfunc: RSS hash function identifier. One of the %ETH_RSS_HASH_* + * @input_xfrm: Defines how the input data is transformed. Valid values are one + * of %RXH_XFRM_*. + * @indir_configured: indir has been specified (at create time or subsequently) + * @key_configured: hkey has been specified (at create time or subsequently) + */ +struct ethtool_rxfh_context { + u32 indir_size; + u32 key_size; + u16 priv_size; + u8 hfunc; + u8 input_xfrm; + u8 indir_configured:1; + u8 key_configured:1; + /* private: driver private data, indirection table, and hash key are + * stored sequentially in @data area. Use below helpers to access. + */ + u8 data[] __aligned(sizeof(void *)); +}; + +static inline void *ethtool_rxfh_context_priv(struct ethtool_rxfh_context *ctx) +{ + return ctx->data; +} + +static inline u32 *ethtool_rxfh_context_indir(struct ethtool_rxfh_context *ctx) +{ + return (u32 *)(ctx->data + ALIGN(ctx->priv_size, sizeof(u32))); +} + +static inline u8 *ethtool_rxfh_context_key(struct ethtool_rxfh_context *ctx) +{ + return (u8 *)(ethtool_rxfh_context_indir(ctx) + ctx->indir_size); +} + /* declare a link mode bitmap */ #define __ETHTOOL_DECLARE_LINK_MODE_MASK(name) \ DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS) @@ -1006,9 +1046,11 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, /** * struct ethtool_netdev_state - per-netdevice state for ethtool features + * @rss_ctx: XArray of custom RSS contexts * @wol_enabled: Wake-on-LAN is enabled */ struct ethtool_netdev_state { + struct xarray rss_ctx; unsigned wol_enabled:1; }; -- cgit v1.2.3 From eac9122f0c41b832065e01977c34946ec8e76c24 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 27 Jun 2024 16:33:48 +0100 Subject: net: ethtool: record custom RSS contexts in the XArray Since drivers are still choosing the context IDs, we have to force the XArray to use the ID they've chosen rather than picking one ourselves, and handle the case where they give us an ID that's already in use. Signed-off-by: Edward Cree Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/801f5faa4cec87c65b2c6e27fb220c944bce593a.1719502240.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c741d6403364..43a2a143034f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -199,6 +199,17 @@ static inline u8 *ethtool_rxfh_context_key(struct ethtool_rxfh_context *ctx) return (u8 *)(ethtool_rxfh_context_indir(ctx) + ctx->indir_size); } +static inline size_t ethtool_rxfh_context_size(u32 indir_size, u32 key_size, + u16 priv_size) +{ + size_t indir_bytes = array_size(indir_size, sizeof(u32)); + size_t flex_len; + + flex_len = size_add(size_add(indir_bytes, key_size), + ALIGN(priv_size, sizeof(u32))); + return struct_size_t(struct ethtool_rxfh_context, data, flex_len); +} + /* declare a link mode bitmap */ #define __ETHTOOL_DECLARE_LINK_MODE_MASK(name) \ DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS) @@ -710,6 +721,8 @@ struct ethtool_rxfh_param { * contexts. * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor * RSS. + * @rxfh_priv_size: size of the driver private data area the core should + * allocate for an RSS context (in &struct ethtool_rxfh_context). * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. * @get_drvinfo: Report driver/device information. Modern drivers no @@ -895,6 +908,7 @@ struct ethtool_ops { u32 cap_link_lanes_supported:1; u32 cap_rss_ctx_supported:1; u32 cap_rss_sym_xor_supported:1; + u16 rxfh_priv_size; u32 supported_coalesce_params; u32 supported_ring_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); -- cgit v1.2.3 From 847a8ab186767be6ee95643f9739fa9d0f839589 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 27 Jun 2024 16:33:49 +0100 Subject: net: ethtool: let the core choose RSS context IDs Add a new API to create/modify/remove RSS contexts, that passes in the newly-chosen context ID (not as a pointer) rather than leaving the driver to choose it on create. Also pass in the ctx, allowing drivers to easily use its private data area to store their hardware-specific state. Keep the existing .set_rxfh API for now as a fallback, but deprecate it for custom contexts (rss_context != 0). Signed-off-by: Edward Cree Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/45f1fe61df2163c091ec394c9f52000c8b16cc3b.1719502240.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 43a2a143034f..4292a25b2427 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -723,6 +723,10 @@ struct ethtool_rxfh_param { * RSS. * @rxfh_priv_size: size of the driver private data area the core should * allocate for an RSS context (in &struct ethtool_rxfh_context). + * @rxfh_max_context_id: maximum (exclusive) supported RSS context ID. If this + * is zero then the core may choose any (nonzero) ID, otherwise the core + * will only use IDs strictly less than this value, as the @rss_context + * argument to @create_rxfh_context and friends. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. * @get_drvinfo: Report driver/device information. Modern drivers no @@ -819,6 +823,32 @@ struct ethtool_rxfh_param { * will remain unchanged. * Returns a negative error code or zero. An error code must be returned * if at least one unsupported change was requested. + * @create_rxfh_context: Create a new RSS context with the specified RX flow + * hash indirection table, hash key, and hash function. + * The &struct ethtool_rxfh_context for this context is passed in @ctx; + * note that the indir table, hkey and hfunc are not yet populated as + * of this call. The driver does not need to update these; the core + * will do so if this op succeeds. + * However, if @rxfh.indir is set to %NULL, the driver must update the + * indir table in @ctx with the (default or inherited) table actually in + * use; similarly, if @rxfh.key is %NULL, @rxfh.hfunc is + * %ETH_RSS_HASH_NO_CHANGE, or @rxfh.input_xfrm is %RXH_XFRM_NO_CHANGE, + * the driver should update the corresponding information in @ctx. + * If the driver provides this method, it must also provide + * @modify_rxfh_context and @remove_rxfh_context. + * Returns a negative error code or zero. + * @modify_rxfh_context: Reconfigure the specified RSS context. Allows setting + * the contents of the RX flow hash indirection table, hash key, and/or + * hash function associated with the given context. + * Parameters which are set to %NULL or zero will remain unchanged. + * The &struct ethtool_rxfh_context for this context is passed in @ctx; + * note that it will still contain the *old* settings. The driver does + * not need to update these; the core will do so if this op succeeds. + * Returns a negative error code or zero. An error code must be returned + * if at least one unsupported change was requested. + * @remove_rxfh_context: Remove the specified RSS context. + * The &struct ethtool_rxfh_context for this context is passed in @ctx. + * Returns a negative error code or zero. * @get_channels: Get number of channels. * @set_channels: Set number of channels. Returns a negative error code or * zero. @@ -909,6 +939,7 @@ struct ethtool_ops { u32 cap_rss_ctx_supported:1; u32 cap_rss_sym_xor_supported:1; u16 rxfh_priv_size; + u32 rxfh_max_context_id; u32 supported_coalesce_params; u32 supported_ring_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); @@ -971,6 +1002,15 @@ struct ethtool_ops { int (*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *); int (*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *, struct netlink_ext_ack *extack); + int (*create_rxfh_context)(struct net_device *, + struct ethtool_rxfh_context *ctx, + const struct ethtool_rxfh_param *rxfh); + int (*modify_rxfh_context)(struct net_device *, + struct ethtool_rxfh_context *ctx, + const struct ethtool_rxfh_param *rxfh); + int (*remove_rxfh_context)(struct net_device *, + struct ethtool_rxfh_context *ctx, + u32 rss_context); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); int (*get_dump_flag)(struct net_device *, struct ethtool_dump *); -- cgit v1.2.3 From 30a32cdf6b130356805b3193a6208de25cbb2015 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 27 Jun 2024 16:33:50 +0100 Subject: net: ethtool: add an extack parameter to new rxfh_context APIs Currently passed as NULL, but will allow drivers to report back errors when ethnl support for these ops is added. Signed-off-by: Edward Cree Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/6e0012347d175fdd1280363d7bfa76a2f2777e17.1719502240.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 4292a25b2427..9cdbc8e3ed5c 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1004,13 +1004,16 @@ struct ethtool_ops { struct netlink_ext_ack *extack); int (*create_rxfh_context)(struct net_device *, struct ethtool_rxfh_context *ctx, - const struct ethtool_rxfh_param *rxfh); + const struct ethtool_rxfh_param *rxfh, + struct netlink_ext_ack *extack); int (*modify_rxfh_context)(struct net_device *, struct ethtool_rxfh_context *ctx, - const struct ethtool_rxfh_param *rxfh); + const struct ethtool_rxfh_param *rxfh, + struct netlink_ext_ack *extack); int (*remove_rxfh_context)(struct net_device *, struct ethtool_rxfh_context *ctx, - u32 rss_context); + u32 rss_context, + struct netlink_ext_ack *extack); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); int (*get_dump_flag)(struct net_device *, struct ethtool_dump *); -- cgit v1.2.3 From 87925151191b64d9623e63ccf11e517eacc99d7d Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 27 Jun 2024 16:33:51 +0100 Subject: net: ethtool: add a mutex protecting RSS contexts While this is not needed to serialise the ethtool entry points (which are all under RTNL), drivers may have cause to asynchronously access dev->ethtool->rss_ctx; taking dev->ethtool->rss_lock allows them to do this safely without needing to take the RTNL. Signed-off-by: Edward Cree Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/7f9c15eb7525bf87af62c275dde3a8570ee8bf0a.1719502240.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 9cdbc8e3ed5c..f74bb0cf8ed1 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1104,10 +1104,13 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, /** * struct ethtool_netdev_state - per-netdevice state for ethtool features * @rss_ctx: XArray of custom RSS contexts + * @rss_lock: Protects entries in @rss_ctx. May be taken from + * within RTNL. * @wol_enabled: Wake-on-LAN is enabled */ struct ethtool_netdev_state { struct xarray rss_ctx; + struct mutex rss_lock; unsigned wol_enabled:1; }; -- cgit v1.2.3 From b8c7dd15ceb87e5f37ec1ed7b56c279d98f3eb53 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Tue, 11 Jun 2024 17:12:22 -0700 Subject: kernel-wide: fix spelling mistakes like "assocative" -> "associative" There were several instances of the string "assocat" in the kernel, which should have been spelled "associat", with the various endings of -ive, -ed, -ion, and sometimes beginnging with dis-. Add to the spelling dictionary the corrections so that future instances will be caught by checkpatch, and fix the instances found. Originally noticed by accident with a 'git grep socat'. Link: https://lkml.kernel.org/r/20240612001247.356867-1-jesse.brandeburg@intel.com Signed-off-by: Jesse Brandeburg Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/nvme-fc-driver.h | 2 +- include/linux/soc/apple/rtkit.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 4109f1bd6128..1177dde77104 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -620,7 +620,7 @@ enum { * * Structure used between LLDD and nvmet-fc layer to represent the exchange * context for a FC-NVME FCP I/O operation (e.g. a nvme sqe, the sqe-related - * memory transfers, and its assocated cqe transfer). + * memory transfers, and its associated cqe transfer). * * The structure is allocated by the LLDD whenever a FCP CMD IU is received * from the FC link. The address of the structure is passed to the nvmet-fc diff --git a/include/linux/soc/apple/rtkit.h b/include/linux/soc/apple/rtkit.h index 8c9ca857ccf6..c06d17599ae7 100644 --- a/include/linux/soc/apple/rtkit.h +++ b/include/linux/soc/apple/rtkit.h @@ -69,7 +69,7 @@ struct apple_rtkit; * Initializes the internal state required to handle RTKit. This * should usually be called within _probe. * - * @dev: Pointer to the device node this coprocessor is assocated with + * @dev: Pointer to the device node this coprocessor is associated with * @cookie: opaque cookie passed to all functions defined in rtkit_ops * @mbox_name: mailbox name used to communicate with the co-processor * @mbox_idx: mailbox index to be used if mbox_name is NULL @@ -83,7 +83,7 @@ struct apple_rtkit *devm_apple_rtkit_init(struct device *dev, void *cookie, * Non-devm version of devm_apple_rtkit_init. Must be freed with * apple_rtkit_free. * - * @dev: Pointer to the device node this coprocessor is assocated with + * @dev: Pointer to the device node this coprocessor is associated with * @cookie: opaque cookie passed to all functions defined in rtkit_ops * @mbox_name: mailbox name used to communicate with the co-processor * @mbox_idx: mailbox index to be used if mbox_name is NULL -- cgit v1.2.3 From 7c812814e8c34a41bff6fe49987760ffaf8702af Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 24 Jun 2024 18:39:49 +0300 Subject: compiler.h: simplify data_race() macro -Wdeclaration-after-statement used since forever required statement expressions to inject __kcsan_disable_current(), __kcsan_enable_current() to mark data race. Now that it is gone, make macro expansion simpler. __unqual_scalar_typeof() is wordy macro by itself. "expr" is expanded twice. Link: https://lkml.kernel.org/r/fb62163f-ba21-4661-be5b-bb5124abc87d@p183 Signed-off-by: Alexey Dobriyan Reviewed-by: Marco Elver Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/compiler.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 8c252e073bd8..2ea6120f3f7a 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -200,10 +200,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, */ #define data_race(expr) \ ({ \ - __unqual_scalar_typeof(({ expr; })) __v = ({ \ - __kcsan_disable_current(); \ - expr; \ - }); \ + __kcsan_disable_current(); \ + __auto_type __v = (expr); \ __kcsan_enable_current(); \ __v; \ }) -- cgit v1.2.3 From 3e26d9f08fbe0b73e951a5e810fdb7a332b7e37f Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Thu, 20 Jun 2024 14:27:22 +0200 Subject: iio: core: Add new DMABUF interface infrastructure Add the necessary infrastructure to the IIO core to support a new optional DMABUF based interface. With this new interface, DMABUF objects (externally created) can be attached to a IIO buffer, and subsequently used for data transfer. A userspace application can then use this interface to share DMABUF objects between several interfaces, allowing it to transfer data in a zero-copy fashion, for instance between IIO and the USB stack. The userspace application can also memory-map the DMABUF objects, and access the sample data directly. The advantage of doing this vs. the read() interface is that it avoids an extra copy of the data between the kernel and userspace. This is particularly userful for high-speed devices which produce several megabytes or even gigabytes of data per second. As part of the interface, 3 new IOCTLs have been added: IIO_BUFFER_DMABUF_ATTACH_IOCTL(int fd): Attach the DMABUF object identified by the given file descriptor to the buffer. IIO_BUFFER_DMABUF_DETACH_IOCTL(int fd): Detach the DMABUF object identified by the given file descriptor from the buffer. Note that closing the IIO buffer's file descriptor will automatically detach all previously attached DMABUF objects. IIO_BUFFER_DMABUF_ENQUEUE_IOCTL(struct iio_dmabuf *): Request a data transfer to/from the given DMABUF object. Its file descriptor, as well as the transfer size and flags are provided in the "iio_dmabuf" structure. These three IOCTLs have to be performed on the IIO buffer's file descriptor, obtained using the IIO_BUFFER_GET_FD_IOCTL() ioctl. Signed-off-by: Paul Cercueil Co-developed-by: Nuno Sa Signed-off-by: Nuno Sa Link: https://patch.msgid.link/20240620122726.41232-4-paul@crapouillou.net Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer_impl.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h index 89c3fd7c29ca..e72552e026f3 100644 --- a/include/linux/iio/buffer_impl.h +++ b/include/linux/iio/buffer_impl.h @@ -9,8 +9,12 @@ #include #include +struct dma_buf_attachment; +struct dma_fence; struct iio_dev; +struct iio_dma_buffer_block; struct iio_buffer; +struct sg_table; /** * INDIO_BUFFER_FLAG_FIXED_WATERMARK - Watermark level of the buffer can not be @@ -39,6 +43,16 @@ struct iio_buffer; * device stops sampling. Calles are balanced with @enable. * @release: called when the last reference to the buffer is dropped, * should free all resources allocated by the buffer. + * @attach_dmabuf: called from userspace via ioctl to attach one external + * DMABUF. + * @detach_dmabuf: called from userspace via ioctl to detach one previously + * attached DMABUF. + * @enqueue_dmabuf: called from userspace via ioctl to queue this DMABUF + * object to this buffer. Requires a valid DMABUF fd, that + * was previouly attached to this buffer. + * @lock_queue: called when the core needs to lock the buffer queue; + * it is used when enqueueing DMABUF objects. + * @unlock_queue: used to unlock a previously locked buffer queue * @modes: Supported operating modes by this buffer type * @flags: A bitmask combination of INDIO_BUFFER_FLAG_* * @@ -68,6 +82,17 @@ struct iio_buffer_access_funcs { void (*release)(struct iio_buffer *buffer); + struct iio_dma_buffer_block * (*attach_dmabuf)(struct iio_buffer *buffer, + struct dma_buf_attachment *attach); + void (*detach_dmabuf)(struct iio_buffer *buffer, + struct iio_dma_buffer_block *block); + int (*enqueue_dmabuf)(struct iio_buffer *buffer, + struct iio_dma_buffer_block *block, + struct dma_fence *fence, struct sg_table *sgt, + size_t size, bool cyclic); + void (*lock_queue)(struct iio_buffer *buffer); + void (*unlock_queue)(struct iio_buffer *buffer); + unsigned int modes; unsigned int flags; }; @@ -136,6 +161,12 @@ struct iio_buffer { /* @ref: Reference count of the buffer. */ struct kref ref; + + /* @dmabufs: List of DMABUF attachments */ + struct list_head dmabufs; /* P: dmabufs_mutex */ + + /* @dmabufs_mutex: Protects dmabufs */ + struct mutex dmabufs_mutex; }; /** @@ -159,6 +190,8 @@ void iio_buffer_init(struct iio_buffer *buffer); struct iio_buffer *iio_buffer_get(struct iio_buffer *buffer); void iio_buffer_put(struct iio_buffer *buffer); +void iio_buffer_signal_dmabuf_done(struct dma_fence *fence, int ret); + #else /* CONFIG_IIO_BUFFER */ static inline void iio_buffer_get(struct iio_buffer *buffer) {} -- cgit v1.2.3 From d85318900c1c06a251ad3d86fba6bbab116a95d5 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Thu, 20 Jun 2024 14:27:23 +0200 Subject: iio: buffer-dma: Enable support for DMABUFs Implement iio_dma_buffer_attach_dmabuf(), iio_dma_buffer_detach_dmabuf() and iio_dma_buffer_transfer_dmabuf(), which can then be used by the IIO DMA buffer implementations. Signed-off-by: Paul Cercueil Co-developed-by: Nuno Sa Signed-off-by: Nuno Sa Link: https://patch.msgid.link/20240620122726.41232-5-paul@crapouillou.net Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer-dma.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h index 6e27e47077d5..5eb66a399002 100644 --- a/include/linux/iio/buffer-dma.h +++ b/include/linux/iio/buffer-dma.h @@ -7,6 +7,7 @@ #ifndef __INDUSTRIALIO_DMA_BUFFER_H__ #define __INDUSTRIALIO_DMA_BUFFER_H__ +#include #include #include #include @@ -16,6 +17,9 @@ struct iio_dma_buffer_queue; struct iio_dma_buffer_ops; struct device; +struct dma_buf_attachment; +struct dma_fence; +struct sg_table; /** * enum iio_block_state - State of a struct iio_dma_buffer_block @@ -41,6 +45,10 @@ enum iio_block_state { * @queue: Parent DMA buffer queue * @kref: kref used to manage the lifetime of block * @state: Current state of the block + * @cyclic: True if this is a cyclic buffer + * @fileio: True if this buffer is used for fileio mode + * @sg_table: DMA table for the transfer when transferring a DMABUF + * @fence: DMA fence to be signaled when a DMABUF transfer is complete */ struct iio_dma_buffer_block { /* May only be accessed by the owner of the block */ @@ -63,6 +71,12 @@ struct iio_dma_buffer_block { * queue->list_lock if the block is not owned by the core. */ enum iio_block_state state; + + bool cyclic; + bool fileio; + + struct sg_table *sg_table; + struct dma_fence *fence; }; /** @@ -72,6 +86,7 @@ struct iio_dma_buffer_block { * @pos: Read offset in the active block * @block_size: Size of each block * @next_dequeue: index of next block that will be dequeued + * @enabled: Whether the buffer is operating in fileio mode */ struct iio_dma_buffer_queue_fileio { struct iio_dma_buffer_block *blocks[2]; @@ -80,6 +95,7 @@ struct iio_dma_buffer_queue_fileio { size_t block_size; unsigned int next_dequeue; + bool enabled; }; /** @@ -95,6 +111,7 @@ struct iio_dma_buffer_queue_fileio { * the DMA controller * @incoming: List of buffers on the incoming queue * @active: Whether the buffer is currently active + * @num_dmabufs: Total number of DMABUFs attached to this queue * @fileio: FileIO state */ struct iio_dma_buffer_queue { @@ -107,6 +124,7 @@ struct iio_dma_buffer_queue { struct list_head incoming; bool active; + atomic_t num_dmabufs; struct iio_dma_buffer_queue_fileio fileio; }; @@ -144,4 +162,17 @@ int iio_dma_buffer_init(struct iio_dma_buffer_queue *queue, void iio_dma_buffer_exit(struct iio_dma_buffer_queue *queue); void iio_dma_buffer_release(struct iio_dma_buffer_queue *queue); +struct iio_dma_buffer_block * +iio_dma_buffer_attach_dmabuf(struct iio_buffer *buffer, + struct dma_buf_attachment *attach); +void iio_dma_buffer_detach_dmabuf(struct iio_buffer *buffer, + struct iio_dma_buffer_block *block); +int iio_dma_buffer_enqueue_dmabuf(struct iio_buffer *buffer, + struct iio_dma_buffer_block *block, + struct dma_fence *fence, + struct sg_table *sgt, + size_t size, bool cyclic); +void iio_dma_buffer_lock_queue(struct iio_buffer *buffer); +void iio_dma_buffer_unlock_queue(struct iio_buffer *buffer); + #endif -- cgit v1.2.3 From dbbe7eaf0e4795bf003ac06872aaf52b6b6b1310 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Thu, 6 Jun 2024 09:22:37 +0200 Subject: dev_printk: add new dev_err_probe() helpers This is similar to dev_err_probe() but for cases where an ERR_PTR() or ERR_CAST() is to be returned simplifying patterns like: dev_err_probe(dev, ret, ...); return ERR_PTR(ret) or dev_err_probe(dev, PTR_ERR(ptr), ...); return ERR_CAST(ptr) Signed-off-by: Nuno Sa Link: https://patch.msgid.link/20240606-dev-add_dev_errp_probe-v3-1-51bb229edd79@analog.com Signed-off-by: Jonathan Cameron --- include/linux/dev_printk.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h index ae80a303c216..ca32b5bb28eb 100644 --- a/include/linux/dev_printk.h +++ b/include/linux/dev_printk.h @@ -277,4 +277,12 @@ do { \ __printf(3, 4) int dev_err_probe(const struct device *dev, int err, const char *fmt, ...); +/* Simple helper for dev_err_probe() when ERR_PTR() is to be returned. */ +#define dev_err_ptr_probe(dev, ___err, fmt, ...) \ + ERR_PTR(dev_err_probe(dev, ___err, fmt, ##__VA_ARGS__)) + +/* Simple helper for dev_err_probe() when ERR_CAST() is to be returned. */ +#define dev_err_cast_probe(dev, ___err_ptr, fmt, ...) \ + ERR_PTR(dev_err_probe(dev, PTR_ERR(___err_ptr), fmt, ##__VA_ARGS__)) + #endif /* _DEVICE_PRINTK_H_ */ -- cgit v1.2.3 From f6549f538fe0b2c389e1a7037f4e21039e25137a Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Sat, 29 Jun 2024 14:42:12 +0200 Subject: ata,scsi: libata-core: Do not leak memory for ata_port struct members libsas is currently not freeing all the struct ata_port struct members, e.g. ncq_sense_buf for a driver supporting Command Duration Limits (CDL). Add a function, ata_port_free(), that is used to free a ata_port, including its struct members. It makes sense to keep the code related to freeing a ata_port in its own function, which will also free all the struct members of struct ata_port. Fixes: 18bd7718b5c4 ("scsi: ata: libata: Handle completion of CDL commands using policy 0xD") Reviewed-by: John Garry Link: https://lore.kernel.org/r/20240629124210.181537-8-cassel@kernel.org Signed-off-by: Niklas Cassel --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 13fb41d25da6..7d3bd7c9664a 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1249,6 +1249,7 @@ extern int ata_slave_link_init(struct ata_port *ap); extern struct ata_port *ata_sas_port_alloc(struct ata_host *, struct ata_port_info *, struct Scsi_Host *); extern void ata_port_probe(struct ata_port *ap); +extern void ata_port_free(struct ata_port *ap); extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap); extern void ata_sas_tport_delete(struct ata_port *ap); int ata_sas_device_configure(struct scsi_device *sdev, struct queue_limits *lim, -- cgit v1.2.3 From 61842868de13aa7fd7391c626e889f4d6f1450bf Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Fri, 10 May 2024 10:57:22 +0200 Subject: module: create weak dependecies It has been seen that for some network mac drivers (i.e. lan78xx) the related module for the phy is loaded dynamically depending on the current hardware. In this case, the associated phy is read using mdio bus and then the associated phy module is loaded during runtime (kernel function phy_request_driver_module). However, no software dependency is defined, so the user tools will no be able to get this dependency. For example, if dracut is used and the hardware is present, lan78xx will be included but no phy module will be added, and in the next restart the device will not work from boot because no related phy will be found during initramfs stage. In order to solve this, we could define a normal 'pre' software dependency in lan78xx module with all the possible phy modules (there may be some), but proceeding in that way, all the possible phy modules would be loaded while only one is necessary. The idea is to create a new type of dependency, that we are going to call 'weak' to be used only by the user tools that need to detect this situation. In that way, for example, dracut could check the 'weak' dependency of the modules involved in order to install these dependencies in initramfs too. That is, for the commented lan78xx module, defining the 'weak' dependency with the possible phy modules list, only the necessary phy would be loaded on demand keeping the same behavior, but all the possible phy modules would be available from initramfs. The 'weak' dependency support has been included in kmod: https://github.com/kmod-project/kmod/commit/05828b4a6e9327a63ef94df544a042b5e9ce4fe7 But, take into account that this can only be used if depmod is new enough. If it isn't, depmod will have the same behavior as always (keeping backward compatibility) and the information for the 'weak' dependency will not be provided. Signed-off-by: Jose Ignacio Tornos Martinez Reviewed-by: Lucas De Marchi Signed-off-by: Luis Chamberlain --- include/linux/module.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index ffa1c603163c..ae19bf7e3131 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -173,6 +173,12 @@ extern void cleanup_module(void); */ #define MODULE_SOFTDEP(_softdep) MODULE_INFO(softdep, _softdep) +/* + * Weak module dependencies. See man modprobe.d for details. + * Example: MODULE_WEAKDEP("module-foo") + */ +#define MODULE_WEAKDEP(_weakdep) MODULE_INFO(weakdep, _weakdep) + /* * MODULE_FILE is used for generating modules.builtin * So, make it no-op when this is being built as a module -- cgit v1.2.3 From 65528cfb21fdb68de8ae6dccae19af180d93e143 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:34 +0300 Subject: net/mlx5: mlx5_ifc update for multi-plane support Add new fields to support mlx5 multi-plane feature. Actual support will be added in following patches. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/36a74a1b1d2b7b59c99cda4abad1794ddde30230.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 09d9d87d62c6..61738990e399 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -793,7 +793,7 @@ struct mlx5_ifc_ads_bits { u8 reserved_at_2[0xe]; u8 pkey_index[0x10]; - u8 reserved_at_20[0x8]; + u8 plane_index[0x8]; u8 grh[0x1]; u8 mlid[0x7]; u8 rlid[0x10]; @@ -1990,7 +1990,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_c0[0x8]; u8 migration_multi_load[0x1]; u8 migration_tracking_state[0x1]; - u8 reserved_at_ca[0x6]; + u8 multiplane_qp_ud[0x1]; + u8 reserved_at_cb[0x5]; u8 migration_in_chunks[0x1]; u8 reserved_at_d1[0xf]; @@ -4172,7 +4173,8 @@ struct mlx5_ifc_hca_vport_context_bits { u8 has_smi[0x1]; u8 has_raw[0x1]; u8 grh_required[0x1]; - u8 reserved_at_104[0xc]; + u8 reserved_at_104[0x4]; + u8 num_port_plane[0x8]; u8 port_physical_state[0x4]; u8 vport_state_policy[0x4]; u8 port_state[0x4]; @@ -7692,7 +7694,7 @@ struct mlx5_ifc_mad_ifc_in_bits { u8 op_mod[0x10]; u8 remote_lid[0x10]; - u8 reserved_at_50[0x8]; + u8 plane_index[0x8]; u8 port[0x8]; u8 reserved_at_60[0x20]; @@ -9621,7 +9623,9 @@ struct mlx5_ifc_ptys_reg_bits { u8 an_disable_cap[0x1]; u8 reserved_at_3[0x5]; u8 local_port[0x8]; - u8 reserved_at_10[0xd]; + u8 reserved_at_10[0x8]; + u8 plane_ind[0x4]; + u8 reserved_at_1c[0x1]; u8 proto_mask[0x3]; u8 an_status[0x4]; -- cgit v1.2.3 From 2a5db20fa532198639671713c6213f96ff285b85 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:35 +0300 Subject: RDMA/mlx5: Add support to multi-plane device and port When multi-plane is supported, a logical port, which is aggregation of multiple physical plane ports, is exposed for data transmission. Compared with a normal mlx5 IB port, this logical port supports all functionalities except Subnet Management. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/7e37c06c9cb243be9ac79930cd17053903785b95.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 0d31f77396fc..a96438ded15f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -917,6 +917,7 @@ struct mlx5_hca_vport_context { u16 qkey_violation_counter; u16 pkey_violation_counter; bool grh_required; + u8 num_plane; }; #define STRUCT_FIELD(header, field) \ -- cgit v1.2.3 From 3b43399b297c52cfe4dfe0194b6ad89d52bf8c35 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:42 +0300 Subject: RDMA/mlx5: Add plane index support when querying PTYS registers Support the new "plane_ind" field when querying port PTYS registers. This is needed when querying the rate of a plane port. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/1f703c36306aa46917fcd88eadbb23b3e380d526.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/port.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 26092c78a985..e68d42b8ce65 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -155,10 +155,11 @@ struct mlx5_port_eth_proto { int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps); int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, - int ptys_size, int proto_mask, u8 local_port); + int ptys_size, int proto_mask, + u8 local_port, u8 plane_index); int mlx5_query_ib_port_oper(struct mlx5_core_dev *dev, u16 *link_width_oper, - u16 *proto_oper, u8 local_port); + u16 *proto_oper, u8 local_port, u8 plane_index); void mlx5_toggle_port_link(struct mlx5_core_dev *dev); int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, enum mlx5_port_status status); -- cgit v1.2.3 From c6b6677d85d47f5562020aa082d9b5f90738fdcd Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:43 +0300 Subject: net/mlx5: mlx5_ifc update for accessing ppcnt register of plane ports This patch adds new fields to support multi-plane and the extend port counters group. Actual support will be added in the next patch. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/70221cdd79aad0e21cbf385d9567e3ebffbc5137.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 47 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 61738990e399..5fea7b747607 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2651,6 +2651,46 @@ struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits { u8 port_xmit_wait[0x20]; }; +struct mlx5_ifc_ib_ext_port_cntrs_grp_data_layout_bits { + u8 reserved_at_0[0x300]; + + u8 port_xmit_data_high[0x20]; + + u8 port_xmit_data_low[0x20]; + + u8 port_rcv_data_high[0x20]; + + u8 port_rcv_data_low[0x20]; + + u8 port_xmit_pkts_high[0x20]; + + u8 port_xmit_pkts_low[0x20]; + + u8 port_rcv_pkts_high[0x20]; + + u8 port_rcv_pkts_low[0x20]; + + u8 reserved_at_400[0x80]; + + u8 port_unicast_xmit_pkts_high[0x20]; + + u8 port_unicast_xmit_pkts_low[0x20]; + + u8 port_multicast_xmit_pkts_high[0x20]; + + u8 port_multicast_xmit_pkts_low[0x20]; + + u8 port_unicast_rcv_pkts_high[0x20]; + + u8 port_unicast_rcv_pkts_low[0x20]; + + u8 port_multicast_rcv_pkts_high[0x20]; + + u8 port_multicast_rcv_pkts_low[0x20]; + + u8 reserved_at_580[0x240]; +}; + struct mlx5_ifc_eth_per_tc_prio_grp_data_layout_bits { u8 transmit_queue_high[0x20]; @@ -4543,6 +4583,7 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { struct mlx5_ifc_eth_per_tc_prio_grp_data_layout_bits eth_per_tc_prio_grp_data_layout; struct mlx5_ifc_eth_per_tc_congest_prio_grp_data_layout_bits eth_per_tc_congest_prio_grp_data_layout; struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout; + struct mlx5_ifc_ib_ext_port_cntrs_grp_data_layout_bits ib_ext_port_cntrs_grp_data_layout; struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; struct mlx5_ifc_phys_layer_statistical_cntrs_bits phys_layer_statistical_cntrs; u8 reserved_at_0[0x7c0]; @@ -9851,8 +9892,10 @@ struct mlx5_ifc_ppcnt_reg_bits { u8 grp[0x6]; u8 clr[0x1]; - u8 reserved_at_21[0x1c]; - u8 prio_tc[0x3]; + u8 reserved_at_21[0x13]; + u8 plane_ind[0x4]; + u8 reserved_at_38[0x3]; + u8 prio_tc[0x5]; union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set; }; -- cgit v1.2.3 From 7a2210a57d423cbdb9c5f2e8b12c65d7472ff147 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:44 +0300 Subject: RDMA/mlx5: Support per-plane port IB counters by querying PPCNT register Supports per-plane port counters by querying PPCNT register with the "extended port counters" group, as the query_vport_counter command doesn't support plane ports. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/06ffb582d67159b7def4654c8272d3d6e8bd2f2f.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index d7bb31d9a446..68bd1b4737ea 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1466,6 +1466,7 @@ enum { MLX5_PER_TRAFFIC_CLASS_CONGESTION_GROUP = 0x13, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP = 0x16, MLX5_INFINIBAND_PORT_COUNTERS_GROUP = 0x20, + MLX5_INFINIBAND_EXTENDED_PORT_COUNTERS_GROUP = 0x21, }; enum { -- cgit v1.2.3 From 762ced1630a97a457ad2fd5f5a36849009808431 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 1 Jul 2024 14:39:50 +0200 Subject: HID: bpf: fix gcc warning and unify __u64 into u64 I've got multiple reports of: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast]. Let's use the same trick than kernel/bpf/helpers.c to shut up that warning. Even if we were on an architecture with addresses on more than 64 bits, this isn't much of an issue as the address is not used as a pointer, but as an hash and the caller is not supposed to go back to the kernel address ever. And while we change those, make sure we use u64 instead of __u64 for consistency Reported-by: Stephen Rothwell Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202406280633.OPB5uIFj-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202406282304.UydSVncq-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202406282242.Fk738zzy-lkp@intel.com/ Reported-by: Mirsad Todorovac Fixes: 67eccf151d76 ("HID: add source argument to HID low level functions") Link: https://patch.msgid.link/20240701-fix-cki-v2-2-20564e2e1393@kernel.org Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index 93546ee7677a..3f6584014311 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -68,9 +68,9 @@ struct hid_ops { unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype, - __u64 source, bool from_bpf); + u64 source, bool from_bpf); int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len, - __u64 source, bool from_bpf); + u64 source, bool from_bpf); int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, bool lock_already_taken); @@ -115,7 +115,7 @@ struct hid_bpf_ops { * Context: Interrupt context. */ int (*hid_device_event)(struct hid_bpf_ctx *ctx, enum hid_report_type report_type, - __u64 source); + u64 source); /** * @hid_rdesc_fixup: called when the probe function parses the report descriptor -- cgit v1.2.3 From 260ffc9676b635c2ededc39285bfa41f83536ee1 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 1 Jul 2024 14:39:51 +0200 Subject: HID: bpf: doc fixes for hid_hw_request() hooks We had the following errors while doing make htmldocs: Documentation/hid/hid-bpf:185: include/linux/hid_bpf.h:144: ERROR: Unexpected indentation. Documentation/hid/hid-bpf:185: include/linux/hid_bpf.h:145: WARNING: Block quote ends without a blank line; unexpected unindent. Documentation/hid/hid-bpf:185: include/linux/hid_bpf.h:147: ERROR: Unexpected indentation. Reported-by: Stephen Rothwell Fixes: 8bd0488b5ea5 ("HID: bpf: add HID-BPF hooks for hid_hw_raw_requests") Link: https://patch.msgid.link/20240701-fix-cki-v2-3-20564e2e1393@kernel.org Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index 3f6584014311..c30c31b79419 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -139,12 +139,15 @@ struct hid_bpf_ops { * * ``ctx``: The HID-BPF context as &struct hid_bpf_ctx * ``reportnum``: the report number, as in hid_hw_raw_request() + * * ``rtype``: the report type (``HID_INPUT_REPORT``, ``HID_FEATURE_REPORT``, - * ``HID_OUTPUT_REPORT``) + * ``HID_OUTPUT_REPORT``) + * * ``reqtype``: the request + * * ``source``: a u64 referring to a uniq but identifiable source. If %0, the - * kernel itself emitted that call. For hidraw, ``source`` is set - * to the associated ``struct file *``. + * kernel itself emitted that call. For hidraw, ``source`` is set + * to the associated ``struct file *``. * * Return: %0 to keep processing the request by hid-core; any other value * stops hid-core from processing that event. A positive value should be @@ -153,7 +156,7 @@ struct hid_bpf_ops { */ int (*hid_hw_request)(struct hid_bpf_ctx *ctx, unsigned char reportnum, enum hid_report_type rtype, enum hid_class_request reqtype, - __u64 source); + u64 source); /** * @hid_hw_output_report: called whenever a hid_hw_output_report() call is emitted -- cgit v1.2.3 From c79de517a226b86419a5baa867e65e3f8118829f Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 1 Jul 2024 14:39:52 +0200 Subject: HID: bpf: doc fixes for hid_hw_request() hooks We had the following errors while doing make htmldocs: Documentation/hid/hid-bpf:185: include/linux/hid_bpf.h:167: ERROR: Unexpected indentation. Also ensure consistency with the rest of the __u64 vs u64. Reported-by: Stephen Rothwell Fixes: 9286675a2aed ("HID: bpf: add HID-BPF hooks for hid_hw_output_report") Link: https://patch.msgid.link/20240701-fix-cki-v2-4-20564e2e1393@kernel.org Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index c30c31b79419..9ca96fc90449 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -138,6 +138,7 @@ struct hid_bpf_ops { * It has the following arguments: * * ``ctx``: The HID-BPF context as &struct hid_bpf_ctx + * * ``reportnum``: the report number, as in hid_hw_raw_request() * * ``rtype``: the report type (``HID_INPUT_REPORT``, ``HID_FEATURE_REPORT``, @@ -165,16 +166,17 @@ struct hid_bpf_ops { * It has the following arguments: * * ``ctx``: The HID-BPF context as &struct hid_bpf_ctx + * * ``source``: a u64 referring to a uniq but identifiable source. If %0, the - * kernel itself emitted that call. For hidraw, ``source`` is set - * to the associated ``struct file *``. + * kernel itself emitted that call. For hidraw, ``source`` is set + * to the associated ``struct file *``. * * Return: %0 to keep processing the request by hid-core; any other value * stops hid-core from processing that event. A positive value should be * returned with the number of bytes written to the device; a negative error * code interrupts the processing of this call. */ - int (*hid_hw_output_report)(struct hid_bpf_ctx *ctx, __u64 source); + int (*hid_hw_output_report)(struct hid_bpf_ctx *ctx, u64 source); /* private: do not show up in the docs */ @@ -203,9 +205,9 @@ int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, u32 size, enum hid_report_type rtype, enum hid_class_request reqtype, - __u64 source, bool from_bpf); + u64 source, bool from_bpf); int dispatch_hid_bpf_output_report(struct hid_device *hdev, __u8 *buf, u32 size, - __u64 source, bool from_bpf); + u64 source, bool from_bpf); int hid_bpf_connect_device(struct hid_device *hdev); void hid_bpf_disconnect_device(struct hid_device *hdev); void hid_bpf_destroy_device(struct hid_device *hid); @@ -221,7 +223,7 @@ static inline int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, enum hid_class_request reqtype, u64 source, bool from_bpf) { return 0; } static inline int dispatch_hid_bpf_output_report(struct hid_device *hdev, __u8 *buf, u32 size, - __u64 source, bool from_bpf) { return 0; } + u64 source, bool from_bpf) { return 0; } static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; } static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {} static inline void hid_bpf_destroy_device(struct hid_device *hid) {} -- cgit v1.2.3 From 63e2f40c9e3187641afacde4153f54b3ee4dbc8c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 29 Jun 2024 21:48:41 +0200 Subject: syscalls: fix sys_fanotify_mark prototype My earlier fix missed an incorrect function prototype that shows up on native 32-bit builds: In file included from fs/notify/fanotify/fanotify_user.c:14: include/linux/syscalls.h:248:25: error: conflicting types for 'sys_fanotify_mark'; have 'long int(int, unsigned int, u32, u32, int, const char *)' {aka 'long int(int, unsigned int, unsigned int, unsigned int, int, const char *)'} 1924 | SYSCALL32_DEFINE6(fanotify_mark, | ^~~~~~~~~~~~~~~~~ include/linux/syscalls.h:862:17: note: previous declaration of 'sys_fanotify_mark' with type 'long int(int, unsigned int, u64, int, const char *)' {aka 'long int(int, unsigned int, long long unsigned int, int, const char *)'} On x86 and powerpc, the prototype is also wrong but hidden in an #ifdef, so it never caused problems. Add another alternative declaration that matches the conditional function definition. Fixes: 403f17a33073 ("parisc: use generic sys_fanotify_mark implementation") Cc: stable@vger.kernel.org Reported-by: Guenter Roeck Reported-by: Geert Uytterhoeven Reported-by: kernel test robot Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 63424af87bba..fff820c3e93e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -859,9 +859,15 @@ asmlinkage long sys_prlimit64(pid_t pid, unsigned int resource, const struct rlimit64 __user *new_rlim, struct rlimit64 __user *old_rlim); asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags); +#if defined(CONFIG_ARCH_SPLIT_ARG64) +asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags, + unsigned int mask_1, unsigned int mask_2, + int dfd, const char __user * pathname); +#else asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags, u64 mask, int fd, const char __user *pathname); +#endif asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name, struct file_handle __user *handle, int __user *mnt_id, int flag); -- cgit v1.2.3 From c05cb5bdf413a2a5051701a397082380758e37ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 30 Jun 2024 22:54:09 +0200 Subject: platform/chrome: Update binary interface for EC-based charge control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The charge-control command v2/v3 is more featureful than v1, it additionally supports charge thresholds. The definitions were imported from ChromeOS EC commit 32870d602317 ("squirtle: modify motionsense rotation matrix") Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20240630-cros_ec-charge-control-v5-2-8f649d018c52@weissschuh.net Signed-off-by: Tzung-Bi Shih --- include/linux/platform_data/cros_ec_commands.h | 49 ++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index ec598057d1da..e574b790be6f 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -3843,16 +3843,61 @@ struct ec_params_i2c_write { * discharge the battery. */ #define EC_CMD_CHARGE_CONTROL 0x0096 -#define EC_VER_CHARGE_CONTROL 1 +#define EC_VER_CHARGE_CONTROL 3 enum ec_charge_control_mode { CHARGE_CONTROL_NORMAL = 0, CHARGE_CONTROL_IDLE, CHARGE_CONTROL_DISCHARGE, + /* Add no more entry below. */ + CHARGE_CONTROL_COUNT, +}; + +#define EC_CHARGE_MODE_TEXT \ + { \ + [CHARGE_CONTROL_NORMAL] = "NORMAL", \ + [CHARGE_CONTROL_IDLE] = "IDLE", \ + [CHARGE_CONTROL_DISCHARGE] = "DISCHARGE", \ + } + +enum ec_charge_control_cmd { + EC_CHARGE_CONTROL_CMD_SET = 0, + EC_CHARGE_CONTROL_CMD_GET, +}; + +enum ec_charge_control_flag { + EC_CHARGE_CONTROL_FLAG_NO_IDLE = BIT(0), }; struct ec_params_charge_control { - uint32_t mode; /* enum charge_control_mode */ + uint32_t mode; /* enum charge_control_mode */ + + /* Below are the fields added in V2. */ + uint8_t cmd; /* enum ec_charge_control_cmd. */ + uint8_t flags; /* enum ec_charge_control_flag (v3+) */ + /* + * Lower and upper thresholds for battery sustainer. This struct isn't + * named to avoid tainting foreign projects' name spaces. + * + * If charge mode is explicitly set (e.g. DISCHARGE), battery sustainer + * will be disabled. To disable battery sustainer, set mode=NORMAL, + * lower=-1, upper=-1. + */ + struct { + int8_t lower; /* Display SoC in percentage. */ + int8_t upper; /* Display SoC in percentage. */ + } sustain_soc; +} __ec_align4; + +/* Added in v2 */ +struct ec_response_charge_control { + uint32_t mode; /* enum charge_control_mode */ + struct { /* Battery sustainer thresholds */ + int8_t lower; + int8_t upper; + } sustain_soc; + uint8_t flags; /* enum ec_charge_control_flag (v3+) */ + uint8_t reserved; } __ec_align4; /*****************************************************************************/ -- cgit v1.2.3 From 69a13742b7c64ed89894caf7091539e164b3e97c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 30 Jun 2024 22:54:10 +0200 Subject: platform/chrome: cros_ec_proto: Introduce cros_ec_get_cmd_versions() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Retrieving the supported versions of a command is a fairly common operation. Provide a helper for it. If the command is not supported at all the EC returns -EINVAL/EC_RES_INVALID_PARAMS. This error is translated into an empty version mask as that is easier to handle for callers and they don't need to know about the error details. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20240630-cros_ec-charge-control-v5-3-8f649d018c52@weissschuh.net Signed-off-by: Tzung-Bi Shih --- include/linux/platform_data/cros_ec_proto.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 6e9225bdf903..b34ed0cc1f8d 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -263,6 +263,8 @@ int cros_ec_cmd(struct cros_ec_device *ec_dev, unsigned int version, int command int cros_ec_cmd_readmem(struct cros_ec_device *ec_dev, u8 offset, u8 size, void *dest); +int cros_ec_get_cmd_versions(struct cros_ec_device *ec_dev, u16 cmd); + /** * cros_ec_get_time_ns() - Return time in ns. * -- cgit v1.2.3 From 992f1a3d4e88498de04b0b13b94705d8540f3d81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 1 Jul 2024 13:30:04 +0200 Subject: platform: cznic: Add preliminary support for Turris Omnia MCU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the basic skeleton for a new platform driver for the microcontroller found on the Turris Omnia board. Signed-off-by: Marek Behún Reviewed-by: Andy Shevchenko Acked-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20240701113010.16447-3-kabel@kernel.org Signed-off-by: Arnd Bergmann --- include/linux/turris-omnia-mcu-interface.h | 249 +++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 include/linux/turris-omnia-mcu-interface.h (limited to 'include/linux') diff --git a/include/linux/turris-omnia-mcu-interface.h b/include/linux/turris-omnia-mcu-interface.h new file mode 100644 index 000000000000..2da8cbeb158a --- /dev/null +++ b/include/linux/turris-omnia-mcu-interface.h @@ -0,0 +1,249 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * CZ.NIC's Turris Omnia MCU I2C interface commands definitions + * + * 2024 by Marek Behún + */ + +#ifndef __TURRIS_OMNIA_MCU_INTERFACE_H +#define __TURRIS_OMNIA_MCU_INTERFACE_H + +#include +#include + +enum omnia_commands_e { + OMNIA_CMD_GET_STATUS_WORD = 0x01, /* slave sends status word back */ + OMNIA_CMD_GENERAL_CONTROL = 0x02, + OMNIA_CMD_LED_MODE = 0x03, /* default/user */ + OMNIA_CMD_LED_STATE = 0x04, /* LED on/off */ + OMNIA_CMD_LED_COLOR = 0x05, /* LED number + RED + GREEN + BLUE */ + OMNIA_CMD_USER_VOLTAGE = 0x06, + OMNIA_CMD_SET_BRIGHTNESS = 0x07, + OMNIA_CMD_GET_BRIGHTNESS = 0x08, + OMNIA_CMD_GET_RESET = 0x09, + OMNIA_CMD_GET_FW_VERSION_APP = 0x0A, /* 20B git hash number */ + OMNIA_CMD_SET_WATCHDOG_STATE = 0x0B, /* 0 - disable + * 1 - enable / ping + * after boot watchdog is started + * with 2 minutes timeout + */ + + /* OMNIA_CMD_WATCHDOG_STATUS = 0x0C, not implemented anymore */ + + OMNIA_CMD_GET_WATCHDOG_STATE = 0x0D, + OMNIA_CMD_GET_FW_VERSION_BOOT = 0x0E, /* 20B Git hash number */ + OMNIA_CMD_GET_FW_CHECKSUM = 0x0F, /* 4B length, 4B checksum */ + + /* available if FEATURES_SUPPORTED bit set in status word */ + OMNIA_CMD_GET_FEATURES = 0x10, + + /* available if EXT_CMD bit set in features */ + OMNIA_CMD_GET_EXT_STATUS_DWORD = 0x11, + OMNIA_CMD_EXT_CONTROL = 0x12, + OMNIA_CMD_GET_EXT_CONTROL_STATUS = 0x13, + + /* available if NEW_INT_API bit set in features */ + OMNIA_CMD_GET_INT_AND_CLEAR = 0x14, + OMNIA_CMD_GET_INT_MASK = 0x15, + OMNIA_CMD_SET_INT_MASK = 0x16, + + /* available if FLASHING bit set in features */ + OMNIA_CMD_FLASH = 0x19, + + /* available if WDT_PING bit set in features */ + OMNIA_CMD_SET_WDT_TIMEOUT = 0x20, + OMNIA_CMD_GET_WDT_TIMELEFT = 0x21, + + /* available if POWEROFF_WAKEUP bit set in features */ + OMNIA_CMD_SET_WAKEUP = 0x22, + OMNIA_CMD_GET_UPTIME_AND_WAKEUP = 0x23, + OMNIA_CMD_POWER_OFF = 0x24, + + /* available if USB_OVC_PROT_SETTING bit set in features */ + OMNIA_CMD_SET_USB_OVC_PROT = 0x25, + OMNIA_CMD_GET_USB_OVC_PROT = 0x26, + + /* available if TRNG bit set in features */ + OMNIA_CMD_TRNG_COLLECT_ENTROPY = 0x28, + + /* available if CRYPTO bit set in features */ + OMNIA_CMD_CRYPTO_GET_PUBLIC_KEY = 0x29, + OMNIA_CMD_CRYPTO_SIGN_MESSAGE = 0x2A, + OMNIA_CMD_CRYPTO_COLLECT_SIGNATURE = 0x2B, + + /* available if BOARD_INFO it set in features */ + OMNIA_CMD_BOARD_INFO_GET = 0x2C, + OMNIA_CMD_BOARD_INFO_BURN = 0x2D, + + /* available only at address 0x2b (LED-controller) */ + /* available only if LED_GAMMA_CORRECTION bit set in features */ + OMNIA_CMD_SET_GAMMA_CORRECTION = 0x30, + OMNIA_CMD_GET_GAMMA_CORRECTION = 0x31, + + /* available only at address 0x2b (LED-controller) */ + /* available only if PER_LED_CORRECTION bit set in features */ + /* available only if FROM_BIT_16_INVALID bit NOT set in features */ + OMNIA_CMD_SET_LED_CORRECTIONS = 0x32, + OMNIA_CMD_GET_LED_CORRECTIONS = 0x33, +}; + +enum omnia_flashing_commands_e { + OMNIA_FLASH_CMD_UNLOCK = 0x01, + OMNIA_FLASH_CMD_SIZE_AND_CSUM = 0x02, + OMNIA_FLASH_CMD_PROGRAM = 0x03, + OMNIA_FLASH_CMD_RESET = 0x04, +}; + +enum omnia_sts_word_e { + OMNIA_STS_MCU_TYPE_MASK = GENMASK(1, 0), + OMNIA_STS_MCU_TYPE_STM32 = FIELD_PREP_CONST(OMNIA_STS_MCU_TYPE_MASK, 0), + OMNIA_STS_MCU_TYPE_GD32 = FIELD_PREP_CONST(OMNIA_STS_MCU_TYPE_MASK, 1), + OMNIA_STS_MCU_TYPE_MKL = FIELD_PREP_CONST(OMNIA_STS_MCU_TYPE_MASK, 2), + OMNIA_STS_FEATURES_SUPPORTED = BIT(2), + OMNIA_STS_USER_REGULATOR_NOT_SUPPORTED = BIT(3), + OMNIA_STS_CARD_DET = BIT(4), + OMNIA_STS_MSATA_IND = BIT(5), + OMNIA_STS_USB30_OVC = BIT(6), + OMNIA_STS_USB31_OVC = BIT(7), + OMNIA_STS_USB30_PWRON = BIT(8), + OMNIA_STS_USB31_PWRON = BIT(9), + OMNIA_STS_ENABLE_4V5 = BIT(10), + OMNIA_STS_BUTTON_MODE = BIT(11), + OMNIA_STS_BUTTON_PRESSED = BIT(12), + OMNIA_STS_BUTTON_COUNTER_MASK = GENMASK(15, 13), +}; + +enum omnia_ctl_byte_e { + OMNIA_CTL_LIGHT_RST = BIT(0), + OMNIA_CTL_HARD_RST = BIT(1), + /* BIT(2) is currently reserved */ + OMNIA_CTL_USB30_PWRON = BIT(3), + OMNIA_CTL_USB31_PWRON = BIT(4), + OMNIA_CTL_ENABLE_4V5 = BIT(5), + OMNIA_CTL_BUTTON_MODE = BIT(6), + OMNIA_CTL_BOOTLOADER = BIT(7), +}; + +enum omnia_features_e { + OMNIA_FEAT_PERIPH_MCU = BIT(0), + OMNIA_FEAT_EXT_CMDS = BIT(1), + OMNIA_FEAT_WDT_PING = BIT(2), + OMNIA_FEAT_LED_STATE_EXT_MASK = GENMASK(4, 3), + OMNIA_FEAT_LED_STATE_EXT = FIELD_PREP_CONST(OMNIA_FEAT_LED_STATE_EXT_MASK, 1), + OMNIA_FEAT_LED_STATE_EXT_V32 = FIELD_PREP_CONST(OMNIA_FEAT_LED_STATE_EXT_MASK, 2), + OMNIA_FEAT_LED_GAMMA_CORRECTION = BIT(5), + OMNIA_FEAT_NEW_INT_API = BIT(6), + OMNIA_FEAT_BOOTLOADER = BIT(7), + OMNIA_FEAT_FLASHING = BIT(8), + OMNIA_FEAT_NEW_MESSAGE_API = BIT(9), + OMNIA_FEAT_BRIGHTNESS_INT = BIT(10), + OMNIA_FEAT_POWEROFF_WAKEUP = BIT(11), + OMNIA_FEAT_CAN_OLD_MESSAGE_API = BIT(12), + OMNIA_FEAT_TRNG = BIT(13), + OMNIA_FEAT_CRYPTO = BIT(14), + OMNIA_FEAT_BOARD_INFO = BIT(15), + + /* + * Orginally the features command replied only 16 bits. If more were + * read, either the I2C transaction failed or 0xff bytes were sent. + * Therefore to consider bits 16 - 31 valid, one bit (20) was reserved + * to be zero. + */ + + /* Bits 16 - 19 correspond to bits 0 - 3 of status word */ + OMNIA_FEAT_MCU_TYPE_MASK = GENMASK(17, 16), + OMNIA_FEAT_MCU_TYPE_STM32 = FIELD_PREP_CONST(OMNIA_FEAT_MCU_TYPE_MASK, 0), + OMNIA_FEAT_MCU_TYPE_GD32 = FIELD_PREP_CONST(OMNIA_FEAT_MCU_TYPE_MASK, 1), + OMNIA_FEAT_MCU_TYPE_MKL = FIELD_PREP_CONST(OMNIA_FEAT_MCU_TYPE_MASK, 2), + OMNIA_FEAT_FEATURES_SUPPORTED = BIT(18), + OMNIA_FEAT_USER_REGULATOR_NOT_SUPPORTED = BIT(19), + + /* must not be set */ + OMNIA_FEAT_FROM_BIT_16_INVALID = BIT(20), + + OMNIA_FEAT_PER_LED_CORRECTION = BIT(21), + OMNIA_FEAT_USB_OVC_PROT_SETTING = BIT(22), +}; + +enum omnia_ext_sts_dword_e { + OMNIA_EXT_STS_SFP_nDET = BIT(0), + OMNIA_EXT_STS_LED_STATES_MASK = GENMASK(31, 12), + OMNIA_EXT_STS_WLAN0_MSATA_LED = BIT(12), + OMNIA_EXT_STS_WLAN1_LED = BIT(13), + OMNIA_EXT_STS_WLAN2_LED = BIT(14), + OMNIA_EXT_STS_WPAN0_LED = BIT(15), + OMNIA_EXT_STS_WPAN1_LED = BIT(16), + OMNIA_EXT_STS_WPAN2_LED = BIT(17), + OMNIA_EXT_STS_WAN_LED0 = BIT(18), + OMNIA_EXT_STS_WAN_LED1 = BIT(19), + OMNIA_EXT_STS_LAN0_LED0 = BIT(20), + OMNIA_EXT_STS_LAN0_LED1 = BIT(21), + OMNIA_EXT_STS_LAN1_LED0 = BIT(22), + OMNIA_EXT_STS_LAN1_LED1 = BIT(23), + OMNIA_EXT_STS_LAN2_LED0 = BIT(24), + OMNIA_EXT_STS_LAN2_LED1 = BIT(25), + OMNIA_EXT_STS_LAN3_LED0 = BIT(26), + OMNIA_EXT_STS_LAN3_LED1 = BIT(27), + OMNIA_EXT_STS_LAN4_LED0 = BIT(28), + OMNIA_EXT_STS_LAN4_LED1 = BIT(29), + OMNIA_EXT_STS_LAN5_LED0 = BIT(30), + OMNIA_EXT_STS_LAN5_LED1 = BIT(31), +}; + +enum omnia_ext_ctl_e { + OMNIA_EXT_CTL_nRES_MMC = BIT(0), + OMNIA_EXT_CTL_nRES_LAN = BIT(1), + OMNIA_EXT_CTL_nRES_PHY = BIT(2), + OMNIA_EXT_CTL_nPERST0 = BIT(3), + OMNIA_EXT_CTL_nPERST1 = BIT(4), + OMNIA_EXT_CTL_nPERST2 = BIT(5), + OMNIA_EXT_CTL_PHY_SFP = BIT(6), + OMNIA_EXT_CTL_PHY_SFP_AUTO = BIT(7), + OMNIA_EXT_CTL_nVHV_CTRL = BIT(8), +}; + +enum omnia_int_e { + OMNIA_INT_CARD_DET = BIT(0), + OMNIA_INT_MSATA_IND = BIT(1), + OMNIA_INT_USB30_OVC = BIT(2), + OMNIA_INT_USB31_OVC = BIT(3), + OMNIA_INT_BUTTON_PRESSED = BIT(4), + OMNIA_INT_SFP_nDET = BIT(5), + OMNIA_INT_BRIGHTNESS_CHANGED = BIT(6), + OMNIA_INT_TRNG = BIT(7), + OMNIA_INT_MESSAGE_SIGNED = BIT(8), + + OMNIA_INT_LED_STATES_MASK = GENMASK(31, 12), + OMNIA_INT_WLAN0_MSATA_LED = BIT(12), + OMNIA_INT_WLAN1_LED = BIT(13), + OMNIA_INT_WLAN2_LED = BIT(14), + OMNIA_INT_WPAN0_LED = BIT(15), + OMNIA_INT_WPAN1_LED = BIT(16), + OMNIA_INT_WPAN2_LED = BIT(17), + OMNIA_INT_WAN_LED0 = BIT(18), + OMNIA_INT_WAN_LED1 = BIT(19), + OMNIA_INT_LAN0_LED0 = BIT(20), + OMNIA_INT_LAN0_LED1 = BIT(21), + OMNIA_INT_LAN1_LED0 = BIT(22), + OMNIA_INT_LAN1_LED1 = BIT(23), + OMNIA_INT_LAN2_LED0 = BIT(24), + OMNIA_INT_LAN2_LED1 = BIT(25), + OMNIA_INT_LAN3_LED0 = BIT(26), + OMNIA_INT_LAN3_LED1 = BIT(27), + OMNIA_INT_LAN4_LED0 = BIT(28), + OMNIA_INT_LAN4_LED1 = BIT(29), + OMNIA_INT_LAN5_LED0 = BIT(30), + OMNIA_INT_LAN5_LED1 = BIT(31), +}; + +enum omnia_cmd_poweroff_e { + OMNIA_CMD_POWER_OFF_POWERON_BUTTON = BIT(0), + OMNIA_CMD_POWER_OFF_MAGIC = 0xdead, +}; + +enum omnia_cmd_usb_ovc_prot_e { + OMNIA_CMD_xET_USB_OVC_PROT_PORT_MASK = GENMASK(3, 0), + OMNIA_CMD_xET_USB_OVC_PROT_ENABLE = BIT(4), +}; + +#endif /* __TURRIS_OMNIA_MCU_INTERFACE_H */ -- cgit v1.2.3 From face1c543e89ebc657f7948b0541a76954cf9382 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Thu, 20 Jun 2024 21:14:10 +0200 Subject: ACPI: bus: Indicate support for battery charge limiting thru _OSC The ACPI battery driver can handle the "charge limiting" state of the battery, so the platform can advertise this state. Indicate this by setting bit 19 ("Battery Charge Limiting Support") when evaluating _OSC. Tested on a Lenovo Ideapad S145-14IWL. Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20240620191410.3646-2-W_Armin@gmx.de Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 28c3fb2bef0d..b87ef8f3caa0 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -576,6 +576,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); #define OSC_SB_CPC_FLEXIBLE_ADR_SPACE 0x00004000 #define OSC_SB_GENERIC_INITIATOR_SUPPORT 0x00020000 #define OSC_SB_NATIVE_USB4_SUPPORT 0x00040000 +#define OSC_SB_BATTERY_CHARGE_LIMITING_SUPPORT 0x00080000 #define OSC_SB_PRM_SUPPORT 0x00200000 #define OSC_SB_FFH_OPR_SUPPORT 0x00400000 -- cgit v1.2.3 From e38a82df2c9924577d39ce5ccee9e9edcadc3b5d Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 4 Jun 2024 14:30:06 +0200 Subject: math.h: Add unsigned 8 bits fractional numbers type Some users may be requiring only rather small numbers as both numerator and denominator: add signed and unsigned 8 bits structs {s8,u8}_fract. Signed-off-by: AngeloGioacchino Del Regno Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20240604123008.327424-4-angelogioacchino.delregno@collabora.com Signed-off-by: Jonathan Cameron --- include/linux/math.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/math.h b/include/linux/math.h index dd4152711de7..f5f18dc3616b 100644 --- a/include/linux/math.h +++ b/include/linux/math.h @@ -112,6 +112,8 @@ struct type##_fract { \ __##type numerator; \ __##type denominator; \ }; +__STRUCT_FRACT(s8) +__STRUCT_FRACT(u8) __STRUCT_FRACT(s16) __STRUCT_FRACT(u16) __STRUCT_FRACT(s32) -- cgit v1.2.3 From 9f111059e725f7ca79a136bfc734da3c8c1838b4 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 27 Jun 2024 19:26:24 -0500 Subject: fs_parse: add uid & gid option option parsing helpers Multiple filesystems take uid and gid as options, and the code to create the ID from an integer and validate it is standard boilerplate that can be moved into common helper functions, so do that for consistency and less cut&paste. This also helps avoid the buggy pattern noted by Seth Jenkins at https://lore.kernel.org/lkml/CALxfFW4BXhEwxR0Q5LSkg-8Vb4r2MONKCcUCVioehXQKr35eHg@mail.gmail.com/ because uid/gid parsing will fail before any assignment in most filesystems. Signed-off-by: Eric Sandeen Link: https://lore.kernel.org/r/de859d0a-feb9-473d-a5e2-c195a3d47abb@redhat.com Signed-off-by: Christian Brauner --- include/linux/fs_parser.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index d3350979115f..6cf713a7e6c6 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -28,7 +28,7 @@ typedef int fs_param_type(struct p_log *, */ fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64, fs_param_is_enum, fs_param_is_string, fs_param_is_blob, fs_param_is_blockdev, - fs_param_is_path, fs_param_is_fd; + fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid; /* * Specification of the type of value a parameter wants. @@ -57,6 +57,8 @@ struct fs_parse_result { int int_32; /* For spec_s32/spec_enum */ unsigned int uint_32; /* For spec_u32{,_octal,_hex}/spec_enum */ u64 uint_64; /* For spec_u64 */ + kuid_t uid; + kgid_t gid; }; }; @@ -131,6 +133,8 @@ static inline bool fs_validate_description(const char *name, #define fsparam_bdev(NAME, OPT) __fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL) #define fsparam_path(NAME, OPT) __fsparam(fs_param_is_path, NAME, OPT, 0, NULL) #define fsparam_fd(NAME, OPT) __fsparam(fs_param_is_fd, NAME, OPT, 0, NULL) +#define fsparam_uid(NAME, OPT) __fsparam(fs_param_is_uid, NAME, OPT, 0, NULL) +#define fsparam_gid(NAME, OPT) __fsparam(fs_param_is_gid, NAME, OPT, 0, NULL) /* String parameter that allows empty argument */ #define fsparam_string_empty(NAME, OPT) \ -- cgit v1.2.3 From 6f2a875024993449f1b19a144d3e4391411a1b51 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 25 Jun 2024 09:38:15 +0200 Subject: gpiolib: unexport gpiochip_get_desc() This function has been deprecated for some time and is now only used within the GPIOLIB core. Remove it from the public header and unexport it as all current users are linked against the compilation unit where it is defined. Link: https://lore.kernel.org/r/20240625073815.12376-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 6d31388dde0a..2dd7cb9cc270 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -787,7 +787,6 @@ struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *gc, enum gpiod_flags dflags); void gpiochip_free_own_desc(struct gpio_desc *desc); -struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum); struct gpio_desc * gpio_device_get_desc(struct gpio_device *gdev, unsigned int hwnum); -- cgit v1.2.3 From 060f4ba6e40338a70932603a3564903acf5f5734 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jun 2024 13:59:44 +0100 Subject: io_uring/net: move charging socket out of zc io_uring Currently, io_uring's io_sg_from_iter() duplicates the part of __zerocopy_sg_from_iter() charging pages to the socket. It'd be too easy to miss while changing it in net/, the chunk is not the most straightforward for outside users and full of internal implementation details. io_uring is not a good place to keep it, deduplicate it by moving out of the callback into __zerocopy_sg_from_iter(). Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Reviewed-by: Jens Axboe Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 3 +++ include/linux/socket.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f4cda3fbdb75..9c29bdd5596d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1703,6 +1703,9 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); +int zerocopy_fill_skb_from_iter(struct sk_buff *skb, + struct iov_iter *from, size_t length); + static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { diff --git a/include/linux/socket.h b/include/linux/socket.h index 89d16b90370b..2a1ff91d1914 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -76,7 +76,7 @@ struct msghdr { __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ struct ubuf_info *msg_ubuf; - int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb, + int (*sg_from_iter)(struct sk_buff *skb, struct iov_iter *from, size_t length); }; -- cgit v1.2.3 From 32267c29bc7d5c9654b71e4f354064217a5fb053 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Mon, 17 Jun 2024 17:44:47 +0100 Subject: phy: exynos5-usbdrd: support Exynos USBDRD 3.1 combo phy (HS & SS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the Exynos USB 3.1 DRD combo phy, as found in Exynos 9 SoCs like Google GS101. It supports USB SS, HS and DisplayPort. In terms of UTMI+, this is very similar to the existing Exynos850 support in this driver. The difference is that this combo phy supports both UTMI+ (HS) and PIPE3 (SS). It also supports DP alt mode. The number of ports for UTMI+ and PIPE3 can be determined using the LINKPORT register (which also exists on Exynos E850). For SuperSpeed (SS) a new SS phy is in use and its PIPE3 interface is new compared to Exynos E850, and also very different from the existing support for older Exynos SoCs in this driver. The SS phy needs a bit more configuration work and register tuning for signal quality to work reliably, presumably due to the higher frequency, e.g. to account for different board layouts. Additionally, power needs to be enabled before writing to the SS phy registers. This commit adds the necessary changes for USB HS and SS to work. DisplayPort is out of scope in this commit. Notes: * For the register tuning, exynos5_usbdrd_apply_phy_tunes() has been added with the appropriate data structures to support tuning at various stages during initialisation. Since these are hardware specific, the platform data is supposed to be populated accordingly. The implementation is loosely modelled after the Samsung UFS PHY driver. There is one tuning state for UTMI+, PTS_UTMI_POSTINIT, to execute after init and generally intended for HS signal tuning, as done in this commit. PTS_PIPE3_PREINIT PTS_PIPE3_INIT PTS_PIPE3_POSTINIT PTS_PIPE3_POSTLOCK are tuning states for PIPE3. In the downstream driver, preinit differs by Exynos SoC, and postinit and postlock are different per board. The latter haven't been implemented for gs101 here, because downstream doesn't use them on gs101 either. * Signal lock acquisition for SS depends on the orientation of the USB-C plug. Since there currently is no infrastructure to chain connector events to both the USB DWC3 driver and this phy driver, a work-around has been added in exynos5_usbdrd_usbdp_g2_v4_pma_check_cdr_lock() to check both registers if it failed in one of the orientations. * Equally, we can only establish SS speed in one of the connector orientations due to programming differences when selecting the lane mux in exynos5_usbdrd_usbdp_g2_v4_pma_lane_mux_sel(), which really needs to be dynamic, based on the orientation of the connector. * As is, we can establish a HS link using any cable, and an SS link in one orientation of the plug, falling back to HS if the orientation is reversed to the expectation. Signed-off-by: André Draszik Reviewed-by: Peter Griffin Tested-by: Peter Griffin Tested-by: Will McVicker Link: https://lore.kernel.org/r/20240617-usb-phy-gs101-v3-6-b66de9ae7424@linaro.org Signed-off-by: Vinod Koul --- include/linux/soc/samsung/exynos-regs-pmu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h index aa840ed043e1..6765160eaab2 100644 --- a/include/linux/soc/samsung/exynos-regs-pmu.h +++ b/include/linux/soc/samsung/exynos-regs-pmu.h @@ -657,4 +657,8 @@ #define EXYNOS5433_PAD_RETENTION_UFS_OPTION (0x3268) #define EXYNOS5433_PAD_RETENTION_FSYSGENIO_OPTION (0x32A8) +/* For GS101 */ +#define GS101_PHY_CTRL_USB20 0x3eb0 +#define GS101_PHY_CTRL_USBDP 0x3eb4 + #endif /* __LINUX_SOC_EXYNOS_REGS_PMU_H */ -- cgit v1.2.3 From d839a73179ae91c07f5f2f97ccb9c69b2b7c3306 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 28 Jun 2024 12:18:55 +0200 Subject: net: Optimize xdp_do_flush() with bpf_net_context infos. Every NIC driver utilizing XDP should invoke xdp_do_flush() after processing all packages. With the introduction of the bpf_net_context logic the flush lists (for dev, CPU-map and xsk) are lazy initialized only if used. However xdp_do_flush() tries to flush all three of them so all three lists are always initialized and the likely empty lists are "iterated". Without the usage of XDP but with CONFIG_DEBUG_NET the lists are also initialized due to xdp_do_check_flushed(). Jakub suggest to utilize the hints in bpf_net_context and avoid invoking the flush function. This will also avoiding initializing the lists which are otherwise unused. Introduce bpf_net_ctx_get_all_used_flush_lists() to return the individual list if not-empty. Use the logic in xdp_do_flush() and xdp_do_check_flushed(). Remove the not needed .*_check_flush(). Suggested-by: Jakub Kicinski Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/bpf.h | 10 ++++------ include/linux/filter.h | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a834f4b761bc..f5c6bc9093a6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2494,7 +2494,7 @@ struct sk_buff; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; -void __dev_flush(void); +void __dev_flush(struct list_head *flush_list); int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, @@ -2507,7 +2507,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress); -void __cpu_map_flush(void); +void __cpu_map_flush(struct list_head *flush_list); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx); int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, @@ -2644,8 +2644,6 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); -bool dev_check_flush(void); -bool cpu_map_check_flush(void); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2738,7 +2736,7 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) return ERR_PTR(-EOPNOTSUPP); } -static inline void __dev_flush(void) +static inline void __dev_flush(struct list_head *flush_list) { } @@ -2784,7 +2782,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, return 0; } -static inline void __cpu_map_flush(void) +static inline void __cpu_map_flush(struct list_head *flush_list) { } diff --git a/include/linux/filter.h b/include/linux/filter.h index c0349522de8f..02ddcfdf94c4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -829,6 +829,33 @@ static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void) return &bpf_net_ctx->xskmap_map_flush_list; } +static inline void bpf_net_ctx_get_all_used_flush_lists(struct list_head **lh_map, + struct list_head **lh_dev, + struct list_head **lh_xsk) +{ + struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + u32 kern_flags = bpf_net_ctx->ri.kern_flags; + struct list_head *lh; + + *lh_map = *lh_dev = *lh_xsk = NULL; + + if (!IS_ENABLED(CONFIG_BPF_SYSCALL)) + return; + + lh = &bpf_net_ctx->dev_map_flush_list; + if (kern_flags & BPF_RI_F_DEV_MAP_INIT && !list_empty(lh)) + *lh_dev = lh; + + lh = &bpf_net_ctx->cpu_map_flush_list; + if (kern_flags & BPF_RI_F_CPU_MAP_INIT && !list_empty(lh)) + *lh_map = lh; + + lh = &bpf_net_ctx->xskmap_map_flush_list; + if (IS_ENABLED(CONFIG_XDP_SOCKETS) && + kern_flags & BPF_RI_F_XSK_MAP_INIT && !list_empty(lh)) + *lh_xsk = lh; +} + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) -- cgit v1.2.3 From 7a4479680d7fd05c7a3efa87b41f421af48fbbdf Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 1 Jul 2024 16:49:37 -0700 Subject: cgroup_misc: add kernel-doc comments for enum misc_res_type Fully document enum misc_res_type with kernel-doc comments to prevent kernel-doc warnings: misc_cgroup.h:12: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Types of misc cgroup entries supported by the host. misc_cgroup.h:12: warning: missing initial short description on line: * Types of misc cgroup entries supported by the host. Fixes: a72232eabdfc ("cgroup: Add misc cgroup controller") Signed-off-by: Randy Dunlap Cc: cgroups@vger.kernel.org Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index e799b1f8d05b..d70eab2501ee 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -9,15 +9,16 @@ #define _MISC_CGROUP_H_ /** - * Types of misc cgroup entries supported by the host. + * enum misc_res_type - Types of misc cgroup entries supported by the host. */ enum misc_res_type { #ifdef CONFIG_KVM_AMD_SEV - /* AMD SEV ASIDs resource */ + /** @MISC_CG_RES_SEV: AMD SEV ASIDs resource */ MISC_CG_RES_SEV, - /* AMD SEV-ES ASIDs resource */ + /** @MISC_CG_RES_SEV_ES: AMD SEV-ES ASIDs resource */ MISC_CG_RES_SEV_ES, #endif + /** @MISC_CG_RES_TYPES: count of enum misc_res_type constants */ MISC_CG_RES_TYPES }; -- cgit v1.2.3 From f436cb6913a57bf3e1e66d18bc663e6c20751929 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:01 -0700 Subject: x86/resctrl: Prepare for new domain scope Resctrl resources operate on subsets of CPUs in the system with the defining attribute of each subset being an instance of a particular level of cache. E.g. all CPUs sharing an L3 cache would be part of the same domain. In preparation for features that are scoped at the NUMA node level, change the code from explicit references to "cache_level" to a more generic scope. At this point the only options for this scope are groups of CPUs that share an L2 cache or L3 cache. Clean up the error handling when looking up domains. Report invalid ids before calling rdt_find_domain() in preparation for better messages when scope can be other than cache scope. This means that rdt_find_domain() will never return an error. So remove checks for error from the call sites. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-2-tony.luck@intel.com --- include/linux/resctrl.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index a365f67131ec..ed693bfe474d 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -150,13 +150,18 @@ struct resctrl_membw { struct rdt_parse_data; struct resctrl_schema; +enum resctrl_scope { + RESCTRL_L2_CACHE = 2, + RESCTRL_L3_CACHE = 3, +}; + /** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine * @num_rmid: Number of RMIDs available - * @cache_level: Which cache level defines scope of this resource + * @scope: Scope of this resource * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. * @domains: RCU list of all domains for this resource @@ -174,7 +179,7 @@ struct rdt_resource { bool alloc_capable; bool mon_capable; int num_rmid; - int cache_level; + enum resctrl_scope scope; struct resctrl_cache cache; struct resctrl_membw membw; struct list_head domains; -- cgit v1.2.3 From c103d4d48e1599a88001fa6215be27d55f3c025b Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:02 -0700 Subject: x86/resctrl: Prepare to split rdt_domain structure The rdt_domain structure is used for both control and monitor features. It is about to be split into separate structures for these two usages because the scope for control and monitoring features for a resource will be different for future resources. To allow for common code that scans a list of domains looking for a specific domain id, move all the common fields ("list", "id", "cpu_mask") into their own structure within the rdt_domain structure. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-3-tony.luck@intel.com --- include/linux/resctrl.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index ed693bfe474d..f63fcf17a3bc 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -59,10 +59,20 @@ struct resctrl_staged_config { }; /** - * struct rdt_domain - group of CPUs sharing a resctrl resource + * struct rdt_domain_hdr - common header for different domain types * @list: all instances of this resource * @id: unique id for this instance * @cpu_mask: which CPUs share this resource + */ +struct rdt_domain_hdr { + struct list_head list; + int id; + struct cpumask cpu_mask; +}; + +/** + * struct rdt_domain - group of CPUs sharing a resctrl resource + * @hdr: common header for different domain types * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold * @mbm_total: saved state for MBM total bandwidth * @mbm_local: saved state for MBM local bandwidth @@ -77,9 +87,7 @@ struct resctrl_staged_config { * by closid */ struct rdt_domain { - struct list_head list; - int id; - struct cpumask cpu_mask; + struct rdt_domain_hdr hdr; unsigned long *rmid_busy_llc; struct mbm_state *mbm_total; struct mbm_state *mbm_local; -- cgit v1.2.3 From cd84f72b6a5c10f79f19fab67b0edfbc4fdbc5b1 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:03 -0700 Subject: x86/resctrl: Prepare for different scope for control/monitor operations Resctrl assumes that control and monitor operations on a resource are performed at the same scope. Prepare for systems that use different scope (specifically Intel needs to split the RDT_RESOURCE_L3 resource to use L3 scope for cache control and NODE scope for cache occupancy and memory bandwidth monitoring). Create separate domain lists for control and monitor operations. Note that errors during initialization of either control or monitor functions on a domain would previously result in that domain being excluded from both control and monitor operations. Now the domains are allocated independently it is no longer required to disable both control and monitor operations if either fail. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-4-tony.luck@intel.com --- include/linux/resctrl.h | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index f63fcf17a3bc..96ddf9ff3183 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -58,15 +58,22 @@ struct resctrl_staged_config { bool have_new_ctrl; }; +enum resctrl_domain_type { + RESCTRL_CTRL_DOMAIN, + RESCTRL_MON_DOMAIN, +}; + /** * struct rdt_domain_hdr - common header for different domain types * @list: all instances of this resource * @id: unique id for this instance + * @type: type of this instance * @cpu_mask: which CPUs share this resource */ struct rdt_domain_hdr { struct list_head list; int id; + enum resctrl_domain_type type; struct cpumask cpu_mask; }; @@ -169,10 +176,12 @@ enum resctrl_scope { * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine * @num_rmid: Number of RMIDs available - * @scope: Scope of this resource + * @ctrl_scope: Scope of this resource for control functions + * @mon_scope: Scope of this resource for monitor functions * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. - * @domains: RCU list of all domains for this resource + * @ctrl_domains: RCU list of all control domains for this resource + * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @data_width: Character width of data when displaying * @default_ctrl: Specifies default cache cbm or memory B/W percent. @@ -187,10 +196,12 @@ struct rdt_resource { bool alloc_capable; bool mon_capable; int num_rmid; - enum resctrl_scope scope; + enum resctrl_scope ctrl_scope; + enum resctrl_scope mon_scope; struct resctrl_cache cache; struct resctrl_membw membw; - struct list_head domains; + struct list_head ctrl_domains; + struct list_head mon_domains; char *name; int data_width; u32 default_ctrl; @@ -236,8 +247,10 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, u32 closid, enum resctrl_conf_type type); -int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); -void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); +int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d); +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain *d); +void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d); +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_online_cpu(unsigned int cpu); void resctrl_offline_cpu(unsigned int cpu); -- cgit v1.2.3 From cae2bcb6a2c691ef7b537ad07e9819a5ed645bcc Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:04 -0700 Subject: x86/resctrl: Split the rdt_domain and rdt_hw_domain structures The same rdt_domain structure is used for both control and monitor functions. But this results in wasted memory as some of the fields are only used by control functions, while most are only used for monitor functions. Split into separate rdt_ctrl_domain and rdt_mon_domain structures with just the fields required for control and monitoring respectively. Similar split of the rdt_hw_domain structure into rdt_hw_ctrl_domain and rdt_hw_mon_domain. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-5-tony.luck@intel.com --- include/linux/resctrl.h | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 96ddf9ff3183..aa2c22a8e37b 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -78,7 +78,23 @@ struct rdt_domain_hdr { }; /** - * struct rdt_domain - group of CPUs sharing a resctrl resource + * struct rdt_ctrl_domain - group of CPUs sharing a resctrl control resource + * @hdr: common header for different domain types + * @plr: pseudo-locked region (if any) associated with domain + * @staged_config: parsed configuration to be applied + * @mbps_val: When mba_sc is enabled, this holds the array of user + * specified control values for mba_sc in MBps, indexed + * by closid + */ +struct rdt_ctrl_domain { + struct rdt_domain_hdr hdr; + struct pseudo_lock_region *plr; + struct resctrl_staged_config staged_config[CDP_NUM_TYPES]; + u32 *mbps_val; +}; + +/** + * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold * @mbm_total: saved state for MBM total bandwidth @@ -87,13 +103,8 @@ struct rdt_domain_hdr { * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters * @cqm_work_cpu: worker CPU for CQM h/w counters - * @plr: pseudo-locked region (if any) associated with domain - * @staged_config: parsed configuration to be applied - * @mbps_val: When mba_sc is enabled, this holds the array of user - * specified control values for mba_sc in MBps, indexed - * by closid */ -struct rdt_domain { +struct rdt_mon_domain { struct rdt_domain_hdr hdr; unsigned long *rmid_busy_llc; struct mbm_state *mbm_total; @@ -102,9 +113,6 @@ struct rdt_domain { struct delayed_work cqm_limbo; int mbm_work_cpu; int cqm_work_cpu; - struct pseudo_lock_region *plr; - struct resctrl_staged_config staged_config[CDP_NUM_TYPES]; - u32 *mbps_val; }; /** @@ -208,7 +216,7 @@ struct rdt_resource { const char *format_str; int (*parse_ctrlval)(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d); + struct rdt_ctrl_domain *d); struct list_head evt_list; unsigned long fflags; bool cdp_capable; @@ -242,15 +250,15 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. */ -int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val); -u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type); -int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d); -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain *d); -void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d); -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain *d); +int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d); +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d); +void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d); +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d); void resctrl_online_cpu(unsigned int cpu); void resctrl_offline_cpu(unsigned int cpu); @@ -279,7 +287,7 @@ void resctrl_offline_cpu(unsigned int cpu); * Return: * 0 on success, or -EIO, -EINVAL etc on error. */ -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *arch_mon_ctx); @@ -312,7 +320,7 @@ static inline void resctrl_arch_rmid_read_context_check(void) * * This can be called from any CPU. */ -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid); @@ -325,7 +333,7 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, * * This can be called from any CPU. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d); +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d); extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; -- cgit v1.2.3 From 1a171608ee8d40d22d604303e42f033c69151123 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:05 -0700 Subject: x86/resctrl: Add node-scope to the options for feature scope Currently supported resctrl features are all domain scoped the same as the scope of the L2 or L3 caches. Add RESCTRL_L3_NODE as a new option for features that are scoped at the same granularity as NUMA nodes. This is needed for Intel's Sub-NUMA Cluster (SNC) feature where monitoring features are divided between nodes that share an L3 cache. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-6-tony.luck@intel.com --- include/linux/resctrl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index aa2c22a8e37b..64b6ad1b22a1 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -176,6 +176,7 @@ struct resctrl_schema; enum resctrl_scope { RESCTRL_L2_CACHE = 2, RESCTRL_L3_CACHE = 3, + RESCTRL_L3_NODE, }; /** -- cgit v1.2.3 From 328ea688746420e12ced6cfbc5064413180244cc Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Jun 2024 14:56:08 -0700 Subject: x86/resctrl: Prepare for new Sub-NUMA Cluster (SNC) monitor files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When SNC is enabled, monitoring data is collected at the SNC node granularity, but must be reported at L3-cache granularity for backwards compatibility in addition to reporting at the node level. Add a "ci" field to the rdt_mon_domain structure to save the cache information about the enclosing L3 cache for the domain. This provides: 1) The cache id which is needed to compose the name of the legacy monitoring directory, and to determine which domains should be summed to provide L3-scoped data. 2) The shared_cpu_map which is needed to determine which CPUs can be used to read the RMID counters with the MSR interface. This is the first step to an eventual goal of monitor reporting files like this (for a system with two SNC nodes per L3): $ cd /sys/fs/resctrl/mon_data $ tree mon_L3_00 mon_L3_00 <- 00 here is L3 cache id ├── llc_occupancy \ These files provide legacy support ├── mbm_local_bytes > for non-SNC aware monitor apps ├── mbm_total_bytes / that expect data at L3 cache level ├── mon_sub_L3_00 <- 00 here is SNC node id │   ├── llc_occupancy \ These files are finer grained │   ├── mbm_local_bytes > data from each SNC node │   └── mbm_total_bytes / └── mon_sub_L3_01 ├── llc_occupancy \ ├── mbm_local_bytes > As above, but for node 1. └── mbm_total_bytes / [ bp: Massage commit message. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20240628215619.76401-9-tony.luck@intel.com --- include/linux/resctrl.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 64b6ad1b22a1..b0875b99e811 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -2,6 +2,7 @@ #ifndef _RESCTRL_H #define _RESCTRL_H +#include #include #include #include @@ -96,6 +97,7 @@ struct rdt_ctrl_domain { /** * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types + * @ci: cache info for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold * @mbm_total: saved state for MBM total bandwidth * @mbm_local: saved state for MBM local bandwidth @@ -106,6 +108,7 @@ struct rdt_ctrl_domain { */ struct rdt_mon_domain { struct rdt_domain_hdr hdr; + struct cacheinfo *ci; unsigned long *rmid_busy_llc; struct mbm_state *mbm_total; struct mbm_state *mbm_local; -- cgit v1.2.3 From 675e979db473d08be346a3190c5f0db095a57153 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Fri, 7 Jun 2024 16:43:58 +0200 Subject: cxl/events: Use a common struct for DRAM and General Media events cxl_event_common was an unfortunate naming choice and caused confusion with the existing Common Event Record. Furthermore, its fields didn't map all the common information between DRAM and General Media Events. Remove cxl_event_common and introduce cxl_event_media_hdr to record common information between DRAM and General Media events. cxl_event_media_hdr, which is embedded in both cxl_event_gen_media and cxl_event_dram, leverages the commonalities between the two events to simplify their respective handling. Suggested-by: Dan Williams Reviewed-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240607144423.48681-1-fabio.m.de.francesco@linux.intel.com Signed-off-by: Dave Jiang --- include/linux/cxl-event.h | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 60b25020281f..0bea1afbd747 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -21,6 +21,21 @@ struct cxl_event_record_hdr { u8 reserved[15]; } __packed; +struct cxl_event_media_hdr { + struct cxl_event_record_hdr hdr; + __le64 phys_addr; + u8 descriptor; + u8 type; + u8 transaction_type; + /* + * The meaning of Validity Flags from bit 2 is + * different across DRAM and General Media records + */ + u8 validity_flags[2]; + u8 channel; + u8 rank; +} __packed; + #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 struct cxl_event_generic { struct cxl_event_record_hdr hdr; @@ -33,14 +48,7 @@ struct cxl_event_generic { */ #define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 struct cxl_event_gen_media { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; - u8 descriptor; - u8 type; - u8 transaction_type; - u8 validity_flags[2]; - u8 channel; - u8 rank; + struct cxl_event_media_hdr media_hdr; u8 device[3]; u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE]; u8 reserved[46]; @@ -52,14 +60,7 @@ struct cxl_event_gen_media { */ #define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 struct cxl_event_dram { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; - u8 descriptor; - u8 type; - u8 transaction_type; - u8 validity_flags[2]; - u8 channel; - u8 rank; + struct cxl_event_media_hdr media_hdr; u8 nibble_mask[3]; u8 bank_group; u8 bank; @@ -95,21 +96,13 @@ struct cxl_event_mem_module { u8 reserved[0x3d]; } __packed; -/* - * General Media or DRAM Event Common Fields - * - provides common access to phys_addr - */ -struct cxl_event_common { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; -} __packed; - union cxl_event { struct cxl_event_generic generic; struct cxl_event_gen_media gen_media; struct cxl_event_dram dram; struct cxl_event_mem_module mem_module; - struct cxl_event_common common; + /* dram & gen_media event header */ + struct cxl_event_media_hdr media_hdr; } __packed; /* -- cgit v1.2.3 From 4dec64c52e24c2c9a15f81c115f1be5ea35121cb Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Fri, 28 Jun 2024 00:32:42 +0000 Subject: page_pool: convert to use netmem Abstract the memory type from the page_pool so we can later add support for new memory types. Convert the page_pool to use the new netmem type abstraction, rather than use struct page directly. As of this patch the netmem type is a no-op abstraction: it's always a struct page underneath. All the page pool internals are converted to use struct netmem instead of struct page, and the page pool now exports 2 APIs: 1. The existing struct page API. 2. The new struct netmem API. Keeping the existing API is transitional; we do not want to refactor all the current drivers using the page pool at once. The netmem abstraction is currently a no-op. The page_pool uses page_to_netmem() to convert allocated pages to netmem, and uses netmem_to_page() to convert the netmem back to pages to pass to mm APIs, Follow up patches to this series add non-paged netmem support to the page_pool. This change is factored out on its own to limit the code churn to this 1 patch, for ease of code review. Signed-off-by: Mina Almasry Reviewed-by: Pavel Begunkov Link: https://patch.msgid.link/20240628003253.1694510-6-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff_ref.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h index 11f0a4063403..16c241a23472 100644 --- a/include/linux/skbuff_ref.h +++ b/include/linux/skbuff_ref.h @@ -32,13 +32,13 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } -bool napi_pp_put_page(struct page *page); +bool napi_pp_put_page(netmem_ref netmem); static inline void skb_page_unref(struct page *page, bool recycle) { #ifdef CONFIG_PAGE_POOL - if (recycle && napi_pp_put_page(page)) + if (recycle && napi_pp_put_page(page_to_netmem(page))) return; #endif put_page(page); -- cgit v1.2.3 From 18a5daf0e4974ded74b2ed8a6aed1da49bde356d Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 13 Jun 2024 02:12:15 +0200 Subject: vfs: move d_lockref out of the area used by RCU lookup Stock kernel scales worse than FreeBSD when doing a 20-way stat(2) on the same tmpfs-backed file. According to perf top: 38.09% lockref_put_return 26.08% lockref_get_not_dead 25.60% __d_lookup_rcu 0.89% clear_bhb_loop __d_lookup_rcu is participating in cacheline ping pong due to the embedded name sharing a cacheline with lockref. Moving it out resolves the problem: 41.50% lockref_put_return 41.03% lockref_get_not_dead 1.54% clear_bhb_loop benchmark (will-it-scale, Sapphire Rapids, tmpfs, ops/s): FreeBSD:7219334 before: 5038006 after: 7842883 (+55%) One minor remark: the 'after' result is unstable, fluctuating in the range ~7.8 mln to ~9 mln during different runs. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240613001215.648829-3-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/dcache.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bf53e3894aae..326dbccc3736 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -89,13 +89,18 @@ struct dentry { struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ + /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */ /* Ref lookup also touches following */ - struct lockref d_lockref; /* per-dentry lock and refcount */ const struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ unsigned long d_time; /* used by d_revalidate */ void *d_fsdata; /* fs-specific data */ + /* --- cacheline 2 boundary (128 bytes) --- */ + struct lockref d_lockref; /* per-dentry lock and refcount + * keep separate from RCU lookup area if + * possible! + */ union { struct list_head d_lru; /* LRU list */ -- cgit v1.2.3 From dc99c0ff53f588bb210b1e8b3314c7581cde68a2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 18 Jun 2024 14:12:28 +0200 Subject: fs: fix dentry size On CONFIG_SMP=y and on 32bit we need to decrease DNAME_INLINE_LEN to 36 btyes to end up with 128 bytes in total. Reported-by: Linus Torvalds Links: https://lore.kernel.org/r/CAHk-=whtoqTSCcAvV-X-KPqoDWxS4vxmWpuKLB+Vv8=FtUd5vA@mail.gmail.com Signed-off-by: Christian Brauner --- include/linux/dcache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 326dbccc3736..58916b3f53ad 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -71,7 +71,7 @@ extern const struct qstr dotdot_name; # define DNAME_INLINE_LEN 40 /* 192 bytes */ #else # ifdef CONFIG_SMP -# define DNAME_INLINE_LEN 40 /* 128 bytes */ +# define DNAME_INLINE_LEN 36 /* 128 bytes */ # else # define DNAME_INLINE_LEN 44 /* 128 bytes */ # endif -- cgit v1.2.3 From 85b08b31a22b481ec6528130daf94eee4452e23f Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 28 Jun 2024 14:29:22 +0800 Subject: netfs, fscache: export fscache_put_volume() and add fscache_try_get_volume() Export fscache_put_volume() and add fscache_try_get_volume() helper function to allow cachefiles to get/put fscache_volume via linux/fscache-cache.h. Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240628062930.2467993-2-libaokun@huaweicloud.com Signed-off-by: Christian Brauner --- include/linux/fscache-cache.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index bdf7f3eddf0a..4c91a019972b 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -19,6 +19,7 @@ enum fscache_cache_trace; enum fscache_cookie_trace; enum fscache_access_trace; +enum fscache_volume_trace; enum fscache_cache_state { FSCACHE_CACHE_IS_NOT_PRESENT, /* No cache is present for this name */ @@ -97,6 +98,11 @@ extern void fscache_withdraw_cookie(struct fscache_cookie *cookie); extern void fscache_io_error(struct fscache_cache *cache); +extern struct fscache_volume * +fscache_try_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +extern void fscache_put_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); extern void fscache_end_volume_access(struct fscache_volume *volume, struct fscache_cookie *cookie, enum fscache_access_trace why); -- cgit v1.2.3 From ad59baa3169591e0b4cf1a217c9139f2145f4c7f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Jul 2024 09:25:21 +0200 Subject: slab, rust: extend kmalloc() alignment guarantees to remove Rust padding Slab allocators have been guaranteeing natural alignment for power-of-two sizes since commit 59bb47985c1d ("mm, sl[aou]b: guarantee natural alignment for kmalloc(power-of-two)"), while any other sizes are guaranteed to be aligned only to ARCH_KMALLOC_MINALIGN bytes (although in practice are aligned more than that in non-debug scenarios). Rust's allocator API specifies size and alignment per allocation, which have to satisfy the following rules, per Alice Ryhl [1]: 1. The alignment is a power of two. 2. The size is non-zero. 3. When you round up the size to the next multiple of the alignment, then it must not overflow the signed type isize / ssize_t. In order to map this to kmalloc()'s guarantees, some requested allocation sizes have to be padded to the next power-of-two size [2]. For example, an allocation of size 96 and alignment of 32 will be padded to an allocation of size 128, because the existing kmalloc-96 bucket doesn't guarantee alignent above ARCH_KMALLOC_MINALIGN. Without slab debugging active, the layout of the kmalloc-96 slabs however naturally align the objects to 32 bytes, so extending the size to 128 bytes is wasteful. To improve the situation we can extend the kmalloc() alignment guarantees in a way that 1) doesn't change the current slab layout (and thus does not increase internal fragmentation) when slab debugging is not active 2) reduces waste in the Rust allocator use case 3) is a superset of the current guarantee for power-of-two sizes. The extended guarantee is that alignment is at least the largest power-of-two divisor of the requested size. For power-of-two sizes the largest divisor is the size itself, but let's keep this case documented separately for clarity. For current kmalloc size buckets, it means kmalloc-96 will guarantee alignment of 32 bytes and kmalloc-196 will guarantee 64 bytes. This covers the rules 1 and 2 above of Rust's API as long as the size is a multiple of the alignment. The Rust layer should now only need to round up the size to the next multiple if it isn't, while enforcing the rule 3. Implementation-wise, this changes the alignment calculation in create_boot_cache(). While at it also do the calulation only for caches with the SLAB_KMALLOC flag, because the function is also used to create the initial kmem_cache and kmem_cache_node caches, where no alignment guarantee is necessary. In the Rust allocator's krealloc_aligned(), remove the code that padded sizes to the next power of two (suggested by Alice Ryhl) as it's no longer necessary with the new guarantees. Reported-by: Alice Ryhl Reported-by: Boqun Feng Link: https://lore.kernel.org/all/CAH5fLggjrbdUuT-H-5vbQfMazjRDpp2%2Bk3%3DYhPyS17ezEqxwcw@mail.gmail.com/ [1] Link: https://lore.kernel.org/all/CAH5fLghsZRemYUwVvhk77o6y1foqnCeDzW4WZv6ScEWna2+_jw@mail.gmail.com/ [2] Reviewed-by: Boqun Feng Acked-by: Roman Gushchin Reviewed-by: Alice Ryhl Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index ed6bee5ec2b6..640cea6e6323 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -604,7 +604,8 @@ void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) * * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN * bytes. For @size of power of two bytes, the alignment is also guaranteed - * to be at least to the size. + * to be at least to the size. For other sizes, the alignment is guaranteed to + * be at least the largest power-of-two divisor of @size. * * The @flags argument may be one of the GFP flags defined at * include/linux/gfp_types.h and described at -- cgit v1.2.3 From 72e0fe2241ce113cbba339ca8c2450b167774530 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 1 Jul 2024 12:12:58 -0700 Subject: mm/slab: Introduce kmem_buckets typedef Encapsulate the concept of a single set of kmem_caches that are used for the kmalloc size buckets. Redefine kmalloc_caches as an array of these buckets (for the different global cache buckets). Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 640cea6e6323..922bf15794f7 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -426,8 +426,9 @@ enum kmalloc_cache_type { NR_KMALLOC_TYPES }; -extern struct kmem_cache * -kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; +typedef struct kmem_cache * kmem_buckets[KMALLOC_SHIFT_HIGH + 1]; + +extern kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES]; /* * Define gfp bits that should not be set for KMALLOC_NORMAL. -- cgit v1.2.3 From 67f2df3b82d091ed095d0e47e1f3a9d3e18e4e41 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 1 Jul 2024 12:12:59 -0700 Subject: mm/slab: Plumb kmem_buckets into __do_kmalloc_node() Introduce CONFIG_SLAB_BUCKETS which provides the infrastructure to support separated kmalloc buckets (in the following kmem_buckets_create() patches and future codetag-based separation). Since this will provide a mitigation for a very common case of exploits, it is recommended to enable this feature for general purpose distros. By default, the new Kconfig will be enabled if CONFIG_SLAB_FREELIST_HARDENED is enabled (and it is added to the hardening.config Kconfig fragment). To be able to choose which buckets to allocate from, make the buckets available to the internal kmalloc interfaces by adding them as the second argument, rather than depending on the buckets being chosen from the fixed set of global buckets. Where the bucket is not available, pass NULL, which means "use the default system kmalloc bucket set" (the prior existing behavior), as implemented in kmalloc_slab(). To avoid adding the extra argument when !CONFIG_SLAB_BUCKETS, only the top-level macros and static inlines use the buckets argument (where they are stripped out and compiled out respectively). The actual extern functions can then be built without the argument, and the internals fall back to the global kmalloc buckets unconditionally. Co-developed-by: Vlastimil Babka Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 922bf15794f7..a9200d453087 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -570,6 +570,21 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; #define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__)) +/* + * These macros allow declaring a kmem_buckets * parameter alongside size, which + * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call + * sites don't have to pass NULL. + */ +#ifdef CONFIG_SLAB_BUCKETS +#define DECL_BUCKET_PARAMS(_size, _b) size_t (_size), kmem_buckets *(_b) +#define PASS_BUCKET_PARAMS(_size, _b) (_size), (_b) +#define PASS_BUCKET_PARAM(_b) (_b) +#else +#define DECL_BUCKET_PARAMS(_size, _b) size_t (_size) +#define PASS_BUCKET_PARAMS(_size, _b) (_size) +#define PASS_BUCKET_PARAM(_b) NULL +#endif + /* * The following functions are not to be used directly and are intended only * for internal use from kmalloc() and kmalloc_node() @@ -579,7 +594,7 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, void *__kmalloc_noprof(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); -void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) +void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) __assume_kmalloc_alignment __alloc_size(1); void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t flags, size_t size) @@ -680,7 +695,7 @@ static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gf kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index], flags, node, size); } - return __kmalloc_node_noprof(size, flags, node); + return __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node); } #define kmalloc_node(...) alloc_hooks(kmalloc_node_noprof(__VA_ARGS__)) @@ -731,8 +746,10 @@ static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(voi */ #define kcalloc(n, size, flags) kmalloc_array(n, size, (flags) | __GFP_ZERO) -void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, int node, - unsigned long caller) __alloc_size(1); +void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node, + unsigned long caller) __alloc_size(1); +#define kmalloc_node_track_caller_noprof(size, flags, node, caller) \ + __kmalloc_node_track_caller_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node, caller) #define kmalloc_node_track_caller(...) \ alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_)) @@ -758,7 +775,7 @@ static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_ return NULL; if (__builtin_constant_p(n) && __builtin_constant_p(size)) return kmalloc_node_noprof(bytes, flags, node); - return __kmalloc_node_noprof(bytes, flags, node); + return __kmalloc_node_noprof(PASS_BUCKET_PARAMS(bytes, NULL), flags, node); } #define kmalloc_array_node(...) alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__)) -- cgit v1.2.3 From 2e8000b826fcd2716449d09753d5ed843067881e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 1 Jul 2024 12:13:00 -0700 Subject: mm/slab: Introduce kvmalloc_buckets_node() that can take kmem_buckets argument Plumb kmem_buckets arguments through kvmalloc_node_noprof() so it is possible to provide an API to perform kvmalloc-style allocations with a particular set of buckets. Introduce kvmalloc_buckets_node() that takes a kmem_buckets argument. Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index a9200d453087..837005314f96 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -799,7 +799,9 @@ static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags) #define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__)) #define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node) -extern void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) __alloc_size(1); +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) __alloc_size(1); +#define kvmalloc_node_noprof(size, flags, node) \ + __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node) #define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__)) #define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE) -- cgit v1.2.3 From b32801d1255be1da62ea8134df3ed9f3331fba12 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 1 Jul 2024 12:13:01 -0700 Subject: mm/slab: Introduce kmem_buckets_create() and family Dedicated caches are available for fixed size allocations via kmem_cache_alloc(), but for dynamically sized allocations there is only the global kmalloc API's set of buckets available. This means it isn't possible to separate specific sets of dynamically sized allocations into a separate collection of caches. This leads to a use-after-free exploitation weakness in the Linux kernel since many heap memory spraying/grooming attacks depend on using userspace-controllable dynamically sized allocations to collide with fixed size allocations that end up in same cache. While CONFIG_RANDOM_KMALLOC_CACHES provides a probabilistic defense against these kinds of "type confusion" attacks, including for fixed same-size heap objects, we can create a complementary deterministic defense for dynamically sized allocations that are directly user controlled. Addressing these cases is limited in scope, so isolating these kinds of interfaces will not become an unbounded game of whack-a-mole. For example, many pass through memdup_user(), making isolation there very effective. In order to isolate user-controllable dynamically-sized allocations from the common system kmalloc allocations, introduce kmem_buckets_create(), which behaves like kmem_cache_create(). Introduce kmem_buckets_alloc(), which behaves like kmem_cache_alloc(). Introduce kmem_buckets_alloc_track_caller() for where caller tracking is needed. Introduce kmem_buckets_valloc() for cases where vmalloc fallback is needed. Note that these caches are specifically flagged with SLAB_NO_MERGE, since merging would defeat the entire purpose of the mitigation. This can also be used in the future to extend allocation profiling's use of code tagging to implement per-caller allocation cache isolation[1] even for dynamic allocations. Memory allocation pinning[2] is still needed to plug the Use-After-Free cross-allocator weakness (where attackers can arrange to free an entire slab page and have it reallocated to a different cache), but that is an existing and separate issue which is complementary to this improvement. Development continues for that feature via the SLAB_VIRTUAL[3] series (which could also provide guard pages -- another complementary improvement). Link: https://lore.kernel.org/lkml/202402211449.401382D2AF@keescook [1] Link: https://googleprojectzero.blogspot.com/2021/10/how-simple-linux-kernel-memory.html [2] Link: https://lore.kernel.org/lkml/20230915105933.495735-1-matteorizzo@google.com/ [3] Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 837005314f96..d99afce36098 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -549,6 +549,10 @@ void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, void kmem_cache_free(struct kmem_cache *s, void *objp); +kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, + void (*ctor)(void *)); + /* * Bulk allocation and freeing operations. These are accelerated in an * allocator specific way to avoid taking locks repeatedly or building @@ -682,6 +686,12 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f } #define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__)) +#define kmem_buckets_alloc(_b, _size, _flags) \ + alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) + +#define kmem_buckets_alloc_track_caller(_b, _size, _flags) \ + alloc_hooks(__kmalloc_node_track_caller_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE, _RET_IP_)) + static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node) { if (__builtin_constant_p(size) && size) { @@ -809,6 +819,8 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) #define kvzalloc(_size, _flags) kvmalloc(_size, (_flags)|__GFP_ZERO) #define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node) +#define kmem_buckets_valloc(_b, _size, _flags) \ + alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) static inline __alloc_size(1, 2) void * kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) -- cgit v1.2.3 From d688ffa269421cec73c7e13fd0cb03879b07db89 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 26 Jun 2024 16:32:29 -0600 Subject: perf: arm_pmuv3: Include asm/arm_pmuv3.h from linux/perf/arm_pmuv3.h The arm64 asm/arm_pmuv3.h depends on defines from linux/perf/arm_pmuv3.h. Rather than depend on include order, follow the usual pattern of "linux" headers including "asm" headers of the same name. With this change, the include of linux/kvm_host.h is problematic due to circular includes: In file included from ../arch/arm64/include/asm/arm_pmuv3.h:9, from ../include/linux/perf/arm_pmuv3.h:312, from ../include/kvm/arm_pmu.h:11, from ../arch/arm64/include/asm/kvm_host.h:38, from ../arch/arm64/mm/init.c:41: ../include/linux/kvm_host.h:383:30: error: field 'arch' has incomplete type Switching to asm/kvm_host.h solves the issue. Signed-off-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20240626-arm-pmu-3-9-icntr-v2-5-c9784b4f4065@kernel.org Signed-off-by: Will Deacon --- include/linux/perf/arm_pmuv3.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 46377e134d67..7867db04ec98 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -309,4 +309,6 @@ } \ } while (0) +#include + #endif -- cgit v1.2.3 From d69d804845985c29ab5be5a4b3b1f4787893daf8 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 1 Jul 2024 14:07:37 +0200 Subject: driver core: have match() callback in struct bus_type take a const * In the match() callback, the struct device_driver * should not be changed, so change the function callback to be a const *. This is one step of many towards making the driver core safe to have struct device_driver in read-only memory. Because the match() callback is in all busses, all busses are modified to handle this properly. This does entail switching some container_of() calls to container_of_const() to properly handle the constant *. For some busses, like PCI and USB and HV, the const * is cast away in the match callback as those busses do want to modify those structures at this point in time (they have a local lock in the driver structure.) That will have to be changed in the future if they wish to have their struct device * in read-only-memory. Cc: Rafael J. Wysocki Reviewed-by: Alex Elder Acked-by: Sumit Garg Link: https://lore.kernel.org/r/2024070136-wrongdoer-busily-01e8@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/arm_ffa.h | 2 +- include/linux/cdx/cdx_bus.h | 2 +- include/linux/device/bus.h | 2 +- include/linux/dfl.h | 2 +- include/linux/eisa.h | 2 +- include/linux/fsi.h | 2 +- include/linux/fsl/mc.h | 2 +- include/linux/gameport.h | 2 +- include/linux/greybus.h | 2 +- include/linux/hyperv.h | 6 +----- include/linux/i2c.h | 2 +- include/linux/i3c/device.h | 5 +---- include/linux/maple.h | 2 +- include/linux/mcb.h | 5 +---- include/linux/mdio.h | 19 ++++++------------- include/linux/mhi.h | 2 +- include/linux/mhi_ep.h | 2 +- include/linux/moxtet.h | 9 ++------- include/linux/nd.h | 6 +----- include/linux/pci-epf.h | 3 +-- include/linux/pci.h | 6 ++---- include/linux/phy.h | 2 +- include/linux/pnp.h | 2 +- include/linux/rio.h | 2 +- include/linux/scmi_protocol.h | 2 +- include/linux/serio.h | 2 +- include/linux/slimbus.h | 2 +- include/linux/soc/qcom/apr.h | 2 +- include/linux/soundwire/sdw_type.h | 2 +- include/linux/spi/spi.h | 6 ++---- include/linux/ssb/ssb.h | 2 +- include/linux/tc.h | 2 +- include/linux/tee_drv.h | 2 +- include/linux/virtio.h | 5 +---- 34 files changed, 42 insertions(+), 76 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index c82d56768101..ae70704bfa90 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -149,7 +149,7 @@ struct ffa_driver { struct device_driver driver; }; -#define to_ffa_driver(d) container_of(d, struct ffa_driver, driver) +#define to_ffa_driver(d) container_of_const(d, struct ffa_driver, driver) static inline void ffa_dev_set_drvdata(struct ffa_device *fdev, void *data) { diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h index b57118aaa679..79bb80e56790 100644 --- a/include/linux/cdx/cdx_bus.h +++ b/include/linux/cdx/cdx_bus.h @@ -211,7 +211,7 @@ struct cdx_driver { }; #define to_cdx_driver(_drv) \ - container_of(_drv, struct cdx_driver, driver) + container_of_const(_drv, struct cdx_driver, driver) /* Macro to avoid include chaining to get THIS_MODULE */ #define cdx_driver_register(drv) \ diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index 5ef4ec1c36c3..807831d6bf0f 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -81,7 +81,7 @@ struct bus_type { const struct attribute_group **dev_groups; const struct attribute_group **drv_groups; - int (*match)(struct device *dev, struct device_driver *drv); + int (*match)(struct device *dev, const struct device_driver *drv); int (*uevent)(const struct device *dev, struct kobj_uevent_env *env); int (*probe)(struct device *dev); void (*sync_state)(struct device *dev); diff --git a/include/linux/dfl.h b/include/linux/dfl.h index 0a7a00a0ee7f..1f02db0c1897 100644 --- a/include/linux/dfl.h +++ b/include/linux/dfl.h @@ -71,7 +71,7 @@ struct dfl_driver { }; #define to_dfl_dev(d) container_of(d, struct dfl_device, dev) -#define to_dfl_drv(d) container_of(d, struct dfl_driver, drv) +#define to_dfl_drv(d) container_of_const(d, struct dfl_driver, drv) /* * use a macro to avoid include chaining to get THIS_MODULE. diff --git a/include/linux/eisa.h b/include/linux/eisa.h index b012e30afebd..f98200cae637 100644 --- a/include/linux/eisa.h +++ b/include/linux/eisa.h @@ -60,7 +60,7 @@ struct eisa_driver { struct device_driver driver; }; -#define to_eisa_driver(drv) container_of(drv,struct eisa_driver, driver) +#define to_eisa_driver(drv) container_of_const(drv,struct eisa_driver, driver) /* These external functions are only available when EISA support is enabled. */ #ifdef CONFIG_EISA diff --git a/include/linux/fsi.h b/include/linux/fsi.h index 3df8c54868df..8c5eef808788 100644 --- a/include/linux/fsi.h +++ b/include/linux/fsi.h @@ -44,7 +44,7 @@ struct fsi_driver { }; #define to_fsi_dev(devp) container_of(devp, struct fsi_device, dev) -#define to_fsi_drv(drvp) container_of(drvp, struct fsi_driver, drv) +#define to_fsi_drv(drvp) container_of_const(drvp, struct fsi_driver, drv) extern int fsi_driver_register(struct fsi_driver *fsi_drv); extern void fsi_driver_unregister(struct fsi_driver *fsi_drv); diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index a1b3de87a3d1..083c860fd28e 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -56,7 +56,7 @@ struct fsl_mc_driver { }; #define to_fsl_mc_driver(_drv) \ - container_of(_drv, struct fsl_mc_driver, driver) + container_of_const(_drv, struct fsl_mc_driver, driver) /** * enum fsl_mc_pool_type - Types of allocatable MC bus resources diff --git a/include/linux/gameport.h b/include/linux/gameport.h index 07e370113b2b..86d62fdafd7a 100644 --- a/include/linux/gameport.h +++ b/include/linux/gameport.h @@ -58,7 +58,7 @@ struct gameport_driver { bool ignore; }; -#define to_gameport_driver(d) container_of(d, struct gameport_driver, driver) +#define to_gameport_driver(d) container_of_const(d, struct gameport_driver, driver) int gameport_open(struct gameport *gameport, struct gameport_driver *drv, int mode); void gameport_close(struct gameport *gameport); diff --git a/include/linux/greybus.h b/include/linux/greybus.h index 634c9511cf78..4d58e27ceaf6 100644 --- a/include/linux/greybus.h +++ b/include/linux/greybus.h @@ -64,7 +64,7 @@ struct greybus_driver { struct device_driver driver; }; -#define to_greybus_driver(d) container_of(d, struct greybus_driver, driver) +#define to_greybus_driver(d) container_of_const(d, struct greybus_driver, driver) static inline void greybus_set_drvdata(struct gb_bundle *bundle, void *data) { diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 5e39baa7f6cb..22c22fb91042 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1330,11 +1330,7 @@ struct hv_device { #define device_to_hv_device(d) container_of_const(d, struct hv_device, device) - -static inline struct hv_driver *drv_to_hv_drv(struct device_driver *d) -{ - return container_of(d, struct hv_driver, driver); -} +#define drv_to_hv_drv(d) container_of_const(d, struct hv_driver, driver) static inline void hv_set_drvdata(struct hv_device *dev, void *data) { diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 9709537370ee..cde3de35a89f 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -304,7 +304,7 @@ struct i2c_driver { u32 flags; }; -#define to_i2c_driver(d) container_of(d, struct i2c_driver, driver) +#define to_i2c_driver(d) container_of_const(d, struct i2c_driver, driver) /** * struct i2c_client - represent an I2C slave device diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index e119f11948ef..0a8a44ac2f02 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -183,10 +183,7 @@ struct i3c_driver { const struct i3c_device_id *id_table; }; -static inline struct i3c_driver *drv_to_i3cdrv(struct device_driver *drv) -{ - return container_of(drv, struct i3c_driver, driver); -} +#define drv_to_i3cdrv(__drv) container_of_const(__drv, struct i3c_driver, driver) struct device *i3cdev_to_dev(struct i3c_device *i3cdev); diff --git a/include/linux/maple.h b/include/linux/maple.h index 9aae44efcfd4..3be4e567473c 100644 --- a/include/linux/maple.h +++ b/include/linux/maple.h @@ -97,7 +97,7 @@ int maple_add_packet(struct maple_device *mdev, u32 function, void maple_clear_dev(struct maple_device *mdev); #define to_maple_dev(n) container_of(n, struct maple_device, dev) -#define to_maple_driver(n) container_of(n, struct maple_driver, drv) +#define to_maple_driver(n) container_of_const(n, struct maple_driver, drv) #define maple_get_drvdata(d) dev_get_drvdata(&(d)->dev) #define maple_set_drvdata(d,p) dev_set_drvdata(&(d)->dev, (p)) diff --git a/include/linux/mcb.h b/include/linux/mcb.h index 0b971b24a804..4ab2691f51a6 100644 --- a/include/linux/mcb.h +++ b/include/linux/mcb.h @@ -94,10 +94,7 @@ struct mcb_driver { void (*shutdown)(struct mcb_device *mdev); }; -static inline struct mcb_driver *to_mcb_driver(struct device_driver *drv) -{ - return container_of(drv, struct mcb_driver, driver); -} +#define to_mcb_driver(__drv) container_of_const(__drv, struct mcb_driver, driver) static inline void *mcb_get_drvdata(struct mcb_device *dev) { diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 68f8d2e970d4..efeca5bd7600 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -31,7 +31,7 @@ struct mdio_device { struct mii_bus *bus; char modalias[MDIO_NAME_SIZE]; - int (*bus_match)(struct device *dev, struct device_driver *drv); + int (*bus_match)(struct device *dev, const struct device_driver *drv); void (*device_free)(struct mdio_device *mdiodev); void (*device_remove)(struct mdio_device *mdiodev); @@ -57,11 +57,8 @@ struct mdio_driver_common { }; #define MDIO_DEVICE_FLAG_PHY 1 -static inline struct mdio_driver_common * -to_mdio_common_driver(const struct device_driver *driver) -{ - return container_of(driver, struct mdio_driver_common, driver); -} +#define to_mdio_common_driver(__drv_c) container_of_const(__drv_c, struct mdio_driver_common, \ + driver) /* struct mdio_driver: Generic MDIO driver */ struct mdio_driver { @@ -80,12 +77,8 @@ struct mdio_driver { void (*shutdown)(struct mdio_device *mdiodev); }; -static inline struct mdio_driver * -to_mdio_driver(const struct device_driver *driver) -{ - return container_of(to_mdio_common_driver(driver), struct mdio_driver, - mdiodrv); -} +#define to_mdio_driver(__drv_m) container_of_const(to_mdio_common_driver(__drv_m), \ + struct mdio_driver, mdiodrv) /* device driver data */ static inline void mdiodev_set_drvdata(struct mdio_device *mdio, void *data) @@ -105,7 +98,7 @@ void mdio_device_remove(struct mdio_device *mdiodev); void mdio_device_reset(struct mdio_device *mdiodev, int value); int mdio_driver_register(struct mdio_driver *drv); void mdio_driver_unregister(struct mdio_driver *drv); -int mdio_device_bus_match(struct device *dev, struct device_driver *drv); +int mdio_device_bus_match(struct device *dev, const struct device_driver *drv); static inline void mdio_device_get(struct mdio_device *mdiodev) { diff --git a/include/linux/mhi.h b/include/linux/mhi.h index b573f15762f8..ce1f9d737964 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -526,7 +526,7 @@ struct mhi_driver { struct device_driver driver; }; -#define to_mhi_driver(drv) container_of(drv, struct mhi_driver, driver) +#define to_mhi_driver(drv) container_of_const(drv, struct mhi_driver, driver) #define to_mhi_device(dev) container_of(dev, struct mhi_device, dev) /** diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index 11bf3212f782..7b40fc8cbe77 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -221,7 +221,7 @@ struct mhi_ep_driver { }; #define to_mhi_ep_device(dev) container_of(dev, struct mhi_ep_device, dev) -#define to_mhi_ep_driver(drv) container_of(drv, struct mhi_ep_driver, driver) +#define to_mhi_ep_driver(drv) container_of_const(drv, struct mhi_ep_driver, driver) /* * module_mhi_ep_driver() - Helper macro for drivers that don't do diff --git a/include/linux/moxtet.h b/include/linux/moxtet.h index ac577699edfd..dfa4800306ee 100644 --- a/include/linux/moxtet.h +++ b/include/linux/moxtet.h @@ -61,13 +61,8 @@ struct moxtet_driver { struct device_driver driver; }; -static inline struct moxtet_driver * -to_moxtet_driver(struct device_driver *drv) -{ - if (!drv) - return NULL; - return container_of(drv, struct moxtet_driver, driver); -} +#define to_moxtet_driver(__drv) \ + ( __drv ? container_of_const(__drv, struct moxtet_driver, driver) : NULL ) extern int __moxtet_register_driver(struct module *owner, struct moxtet_driver *mdrv); diff --git a/include/linux/nd.h b/include/linux/nd.h index b9771ba1ef87..fa099e295f78 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -84,11 +84,7 @@ struct nd_device_driver { void (*notify)(struct device *dev, enum nvdimm_event event); }; -static inline struct nd_device_driver *to_nd_device_driver( - struct device_driver *drv) -{ - return container_of(drv, struct nd_device_driver, drv); -}; +#define to_nd_device_driver(__drv) container_of_const(__drv, struct nd_device_driver, drv) /** * struct nd_namespace_common - core infrastructure of a namespace diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index adee6a1b35db..980a3c90a5ee 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -105,8 +105,7 @@ struct pci_epf_driver { const struct pci_epf_device_id *id_table; }; -#define to_pci_epf_driver(drv) (container_of((drv), struct pci_epf_driver, \ - driver)) +#define to_pci_epf_driver(drv) container_of_const((drv), struct pci_epf_driver, driver) /** * struct pci_epf_bar - represents the BAR of EPF device diff --git a/include/linux/pci.h b/include/linux/pci.h index cafc5ab1cbcb..aa1c3280b7d0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -958,10 +958,8 @@ struct pci_driver { bool driver_managed_dma; }; -static inline struct pci_driver *to_pci_driver(struct device_driver *drv) -{ - return drv ? container_of(drv, struct pci_driver, driver) : NULL; -} +#define to_pci_driver(__drv) \ + ( __drv ? container_of_const(__drv, struct pci_driver, driver) : NULL ) /** * PCI_DEVICE - macro used to describe a specific PCI device diff --git a/include/linux/phy.h b/include/linux/phy.h index e6e83304558e..8237d5006a99 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1179,7 +1179,7 @@ struct phy_driver { int (*led_polarity_set)(struct phy_device *dev, int index, unsigned long modes); }; -#define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ +#define to_phy_driver(d) container_of_const(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) #define PHY_ANY_ID "MATCH ANY PHY" diff --git a/include/linux/pnp.h b/include/linux/pnp.h index 7f2ff95d2deb..b7a7158aaf65 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -383,7 +383,7 @@ struct pnp_driver { struct device_driver driver; }; -#define to_pnp_driver(drv) container_of(drv, struct pnp_driver, driver) +#define to_pnp_driver(drv) container_of_const(drv, struct pnp_driver, driver) struct pnp_card_driver { struct list_head global_list; diff --git a/include/linux/rio.h b/include/linux/rio.h index 2cd637268b4f..3c29f40f3c94 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -465,7 +465,7 @@ struct rio_driver { struct device_driver driver; }; -#define to_rio_driver(drv) container_of(drv,struct rio_driver, driver) +#define to_rio_driver(drv) container_of_const(drv,struct rio_driver, driver) union rio_pw_msg { struct { diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 3a9bb5b9a9e8..688466a0e816 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -945,7 +945,7 @@ struct scmi_device { struct scmi_handle *handle; }; -#define to_scmi_dev(d) container_of(d, struct scmi_device, dev) +#define to_scmi_dev(d) container_of_const(d, struct scmi_device, dev) struct scmi_device_id { u8 protocol_id; diff --git a/include/linux/serio.h b/include/linux/serio.h index 7ca41af93b37..bf2191f25350 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -80,7 +80,7 @@ struct serio_driver { struct device_driver driver; }; -#define to_serio_driver(d) container_of(d, struct serio_driver, driver) +#define to_serio_driver(d) container_of_const(d, struct serio_driver, driver) int serio_open(struct serio *serio, struct serio_driver *drv); void serio_close(struct serio *serio); diff --git a/include/linux/slimbus.h b/include/linux/slimbus.h index 3042385b7b40..a4608d9a9684 100644 --- a/include/linux/slimbus.h +++ b/include/linux/slimbus.h @@ -91,7 +91,7 @@ struct slim_driver { struct device_driver driver; const struct slim_device_id *id_table; }; -#define to_slim_driver(d) container_of(d, struct slim_driver, driver) +#define to_slim_driver(d) container_of_const(d, struct slim_driver, driver) /** * struct slim_val_inf - Slimbus value or information element diff --git a/include/linux/soc/qcom/apr.h b/include/linux/soc/qcom/apr.h index 7161a3183eda..a532d1e4b1f4 100644 --- a/include/linux/soc/qcom/apr.h +++ b/include/linux/soc/qcom/apr.h @@ -162,7 +162,7 @@ struct apr_driver { }; typedef struct apr_driver gpr_driver_t; -#define to_apr_driver(d) container_of(d, struct apr_driver, driver) +#define to_apr_driver(d) container_of_const(d, struct apr_driver, driver) /* * use a macro to avoid include chaining to get THIS_MODULE diff --git a/include/linux/soundwire/sdw_type.h b/include/linux/soundwire/sdw_type.h index 693320b4f5c2..d405935a45fe 100644 --- a/include/linux/soundwire/sdw_type.h +++ b/include/linux/soundwire/sdw_type.h @@ -13,7 +13,7 @@ static inline int is_sdw_slave(const struct device *dev) return dev->type == &sdw_slave_type; } -#define drv_to_sdw_driver(_drv) container_of(_drv, struct sdw_driver, driver) +#define drv_to_sdw_driver(_drv) container_of_const(_drv, struct sdw_driver, driver) #define sdw_register_driver(drv) \ __sdw_register_driver(drv, THIS_MODULE) diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e8e1e798924f..3fc559686d38 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -351,10 +351,8 @@ struct spi_driver { struct device_driver driver; }; -static inline struct spi_driver *to_spi_driver(struct device_driver *drv) -{ - return drv ? container_of(drv, struct spi_driver, driver) : NULL; -} +#define to_spi_driver(__drv) \ + ( __drv ? container_of_const(__drv, struct spi_driver, driver) : NULL ) extern int __spi_register_driver(struct module *owner, struct spi_driver *sdrv); diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index a2257380c3f1..e1fb11e0f12c 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -325,7 +325,7 @@ struct ssb_driver { struct device_driver drv; }; -#define drv_to_ssb_drv(_drv) container_of(_drv, struct ssb_driver, drv) +#define drv_to_ssb_drv(_drv) container_of_const(_drv, struct ssb_driver, drv) extern int __ssb_driver_register(struct ssb_driver *drv, struct module *owner); #define ssb_driver_register(drv) \ diff --git a/include/linux/tc.h b/include/linux/tc.h index 1638660abf5e..8416bae9b126 100644 --- a/include/linux/tc.h +++ b/include/linux/tc.h @@ -108,7 +108,7 @@ struct tc_driver { struct device_driver driver; }; -#define to_tc_driver(drv) container_of(drv, struct tc_driver, driver) +#define to_tc_driver(drv) container_of_const(drv, struct tc_driver, driver) /* * Return TURBOchannel clock frequency in Hz. diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 786b9ae6cf4d..a54c203000ed 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -298,6 +298,6 @@ struct tee_client_driver { }; #define to_tee_client_driver(d) \ - container_of(d, struct tee_client_driver, driver) + container_of_const(d, struct tee_client_driver, driver) #endif /*__TEE_DRV_H*/ diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 96fea920873b..ecc5cb7b8c91 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -209,10 +209,7 @@ struct virtio_driver { int (*restore)(struct virtio_device *dev); }; -static inline struct virtio_driver *drv_to_virtio(struct device_driver *drv) -{ - return container_of(drv, struct virtio_driver, driver); -} +#define drv_to_virtio(__drv) container_of_const(__drv, struct virtio_driver, driver) /* use a macro to avoid include chaining to get THIS_MODULE */ #define register_virtio_driver(drv) \ -- cgit v1.2.3 From 633478695d6b52e0b52f1273d8d11275b627b87d Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Mon, 1 Jul 2024 10:12:15 +0800 Subject: bus: mhi: host: Allow controller drivers to specify name for the MHI controller MHI devices usually have a product/device name to identify each device uniquely. So let's specify that name in 'struct mhi_controller' so that the client drivers can use this name to uniquely identify the devices and apply any device specific quirks. Signed-off-by: Slark Xiao Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20240701021216.17734-2-slark_xiao@163.com [mani: reworked subject and description] Signed-off-by: Manivannan Sadhasivam --- include/linux/mhi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index b573f15762f8..fabd6ed8d258 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -290,6 +290,7 @@ struct mhi_controller_config { /** * struct mhi_controller - Master MHI controller structure + * @name: Device name of the MHI controller * @cntrl_dev: Pointer to the struct device of physical bus acting as the MHI * controller (required) * @mhi_dev: MHI device instance for the controller @@ -367,6 +368,7 @@ struct mhi_controller_config { * they can be populated depending on the usecase. */ struct mhi_controller { + const char *name; struct device *cntrl_dev; struct mhi_device *mhi_dev; struct dentry *debugfs_dentry; -- cgit v1.2.3 From 62ce9ef1479729b1a3f8a1b19900e28d68b3625e Mon Sep 17 00:00:00 2001 From: Marco Felsch Date: Mon, 1 Jul 2024 15:21:24 +0200 Subject: usb: typec: tcpci: add support to set connector orientation This add the support to set the optional connector orientation bit which is part of the optional CONFIG_STANDARD_OUTPUT register 0x18 [1]. This allows system designers to connect the tcpc orientation pin directly to the 2:1 ss-mux. [1] https://www.usb.org/sites/default/files/documents/usb-port_controller_specification_rev2.0_v1.0_0.pdf Signed-off-by: Marco Felsch Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240701132133.3054394-1-m.felsch@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpci.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h index 47a86b8a4a50..0ab39b6ea205 100644 --- a/include/linux/usb/tcpci.h +++ b/include/linux/usb/tcpci.h @@ -47,6 +47,9 @@ #define TCPC_SINK_FAST_ROLE_SWAP BIT(0) #define TCPC_CONFIG_STD_OUTPUT 0x18 +#define TCPC_CONFIG_STD_OUTPUT_ORIENTATION_MASK BIT(0) +#define TCPC_CONFIG_STD_OUTPUT_ORIENTATION_NORMAL 0 +#define TCPC_CONFIG_STD_OUTPUT_ORIENTATION_FLIPPED 1 #define TCPC_TCPC_CTRL 0x19 #define TCPC_TCPC_CTRL_ORIENTATION BIT(0) @@ -127,6 +130,7 @@ #define TCPC_DEV_CAP_2 0x26 #define TCPC_STD_INPUT_CAP 0x28 #define TCPC_STD_OUTPUT_CAP 0x29 +#define TCPC_STD_OUTPUT_CAP_ORIENTATION BIT(0) #define TCPC_MSG_HDR_INFO 0x2e #define TCPC_MSG_HDR_INFO_DATA_ROLE BIT(3) @@ -209,6 +213,9 @@ struct tcpci; * swap following Discover Identity on SOP' occurs. * Return true when the TCPM is allowed to request a Vconn swap * after Discovery Identity on SOP. + * @set_orientation: + * Optional; Enable setting the connector orientation + * CONFIG_STANDARD_OUTPUT (0x18) bit0. */ struct tcpci_data { struct regmap *regmap; @@ -216,6 +223,7 @@ struct tcpci_data { unsigned char auto_discharge_disconnect:1; unsigned char vbus_vsafe0v:1; unsigned char cable_comm_capable:1; + unsigned char set_orientation:1; int (*init)(struct tcpci *tcpci, struct tcpci_data *data); int (*set_vconn)(struct tcpci *tcpci, struct tcpci_data *data, -- cgit v1.2.3 From f9a748fa5ce0958bea2a03c25f092fbc22e72c67 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 May 2024 16:48:21 +0100 Subject: parport: Remove 'drivers' list The list has been empty since: 'commit 3275158fa52a ("parport: remove use of devmodel")' This also means we can remove the 'list_head' from struct parport_driver. Signed-off-by: Dr. David Alan Gilbert Acked-by: Sudip Mukherjee Link: https://lore.kernel.org/r/20240502154823.67235-2-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/parport.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/parport.h b/include/linux/parport.h index fff39bc30629..2a4424b60156 100644 --- a/include/linux/parport.h +++ b/include/linux/parport.h @@ -258,7 +258,6 @@ struct parport_driver { int (*probe)(struct pardevice *); struct device_driver driver; bool devmodel; - struct list_head list; }; #define to_parport_driver(n) container_of(n, struct parport_driver, driver) -- cgit v1.2.3 From ed06e054906c5e7853a36d9ddad8fc94dbb8c252 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 May 2024 16:48:22 +0100 Subject: parport: Remove attach function pointer The attach function pointers haven't actually been called since: 'commit 3275158fa52a ("parport: remove use of devmodel")' topped adding entries to the drivers list. If you're converting a driver, look at the 'match_port' function pointer instead. (There are lots of comment references to 'attach' all over, but they probably need some deeper understanding to check the semantics to see if they can be replaced by match_port). Signed-off-by: Dr. David Alan Gilbert Acked-by: Sudip Mukherjee Link: https://lore.kernel.org/r/20240502154823.67235-3-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/parport.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/parport.h b/include/linux/parport.h index 2a4424b60156..190de3569e25 100644 --- a/include/linux/parport.h +++ b/include/linux/parport.h @@ -252,7 +252,6 @@ struct parport { struct parport_driver { const char *name; - void (*attach) (struct parport *); void (*detach) (struct parport *); void (*match_port)(struct parport *); int (*probe)(struct pardevice *); -- cgit v1.2.3 From dfd19866d1a3f681cc12aae67ab05011eb3aa3d8 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 May 2024 16:48:23 +0100 Subject: parport: Remove parport_driver.devmodel 'devmodel' hasn't actually been used since: 'commit 3275158fa52a ("parport: remove use of devmodel")' and everyone now has it set to true and has been fixed up; remove the flag. (There are still comments all over about it) Signed-off-by: Dr. David Alan Gilbert Acked-by: Sudip Mukherjee Link: https://lore.kernel.org/r/20240502154823.67235-4-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/parport.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/parport.h b/include/linux/parport.h index 190de3569e25..464c2ad28039 100644 --- a/include/linux/parport.h +++ b/include/linux/parport.h @@ -256,7 +256,6 @@ struct parport_driver { void (*match_port)(struct parport *); int (*probe)(struct pardevice *); struct device_driver driver; - bool devmodel; }; #define to_parport_driver(n) container_of(n, struct parport_driver, driver) @@ -299,9 +298,6 @@ int __must_check __parport_register_driver(struct parport_driver *, * to receive notifications about ports being found in the * system, as well as ports no longer available. * - * If devmodel is true then the new device model is used - * for registration. - * * The @driver structure is allocated by the caller and must not be * deallocated until after calling parport_unregister_driver(). * -- cgit v1.2.3 From eb054d67b21a53f6ccf3af49a62fb99397b48fc2 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 3 Jul 2024 11:16:03 +0100 Subject: iommu/arm-smmu-v3: Add support for dirty tracking in domain alloc This provides all the infrastructure to enable dirty tracking if the hardware has the capability and domain alloc request for it. Also, add a device_iommu_capable() check in iommufd core for IOMMU_CAP_DIRTY_TRACKING before we request a user domain with dirty tracking support. Please note, we still report no support for IOMMU_CAP_DIRTY_TRACKING as it will finally be enabled in a subsequent patch. Signed-off-by: Joao Martins Reviewed-by: Ryan Roberts Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20240703101604.2576-5-shameerali.kolothum.thodi@huawei.com Signed-off-by: Will Deacon --- include/linux/io-pgtable.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 86cf1f7ae389..f9a81761bfce 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -85,6 +85,8 @@ struct io_pgtable_cfg { * * IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability * attributes set in the TCR for a non-coherent page-table walker. + * + * IO_PGTABLE_QUIRK_ARM_HD: Enables dirty tracking in stage 1 pagetable. */ #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) @@ -92,6 +94,7 @@ struct io_pgtable_cfg { #define IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT BIT(4) #define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5) #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6) + #define IO_PGTABLE_QUIRK_ARM_HD BIT(7) unsigned long quirks; unsigned long pgsize_bitmap; unsigned int ias; -- cgit v1.2.3 From da042a3655151157c06e71a583e883ab2d86d1ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 17:10:19 +0200 Subject: block: split integrity support out of bio.h Split struct bio_integrity_payload and the related prototypes out of bio.h into a separate bio-integrity.h header so that it is only pulled in by the few places that need it. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240702151047.1746127-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 153 +++++++++++++++++++++++++++++++++++++++++ include/linux/bio.h | 156 ------------------------------------------ include/linux/blk-integrity.h | 1 + 3 files changed, 154 insertions(+), 156 deletions(-) create mode 100644 include/linux/bio-integrity.h (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h new file mode 100644 index 000000000000..70ef19a0dc7e --- /dev/null +++ b/include/linux/bio-integrity.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BIO_INTEGRITY_H +#define _LINUX_BIO_INTEGRITY_H + +#include + +enum bip_flags { + BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ + BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ + BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ + BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ + BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ + BIP_INTEGRITY_USER = 1 << 5, /* Integrity payload is user address */ + BIP_COPY_USER = 1 << 6, /* Kernel bounce buffer in use */ +}; + +struct bio_integrity_payload { + struct bio *bip_bio; /* parent bio */ + + struct bvec_iter bip_iter; + + unsigned short bip_vcnt; /* # of integrity bio_vecs */ + unsigned short bip_max_vcnt; /* integrity bio_vec slots */ + unsigned short bip_flags; /* control flags */ + + struct bvec_iter bio_iter; /* for rewinding parent bio */ + + struct work_struct bip_work; /* I/O completion */ + + struct bio_vec *bip_vec; + struct bio_vec bip_inline_vecs[];/* embedded bvec array */ +}; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + +#define bip_for_each_vec(bvl, bip, iter) \ + for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter) + +#define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ + for_each_bio(_bio) \ + bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) + +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) +{ + if (bio->bi_opf & REQ_INTEGRITY) + return bio->bi_integrity; + + return NULL; +} + +static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (bip) + return bip->bip_flags & flag; + + return false; +} + +static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) +{ + return bip->bip_iter.bi_sector; +} + +static inline void bip_set_seed(struct bio_integrity_payload *bip, + sector_t seed) +{ + bip->bip_iter.bi_sector = seed; +} + +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, + unsigned int nr); +int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, + unsigned int offset); +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); +void bio_integrity_unmap_free_user(struct bio *bio); +bool bio_integrity_prep(struct bio *bio); +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); +void bio_integrity_trim(struct bio *bio); +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); +int bioset_integrity_create(struct bio_set *bs, int pool_size); +void bioset_integrity_free(struct bio_set *bs); +void bio_integrity_init(void); + +#else /* CONFIG_BLK_DEV_INTEGRITY */ + +static inline void *bio_integrity(struct bio *bio) +{ + return NULL; +} + +static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ + return 0; +} + +static inline void bioset_integrity_free(struct bio_set *bs) +{ +} + +static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, + ssize_t len, u32 seed) +{ + return -EINVAL; +} + +static inline void bio_integrity_unmap_free_user(struct bio *bio) +{ +} + +static inline bool bio_integrity_prep(struct bio *bio) +{ + return true; +} + +static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, + gfp_t gfp_mask) +{ + return 0; +} + +static inline void bio_integrity_advance(struct bio *bio, + unsigned int bytes_done) +{ +} + +static inline void bio_integrity_trim(struct bio *bio) +{ +} + +static inline void bio_integrity_init(void) +{ +} + +static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) +{ + return false; +} + +static inline void *bio_integrity_alloc(struct bio *bio, gfp_t gfp, + unsigned int nr) +{ + return ERR_PTR(-EINVAL); +} + +static inline int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + return 0; +} +#endif /* CONFIG_BLK_DEV_INTEGRITY */ +#endif /* _LINUX_BIO_INTEGRITY_H */ diff --git a/include/linux/bio.h b/include/linux/bio.h index 818e93612947..a46e2047bea4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -321,69 +321,6 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) #define bio_for_each_folio_all(fi, bio) \ for (bio_first_folio(&fi, bio, 0); fi.folio; bio_next_folio(&fi, bio)) -enum bip_flags { - BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ - BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ - BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ - BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ - BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ - BIP_INTEGRITY_USER = 1 << 5, /* Integrity payload is user address */ - BIP_COPY_USER = 1 << 6, /* Kernel bounce buffer in use */ -}; - -/* - * bio integrity payload - */ -struct bio_integrity_payload { - struct bio *bip_bio; /* parent bio */ - - struct bvec_iter bip_iter; - - unsigned short bip_vcnt; /* # of integrity bio_vecs */ - unsigned short bip_max_vcnt; /* integrity bio_vec slots */ - unsigned short bip_flags; /* control flags */ - - struct bvec_iter bio_iter; /* for rewinding parent bio */ - - struct work_struct bip_work; /* I/O completion */ - - struct bio_vec *bip_vec; - struct bio_vec bip_inline_vecs[];/* embedded bvec array */ -}; - -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) -{ - if (bio->bi_opf & REQ_INTEGRITY) - return bio->bi_integrity; - - return NULL; -} - -static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) -{ - struct bio_integrity_payload *bip = bio_integrity(bio); - - if (bip) - return bip->bip_flags & flag; - - return false; -} - -static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) -{ - return bip->bip_iter.bi_sector; -} - -static inline void bip_set_seed(struct bio_integrity_payload *bip, - sector_t seed) -{ - bip->bip_iter.bi_sector = seed; -} - -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); @@ -721,99 +658,6 @@ static inline bool bioset_initialized(struct bio_set *bs) return bs->bio_slab != NULL; } -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -#define bip_for_each_vec(bvl, bip, iter) \ - for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter) - -#define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ - for_each_bio(_bio) \ - bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) - -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); -void bio_integrity_unmap_free_user(struct bio *bio); -extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); -extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); -extern bool bio_integrity_prep(struct bio *); -extern void bio_integrity_advance(struct bio *, unsigned int); -extern void bio_integrity_trim(struct bio *); -extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t); -extern int bioset_integrity_create(struct bio_set *, int); -extern void bioset_integrity_free(struct bio_set *); -extern void bio_integrity_init(void); - -#else /* CONFIG_BLK_DEV_INTEGRITY */ - -static inline void *bio_integrity(struct bio *bio) -{ - return NULL; -} - -static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - return 0; -} - -static inline void bioset_integrity_free (struct bio_set *bs) -{ - return; -} - -static inline bool bio_integrity_prep(struct bio *bio) -{ - return true; -} - -static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - gfp_t gfp_mask) -{ - return 0; -} - -static inline void bio_integrity_advance(struct bio *bio, - unsigned int bytes_done) -{ - return; -} - -static inline void bio_integrity_trim(struct bio *bio) -{ - return; -} - -static inline void bio_integrity_init(void) -{ - return; -} - -static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) -{ - return false; -} - -static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp, - unsigned int nr) -{ - return ERR_PTR(-EINVAL); -} - -static inline int bio_integrity_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - return 0; -} - -static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, - ssize_t len, u32 seed) -{ - return -EINVAL; -} -static inline void bio_integrity_unmap_free_user(struct bio *bio) -{ -} - -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - /* * Mark a bio as polled. Note that for async polled IO, the caller must * expect -EWOULDBLOCK if we cannot allocate a request (or other resources). diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 804f856ed3e5..de98049b7ded 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -3,6 +3,7 @@ #define _LINUX_BLK_INTEGRITY_H #include +#include struct request; -- cgit v1.2.3 From 21671a1ed1ff22e158ebe9d619943f926f03f5cd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 17:10:20 +0200 Subject: block: also return bio_integrity_payload * from stubs struct bio_integrity_payload is defined unconditionally. No need to return void * from bio_integrity() and bio_integrity_alloc(). Signed-off-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Anuj Gupta Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240702151047.1746127-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 70ef19a0dc7e..cac24dac06ff 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -85,7 +85,7 @@ void bio_integrity_init(void); #else /* CONFIG_BLK_DEV_INTEGRITY */ -static inline void *bio_integrity(struct bio *bio) +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { return NULL; } @@ -138,8 +138,8 @@ static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) return false; } -static inline void *bio_integrity_alloc(struct bio *bio, gfp_t gfp, - unsigned int nr) +static inline struct bio_integrity_payload * +bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr) { return ERR_PTR(-EINVAL); } -- cgit v1.2.3 From 85253bac4d02b1f95d6109c221aeccd7a262ec4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 17:10:23 +0200 Subject: block: don't free submitter owned integrity payload on I/O completion Currently __bio_integrity_endio frees the integrity payload unless it is explicitly marked as user-mapped. This means in-kernel callers that allocate their own integrity payload never get to see it on I/O completion. The current two users don't need it as they just pre-mapped PI tuples received over the network, but this limits uses of integrity data lot. Change bio_integrity_endio to call __bio_integrity_endio for block layer generated integrity data only, and leave freeing of submitter allocated integrity data to bio_uninit which also gets called from the final bio_put. This requires that unmapping user mapped or copied integrity data is now always done by the caller, and the special BIP_INTEGRITY_USER flag can go away. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240702151047.1746127-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index cac24dac06ff..3823d9be0d07 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -10,8 +10,7 @@ enum bip_flags { BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ - BIP_INTEGRITY_USER = 1 << 5, /* Integrity payload is user address */ - BIP_COPY_USER = 1 << 6, /* Kernel bounce buffer in use */ + BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ }; struct bio_integrity_payload { -- cgit v1.2.3 From 74cc150282e41c6c0704cd305c9a4392dc64ef4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Jul 2024 17:10:24 +0200 Subject: block: don't free the integrity payload in bio_integrity_unmap_free_user Now that the integrity payload is always freed in bio_uninit, don't bother freeing it a little earlier in bio_integrity_unmap_free_user. With that the separate bio_integrity_unmap_free_user can go away by just passing the bio to bio_integrity_unmap_user. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240702151047.1746127-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 3823d9be0d07..dd831c269e99 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -73,7 +73,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); -void bio_integrity_unmap_free_user(struct bio *bio); +void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); @@ -104,7 +104,7 @@ static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, return -EINVAL; } -static inline void bio_integrity_unmap_free_user(struct bio *bio) +static inline void bio_integrity_unmap_user(struct bio *bio) { } -- cgit v1.2.3 From 1028f391d5f9d4248e2f49193e6de2516ad630f8 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 3 Jul 2024 00:36:46 +0000 Subject: cgroup/misc: Introduce misc.peak Introduce misc.peak to record the historical maximum usage of the resource, as in some scenarios the value of misc.max could be adjusted based on the peak usage of the resource. Signed-off-by: Xiu Jianfeng Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index d70eab2501ee..618392d41975 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -31,11 +31,13 @@ struct misc_cg; /** * struct misc_res: Per cgroup per misc type resource * @max: Maximum limit on the resource. + * @watermark: Historical maximum usage of the resource. * @usage: Current usage of the resource. * @events: Number of times, the resource limit exceeded. */ struct misc_res { u64 max; + atomic64_t watermark; atomic64_t usage; atomic64_t events; }; -- cgit v1.2.3 From a19621ed4e0ac9e1d296d07dc8ff8c27d5c03b43 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 15 May 2024 15:07:06 +0800 Subject: mm: add folio_alloc_mpol() Patch series "mm: convert to folio_alloc_mpol()". This patch (of 4): This adds a new folio_alloc_mpol() like folio_alloc() but allocate folio according to NUMA mempolicy. Link: https://lkml.kernel.org/r/20240515070709.78529-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240515070709.78529-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7f9691d375f0..f53f76e0b17e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -303,6 +303,8 @@ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); +struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); #else @@ -319,6 +321,11 @@ static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } +static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid) +{ + return folio_alloc_noprof(gfp, order); +} #define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \ folio_alloc_noprof(gfp, order) #endif @@ -326,6 +333,7 @@ static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) #define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) +#define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) -- cgit v1.2.3 From 7f83bf14603ef41a44dc907594d749a283e22c37 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Wed, 15 May 2024 10:47:54 +0800 Subject: mm/huge_memory: mark racy access onhuge_anon_orders_always huge_anon_orders_always is accessed lockless, it is better to use the READ_ONCE() wrapper. This is not fixing any visible bug, hopefully this can cease some KCSAN complains in the future. Also do that for huge_anon_orders_madvise. Link: https://lkml.kernel.org/r/20240515104754889HqrahFPePOIE1UlANHVAh@zte.com.cn Signed-off-by: Ran Xiaokai Acked-by: David Hildenbrand Reviewed-by: Lu Zhongjun Reviewed-by: xu xin Cc: Yang Yang Cc: Matthew Wilcox (Oracle) Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2aa986a5cd1b..088d66a54643 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -134,8 +134,8 @@ static inline bool hugepage_flags_enabled(void) * So we don't need to look at huge_anon_orders_inherit. */ return hugepage_global_enabled() || - huge_anon_orders_always || - huge_anon_orders_madvise; + READ_ONCE(huge_anon_orders_always) || + READ_ONCE(huge_anon_orders_madvise); } static inline int highest_order(unsigned long orders) -- cgit v1.2.3 From 564a2ee9f9f66ceef135b7208cff705d48ad76c4 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:51 +0800 Subject: mm: remove page_file_offset and folio_file_pos These two helpers were useful for mixed usage of swap cache and page cache, which help retrieve the corresponding file or swap device offset of a page or folio. They were introduced in commit f981c5950fa8 ("mm: methods for teaching filesystems about PG_swapcache pages") and used in commit d56b4ddf7781 ("nfs: teach the NFS client how to treat PG_swapcache pages"), suppose to be used with direct_IO for swap over fs. But after commit e1209d3a7a67 ("mm: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space"), swap with direct_IO is no more, and swap cache mapping is never exposed to fs. Now we have dropped all users of page_file_offset and folio_file_pos, so they can be deleted. Link: https://lkml.kernel.org/r/20240521175854.96038-10-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 59f1df0cde5a..78f2cd8c73ff 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -932,11 +932,6 @@ static inline loff_t page_offset(struct page *page) return ((loff_t)page->index) << PAGE_SHIFT; } -static inline loff_t page_file_offset(struct page *page) -{ - return ((loff_t)page_index(page)) << PAGE_SHIFT; -} - /** * folio_pos - Returns the byte position of this folio in its file. * @folio: The folio. @@ -946,18 +941,6 @@ static inline loff_t folio_pos(struct folio *folio) return page_offset(&folio->page); } -/** - * folio_file_pos - Returns the byte position of this folio in its file. - * @folio: The folio. - * - * This differs from folio_pos() for folios which belong to a swap file. - * NFS is the only filesystem today which needs to use folio_file_pos(). - */ -static inline loff_t folio_file_pos(struct folio *folio) -{ - return page_file_offset(&folio->page); -} - /* * Get the offset in PAGE_SIZE (even for hugetlb folios). */ -- cgit v1.2.3 From 05b0c7edad9b8a5ccf1b46b01e1b96fcd10b50d8 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:52 +0800 Subject: mm: drop page_index and simplify folio_index There are two helpers for retrieving the index within address space for mixed usage of swap cache and page cache: - page_index - folio_index This commit drops page_index, as we have eliminated all users, and converts folio_index's helper __page_file_index to use folio to avoid the page conversion. Link: https://lkml.kernel.org/r/20240521175854.96038-11-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 ------------- include/linux/pagemap.h | 8 ++++---- 2 files changed, 4 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index eb7c96d24ac0..99174fac3d47 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2295,19 +2295,6 @@ static inline void *folio_address(const struct folio *folio) return page_address(&folio->page); } -extern pgoff_t __page_file_index(struct page *page); - -/* - * Return the pagecache index of the passed page. Regular pagecache pages - * use ->index whereas swapcache pages use swp_offset(->private) - */ -static inline pgoff_t page_index(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return __page_file_index(page); - return page->index; -} - /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 78f2cd8c73ff..1586b5c21e81 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -792,7 +792,7 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } -#define swapcache_index(folio) __page_file_index(&(folio)->page) +extern pgoff_t __folio_swap_cache_index(struct folio *folio); /** * folio_index - File index of a folio. @@ -807,9 +807,9 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, */ static inline pgoff_t folio_index(struct folio *folio) { - if (unlikely(folio_test_swapcache(folio))) - return swapcache_index(folio); - return folio->index; + if (unlikely(folio_test_swapcache(folio))) + return __folio_swap_cache_index(folio); + return folio->index; } /** -- cgit v1.2.3 From 63818aaf0da8c036d600424ac961620dcdc13ce2 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 20 May 2024 15:44:07 -0700 Subject: mm/hugetlb: remove {Set,Clear}Hpage macros All users have been converted to use the folio version of these macros, we can safely remove the page based interface. Link: https://lkml.kernel.org/r/20240520224407.110062-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2b3c3a404769..15a58f69782c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -625,18 +625,14 @@ static __always_inline \ void folio_set_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ set_bit(HPG_##flname, private); \ - } \ -static inline void SetHPage##uname(struct page *page) \ - { set_bit(HPG_##flname, &(page->private)); } + } #define CLEARHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_clear_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ clear_bit(HPG_##flname, private); \ - } \ -static inline void ClearHPage##uname(struct page *page) \ - { clear_bit(HPG_##flname, &(page->private)); } + } #else #define TESTHPAGEFLAG(uname, flname) \ static inline bool \ @@ -648,15 +644,11 @@ static inline int HPage##uname(struct page *page) \ #define SETHPAGEFLAG(uname, flname) \ static inline void \ folio_set_hugetlb_##flname(struct folio *folio) \ - { } \ -static inline void SetHPage##uname(struct page *page) \ { } #define CLEARHPAGEFLAG(uname, flname) \ static inline void \ folio_clear_hugetlb_##flname(struct folio *folio) \ - { } \ -static inline void ClearHPage##uname(struct page *page) \ { } #endif -- cgit v1.2.3 From 6ad28e7e52e2bae76b1d3eb4466999d04d50db92 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 22 May 2024 14:57:13 +0200 Subject: mm/rmap: sanity check that zeropages are not passed to RMAP Using insert_page() we might have previously ended up passing the zeropage into rmap code. Make sure that won't happen again. Note that we won't check the huge zeropage for now, which might still end up in RMAP code. Link: https://lkml.kernel.org/r/20240522125713.775114-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Dan Williams Cc: Vincent Donnefort Signed-off-by: Andrew Morton --- include/linux/rmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 7229b9baf20d..5cb0d419a1d7 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -200,6 +200,9 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio, /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + /* When (un)mapping zeropages, we should never touch ref+mapcount. */ + VM_WARN_ON_FOLIO(is_zero_folio(folio), folio); + /* * TODO: we get driver-allocated folios that have nothing to do with * the rmap using vm_insert_page(); therefore, we cannot assume that -- cgit v1.2.3 From 23b1b44e6c61295084284aa7d87db863a7802b92 Mon Sep 17 00:00:00 2001 From: Bang Li Date: Wed, 22 May 2024 14:12:02 +0800 Subject: mm: add update_mmu_tlb_range() Patch series "Add update_mmu_tlb_range() to simplify code", v4. This series of commits mainly adds the update_mmu_tlb_range() to batch update tlb in an address range and implement update_mmu_tlb() using update_mmu_tlb_range(). After commit 19eaf44954df ("mm: thp: support allocation of anonymous multi-size THP"), We may need to batch update tlb of a certain address range by calling update_mmu_tlb() in a loop. Using the update_mmu_tlb_range(), we can simplify the code and possibly reduce the execution of some unnecessary code in some architectures. This patch (of 3): Add update_mmu_tlb_range(), we can batch update tlb of an address range. Link: https://lkml.kernel.org/r/20240522061204.117421-1-libang.li@antgroup.com Link: https://lkml.kernel.org/r/20240522061204.117421-2-libang.li@antgroup.com Signed-off-by: Bang Li Acked-by: David Hildenbrand Cc: Chris Zankel Cc: Huacai Chen Cc: Lance Yang Cc: Max Filippov Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Ryan Roberts Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 18019f037bae..17d1caee39ab 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -729,6 +729,13 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ +#ifndef update_mmu_tlb_range +static inline void update_mmu_tlb_range(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) +{ +} +#endif + #ifndef __HAVE_ARCH_UPDATE_MMU_TLB static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) -- cgit v1.2.3 From 8f65aa32239f1c3f11b7a25bd5921223bafc5fed Mon Sep 17 00:00:00 2001 From: Bang Li Date: Wed, 22 May 2024 14:12:03 +0800 Subject: mm: implement update_mmu_tlb() using update_mmu_tlb_range() Let's make update_mmu_tlb() simply a generic wrapper around update_mmu_tlb_range(). Only the latter can now be overridden by the architecture. We can now remove __HAVE_ARCH_UPDATE_MMU_TLB as well. Link: https://lkml.kernel.org/r/20240522061204.117421-3-libang.li@antgroup.com Signed-off-by: Bang Li Acked-by: David Hildenbrand Cc: Chris Zankel Cc: Huacai Chen Cc: Lance Yang Cc: Max Filippov Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Ryan Roberts Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 17d1caee39ab..117b807e3f89 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -736,13 +736,11 @@ static inline void update_mmu_tlb_range(struct vm_area_struct *vma, } #endif -#ifndef __HAVE_ARCH_UPDATE_MMU_TLB static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + update_mmu_tlb_range(vma, address, ptep, 1); } -#define __HAVE_ARCH_UPDATE_MMU_TLB -#endif /* * Some architectures may be able to avoid expensive synchronization -- cgit v1.2.3 From b8b9488d50b7150bd4830dfff487e8d4ef6589ba Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Fri, 24 May 2024 15:53:04 -0600 Subject: mm/memory-failure: improve memory failure action_result messages Added two explicit MF_MSG messages describing failure in get_hwpoison_page. Attemped to document the definition of various action names, and made a few adjustment to the action_result() calls. Link: https://lkml.kernel.org/r/20240524215306.2705454-4-jane.chu@oracle.com Signed-off-by: Jane Chu Reviewed-by: Oscar Salvador Acked-by: Miaohe Lin Cc: Naoya Horiguchi Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 99174fac3d47..f4bf94ca99e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4096,6 +4096,7 @@ enum mf_action_page_type { MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, + MF_MSG_GET_HWPOISON, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, @@ -4109,6 +4110,7 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, + MF_MSG_ALREADY_POISONED, MF_MSG_UNKNOWN, }; -- cgit v1.2.3 From 188f87f2648b13f5de17d5e068f18d317e0c1f98 Mon Sep 17 00:00:00 2001 From: Eric Chanudet Date: Wed, 22 May 2024 16:38:01 -0400 Subject: mm/mm_init: use node's number of cpus in deferred_page_init_max_threads x86_64 is already using the node's cpu as maximum threads. Make that the default for all archs setting DEFERRED_STRUCT_PAGE_INIT. This returns to the behavior prior making the function arch-specific with commit ecd096506922 ("mm: make deferred init's max threads arch-specific"). Setting DEFERRED_STRUCT_PAGE_INIT and testing on a few arm64 platforms shows faster deferred_init_memmap completions: | | x13s | SA8775p-ride | Ampere R137-P31 | Ampere HR330 | | | Metal, 32GB | VM, 36GB | VM, 58GB | Metal, 128GB | | | 8cpus | 8cpus | 8cpus | 32cpus | |---------|-------------|--------------|-----------------|--------------| | threads | ms (%) | ms (%) | ms (%) | ms (%) | |---------|-------------|--------------|-----------------|--------------| | 1 | 108 (0%) | 72 (0%) | 224 (0%) | 324 (0%) | | cpus | 24 (-77%) | 36 (-50%) | 40 (-82%) | 56 (-82%) | Michael Ellerman reported: : On a machine here (1TB, 40 cores, 4KB pages) the existing code gives: : : [ 0.500124] node 2 deferred pages initialised in 210ms : [ 0.515790] node 3 deferred pages initialised in 230ms : [ 0.516061] node 0 deferred pages initialised in 230ms : [ 0.516522] node 7 deferred pages initialised in 230ms : [ 0.516672] node 4 deferred pages initialised in 230ms : [ 0.516798] node 6 deferred pages initialised in 230ms : [ 0.517051] node 5 deferred pages initialised in 230ms : [ 0.523887] node 1 deferred pages initialised in 240ms : : vs with the patch: : : [ 0.379613] node 0 deferred pages initialised in 90ms : [ 0.380388] node 1 deferred pages initialised in 90ms : [ 0.380540] node 4 deferred pages initialised in 100ms : [ 0.390239] node 6 deferred pages initialised in 100ms : [ 0.390249] node 2 deferred pages initialised in 100ms : [ 0.390786] node 3 deferred pages initialised in 110ms : [ 0.396721] node 5 deferred pages initialised in 110ms : [ 0.397095] node 7 deferred pages initialised in 110ms : : Which is a nice speedup. [echanude@redhat.com: v3] Link: https://lkml.kernel.org/r/20240528185455.643227-4-echanude@redhat.com Link: https://lkml.kernel.org/r/20240522203758.626932-4-echanude@redhat.com Signed-off-by: Eric Chanudet Tested-by: Michael Ellerman (powerpc) Reviewed-by: Baoquan He Acked-by: Alexander Gordeev Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/memblock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e2082240586d..40c62aca36ec 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -335,8 +335,6 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, for (; i != U64_MAX; \ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) -int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); - #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** -- cgit v1.2.3 From ffc3c8a631eeadd0de63605cecfb6ba544245352 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 09:49:50 +0800 Subject: mm: memcontrol: remove page_memcg() The page_memcg() only called by mod_memcg_page_state(), so squash it to cleanup page_memcg(). Link: https://lkml.kernel.org/r/20240524014950.187805-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 030d34e9d117..3d1599146afe 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -443,11 +443,6 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return __folio_memcg(folio); } -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return folio_memcg(page_folio(page)); -} - /** * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. * @folio: Pointer to the folio. @@ -1014,7 +1009,7 @@ static inline void mod_memcg_page_state(struct page *page, return; rcu_read_lock(); - memcg = page_memcg(page); + memcg = folio_memcg(page_folio(page)); if (memcg) mod_memcg_state(memcg, idx, val); rcu_read_unlock(); @@ -1133,11 +1128,6 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return NULL; } -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return NULL; -} - static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { WARN_ON_ONCE(!rcu_read_lock_held()); @@ -1636,7 +1626,7 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, spin_unlock_irqrestore(&lruvec->lru_lock, flags); } -/* Test requires a stable page->memcg binding, see page_memcg() */ +/* Test requires a stable folio->memcg binding, see folio_memcg() */ static inline bool folio_matches_lruvec(struct folio *folio, struct lruvec *lruvec) { -- cgit v1.2.3 From 06668257a355a05483c188e35a18c80471d06987 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 24 May 2024 19:18:10 +0100 Subject: mm: remove page_mapping() All callers are now converted, delete this compatibility wrapper. Also fix up some comments which referred to page_mapping. Link: https://lkml.kernel.org/r/20240423225552.4113447-7-willy@infradead.org Link: https://lkml.kernel.org/r/20240524181813.698813-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Eric Biggers Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 2 +- include/linux/page-flags.h | 23 ++++++++++++----------- include/linux/pagemap.h | 1 - 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index e022e40b099e..14acf1bbe0ce 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -53,7 +53,7 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); * filesystem and block layers. Nowadays the basic I/O unit * is the bio, and buffer_heads are used for extracting block * mappings (via a get_block_t call), for tracking state within - * a page (via a page_mapping) and for wrapping bio submission + * a folio (via a folio_mapping) and for wrapping bio submission * for backward compatibility reasons (e.g. submit_bh). */ struct buffer_head { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b9e914e1face..813eea2efe26 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -655,27 +655,28 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #endif /* - * On an anonymous page mapped into a user virtual memory area, - * page->mapping points to its anon_vma, not to a struct address_space; + * On an anonymous folio mapped into a user virtual memory area, + * folio->mapping points to its anon_vma, not to a struct address_space; * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON - * bit; and then page->mapping points, not to an anon_vma, but to a private + * bit; and then folio->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged page. See ksm.h. * * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable - * page and then page->mapping points to a struct movable_operations. + * page and then folio->mapping points to a struct movable_operations. * - * Please note that, confusingly, "page_mapping" refers to the inode - * address_space which maps the page from disk; whereas "page_mapped" - * refers to user virtual address space into which the page is mapped. + * Please note that, confusingly, "folio_mapping" refers to the inode + * address_space which maps the folio from disk; whereas "folio_mapped" + * refers to user virtual address space into which the folio is mapped. * * For slab pages, since slab reuses the bits in struct page to store its - * internal states, the page->mapping does not exist as such, nor do these - * flags below. So in order to avoid testing non-existent bits, please - * make sure that PageSlab(page) actually evaluates to false before calling - * the following functions (e.g., PageAnon). See mm/slab.h. + * internal states, the folio->mapping does not exist as such, nor do + * these flags below. So in order to avoid testing non-existent bits, + * please make sure that folio_test_slab(folio) actually evaluates to + * false before calling the following functions (e.g., folio_test_anon). + * See mm/slab.h. */ #define PAGE_MAPPING_ANON 0x1 #define PAGE_MAPPING_MOVABLE 0x2 diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 1586b5c21e81..e37e16ebff7a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -426,7 +426,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) #endif } -struct address_space *page_mapping(struct page *); struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); -- cgit v1.2.3 From 1df07243483c1a32c8f2f93b06dc52a583a4dd1c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 13:36:18 +0800 Subject: rmap: remove DEFINE_PAGE_VMA_WALK() This are no users since commit 40d707f33db5 ("mm/ksm: use folio in write_protect_page"), so remove DEFINE_PAGE_VMA_WALK(). Link: https://lkml.kernel.org/r/20240524053618.208895-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Cc: Alex Shi Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/rmap.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 5cb0d419a1d7..bb53e5920b88 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -684,16 +684,6 @@ struct page_vma_mapped_walk { unsigned int flags; }; -#define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ - struct page_vma_mapped_walk name = { \ - .pfn = page_to_pfn(_page), \ - .nr_pages = compound_nr(_page), \ - .pgoff = page_to_pgoff(_page), \ - .vma = _vma, \ - .address = _address, \ - .flags = _flags, \ - } - #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ .pfn = folio_pfn(_folio), \ -- cgit v1.2.3 From 940d6683c79950b21b3762124eabfa9b2f6fee96 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 13:28:42 +0800 Subject: mm: migrate: remove migrate_folio_extra() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migrate_folio_extra() is only called in migrate.c now, convert it a static function and take a new src_private argument which could be shared by migrate_folio() and filemap_migrate_folio() to simplify code a bit. Link: https://lkml.kernel.org/r/20240524052843.182275-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Tony Luck Cc: Vishal Moola (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/migrate.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 2ce13e8a309b..517f70b70620 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -63,8 +63,6 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION void putback_movable_pages(struct list_head *l); -int migrate_folio_extra(struct address_space *mapping, struct folio *dst, - struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free, -- cgit v1.2.3 From 906632843d00a4a42072b19c423b30a9e7adef3c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 13:28:43 +0800 Subject: mm: remove MIGRATE_SYNC_NO_COPY mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 2916ecc0f9d4 ("mm/migrate: new migrate mode MIGRATE_SYNC_NO_COPY") introduce a new MIGRATE_SYNC_NO_COPY mode to allow to offload the copy to a device DMA engine, which is only used __migrate_device_pages() to decide whether or not copy the old page, and the MIGRATE_SYNC_NO_COPY mode only set in hmm, as the MIGRATE_SYNC_NO_COPY set is removed by previous cleanup, it seems that we could remove the unnecessary MIGRATE_SYNC_NO_COPY. Link: https://lkml.kernel.org/r/20240524052843.182275-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Tony Luck Cc: Vishal Moola (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/migrate_mode.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index f37cc03f9369..9fb482bb7323 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -7,16 +7,11 @@ * on most operations but not ->writepage as the potential stall time * is too significant * MIGRATE_SYNC will block when migrating pages - * MIGRATE_SYNC_NO_COPY will block when migrating pages but will not copy pages - * with the CPU. Instead, page copy happens outside the migratepage() - * callback and is likely using a DMA engine. See migrate_vma() and HMM - * (mm/hmm.c) for users of this mode. */ enum migrate_mode { MIGRATE_ASYNC, MIGRATE_SYNC_LIGHT, MIGRATE_SYNC, - MIGRATE_SYNC_NO_COPY, }; enum migrate_reason { -- cgit v1.2.3 From ebfba0045176cb013f49cb3e5bd9f0b16eba203c Mon Sep 17 00:00:00 2001 From: Chuanhua Han Date: Wed, 29 May 2024 20:28:19 +1200 Subject: mm: swap: introduce swap_free_nr() for batched swap_free() Patch series "large folios swap-in: handle refault cases first", v5. This patchset is extracted from the large folio swapin series[1], primarily addressing the handling of scenarios involving large folios in the swap cache. Currently, it is particularly focused on addressing the refaulting of mTHP, which is still undergoing reclamation. This approach aims to streamline code review and expedite the integration of this segment into the MM tree. It relies on Ryan's swap-out series[2], leveraging the helper function swap_pte_batch() introduced by that series. Presently, do_swap_page only encounters a large folio in the swap cache before the large folio is released by vmscan. However, the code should remain equally useful once we support large folio swap-in via swapin_readahead(). This approach can effectively reduce page faults and eliminate most redundant checks and early exits for MTE restoration in recent MTE patchset[3]. The large folio swap-in for SWP_SYNCHRONOUS_IO and swapin_readahead() will be split into separate patch sets and sent at a later time. [1] https://lore.kernel.org/linux-mm/20240304081348.197341-1-21cnbao@gmail.com/ [2] https://lore.kernel.org/linux-mm/20240408183946.2991168-1-ryan.roberts@arm.com/ [3] https://lore.kernel.org/linux-mm/20240322114136.61386-1-21cnbao@gmail.com/ This patch (of 6): While swapping in a large folio, we need to free swaps related to the whole folio. To avoid frequently acquiring and releasing swap locks, it is better to introduce an API for batched free. Furthermore, this new function, swap_free_nr(), is designed to efficiently handle various scenarios for releasing a specified number, nr, of swap entries. Link: https://lkml.kernel.org/r/20240529082824.150954-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240529082824.150954-2-21cnbao@gmail.com Signed-off-by: Chuanhua Han Co-developed-by: Barry Song Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Acked-by: Chris Li Reviewed-by: "Huang, Ying" Cc: Baolin Wang Cc: David Hildenbrand Cc: Gao Xiang Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Cc: Andreas Larsson Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Khalid Aziz Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index bd450023b9a4..0f41fe49c9dc 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -478,6 +478,7 @@ extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); +extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); @@ -559,6 +560,10 @@ static inline void swap_free(swp_entry_t swp) { } +static inline void swap_free_nr(swp_entry_t entry, int nr_pages) +{ +} + static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } -- cgit v1.2.3 From 54f7a49c20ebb5189980c53e6e66709d22bee572 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 29 May 2024 20:28:20 +1200 Subject: mm: remove the implementation of swap_free() and always use swap_free_nr() To streamline maintenance efforts, we propose removing the implementation of swap_free(). Instead, we can simply invoke swap_free_nr() with nr set to 1. swap_free_nr() is designed with a bitmap consisting of only one long, resulting in overhead that can be ignored for cases where nr equals 1. A prime candidate for leveraging swap_free_nr() lies within kernel/power/swap.c. Implementing this change facilitates the adoption of batch processing for hibernation. Link: https://lkml.kernel.org/r/20240529082824.150954-3-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: "Huang, Ying" Reviewed-by: "Huang, Ying" Acked-by: Chris Li Reviewed-by: Ryan Roberts Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Len Brown Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Andreas Larsson Cc: Baolin Wang Cc: Chuanhua Han Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gao Xiang Cc: Johannes Weiner Cc: Kairui Song Cc: Khalid Aziz Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0f41fe49c9dc..d33ce740b695 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -477,7 +477,6 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); -extern void swap_free(swp_entry_t); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); @@ -556,10 +555,6 @@ static inline int swapcache_prepare(swp_entry_t swp) return 0; } -static inline void swap_free(swp_entry_t swp) -{ -} - static inline void swap_free_nr(swp_entry_t entry, int nr_pages) { } @@ -608,6 +603,11 @@ static inline void free_swap_and_cache(swp_entry_t entry) free_swap_and_cache_nr(entry, 1); } +static inline void swap_free(swp_entry_t entry) +{ + swap_free_nr(entry, 1); +} + #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { -- cgit v1.2.3 From 29f252cdc293f4a50b5d3dcbed53701d8444614d Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 29 May 2024 20:28:22 +1200 Subject: mm: introduce arch_do_swap_page_nr() which allows restore metadata for nr pages Should do_swap_page() have the capability to directly map a large folio, metadata restoration becomes necessary for a specified number of pages denoted as nr. It's important to highlight that metadata restoration is solely required by the SPARC platform, which, however, does not enable THP_SWAP. Consequently, in the present kernel configuration, there exists no practical scenario where users necessitate the restoration of nr metadata. Platforms implementing THP_SWAP might invoke this function with nr values exceeding 1, subsequent to do_swap_page() successfully mapping an entire large folio. Nonetheless, their arch_do_swap_page_nr() functions remain empty. Link: https://lkml.kernel.org/r/20240529082824.150954-5-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Reviewed-by: Khalid Aziz Cc: "David S. Miller" Cc: Andreas Larsson Cc: Baolin Wang Cc: Chris Li Cc: Christoph Hellwig Cc: Chuanhua Han Cc: David Hildenbrand Cc: Gao Xiang Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Len Brown Cc: Matthew Wilcox (Oracle) Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 117b807e3f89..2f32eaccf0b9 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1089,6 +1089,15 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) }) #ifndef __HAVE_ARCH_DO_SWAP_PAGE +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + +} +#else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be @@ -1097,12 +1106,17 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ -static inline void arch_do_swap_page(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long addr, - pte_t pte, pte_t oldpte) -{ - +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + for (int i = 0; i < nr; i++) { + arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, + pte_advance_pfn(pte, i), + pte_advance_pfn(oldpte, i)); + } } #endif -- cgit v1.2.3 From 16540dae959d862909716d5c854e9b3aee285609 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 30 May 2024 10:14:27 -0700 Subject: mm/hugetlb: mm/memory_hotplug: use a folio in scan_movable_pages() By using a folio in scan_movable_pages() we convert the last user of the page-based hugetlb information macro functions to the folio version. After this conversion, we can safely remove the page-based definitions from include/linux/hugetlb.h. Link: https://lkml.kernel.org/r/20240530171427.242018-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 15a58f69782c..279aca379b95 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -616,9 +616,7 @@ static __always_inline \ bool folio_test_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ return test_bit(HPG_##flname, private); \ - } \ -static inline int HPage##uname(struct page *page) \ - { return test_bit(HPG_##flname, &(page->private)); } + } #define SETHPAGEFLAG(uname, flname) \ static __always_inline \ @@ -637,8 +635,6 @@ void folio_clear_hugetlb_##flname(struct folio *folio) \ #define TESTHPAGEFLAG(uname, flname) \ static inline bool \ folio_test_hugetlb_##flname(struct folio *folio) \ - { return 0; } \ -static inline int HPage##uname(struct page *page) \ { return 0; } #define SETHPAGEFLAG(uname, flname) \ -- cgit v1.2.3 From fefc6e6631ff43427e81f08c8e49f7787ff0213a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 28 May 2024 09:40:50 -0700 Subject: memcg: rearrange fields of mem_cgroup_per_node Kernel test robot reported [1] performance regression for will-it-scale test suite's page_fault2 test case for the commit 70a64b7919cb ("memcg: dynamically allocate lruvec_stats"). After inspection it seems like the commit has unintentionally introduced false cache sharing. After the commit the fields of mem_cgroup_per_node which get read on the performance critical path share the cacheline with the fields which get updated often on LRU page allocations or deallocations. This has caused contention on that cacheline and the workloads which manipulates a lot of LRU pages are regressed as reported by the test report. The solution is to rearrange the fields of mem_cgroup_per_node such that the false sharing is eliminated. Let's move all the read only pointers at the start of the struct, followed by memcg-v1 only fields and at the end fields which get updated often. Experiment setup: Ran fallocate1, fallocate2, page_fault1, page_fault2 and page_fault3 from the will-it-scale test suite inside a three level memcg with /tmp mounted as tmpfs on two different machines, one a single numa node and the other one, two node machine. $ ./[testcase]_processes -t $NR_CPUS -s 50 Results for single node, 52 CPU machine: Testcase base with-patch fallocate1 1031081 1431291 (38.80 %) fallocate2 1029993 1421421 (38.00 %) page_fault1 2269440 3405788 (50.07 %) page_fault2 2375799 3572868 (50.30 %) page_fault3 28641143 28673950 ( 0.11 %) Results for dual node, 80 CPU machine: Testcase base with-patch fallocate1 2976288 3641185 (22.33 %) fallocate2 2979366 3638181 (22.11 %) page_fault1 6221790 7748245 (24.53 %) page_fault2 6482854 7847698 (21.05 %) page_fault3 28804324 28991870 ( 0.65 %) Link: https://lkml.kernel.org/r/20240528164050.2625718-1-shakeel.butt@linux.dev Fixes: 70a64b7919cb ("memcg: dynamically allocate lruvec_stats") Signed-off-by: Shakeel Butt Reported-by: kernel test robot Reviewed-by: Yosry Ahmed Reviewed-by: Roman Gushchin Cc: Feng Tang Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3d1599146afe..7403dd5926eb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -96,23 +96,29 @@ struct mem_cgroup_reclaim_iter { * per-node information in memory controller. */ struct mem_cgroup_per_node { - struct lruvec lruvec; + /* Keep the read-only fields at the start */ + struct mem_cgroup *memcg; /* Back pointer, we cannot */ + /* use container_of */ struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; struct lruvec_stats *lruvec_stats; - - unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; - - struct mem_cgroup_reclaim_iter iter; - struct shrinker_info __rcu *shrinker_info; + /* + * Memcg-v1 only stuff in middle as buffer between read mostly fields + * and update often fields to avoid false sharing. Once v1 stuff is + * moved in a separate struct, an explicit padding is needed. + */ + struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; - struct mem_cgroup *memcg; /* Back pointer, we cannot */ - /* use container_of */ + + /* Fields which get updated often at the end. */ + struct lruvec lruvec; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; + struct mem_cgroup_reclaim_iter iter; }; struct mem_cgroup_threshold { -- cgit v1.2.3 From 21664442be1bf79094dd9d21b8833acbfbd80ea7 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 28 May 2024 16:43:13 +0200 Subject: percpu: add __this_cpu_try_cmpxchg() Add __this_cpu_try_cmpxchg() version of the percpu op. Link: https://lkml.kernel.org/r/20240528144345.5980-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Dennis Zhou Cc: Christoph Hellwig Cc: Lorenzo Stoakes Cc: Tejun Heo Cc: Christoph Lameter Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index ec3573119923..8efce7414fad 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -475,6 +475,12 @@ do { \ raw_cpu_cmpxchg(pcp, oval, nval); \ }) +#define __this_cpu_try_cmpxchg(pcp, ovalp, nval) \ +({ \ + __this_cpu_preempt_check("try_cmpxchg"); \ + raw_cpu_try_cmpxchg(pcp, ovalp, nval); \ +}) + #define __this_cpu_sub(pcp, val) __this_cpu_add(pcp, -(typeof(pcp))(val)) #define __this_cpu_inc(pcp) __this_cpu_add(pcp, 1) #define __this_cpu_dec(pcp) __this_cpu_sub(pcp, 1) -- cgit v1.2.3 From 4b98995530b77a97912230d8e1564ba7738db19c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:07 +0800 Subject: mm: shmem: add multi-size THP sysfs interface for anonymous shmem To support the use of mTHP with anonymous shmem, add a new sysfs interface 'shmem_enabled' in the '/sys/kernel/mm/transparent_hugepage/hugepages-kB/' directory for each mTHP to control whether shmem is enabled for that mTHP, with a value similar to the top level 'shmem_enabled', which can be set to: "always", "inherit (to inherit the top level setting)", "within_size", "advise", "never". An 'inherit' option is added to ensure compatibility with these global settings, and the options 'force' and 'deny' are dropped, which are rather testing artifacts from the old ages. By default, PMD-sized hugepages have enabled="inherit" and all other hugepage sizes have enabled="never" for '/sys/kernel/mm/transparent_hugepage/hugepages-xxkB/shmem_enabled'. In addition, if top level value is 'force', then only PMD-sized hugepages have enabled="inherit", otherwise configuration will be failed and vice versa. That means now we will avoid using non-PMD sized THP to override the global huge allocation. [baolin.wang@linux.alibaba.com: fix transhuge.rst indentation] Link: https://lkml.kernel.org/r/b189d815-998b-4dfd-ba89-218ff51313f8@linux.alibaba.com [akpm@linux-foundation.org: reflow transhuge.rst addition to 80 cols] [baolin.wang@linux.alibaba.com: move huge_shmem_orders_lock under CONFIG_SYSFS] Link: https://lkml.kernel.org/r/eb34da66-7f12-44f3-a39e-2bcc90c33354@linux.alibaba.com [akpm@linux-foundation.org: huge_memory.c needs mm_types.h] Link: https://lkml.kernel.org/r/ffddfa8b3cb4266ff963099ab78cfd7184c57ac7.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 088d66a54643..6a1edd752968 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,6 +6,7 @@ #include #include /* only for vma_is_dax() */ +#include vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -63,6 +64,7 @@ ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; +extern struct kobj_attribute thpsize_shmem_enabled_attr; /* * Mask of all large folio orders supported for anonymous THP; all orders up to @@ -265,6 +267,14 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } +struct thpsize { + struct kobject kobj; + struct list_head node; + int order; +}; + +#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) + enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, -- cgit v1.2.3 From e7a2ab7b3bb5d87f99f2ea3d4481d52fc5ceb52d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:08 +0800 Subject: mm: shmem: add mTHP support for anonymous shmem Commit 19eaf44954df adds multi-size THP (mTHP) for anonymous pages, that can allow THP to be configured through the sysfs interface located at '/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled'. However, the anonymous shmem will ignore the anonymous mTHP rule configured through the sysfs interface, and can only use the PMD-mapped THP, that is not reasonable. Users expect to apply the mTHP rule for all anonymous pages, including the anonymous shmem, in order to enjoy the benefits of mTHP. For example, lower latency than PMD-mapped THP, smaller memory bloat than PMD-mapped THP, contiguous PTEs on ARM architecture to reduce TLB miss etc. In addition, the mTHP interfaces can be extended to support all shmem/tmpfs scenarios in the future, especially for the shmem mmap() case. The primary strategy is similar to supporting anonymous mTHP. Introduce a new interface '/mm/transparent_hugepage/hugepage-XXkb/shmem_enabled', which can have almost the same values as the top-level '/sys/kernel/mm/transparent_hugepage/shmem_enabled', with adding a new additional "inherit" option and dropping the testing options 'force' and 'deny'. By default all sizes will be set to "never" except PMD size, which is set to "inherit". This ensures backward compatibility with the anonymous shmem enabled of the top level, meanwhile also allows independent control of anonymous shmem enabled for each mTHP. Link: https://lkml.kernel.org/r/65796c1e72e51e15f3410195b5c2d5b6c160d411.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 6a1edd752968..53b7a137f460 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -560,6 +560,16 @@ static inline bool thp_migration_supported(void) { return false; } + +static inline int highest_order(unsigned long orders) +{ + return 0; +} + +static inline int next_order(unsigned long *orders, int prev) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, -- cgit v1.2.3 From 66f44583f9b617d74ffa2487e75a9c3adf344ddb Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:10 +0800 Subject: mm: shmem: add mTHP counters for anonymous shmem Add mTHP counters for anonymous shmem. [baolin.wang@linux.alibaba.com: update Documentation/admin-guide/mm/transhuge.rst] Link: https://lkml.kernel.org/r/d86e2e7f-4141-432b-b2ba-c6691f36ef0b@linux.alibaba.com Link: https://lkml.kernel.org/r/4fd9e467d49ae4a747e428bcd821c7d13125ae67.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Lance Yang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 53b7a137f460..7ad41de5eaea 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -281,6 +281,9 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, + MTHP_STAT_FILE_ALLOC, + MTHP_STAT_FILE_FALLBACK, + MTHP_STAT_FILE_FALLBACK_CHARGE, __MTHP_STAT_COUNT }; -- cgit v1.2.3 From cdd9a571b7d8465353fe2e2d8d1edd8a58d82021 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 14:23:56 +0200 Subject: fs/proc: move page_mapcount() to fs/proc/internal.h ... and rename it to folio_precise_page_mapcount(). fs/proc is the last remaining user, and that should stay that way. While at it, cleanup kpagecount_read() a bit: there are still some legacy leftovers -- when the interface was introduced it returned the page refcount, but was changed briefly afterwards to return the page mapcount. Further, some simple folio conversion. Once we stop using the per-page mapcounts of large folios, all folio_precise_page_mapcount() users will have to implement an alternative way to achieve what they are trying to achieve, possibly in a less precise way. [dan.carpenter@linaro.org: fix uninitialized variable in pagemap_pmd_range()] Link: https://lkml.kernel.org/r/9d6eaba7-92f8-4a70-8765-38a519680a87@moroto.mountain Link: https://lkml.kernel.org/r/20240607122357.115423-6-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Dan Carpenter Cc: Alexey Dobriyan Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f4bf94ca99e3..7f864d5f52b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1202,8 +1202,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for - * debugging purposes - it does not include PTE-mapped sub-pages; look - * at folio_mapcount() or page_mapcount() instead. + * debugging purposes or implementation of other core folio_*() primitives. */ static inline int folio_entire_mapcount(const struct folio *folio) { @@ -1221,30 +1220,6 @@ static inline void page_mapcount_reset(struct page *page) atomic_set(&(page)->_mapcount, -1); } -/** - * page_mapcount() - Number of times this precise page is mapped. - * @page: The page. - * - * The number of times this page is mapped. If this page is part of - * a large folio, it includes the number of times this page is mapped - * as part of that folio. - * - * Will report 0 for pages which cannot be mapped into userspace, eg - * slab, page tables and similar. - */ -static inline int page_mapcount(struct page *page) -{ - int mapcount = atomic_read(&page->_mapcount) + 1; - - /* Handle page_has_type() pages */ - if (mapcount < PAGE_MAPCOUNT_RESERVE + 1) - mapcount = 0; - if (unlikely(PageCompound(page))) - mapcount += folio_entire_mapcount(page_folio(page)); - - return mapcount; -} - static inline int folio_large_mapcount(const struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); -- cgit v1.2.3 From 7a581204b1fa4fed233ed3b7ffcf6847cc383dbc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 10:37:10 +0200 Subject: mm/highmem: reimplement totalhigh_pages() by walking zones Patch series "mm/highmem: don't track highmem pages manually". Let's remove highmem special-casing from adjust_managed_page_count(), to result in less confusion why memblock manually adjusts totalram_pages, and __free_pages_core() only adjusts the zone's managed pages -- what about the highmem pages that adjust_managed_page_count() updates? Now, we only maintain totalram_pages and a zone's managed pages independent of highmem support. We can derive the number of highmem pages simply by looking at the relevant zone's managed pages. I don't think there is any particular fast path that needs a maximum-efficient totalhigh_pages() implementation. Note that highmem memory is currently initialized using free_highmem_page()->free_reserved_page(), not __free_pages_core(). In the future we might want to also use __free_pages_core() to initialize highmem memory, to make that less special, and consider moving totalram_pages updates into __free_pages_core() [1], so we can just use adjust_managed_page_count() in there as well. Booting a simple kernel in QEMU reveals no highmem accounting change: Before: Memory: 3095448K/3145208K available (14802K kernel code, 2073K rwdata, 5000K rodata, 740K init, 556K bss, 49760K reserved, 0K cma-reserved, 2244488K highmem) After: Memory: 3095276K/3145208K available (14802K kernel code, 2073K rwdata, 5000K rodata, 740K init, 556K bss, 49932K reserved, 0K cma-reserved, 2244488K highmem) [1] https://lkml.kernel.org/r/20240601133402.2675-1-richard.weiyang@gmail.com This patch (of 2): Can we get rid of the highmem ifdef in adjust_managed_page_count()? Likely yes: we don't have that many totalhigh_pages() users, and they all don't seem to be very performance critical. So let's implement totalhigh_pages() like nr_free_highpages(), collecting information from all zones. This is now similar to what we do in si_meminfo_node() to collect the per-node highmem page count. In the common case (single node, 3-4 zones), we really shouldn't care. We could optimize a bit further (only walk ZONE_HIGHMEM and ZONE_MOVABLE if required), but there doesn't seem a real need for that. [david@redhat.com: fix build bot complaint] Link: https://lkml.kernel.org/r/b57e5bc4-eb72-40e3-add4-57dfa6e03df6@redhat.com Link: https://lkml.kernel.org/r/20240607083711.62833-1-david@redhat.com Link: https://lkml.kernel.org/r/20240607083711.62833-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Wei Yang Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/highmem-internal.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index a3028e400a9c..65f865fbbac0 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -132,7 +132,7 @@ static inline void __kunmap_atomic(const void *addr) } unsigned int __nr_free_highpages(void); -extern atomic_long_t _totalhigh_pages; +unsigned long __totalhigh_pages(void); static inline unsigned int nr_free_highpages(void) { @@ -141,12 +141,7 @@ static inline unsigned int nr_free_highpages(void) static inline unsigned long totalhigh_pages(void) { - return (unsigned long)atomic_long_read(&_totalhigh_pages); -} - -static inline void totalhigh_pages_add(long count) -{ - atomic_long_add(count, &_totalhigh_pages); + return __totalhigh_pages(); } static inline bool is_kmap_addr(const void *x) -- cgit v1.2.3 From 90b8fab5cdc441735d388e286c715da7c85b24f1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 10:37:11 +0200 Subject: mm/highmem: make nr_free_highpages() return "unsigned long" It looks rather weird that totalhigh_pages() returns an "unsigned long" but nr_free_highpages() returns an "unsigned int". Let's return an "unsigned long" from nr_free_highpages() to be consistent. While at it, use a plain "0" instead of a "0UL" in the !CONFIG_HIGHMEM totalhigh_pages() implementation, to make these look alike as well. Link: https://lkml.kernel.org/r/20240607083711.62833-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Wei Yang Signed-off-by: Andrew Morton --- include/linux/highmem-internal.h | 8 ++++---- include/linux/highmem.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 65f865fbbac0..dd100e849f5e 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -131,10 +131,10 @@ static inline void __kunmap_atomic(const void *addr) preempt_enable(); } -unsigned int __nr_free_highpages(void); +unsigned long __nr_free_highpages(void); unsigned long __totalhigh_pages(void); -static inline unsigned int nr_free_highpages(void) +static inline unsigned long nr_free_highpages(void) { return __nr_free_highpages(); } @@ -234,8 +234,8 @@ static inline void __kunmap_atomic(const void *addr) preempt_enable(); } -static inline unsigned int nr_free_highpages(void) { return 0; } -static inline unsigned long totalhigh_pages(void) { return 0UL; } +static inline unsigned long nr_free_highpages(void) { return 0; } +static inline unsigned long totalhigh_pages(void) { return 0; } static inline bool is_kmap_addr(const void *x) { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 00341b56d291..fa6891e06316 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -179,7 +179,7 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset); static inline void *kmap_atomic(struct page *page); /* Highmem related interfaces for management code */ -static inline unsigned int nr_free_highpages(void); +static inline unsigned long nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); #ifndef ARCH_HAS_FLUSH_ANON_PAGE -- cgit v1.2.3 From 29e847d2ade3cdff36afe095fdbeb9b5f71a197a Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 14 Jun 2024 09:51:37 +0800 Subject: mm/rmap: integrate PMD-mapped folio splitting into pagewalk loop In preparation for supporting try_to_unmap_one() to unmap PMD-mapped folios, start the pagewalk first, then call split_huge_pmd_address() to split the folio. Link: https://lkml.kernel.org/r/20240614015138.31461-3-ioworker0@gmail.com Signed-off-by: Lance Yang Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Suggested-by: Baolin Wang Acked-by: Zi Yan Cc: Bang Li Cc: Barry Song Cc: Fangrui Song Cc: Jeff Xie Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: SeongJae Park Cc: Yang Shi Cc: Yin Fengwei Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ++++++ include/linux/rmap.h | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7ad41de5eaea..9f720b0731c4 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -428,6 +428,9 @@ static inline bool thp_migration_supported(void) return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } +void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, bool freeze, struct folio *folio); + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline bool folio_test_pmd_mappable(struct folio *folio) @@ -490,6 +493,9 @@ static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) {} +static inline void split_huge_pmd_locked(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + bool freeze, struct folio *folio) {} #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bb53e5920b88..bf46787c8eba 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -703,6 +703,30 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) spin_unlock(pvmw->ptl); } +/** + * page_vma_mapped_walk_restart - Restart the page table walk. + * @pvmw: Pointer to struct page_vma_mapped_walk. + * + * It restarts the page table walk when changes occur in the page + * table, such as splitting a PMD. Ensures that the PTL held during + * the previous walk is released and resets the state to allow for + * a new walk starting at the current address stored in pvmw->address. + */ +static inline void +page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw) +{ + WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte); + + if (likely(pvmw->ptl)) + spin_unlock(pvmw->ptl); + else + WARN_ON_ONCE(1); + + pvmw->ptl = NULL; + pvmw->pmd = NULL; + pvmw->pte = NULL; +} + bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); /* -- cgit v1.2.3 From 735ecdfaf4e802209caf34b7ac45adf448c54ccc Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 14 Jun 2024 09:51:38 +0800 Subject: mm/vmscan: avoid split lazyfree THP during shrink_folio_list() When the user no longer requires the pages, they would use madvise(MADV_FREE) to mark the pages as lazy free. Subsequently, they typically would not re-write to that memory again. During memory reclaim, if we detect that the large folio and its PMD are both still marked as clean and there are no unexpected references (such as GUP), so we can just discard the memory lazily, improving the efficiency of memory reclamation in this case. On an Intel i5 CPU, reclaiming 1GiB of lazyfree THPs using mem_cgroup_force_empty() results in the following runtimes in seconds (shorter is better): -------------------------------------------- | Old | New | Change | -------------------------------------------- | 0.683426 | 0.049197 | -92.80% | -------------------------------------------- [ioworker0@gmail.com: minor changes per David] Link: https://lkml.kernel.org/r/20240622100057.3352-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240614015138.31461-4-ioworker0@gmail.com Signed-off-by: Lance Yang Suggested-by: Zi Yan Suggested-by: David Hildenbrand Cc: Bang Li Cc: Baolin Wang Cc: Barry Song Cc: Fangrui Song Cc: Jeff Xie Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: SeongJae Park Cc: Yang Shi Cc: Yin Fengwei Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9f720b0731c4..212cca384d7e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -430,6 +430,8 @@ static inline bool thp_migration_supported(void) void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze, struct folio *folio); +bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmdp, struct folio *folio); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -497,6 +499,13 @@ static inline void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze, struct folio *folio) {} +static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp, + struct folio *folio) +{ + return false; +} + #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) -- cgit v1.2.3 From 2b33a97c94bc44468fc1d54b745269c0cf0b7bb2 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 11 Jun 2024 02:45:14 +0000 Subject: mm: zswap: rename is_zswap_enabled() to zswap_is_enabled() In preparation for introducing a similar function, rename is_zswap_enabled() to use zswap_* prefix like other zswap functions. Link: https://lkml.kernel.org/r/20240611024516.1375191-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Barry Song Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Chris Li Cc: David Hildenbrand Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 2a85b941db97..ce5e7bfe8f1e 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -35,7 +35,7 @@ void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); void zswap_folio_swapin(struct folio *folio); -bool is_zswap_enabled(void); +bool zswap_is_enabled(void); #else struct zswap_lruvec_state {}; @@ -60,7 +60,7 @@ static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} static inline void zswap_folio_swapin(struct folio *folio) {} -static inline bool is_zswap_enabled(void) +static inline bool zswap_is_enabled(void) { return false; } -- cgit v1.2.3 From 2d4d2b1cfb85cc07f6d5619acb882d8b11e55cf4 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 11 Jun 2024 02:45:15 +0000 Subject: mm: zswap: add zswap_never_enabled() Add zswap_never_enabled() to skip the xarray lookup in zswap_load() if zswap was never enabled on the system. It is implemented using static branches for efficiency, as enabling zswap should be a rare event. This could shave some cycles off zswap_load() when CONFIG_ZSWAP is used but zswap is never enabled. However, the real motivation behind this patch is two-fold: - Incoming large folio swapin work will need to fallback to order-0 folios if zswap was ever enabled, because any part of the folio could be in zswap, until proper handling of large folios with zswap is added. - A warning and recovery attempt will be added in a following change in case the above was not done incorrectly. Zswap will fail the read if the folio is large and it was ever enabled. Expose zswap_never_enabled() in the header for the swapin work to use it later. [yosryahmed@google.com: expose zswap_never_enabled() in the header] Link: https://lkml.kernel.org/r/Zmjf0Dr8s9xSW41X@google.com Link: https://lkml.kernel.org/r/20240611024516.1375191-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Nhat Pham Cc: Barry Song Cc: Chengming Zhou Cc: Chris Li Cc: David Hildenbrand Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/zswap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index ce5e7bfe8f1e..bf83ae5e285d 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -36,6 +36,7 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); void zswap_folio_swapin(struct folio *folio); bool zswap_is_enabled(void); +bool zswap_never_enabled(void); #else struct zswap_lruvec_state {}; @@ -65,6 +66,11 @@ static inline bool zswap_is_enabled(void) return false; } +static inline bool zswap_never_enabled(void) +{ + return false; +} + #endif #endif /* _LINUX_ZSWAP_H */ -- cgit v1.2.3 From 15995a35247442aefa0ffe36a6dad51cb46b0918 Mon Sep 17 00:00:00 2001 From: Sourav Panda Date: Wed, 5 Jun 2024 22:27:51 +0000 Subject: mm: report per-page metadata information Today, we do not have any observability of per-page metadata and how much it takes away from the machine capacity. Thus, we want to describe the amount of memory that is going towards per-page metadata, which can vary depending on build configuration, machine architecture, and system use. This patch adds 2 fields to /proc/vmstat that can used as shown below: Accounting per-page metadata allocated by boot-allocator: /proc/vmstat:nr_memmap_boot * PAGE_SIZE Accounting per-page metadata allocated by buddy-allocator: /proc/vmstat:nr_memmap * PAGE_SIZE Accounting total Perpage metadata allocated on the machine: (/proc/vmstat:nr_memmap_boot + /proc/vmstat:nr_memmap) * PAGE_SIZE Utility for userspace: Observability: Describe the amount of memory overhead that is going to per-page metadata on the system at any given time since this overhead is not currently observable. Debugging: Tracking the changes or absolute value in struct pages can help detect anomalies as they can be correlated with other metrics in the machine (e.g., memtotal, number of huge pages, etc). page_ext overheads: Some kernel features such as page_owner page_table_check that use page_ext can be optionally enabled via kernel parameters. Having the total per-page metadata information helps users precisely measure impact. Furthermore, page-metadata metrics will reflect the amount of struct pages reliquished (or overhead reduced) when hugetlbfs pages are reserved which will vary depending on whether hugetlb vmemmap optimization is enabled or not. For background and results see: lore.kernel.org/all/20240220214558.3377482-1-souravpanda@google.com Link: https://lkml.kernel.org/r/20240605222751.1406125-1-souravpanda@google.com Signed-off-by: Sourav Panda Acked-by: David Rientjes Reviewed-by: Pasha Tatashin Cc: Alexey Dobriyan Cc: Bjorn Helgaas Cc: Chen Linxuan Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Ivan Babrou Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Kirill A. Shutemov Cc: Liam R. Howlett Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: "Rafael J. Wysocki" Cc: Randy Dunlap Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Tomas Mudrunka Cc: Vlastimil Babka Cc: Wei Xu Cc: Yang Yang Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 ++ include/linux/vmstat.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 586a8f0104d7..cb7f265c2b96 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,6 +220,8 @@ enum node_stat_item { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, + NR_MEMMAP, /* page metadata allocated through buddy allocator */ + NR_MEMMAP_BOOT, /* page metadata allocated through boot allocator */ NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 735eae6e272c..16b0cfa80502 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -624,4 +624,8 @@ static inline void lruvec_stat_sub_folio(struct folio *folio, { lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } + +void __meminit mod_node_early_perpage_metadata(int nid, long delta); +void __meminit store_early_perpage_metadata(void); + #endif /* _LINUX_VMSTAT_H */ -- cgit v1.2.3 From 47179fe03588caa13a9bae642b058901709ddc55 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 12 Jun 2024 09:24:08 +0000 Subject: mm/hugetlb_cgroup: prepare cftypes based on template Unlike other cgroup subsystems, the hugetlb cgroup does not provide a static array of cftype that explicitly displays the properties, handling functions, etc., of each file. Instead, it dynamically creates the attribute of cftypes based on the hstate during the startup procedure. This reduces the readability of the code. To fix this issue, introduce two templates of cftypes, and rebuild the attributes according to the hstate to make it ready to be added to cgroup framework. Link: https://lkml.kernel.org/r/20240612092409.2027592-3-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Cc: Muchun Song Cc: Oscar Salvador Cc: kernel test robot From: Xiu Jianfeng Subject: mm/hugetlb_cgroup: register lockdep key for cftype Date: Tue, 18 Jun 2024 07:19:22 +0000 When CONFIG_DEBUG_LOCK_ALLOC is enabled, the following commands can trigger a bug, mount -t cgroup2 none /sys/fs/cgroup cd /sys/fs/cgroup echo "+hugetlb" > cgroup.subtree_control The log is as below: BUG: key ffff8880046d88d8 has not been registered! ------------[ cut here ]------------ DEBUG_LOCKS_WARN_ON(1) WARNING: CPU: 3 PID: 226 at kernel/locking/lockdep.c:4945 lockdep_init_map_type+0x185/0x220 Modules linked in: CPU: 3 PID: 226 Comm: bash Not tainted 6.10.0-rc4-next-20240617-g76db4c64526c #544 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:lockdep_init_map_type+0x185/0x220 Code: 00 85 c0 0f 84 6c ff ff ff 8b 3d 6a d1 85 01 85 ff 0f 85 5e ff ff ff 48 c7 c6 21 99 4a 82 48 c7 c7 60 29 49 82 e8 3b 2e f5 RSP: 0018:ffffc9000083fc30 EFLAGS: 00000282 RAX: 0000000000000000 RBX: ffffffff828dd820 RCX: 0000000000000027 RDX: ffff88803cd9cac8 RSI: 0000000000000001 RDI: ffff88803cd9cac0 RBP: ffff88800674fbb0 R08: ffffffff828ce248 R09: 00000000ffffefff R10: ffffffff8285e260 R11: ffffffff828b8eb8 R12: ffff8880046d88d8 R13: 0000000000000000 R14: 0000000000000000 R15: ffff8880067281c0 FS: 00007f68601ea740(0000) GS:ffff88803cd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005614f3ebc740 CR3: 000000000773a000 CR4: 00000000000006f0 Call Trace: ? __warn+0x77/0xd0 ? lockdep_init_map_type+0x185/0x220 ? report_bug+0x189/0x1a0 ? handle_bug+0x3c/0x70 ? exc_invalid_op+0x18/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? lockdep_init_map_type+0x185/0x220 __kernfs_create_file+0x79/0x100 cgroup_addrm_files+0x163/0x380 ? find_held_lock+0x2b/0x80 ? find_held_lock+0x2b/0x80 ? find_held_lock+0x2b/0x80 css_populate_dir+0x73/0x180 cgroup_apply_control_enable+0x12f/0x3a0 cgroup_subtree_control_write+0x30b/0x440 kernfs_fop_write_iter+0x13a/0x1f0 vfs_write+0x341/0x450 ksys_write+0x64/0xe0 do_syscall_64+0x4b/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f68602d9833 Code: 8b 15 61 26 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 64 8b 04 25 18 00 00 00 85 c0 75 14 b8 01 00 00 00 08 RSP: 002b:00007fff9bbdf8e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 00007f68602d9833 RDX: 0000000000000009 RSI: 00005614f3ebc740 RDI: 0000000000000001 RBP: 00005614f3ebc740 R08: 000000000000000a R09: 0000000000000008 R10: 00005614f3db6ba0 R11: 0000000000000246 R12: 0000000000000009 R13: 00007f68603bd6a0 R14: 0000000000000009 R15: 00007f68603b8880 For lockdep, there is a sanity check in lockdep_init_map_type(), the lock-class key must either have been allocated statically or must have been registered as a dynamic key. However the commit e18df2889ff9 ("mm/hugetlb_cgroup: prepare cftypes based on template") has changed the cftypes from static allocated objects to dynamic allocated objects, so the cft->lockdep_key must be registered proactively. [xiujianfeng@huawei.com: fix BUG()] Link: https://lkml.kernel.org/r/20240619015527.2212698-1-xiujianfeng@huawei.com Link: https://lkml.kernel.org/r/20240618071922.2127289-1-xiujianfeng@huawei.com Link: https://lore.kernel.org/all/602186b3-5ce3-41b3-90a3-134792cc2a48@samsung.com/ Fixes: e18df2889ff9 ("mm/hugetlb_cgroup: prepare cftypes based on template") Signed-off-by: Xiu Jianfeng Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202406181046.8d8b2492-oliver.sang@intel.com Tested-by: Marek Szyprowski Tested-by: SeongJae Park Closes: https://lore.kernel.org/20240618233608.400367-1-sj@kernel.org Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/cgroup-defs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ea48c861cd36..dc151cb0ef09 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -676,9 +676,7 @@ struct cftype { __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); -#ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; -#endif }; /* -- cgit v1.2.3 From b79d715c43354010775862cfcd2d01cb04d0de47 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 12 Jun 2024 09:24:09 +0000 Subject: mm/hugetlb_cgroup: switch to the new cftypes The previous patch has already reconstructed the cftype attributes based on the templates and saved them in dfl_cftypes and legacy_cftypes. then remove the old procedure and switch to the new cftypes. Link: https://lkml.kernel.org/r/20240612092409.2027592-4-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 279aca379b95..a951c0d06061 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -686,11 +686,6 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; -#ifdef CONFIG_CGROUP_HUGETLB - /* cgroup control files */ - struct cftype cgroup_files_dfl[8]; - struct cftype cgroup_files_legacy[10]; -#endif char name[HSTATE_NAME_LEN]; }; -- cgit v1.2.3 From ceb32d6aa93ce3282a724f3a0828b4b84d5035f1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 12 Jun 2024 15:18:24 +0800 Subject: mm/memory-failure: remove MF_MSG_SLAB Since commit 46df8e73a4a3 ("mm: free up PG_slab"), MF_MSG_SLAB becomes unused. Remove it. No functional change intended. Link: https://lkml.kernel.org/r/20240612071835.157004-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Borislav Petkov (AMD) Cc: David Hildenbrand Cc: kernel test robot Cc: Naoya Horiguchi Cc: Tony Luck Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7f864d5f52b7..8a38adaf5480 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4067,7 +4067,6 @@ enum mf_result { enum mf_action_page_type { MF_MSG_KERNEL, MF_MSG_KERNEL_HIGH_ORDER, - MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, -- cgit v1.2.3 From 3a78f77fd1fb82c32cb7971a8a97c5cbc83ab69e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 12 Jun 2024 15:18:32 +0800 Subject: mm/memory-failure: move some function declarations into internal.h There are some functions only used inside mm. Move them into internal.h. No functional change intended. Link: https://lkml.kernel.org/r/20240612071835.157004-11-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405251049.hxjwX7zO-lkp@intel.com/ Cc: Borislav Petkov (AMD) Cc: David Hildenbrand Cc: Naoya Horiguchi Cc: Tony Luck Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 ------- include/linux/page-flags.h | 5 ----- include/linux/rmap.h | 2 -- 3 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a38adaf5480..51fe207026f8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4000,7 +4000,6 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); -struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); #else static inline void memory_failure_queue(unsigned long pfn, int flags) { @@ -4021,12 +4020,6 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i) } #endif -#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM) -void add_to_kill_ksm(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr); -#endif - #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) extern void memblk_nr_poison_inc(unsigned long pfn); extern void memblk_nr_poison_sub(unsigned long pfn, long i); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 813eea2efe26..b041bc058fb1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -616,11 +616,6 @@ PAGEFLAG_FALSE(Uncached, uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) -#define MAGIC_HWPOISON 0x48575053U /* HWPS */ -extern void SetPageHWPoisonTakenOff(struct page *page); -extern void ClearPageHWPoisonTakenOff(struct page *page); -extern bool take_page_off_buddy(struct page *page); -extern bool put_page_back_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bf46787c8eba..694d3d886245 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -747,8 +747,6 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); -unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); - /* * rmap_walk_control: To control rmap traversing for specific needs * -- cgit v1.2.3 From e36287c6e12d00f2087dc0c4899d8a9c54e22e1c Mon Sep 17 00:00:00 2001 From: Hyeongtak Ji Date: Fri, 14 Jun 2024 12:00:05 +0900 Subject: mm/damon/sysfs-schemes: add target_nid on sysfs-schemes This patch adds target_nid under /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes// The 'target_nid' can be used as the destination node for DAMOS actions such as DAMOS_MIGRATE_{HOT,COLD} in the follow up patches. [sj@kernel.org: document target_nid file] Link: https://lkml.kernel.org/r/20240618213630.84846-3-sj@kernel.org Link: https://lkml.kernel.org/r/20240614030010.751-4-honggyu.kim@sk.com Signed-off-by: Hyeongtak Ji Signed-off-by: Honggyu Kim Signed-off-by: SeongJae Park Cc: Gregory Price Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Rakie Kim Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index f7da65e1ac04..21d6b69a015c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -374,6 +374,7 @@ struct damos_access_pattern { * @apply_interval_us: The time between applying the @action. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. + * @target_nid: Destination node if @action is "migrate_{hot,cold}". * @filters: Additional set of &struct damos_filter for &action. * @stat: Statistics of this scheme. * @list: List head for siblings. @@ -389,6 +390,10 @@ struct damos_access_pattern { * monitoring context are inactive, DAMON stops monitoring either, and just * repeatedly checks the watermarks. * + * @target_nid is used to set the migration target node for migrate_hot or + * migrate_cold actions, which means it's only meaningful when @action is either + * "migrate_hot" or "migrate_cold". + * * Before applying the &action to a memory region, &struct damon_operations * implementation could check pages of the region and skip &action to respect * &filters @@ -410,6 +415,9 @@ struct damos { /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; + union { + int target_nid; + }; struct list_head filters; struct damos_stat stat; struct list_head list; @@ -726,7 +734,8 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, unsigned long apply_interval_us, struct damos_quota *quota, - struct damos_watermarks *wmarks); + struct damos_watermarks *wmarks, + int target_nid); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); -- cgit v1.2.3 From ced816a76b8fd1288d48523cc48ed308964d12ed Mon Sep 17 00:00:00 2001 From: Honggyu Kim Date: Fri, 14 Jun 2024 12:00:06 +0900 Subject: mm/migrate: add MR_DAMON to migrate_reason The current patch series introduces DAMON based migration across NUMA nodes so it'd be better to have a new migrate_reason in trace events. Link: https://lkml.kernel.org/r/20240614030010.751-5-honggyu.kim@sk.com Signed-off-by: Honggyu Kim Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Cc: Gregory Price Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Hyeongtak Ji Cc: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Rakie Kim Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/migrate_mode.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index 9fb482bb7323..265c4328b36a 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -24,6 +24,7 @@ enum migrate_reason { MR_CONTIG_RANGE, MR_LONGTERM_PIN, MR_DEMOTION, + MR_DAMON, MR_TYPES }; -- cgit v1.2.3 From b51820ebea656be3b48bb16dcdc5ad3f203c4fd7 Mon Sep 17 00:00:00 2001 From: Honggyu Kim Date: Fri, 14 Jun 2024 12:00:07 +0900 Subject: mm/damon/paddr: introduce DAMOS_MIGRATE_COLD action for demotion This patch introduces DAMOS_MIGRATE_COLD action, which is similar to DAMOS_PAGEOUT, but migrate folios to the given 'target_nid' in the sysfs instead of swapping them out. The 'target_nid' sysfs knob informs the migration target node ID. Here is one of the example usage of this 'migrate_cold' action. $ cd /sys/kernel/mm/damon/admin/kdamonds/ $ cat contexts//schemes//action migrate_cold $ echo 2 > contexts//schemes//target_nid $ echo commit > state $ numactl -p 0 ./hot_cold 500M 600M & $ numastat -c -p hot_cold Per-node process memory usage (in MBs) PID Node 0 Node 1 Node 2 Total -------------- ------ ------ ------ ----- 701 (hot_cold) 501 0 601 1101 Since there are some common routines with pageout, many functions have similar logics between pageout and migrate cold. damon_pa_migrate_folio_list() is a minimized version of shrink_folio_list(). Link: https://lkml.kernel.org/r/20240614030010.751-6-honggyu.kim@sk.com Signed-off-by: Honggyu Kim Signed-off-by: Hyeongtak Ji Signed-off-by: SeongJae Park Cc: Gregory Price Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Rakie Kim Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 21d6b69a015c..56714b6eb0d7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -105,6 +105,7 @@ struct damon_target { * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. + * @DAMOS_MIGRATE_COLD: Migrate the regions prioritizing colder regions. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions * @@ -122,6 +123,7 @@ enum damos_action { DAMOS_NOHUGEPAGE, DAMOS_LRU_PRIO, DAMOS_LRU_DEPRIO, + DAMOS_MIGRATE_COLD, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, }; -- cgit v1.2.3 From b696722d784fb3610183281d2fd514b0efe97e3b Mon Sep 17 00:00:00 2001 From: Hyeongtak Ji Date: Fri, 14 Jun 2024 12:00:08 +0900 Subject: mm/damon/paddr: introduce DAMOS_MIGRATE_HOT action for promotion This patch introduces DAMOS_MIGRATE_HOT action, which is similar to DAMOS_MIGRATE_COLD, but proritizes hot pages. It migrates pages inside the given region to the 'target_nid' NUMA node in the sysfs. Here is one of the example usage of this 'migrate_hot' action. $ cd /sys/kernel/mm/damon/admin/kdamonds/ $ cat contexts//schemes//action migrate_hot $ echo 0 > contexts//schemes//target_nid $ echo commit > state $ numactl -p 2 ./hot_cold 500M 600M & $ numastat -c -p hot_cold Per-node process memory usage (in MBs) PID Node 0 Node 1 Node 2 Total -------------- ------ ------ ------ ----- 701 (hot_cold) 501 0 601 1101 Link: https://lkml.kernel.org/r/20240614030010.751-7-honggyu.kim@sk.com Signed-off-by: Hyeongtak Ji Signed-off-by: Honggyu Kim Signed-off-by: SeongJae Park Cc: Gregory Price Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Rakie Kim Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 56714b6eb0d7..3d62d98d6359 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -105,6 +105,7 @@ struct damon_target { * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. + * @DAMOS_MIGRATE_HOT: Migrate the regions prioritizing warmer regions. * @DAMOS_MIGRATE_COLD: Migrate the regions prioritizing colder regions. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions @@ -123,6 +124,7 @@ enum damos_action { DAMOS_NOHUGEPAGE, DAMOS_LRU_PRIO, DAMOS_LRU_DEPRIO, + DAMOS_MIGRATE_HOT, DAMOS_MIGRATE_COLD, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, -- cgit v1.2.3 From 3ad1dce6c30175dfd3da29d76853a69affbf7489 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 18 Jun 2024 11:17:58 -0700 Subject: mm/damon/core: implement DAMOS quota goals online commit function Patch series "mm/damon: introduce DAMON parameters online commit function". DAMON context struct (damon_ctx) contains user requests (parameters), internal status, and operation results. For flexible usages, DAMON API users are encouraged to manually manipulate the struct. That works well for simple use cases. However, it has turned out that it is not that simple at least for online parameters udpate. It is easy to forget properly maintaining internal status and operation results. Also, such manual manipulation for online tuning is implemented multiple times on DAMON API users including DAMON sysfs interface, DAMON_RECLAIM and DAMON_LRU_SORT. As a result, we have multiple sources of bugs for same problem. Actually we found and fixed a few bugs from online parameter updating of DAMON API users. Implement a function for online DAMON parameters update in core layer, and replace DAMON API users' manual manipulation code for the use case. The core layer function could still have bugs, but this change reduces the source of bugs for the problem to one place. This patch (of 12): Implement functions for supporting online DAMOS quota goals parameters update. The function receives two DAMOS quota structs. One is the struct that currently being used by a kdamond and therefore to be updated. The other one contains the parameters to be applied to the first one. The function applies the new parameters to the destination struct while keeping/updating the internal status. The function should be called from parameters-update safe place, like DAMON callbacks. Link: https://lkml.kernel.org/r/20240618181809.82078-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240618181809.82078-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 3d62d98d6359..ce12c9f1b4e4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -742,6 +742,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, int target_nid); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); +int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src); struct damon_target *damon_new_target(void); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); -- cgit v1.2.3 From 9cb3d0b9dfce6a3258d91e6d69e418d0b4cce46a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 18 Jun 2024 11:17:59 -0700 Subject: mm/damon/core: implement DAMON context commit function Implement functions for supporting online DAMON context level parameters update. The function receives two DAMON context structs. One is the struct that currently being used by a kdamond and therefore to be updated. The other one contains the parameters to be applied to the first one. The function applies the new parameters to the destination struct while keeping/updating the internal status and operation results. The function should be called from DAMON context-update-safe place, like DAMON callbacks. Link: https://lkml.kernel.org/r/20240618181809.82078-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index ce12c9f1b4e4..27c546bfc6d4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -756,6 +756,7 @@ void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs); void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); +int damon_commit_ctx(struct damon_ctx *old_ctx, struct damon_ctx *new_ctx); int damon_nr_running_ctxs(void); bool damon_is_registered_ops(enum damon_ops_id id); int damon_register_ops(struct damon_operations *ops); -- cgit v1.2.3 From 6d21dde7adc0a2deb9e0c6b06f42be3f88ca180d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 29 May 2024 13:18:59 +0200 Subject: mm: update _mapcount and page_type documentation Patch series "mm: page_type, zsmalloc and page_mapcount_reset()", v2. Wanting to remove the remaining abuser of _mapcount/page_type along with page_mapcount_reset(), I stumbled over zsmalloc, which is yet to be converted away from "struct page" [1]. Unfortunately, we cannot stop using the page_type field in zsmalloc code completely for its own purposes. All other fields in "struct page" are used one way or the other. Could we simply store a 2-byte offset value at the beginning of each page? Likely, but that will require a bit more work; and once we have memdesc we might want to move the offset in there (struct zsalloc?) again. ... but we can limit the abuse to 16 bit, glue it to a page type that must be set, and document it. page_has_type() will always successfully indicate such zsmalloc pages, and such zsmalloc pages only. We lose zsmalloc support for PAGE_SIZE > 64KB, which should be tolerable. We could use more bits from the page type, but 16 bit sounds like a good idea for now. So clarify the _mapcount/page_type documentation, use a proper page_type for zsmalloc, and remove page_mapcount_reset(). [1] https://lore.kernel.org/all/20231130101242.2590384-1-42.hyeyoo@gmail.com/ This patch (of 6): Let's make it clearer that _mapcount must no longer be used for own purposes, and how _mapcount and page_type behaves nowadays (also in the context of hugetlb folios, which are typed folios that will be mapped to user space). Move the documentation regarding "-1" over from page_mapcount_reset(), which we will remove next. Move "page_type" before "mapcount", to make it clearer what typed folios are. Link: https://lkml.kernel.org/r/20240529111904.2069608-1-david@redhat.com Link: https://lkml.kernel.org/r/20240529111904.2069608-2-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Sergey Senozhatsky [zram/zsmalloc workloads] Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 ----- include/linux/mm_types.h | 28 +++++++++++++++++----------- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 51fe207026f8..61a7f680f0ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1210,11 +1210,6 @@ static inline int folio_entire_mapcount(const struct folio *folio) return atomic_read(&folio->_entire_mapcount) + 1; } -/* - * The atomic page->_mapcount, starts from -1: so that transitions - * both from it and to it can be tracked, using atomic_inc_and_test - * and atomic_add_negative(-1). - */ static inline void page_mapcount_reset(struct page *page) { atomic_set(&(page)->_mapcount, -1); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index af3a0256fa93..278eade2ad4c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -46,9 +46,7 @@ struct mem_cgroup; * which is guaranteed to be aligned. If you use the same storage as * page->mapping, you must restore it to NULL before freeing the page. * - * If your page will not be mapped to userspace, you can also use the four - * bytes in the mapcount union, but you must call page_mapcount_reset() - * before freeing it. + * The mapcount field must not be used for own purposes. * * If you want to use the refcount field, it must be used in such a way * that other CPUs temporarily incrementing and then decrementing the @@ -152,18 +150,26 @@ struct page { union { /* This union is 4 bytes in size. */ /* - * If the page can be mapped to userspace, encodes the number - * of times this page is referenced by a page table. + * For head pages of typed folios, the value stored here + * allows for determining what this page is used for. The + * tail pages of typed folios will not store a type + * (page_type == _mapcount == -1). + * + * See page-flags.h for a list of page types which are currently + * stored here. */ - atomic_t _mapcount; + unsigned int page_type; /* - * If the page is neither PageSlab nor mappable to userspace, - * the value stored here may help determine what this page - * is used for. See page-flags.h for a list of page types - * which are currently stored here. + * For pages that are part of non-typed folios for which mappings + * are tracked via the RMAP, encodes the number of times this page + * is directly referenced by a page table. + * + * Note that the mapcount is always initialized to -1, so that + * transitions both from it and to it can be tracked, using + * atomic_inc_and_test() and atomic_add_negative(-1). */ - unsigned int page_type; + atomic_t _mapcount; }; /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ -- cgit v1.2.3 From 8db00ad5646171880239b7f10e333278f63d8fcf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 29 May 2024 13:19:00 +0200 Subject: mm: allow reuse of the lower 16 bit of the page type with an actual type As long as the owner sets a page type first, we can allow reuse of the lower 16 bit: sufficient to store an offset into a 64 KiB page, which is the maximum base page size in *common* configurations (ignoring the 256 KiB variant). Restrict it to the head page. We'll use that for zsmalloc next, to set a proper type while still reusing that field to store information (offset into a base page) that cannot go elsewhere for now. Let's reserve the lower 16 bit for that purpose and for catching mapcount underflows, and let's reduce PAGE_TYPE_BASE to a single bit. Note that we will still have to overflow the mapcount quite a lot until we would actually indicate a valid page type. Start handing out the type bits from highest to lowest, to make it clearer how many bits for types we have left. Out of 15 bit we can use for types, we currently use 6. If we run out of bits before we have better typing (e.g., memdesc), we can always investigate storing a value instead [1]. [1] https://lore.kernel.org/all/00ba1dff-7c05-46e8-b0d9-a78ac1cfc198@redhat.com/ [akpm@linux-foundation.org: fix PG_hugetlb typo, per David] Link: https://lkml.kernel.org/r/20240529111904.2069608-3-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Sergey Senozhatsky [zram/zsmalloc workloads] Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 +++++ include/linux/page-flags.h | 25 +++++++++++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 278eade2ad4c..ef09c4eef6d3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -157,6 +157,11 @@ struct page { * * See page-flags.h for a list of page types which are currently * stored here. + * + * Owners of typed folios may reuse the lower 16 bit of the + * head page page_type field after setting the page type, + * but must reset these 16 bit to -1 before clearing the + * page type. */ unsigned int page_type; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b041bc058fb1..d221a6a14764 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -941,16 +941,21 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) */ enum pagetype { - PG_buddy = 0x00000080, - PG_offline = 0x00000100, - PG_table = 0x00000200, - PG_guard = 0x00000400, - PG_hugetlb = 0x00000800, - PG_slab = 0x00001000, - - PAGE_TYPE_BASE = 0xf0000000, - /* Reserve 0x0000007f to catch underflows of _mapcount */ - PAGE_MAPCOUNT_RESERVE = -128, + PG_buddy = 0x40000000, + PG_offline = 0x20000000, + PG_table = 0x10000000, + PG_guard = 0x08000000, + PG_hugetlb = 0x04000000, + PG_slab = 0x02000000, + + PAGE_TYPE_BASE = 0x80000000, + + /* + * Reserve 0xffff0000 - 0xfffffffe to catch _mapcount underflows and + * allow owners that set a type to reuse the lower 16 bit for their own + * purposes. + */ + PAGE_MAPCOUNT_RESERVE = ~0x0000ffff, }; #define PageType(page, flag) \ -- cgit v1.2.3 From 43d746dc49bb4c82034fce01a92fe67344d664cf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 29 May 2024 13:19:01 +0200 Subject: mm/zsmalloc: use a proper page type Let's clean it up: use a proper page type and store our data (offset into a page) in the lower 16 bit as documented. We won't be able to support 256 KiB base pages, which is acceptable. Teach Kconfig to handle that cleanly using a new CONFIG_HAVE_ZSMALLOC. Based on this, we should do a proper "struct zsdesc" conversion, as proposed in [1]. This removes the last _mapcount/page_type offender. [1] https://lore.kernel.org/all/20231130101242.2590384-1-42.hyeyoo@gmail.com/ Link: https://lkml.kernel.org/r/20240529111904.2069608-4-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Sergey Senozhatsky [zram/zsmalloc workloads] Reviewed-by: Sergey Senozhatsky Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d221a6a14764..0f7c7320391e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -947,6 +947,7 @@ enum pagetype { PG_guard = 0x08000000, PG_hugetlb = 0x04000000, PG_slab = 0x02000000, + PG_zsmalloc = 0x01000000, PAGE_TYPE_BASE = 0x80000000, @@ -1071,6 +1072,8 @@ FOLIO_TYPE_OPS(hugetlb, hugetlb) FOLIO_TEST_FLAG_FALSE(hugetlb) #endif +PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) + /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. -- cgit v1.2.3 From 11d5401b011e3557894d824dac210f0b18cc3911 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 29 May 2024 13:19:04 +0200 Subject: mm/mm_init: initialize page->_mapcount directly in __init_single_page() Let's simply reinitialize the page->_mapcount directly. We can now get rid of page_mapcount_reset(). Link: https://lkml.kernel.org/r/20240529111904.2069608-7-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Sergey Senozhatsky [zram/zsmalloc workloads] Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 61a7f680f0ba..4f75fee497f1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1210,11 +1210,6 @@ static inline int folio_entire_mapcount(const struct folio *folio) return atomic_read(&folio->_entire_mapcount) + 1; } -static inline void page_mapcount_reset(struct page *page) -{ - atomic_set(&(page)->_mapcount, -1); -} - static inline int folio_large_mapcount(const struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); -- cgit v1.2.3 From 2669324b81e5f67d409d3ad988da77c7c2a09045 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 4 Jun 2024 19:48:20 +0800 Subject: mm: remove page_maybe_dma_pinned() After the last user of page_maybe_dma_pinned() is converted to folio_maybe_dma_pinned(), remove page_maybe_dma_pinned() and update the document and comment. [wangkefeng.wang@huawei.com: fix pin_user_pages.rst underlining] Link: https://lkml.kernel.org/r/61b256c7-4989-44ec-83db-f34a1bd4be2d@huawei.com Link: https://lkml.kernel.org/r/20240604114822.2089819-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Cc: Daniel Vetter Cc: Helge Deller Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4f75fee497f1..cb4aa725252e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1918,8 +1918,8 @@ static inline struct folio *pfn_folio(unsigned long pfn) * * For more information, please see Documentation/core-api/pin_user_pages.rst. * - * Return: True, if it is likely that the page has been "dma-pinned". - * False, if the page is definitely not dma-pinned. + * Return: True, if it is likely that the folio has been "dma-pinned". + * False, if the folio is definitely not dma-pinned. */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { @@ -1938,11 +1938,6 @@ static inline bool folio_maybe_dma_pinned(struct folio *folio) GUP_PIN_COUNTING_BIAS; } -static inline bool page_maybe_dma_pinned(struct page *page) -{ - return folio_maybe_dma_pinned(page_folio(page)); -} - /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. -- cgit v1.2.3 From a929e0d10f3db1a53668f6b9845db27d7fb63759 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 4 Jun 2024 19:48:22 +0800 Subject: mm: remove page_mkclean() There are no more users of page_mkclean(), remove it and update the document and comment. Link: https://lkml.kernel.org/r/20240604114822.2089819-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Cc: Daniel Vetter Cc: Helge Deller Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- include/linux/rmap.h | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cb4aa725252e..101945baffc7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1577,7 +1577,7 @@ static inline void put_page(struct page *page) * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() - * provides safe operation for get_user_pages(), page_mkclean() and + * provides safe operation for get_user_pages(), folio_mkclean() and * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 694d3d886245..980fa5d75d69 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -802,8 +802,4 @@ static inline int folio_mkclean(struct folio *folio) } #endif /* CONFIG_MMU */ -static inline int page_mkclean(struct page *page) -{ - return folio_mkclean(page_folio(page)); -} #endif /* _LINUX_RMAP_H */ -- cgit v1.2.3 From 503b158fc30f203a1854c87183ca3467c6466001 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 11:09:37 +0200 Subject: mm/memory_hotplug: initialize memmap of !ZONE_DEVICE with PageOffline() instead of PageReserved() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently initialize the memmap such that PG_reserved is set and the refcount of the page is 1. In virtio-mem code, we have to manually clear that PG_reserved flag to make memory offlining with partially hotplugged memory blocks possible: has_unmovable_pages() would otherwise bail out on such pages. We want to avoid PG_reserved where possible and move to typed pages instead. Further, we want to further enlighten memory offlining code about PG_offline: offline pages in an online memory section. One example is handling managed page count adjustments in a cleaner way during memory offlining. So let's initialize the pages with PG_offline instead of PG_reserved. generic_online_page()->__free_pages_core() will now clear that flag before handing that memory to the buddy. Note that the page refcount is still 1 and would forbid offlining of such memory except when special care is take during GOING_OFFLINE as currently only implemented by virtio-mem. With this change, we can now get non-PageReserved() pages in the XEN balloon list. From what I can tell, that can already happen via decrease_reservation(), so that should be fine. HV-balloon should not really observe a change: partial online memory blocks still cannot get surprise-offlined, because the refcount of these PageOffline() pages is 1. Update virtio-mem, HV-balloon and XEN-balloon code to be aware that hotplugged pages are now PageOffline() instead of PageReserved() before they are handed over to the buddy. We'll leave the ZONE_DEVICE case alone for now. Note that self-hosted vmemmap pages will no longer be marked as reserved. This matches ordinary vmemmap pages allocated from the buddy during memory hotplug. Now, really only vmemmap pages allocated from memblock during early boot will be marked reserved. Existing PageReserved() checks seem to be handling all relevant cases correctly even after this change. Link: https://lkml.kernel.org/r/20240607090939.89524-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Oscar Salvador [generic memory-hotplug bits] Cc: Alexander Potapenko Cc: Dexuan Cui Cc: Dmitry Vyukov Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Juergen Gross Cc: "K. Y. Srinivasan" Cc: Marco Elver Cc: Michael S. Tsirkin Cc: Mike Rapoport (IBM) Cc: Oleksandr Tyshchenko Cc: Stefano Stabellini Cc: Wei Liu Cc: Xuan Zhuo Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0f7c7320391e..b23772b08cc1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -30,16 +30,11 @@ * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying * to read/write these pages might end badly. Don't touch! * - The zero page(s) - * - Pages not added to the page allocator when onlining a section because - * they were excluded via the online_page_callback() or because they are - * PG_hwpoison. * - Pages allocated in the context of kexec/kdump (loaded kernel image, * control pages, vmcoreinfo) * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are * not marked PG_reserved (as they might be in use by somebody else who does * not respect the caching strategy). - * - Pages part of an offline section (struct pages of offline sections should - * not be trusted as they will be initialized when first onlined). * - MCA pages on ia64 * - Pages holding CPU notes for POWER Firmware Assisted Dump * - Device memory (e.g. PMEM, DAX, HMM) @@ -1020,6 +1015,10 @@ PAGE_TYPE_OPS(Buddy, buddy, buddy) * The content of these pages is effectively stale. Such pages should not * be touched (read/write/dump/save) except by their owner. * + * When a memory block gets onlined, all pages are initialized with a + * refcount of 1 and PageOffline(). generic_online_page() will + * take care of clearing PageOffline(). + * * If a driver wants to allow to offline unmovable PageOffline() pages without * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the @@ -1027,8 +1026,7 @@ PAGE_TYPE_OPS(Buddy, buddy, buddy) * pages (now with a reference count of zero) are treated like free pages, * allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will - * require to re-set the pages PageOffline() and not giving them to the - * buddy via online_page_callback_t. + * require not giving them to the buddy via generic_online_page(). * * There are drivers that mark a page PageOffline() and expect there won't be * any further access to page content. PFN walkers that read content of random -- cgit v1.2.3 From 50625744220c101705a989d7c57a6c16e945f3b1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 11:09:38 +0200 Subject: mm/memory_hotplug: skip adjust_managed_page_count() for PageOffline() pages when offlining MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently have a hack for virtio-mem in place to handle memory offlining with PageOffline pages for which we already adjusted the managed page count. Let's enlighten memory offlining code so we can get rid of that hack, and document the situation. Link: https://lkml.kernel.org/r/20240607090939.89524-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Oscar Salvador Cc: Alexander Potapenko Cc: Dexuan Cui Cc: Dmitry Vyukov Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Juergen Gross Cc: "K. Y. Srinivasan" Cc: Marco Elver Cc: Michael S. Tsirkin Cc: Mike Rapoport (IBM) Cc: Oleksandr Tyshchenko Cc: Stefano Stabellini Cc: Wei Liu Cc: Xuan Zhuo Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 4 ++-- include/linux/page-flags.h | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7a9ff464608d..ebe876930e78 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -175,8 +175,8 @@ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); -extern void __offline_isolated_pages(unsigned long start_pfn, - unsigned long end_pfn); +extern unsigned long __offline_isolated_pages(unsigned long start_pfn, + unsigned long end_pfn); typedef void (*online_page_callback_t)(struct page *page, unsigned int order); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b23772b08cc1..1d441908b726 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1023,11 +1023,15 @@ PAGE_TYPE_OPS(Buddy, buddy, buddy) * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline() - * pages (now with a reference count of zero) are treated like free pages, - * allowing the containing memory block to get offlined. A driver that + * pages (now with a reference count of zero) are treated like free (unmanaged) + * pages, allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will * require not giving them to the buddy via generic_online_page(). * + * Memory offlining code will not adjust the managed page count for any + * PageOffline() pages, treating them like they were never exposed to the + * buddy using generic_online_page(). + * * There are drivers that mark a page PageOffline() and expect there won't be * any further access to page content. PFN walkers that read content of random * pages should check PageOffline() and synchronize with such drivers using -- cgit v1.2.3 From 15bde4abab734c687c1f81704886aba3a70c268e Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 18 Jun 2024 11:11:35 +1200 Subject: mm: extend rmap flags arguments for folio_add_new_anon_rmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: clarify folio_add_new_anon_rmap() and __folio_add_anon_rmap()", v2. This patchset is preparatory work for mTHP swapin. folio_add_new_anon_rmap() assumes that new anon rmaps are always exclusive. However, this assumption doesn’t hold true for cases like do_swap_page(), where a new anon might be added to the swapcache and is not necessarily exclusive. The patchset extends the rmap flags to allow folio_add_new_anon_rmap() to handle both exclusive and non-exclusive new anon folios. The do_swap_page() function is updated to use this extended API with rmap flags. Consequently, all new anon folios now consistently use folio_add_new_anon_rmap(). The special case for !folio_test_anon() in __folio_add_anon_rmap() can be safely removed. In conclusion, new anon folios always use folio_add_new_anon_rmap(), regardless of exclusivity. Old anon folios continue to use __folio_add_anon_rmap() via folio_add_anon_rmap_pmd() and folio_add_anon_rmap_ptes(). This patch (of 3): In the case of a swap-in, a new anonymous folio is not necessarily exclusive. This patch updates the rmap flags to allow a new anonymous folio to be treated as either exclusive or non-exclusive. To maintain the existing behavior, we always use EXCLUSIVE as the default setting. [akpm@linux-foundation.org: cleanup and constifications per David and akpm] [v-songbaohua@oppo.com: fix missing doc for flags of folio_add_new_anon_rmap()] Link: https://lkml.kernel.org/r/20240619210641.62542-1-21cnbao@gmail.com [v-songbaohua@oppo.com: enhance doc for extend rmap flags arguments for folio_add_new_anon_rmap] Link: https://lkml.kernel.org/r/20240622030256.43775-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-2-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: David Hildenbrand Tested-by: Shuai Yuan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 980fa5d75d69..0978c64f49d8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -244,7 +244,7 @@ void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, void folio_add_anon_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, - unsigned long address); + unsigned long address, rmap_t flags); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_add_file_rmap_pte(folio, page, vma) \ -- cgit v1.2.3 From 78fefd04c123493bbf28434768fa577b2153c79b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 18 Jun 2024 17:12:39 +0800 Subject: mm: memory: convert clear_huge_page() to folio_zero_user() Patch series "mm: improve clear and copy user folio", v2. Some folio conversions. An improvement is to move address alignment into the caller as it is only needed if we don't know which address will be accessed when clearing/copying user folios. This patch (of 4): Replace clear_huge_page() with folio_zero_user(), and take a folio instead of a page. Directly get number of pages by folio_nr_pages() to remove pages_per_huge_page argument, furthermore, move the address alignment from folio_zero_user() to the callers since the alignment is only needed when we don't know which address will be accessed. Link: https://lkml.kernel.org/r/20240618091242.2140164-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240618091242.2140164-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 101945baffc7..e2140ea6ae98 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4067,9 +4067,7 @@ enum mf_action_page_type { }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) -extern void clear_huge_page(struct page *page, - unsigned long addr_hint, - unsigned int pages_per_huge_page); +void folio_zero_user(struct folio *folio, unsigned long addr_hint); int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma); -- cgit v1.2.3 From dc9e6f7053dd540db2c9fba30c31a07de5edab01 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 31 May 2024 14:56:16 +0200 Subject: mm: read page_type using READ_ONCE KCSAN complains about possible data races: while we check for a page_type -- for example for sanity checks -- we might concurrently modify the mapcount that overlays page_type. Let's use READ_ONCE to avoid load tearing (shouldn't make a difference) and to make KCSAN happy. Likely, we might also want to use WRITE_ONCE for the writer side of page_type, if KCSAN ever complains about that. But we'll not mess with that for now. Note: nothing should really be broken besides wrong KCSAN complaints. The sanity check that triggers this was added in commit 68f0320824fa ("mm/rmap: convert folio_add_file_rmap_range() into folio_add_file_rmap_[pte|ptes|pmd]()"). Even before that similar races likely where possible, ever since we added page_type in commit 6e292b9be7f4 ("mm: split page_type out from _mapcount"). Link: https://lkml.kernel.org/r/20240531125616.2850153-1-david@redhat.com Signed-off-by: David Hildenbrand Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202405281431.c46a3be9-lkp@intel.com Reviewed-by: Oscar Salvador Reviewed-by: Miaohe Lin Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1d441908b726..5769fe6e4950 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -955,9 +955,9 @@ enum pagetype { }; #define PageType(page, flag) \ - ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + ((READ_ONCE(page->page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) #define folio_test_type(folio, flag) \ - ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + ((READ_ONCE(folio->page.page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) static inline int page_type_has_type(unsigned int page_type) { @@ -966,7 +966,7 @@ static inline int page_type_has_type(unsigned int page_type) static inline int page_has_type(const struct page *page) { - return page_type_has_type(page->page_type); + return page_type_has_type(READ_ONCE(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ -- cgit v1.2.3 From 6b1709d4b7fce27c6c79fc78c17c458f9848a4d8 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 21 Jun 2024 13:34:53 +0200 Subject: kmsan: expose kmsan_get_metadata() Each s390 CPU has lowcore pages associated with it. Each CPU sees its own lowcore at virtual address 0 through a hardware mechanism called prefixing. Additionally, all lowcores are mapped to non-0 virtual addresses stored in the lowcore_ptr[] array. When lowcore is accessed through virtual address 0, one needs to resolve metadata for lowcore_ptr[raw_smp_processor_id()]. Expose kmsan_get_metadata() to make it possible to do this from the arch code. Link: https://lkml.kernel.org/r/20240621113706.315500-10-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Reviewed-by: Alexander Potapenko Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Cc: Marco Elver Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt (Google) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index e0c23a32cdf0..fe6c2212bdb1 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -230,6 +230,15 @@ void kmsan_handle_urb(const struct urb *urb, bool is_out); */ void kmsan_unpoison_entry_regs(const struct pt_regs *regs); +/** + * kmsan_get_metadata() - Return a pointer to KMSAN shadow or origins. + * @addr: kernel address. + * @is_origin: whether to return origins or shadow. + * + * Return NULL if metadata cannot be found. + */ +void *kmsan_get_metadata(void *addr, bool is_origin); + #else static inline void kmsan_init_shadow(void) -- cgit v1.2.3 From ec3e837d8fd96c68599b2861dd412094b7bc335c Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 21 Jun 2024 13:34:55 +0200 Subject: kmsan: allow disabling KMSAN checks for the current task Like for KASAN, it's useful to temporarily disable KMSAN checks around, e.g., redzone accesses. Introduce kmsan_disable_current() and kmsan_enable_current(), which are similar to their KASAN counterparts. Make them reentrant in order to handle memory allocations in interrupt context. Repurpose the allow_reporting field for this. Link: https://lkml.kernel.org/r/20240621113706.315500-12-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Reviewed-by: Alexander Potapenko Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Cc: Marco Elver Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt (Google) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 24 ++++++++++++++++++++++++ include/linux/kmsan_types.h | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index fe6c2212bdb1..14b5ea6d3a43 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -239,6 +239,22 @@ void kmsan_unpoison_entry_regs(const struct pt_regs *regs); */ void *kmsan_get_metadata(void *addr, bool is_origin); +/** + * kmsan_enable_current(): Enable KMSAN for the current task. + * + * Each kmsan_enable_current() current call must be preceded by a + * kmsan_disable_current() call. These call pairs may be nested. + */ +void kmsan_enable_current(void); + +/** + * kmsan_disable_current(): Disable KMSAN for the current task. + * + * Each kmsan_disable_current() current call must be followed by a + * kmsan_enable_current() call. These call pairs may be nested. + */ +void kmsan_disable_current(void); + #else static inline void kmsan_init_shadow(void) @@ -338,6 +354,14 @@ static inline void kmsan_unpoison_entry_regs(const struct pt_regs *regs) { } +static inline void kmsan_enable_current(void) +{ +} + +static inline void kmsan_disable_current(void) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h index 929287981afe..dfc59918b3c0 100644 --- a/include/linux/kmsan_types.h +++ b/include/linux/kmsan_types.h @@ -31,7 +31,7 @@ struct kmsan_context_state { struct kmsan_ctx { struct kmsan_context_state cstate; int kmsan_in_runtime; - bool allow_reporting; + unsigned int depth; }; #endif /* _LINUX_KMSAN_TYPES_H */ -- cgit v1.2.3 From 1fdb3c7006d9914e4b070f7eee98dfbdf743ee16 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 21 Jun 2024 13:34:56 +0200 Subject: kmsan: introduce memset_no_sanitize_memory() Add a wrapper for memset() that prevents unpoisoning. This is useful for filling memory allocator redzones. Link: https://lkml.kernel.org/r/20240621113706.315500-13-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Reviewed-by: Alexander Potapenko Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Cc: Marco Elver Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt (Google) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 14b5ea6d3a43..7109644f4c19 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -255,6 +255,19 @@ void kmsan_enable_current(void); */ void kmsan_disable_current(void); +/** + * memset_no_sanitize_memory(): Fill memory without KMSAN instrumentation. + * @s: address of kernel memory to fill. + * @c: constant byte to fill the memory with. + * @n: number of bytes to fill. + * + * This is like memset(), but without KMSAN instrumentation. + */ +static inline void *memset_no_sanitize_memory(void *s, int c, size_t n) +{ + return __memset(s, c, n); +} + #else static inline void kmsan_init_shadow(void) @@ -362,6 +375,11 @@ static inline void kmsan_disable_current(void) { } +static inline void *memset_no_sanitize_memory(void *s, int c, size_t n) +{ + return memset(s, c, n); +} + #endif #endif /* _LINUX_KMSAN_H */ -- cgit v1.2.3 From e6553e2f79b2673b239c3dd47a88451119eb82d9 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 21 Jun 2024 13:35:00 +0200 Subject: kmsan: expose KMSAN_WARN_ON() KMSAN_WARN_ON() is required for implementing s390-specific KMSAN functions, but right now it's available only to the KMSAN internal functions. Expose it to subsystems through . Link: https://lkml.kernel.org/r/20240621113706.315500-17-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Reviewed-by: Alexander Potapenko Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Cc: Marco Elver Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt (Google) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 7109644f4c19..2b1432cc16d5 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -268,6 +268,29 @@ static inline void *memset_no_sanitize_memory(void *s, int c, size_t n) return __memset(s, c, n); } +extern bool kmsan_enabled; +extern int panic_on_kmsan; + +/* + * KMSAN performs a lot of consistency checks that are currently enabled by + * default. BUG_ON is normally discouraged in the kernel, unless used for + * debugging, but KMSAN itself is a debugging tool, so it makes little sense to + * recover if something goes wrong. + */ +#define KMSAN_WARN_ON(cond) \ + ({ \ + const bool __cond = WARN_ON(cond); \ + if (unlikely(__cond)) { \ + WRITE_ONCE(kmsan_enabled, false); \ + if (panic_on_kmsan) { \ + /* Can't call panic() here because */ \ + /* of uaccess checks. */ \ + BUG(); \ + } \ + } \ + __cond; \ + }) + #else static inline void kmsan_init_shadow(void) @@ -380,6 +403,8 @@ static inline void *memset_no_sanitize_memory(void *s, int c, size_t n) return memset(s, c, n); } +#define KMSAN_WARN_ON WARN_ON + #endif #endif /* _LINUX_KMSAN_H */ -- cgit v1.2.3 From ee86814b0562f18255b55c5e6a01a022895994cf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 20 Jun 2024 23:29:35 +0200 Subject: mm/migrate: move NUMA hinting fault folio isolation + checks under PTL Currently we always take a folio reference even if migration will not even be tried or isolation failed, requiring us to grab+drop an additional reference. Further, we end up calling folio_likely_mapped_shared() while the folio might have already been unmapped, because after we dropped the PTL, that can easily happen. We want to stop touching mapcounts and friends from such context, and only call folio_likely_mapped_shared() while the folio is still mapped: mapcount information is pretty much stale and unreliable otherwise. So let's move checks into numamigrate_isolate_folio(), rename that function to migrate_misplaced_folio_prepare(), and call that function from callsites where we call migrate_misplaced_folio(), but still with the PTL held. We can now stop taking temporary folio references, and really only take a reference if folio isolation succeeded. Doing the folio_likely_mapped_shared() + folio isolation under PT lock is now similar to how we handle MADV_PAGEOUT. While at it, combine the folio_is_file_lru() checks. [david@redhat.com: fix list_del() corruption] Link: https://lkml.kernel.org/r/8f85c31a-e603-4578-bf49-136dae0d4b69@redhat.com Link: https://lkml.kernel.org/r/20240626191129.658CFC32782@smtp.kernel.org Link: https://lkml.kernel.org/r/20240620212935.656243-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Baolin Wang Reviewed-by: Zi Yan Tested-by: Donet Tom Signed-off-by: Andrew Morton --- include/linux/migrate.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 517f70b70620..af2579ae93f2 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -140,9 +140,16 @@ const struct movable_operations *page_movable_ops(struct page *page) } #ifdef CONFIG_NUMA_BALANCING +int migrate_misplaced_folio_prepare(struct folio *folio, + struct vm_area_struct *vma, int node); int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, int node); #else +static inline int migrate_misplaced_folio_prepare(struct folio *folio, + struct vm_area_struct *vma, int node) +{ + return -EAGAIN; /* can't migrate now */ +} static inline int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, int node) { -- cgit v1.2.3 From bb82ac31ddcedd6a1e6ee30b7afec7acdef811e1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 25 Jun 2024 12:18:55 +0200 Subject: readahead: drop index argument of page_cache_async_readahead() The index argument of page_cache_async_readahead() is just folio->index so there's no point in passing is separately. Drop it. Link: https://lkml.kernel.org/r/20240625101909.12234-5-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Josef Bacik Tested-by: Zhang Peng Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e37e16ebff7a..eef99375dcf1 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1293,8 +1293,7 @@ void page_cache_sync_readahead(struct address_space *mapping, * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @file: Used by the filesystem for authentication. - * @folio: The folio at @index which triggered the readahead call. - * @index: Index of first page to be read. + * @folio: The folio which triggered the readahead call. * @req_count: Total number of pages being read by the caller. * * page_cache_async_readahead() should be called when a page is used which @@ -1305,9 +1304,9 @@ void page_cache_sync_readahead(struct address_space *mapping, static inline void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *file, - struct folio *folio, pgoff_t index, unsigned long req_count) + struct folio *folio, unsigned long req_count) { - DEFINE_READAHEAD(ractl, file, ra, mapping, index); + DEFINE_READAHEAD(ractl, file, ra, mapping, folio->index); page_cache_async_ra(&ractl, folio, req_count); } -- cgit v1.2.3 From fa2690af573dfefb47ba6eef888797a64b6b5f3c Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 25 Jun 2024 13:53:50 -0700 Subject: mm: page_ref: remove folio_try_get_rcu() The below bug was reported on a non-SMP kernel: [ 275.267158][ T4335] ------------[ cut here ]------------ [ 275.267949][ T4335] kernel BUG at include/linux/page_ref.h:275! [ 275.268526][ T4335] invalid opcode: 0000 [#1] KASAN PTI [ 275.269001][ T4335] CPU: 0 PID: 4335 Comm: trinity-c3 Not tainted 6.7.0-rc4-00061-gefa7df3e3bb5 #1 [ 275.269787][ T4335] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 275.270679][ T4335] RIP: 0010:try_get_folio (include/linux/page_ref.h:275 (discriminator 3) mm/gup.c:79 (discriminator 3)) [ 275.272813][ T4335] RSP: 0018:ffffc90005dcf650 EFLAGS: 00010202 [ 275.273346][ T4335] RAX: 0000000000000246 RBX: ffffea00066e0000 RCX: 0000000000000000 [ 275.274032][ T4335] RDX: fffff94000cdc007 RSI: 0000000000000004 RDI: ffffea00066e0034 [ 275.274719][ T4335] RBP: ffffea00066e0000 R08: 0000000000000000 R09: fffff94000cdc006 [ 275.275404][ T4335] R10: ffffea00066e0037 R11: 0000000000000000 R12: 0000000000000136 [ 275.276106][ T4335] R13: ffffea00066e0034 R14: dffffc0000000000 R15: ffffea00066e0008 [ 275.276790][ T4335] FS: 00007fa2f9b61740(0000) GS:ffffffff89d0d000(0000) knlGS:0000000000000000 [ 275.277570][ T4335] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 275.278143][ T4335] CR2: 00007fa2f6c00000 CR3: 0000000134b04000 CR4: 00000000000406f0 [ 275.278833][ T4335] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 275.279521][ T4335] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 275.280201][ T4335] Call Trace: [ 275.280499][ T4335] [ 275.280751][ T4335] ? die (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:434 arch/x86/kernel/dumpstack.c:447) [ 275.281087][ T4335] ? do_trap (arch/x86/kernel/traps.c:112 arch/x86/kernel/traps.c:153) [ 275.281463][ T4335] ? try_get_folio (include/linux/page_ref.h:275 (discriminator 3) mm/gup.c:79 (discriminator 3)) [ 275.281884][ T4335] ? try_get_folio (include/linux/page_ref.h:275 (discriminator 3) mm/gup.c:79 (discriminator 3)) [ 275.282300][ T4335] ? do_error_trap (arch/x86/kernel/traps.c:174) [ 275.282711][ T4335] ? try_get_folio (include/linux/page_ref.h:275 (discriminator 3) mm/gup.c:79 (discriminator 3)) [ 275.283129][ T4335] ? handle_invalid_op (arch/x86/kernel/traps.c:212) [ 275.283561][ T4335] ? try_get_folio (include/linux/page_ref.h:275 (discriminator 3) mm/gup.c:79 (discriminator 3)) [ 275.283990][ T4335] ? exc_invalid_op (arch/x86/kernel/traps.c:264) [ 275.284415][ T4335] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:568) [ 275.284859][ T4335] ? try_get_folio (include/linux/page_ref.h:275 (discriminator 3) mm/gup.c:79 (discriminator 3)) [ 275.285278][ T4335] try_grab_folio (mm/gup.c:148) [ 275.285684][ T4335] __get_user_pages (mm/gup.c:1297 (discriminator 1)) [ 275.286111][ T4335] ? __pfx___get_user_pages (mm/gup.c:1188) [ 275.286579][ T4335] ? __pfx_validate_chain (kernel/locking/lockdep.c:3825) [ 275.287034][ T4335] ? mark_lock (kernel/locking/lockdep.c:4656 (discriminator 1)) [ 275.287416][ T4335] __gup_longterm_locked (mm/gup.c:1509 mm/gup.c:2209) [ 275.288192][ T4335] ? __pfx___gup_longterm_locked (mm/gup.c:2204) [ 275.288697][ T4335] ? __pfx_lock_acquire (kernel/locking/lockdep.c:5722) [ 275.289135][ T4335] ? __pfx___might_resched (kernel/sched/core.c:10106) [ 275.289595][ T4335] pin_user_pages_remote (mm/gup.c:3350) [ 275.290041][ T4335] ? __pfx_pin_user_pages_remote (mm/gup.c:3350) [ 275.290545][ T4335] ? find_held_lock (kernel/locking/lockdep.c:5244 (discriminator 1)) [ 275.290961][ T4335] ? mm_access (kernel/fork.c:1573) [ 275.291353][ T4335] process_vm_rw_single_vec+0x142/0x360 [ 275.291900][ T4335] ? __pfx_process_vm_rw_single_vec+0x10/0x10 [ 275.292471][ T4335] ? mm_access (kernel/fork.c:1573) [ 275.292859][ T4335] process_vm_rw_core+0x272/0x4e0 [ 275.293384][ T4335] ? hlock_class (arch/x86/include/asm/bitops.h:227 arch/x86/include/asm/bitops.h:239 include/asm-generic/bitops/instrumented-non-atomic.h:142 kernel/locking/lockdep.c:228) [ 275.293780][ T4335] ? __pfx_process_vm_rw_core+0x10/0x10 [ 275.294350][ T4335] process_vm_rw (mm/process_vm_access.c:284) [ 275.294748][ T4335] ? __pfx_process_vm_rw (mm/process_vm_access.c:259) [ 275.295197][ T4335] ? __task_pid_nr_ns (include/linux/rcupdate.h:306 (discriminator 1) include/linux/rcupdate.h:780 (discriminator 1) kernel/pid.c:504 (discriminator 1)) [ 275.295634][ T4335] __x64_sys_process_vm_readv (mm/process_vm_access.c:291) [ 275.296139][ T4335] ? syscall_enter_from_user_mode (kernel/entry/common.c:94 kernel/entry/common.c:112) [ 275.296642][ T4335] do_syscall_64 (arch/x86/entry/common.c:51 (discriminator 1) arch/x86/entry/common.c:82 (discriminator 1)) [ 275.297032][ T4335] ? __task_pid_nr_ns (include/linux/rcupdate.h:306 (discriminator 1) include/linux/rcupdate.h:780 (discriminator 1) kernel/pid.c:504 (discriminator 1)) [ 275.297470][ T4335] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4300 kernel/locking/lockdep.c:4359) [ 275.297988][ T4335] ? do_syscall_64 (arch/x86/include/asm/cpufeature.h:171 arch/x86/entry/common.c:97) [ 275.298389][ T4335] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4300 kernel/locking/lockdep.c:4359) [ 275.298906][ T4335] ? do_syscall_64 (arch/x86/include/asm/cpufeature.h:171 arch/x86/entry/common.c:97) [ 275.299304][ T4335] ? do_syscall_64 (arch/x86/include/asm/cpufeature.h:171 arch/x86/entry/common.c:97) [ 275.299703][ T4335] ? do_syscall_64 (arch/x86/include/asm/cpufeature.h:171 arch/x86/entry/common.c:97) [ 275.300115][ T4335] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) This BUG is the VM_BUG_ON(!in_atomic() && !irqs_disabled()) assertion in folio_ref_try_add_rcu() for non-SMP kernel. The process_vm_readv() calls GUP to pin the THP. An optimization for pinning THP instroduced by commit 57edfcfd3419 ("mm/gup: accelerate thp gup even for "pages != NULL"") calls try_grab_folio() to pin the THP, but try_grab_folio() is supposed to be called in atomic context for non-SMP kernel, for example, irq disabled or preemption disabled, due to the optimization introduced by commit e286781d5f2e ("mm: speculative page references"). The commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") is not actually the root cause although it was bisected to. It just makes the problem exposed more likely. The follow up discussion suggested the optimization for non-SMP kernel may be out-dated and not worth it anymore [1]. So removing the optimization to silence the BUG. However calling try_grab_folio() in GUP slow path actually is unnecessary, so the following patch will clean this up. [1] https://lore.kernel.org/linux-mm/821cf1d6-92b9-4ac4-bacc-d8f2364ac14f@paulmck-laptop/ Link: https://lkml.kernel.org/r/20240625205350.1777481-1-yang@os.amperecomputing.com Fixes: 57edfcfd3419 ("mm/gup: accelerate thp gup even for "pages != NULL"") Signed-off-by: Yang Shi Reported-by: kernel test robot Tested-by: Oliver Sang Acked-by: Peter Xu Acked-by: David Hildenbrand Cc: Christoph Lameter Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Rik van Riel Cc: Vivek Kasireddy Cc: [6.6+] Signed-off-by: Andrew Morton --- include/linux/page_ref.h | 49 ++---------------------------------------------- 1 file changed, 2 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 1acf5bac7f50..490d0ad6e56d 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -258,54 +258,9 @@ static inline bool folio_try_get(struct folio *folio) return folio_ref_add_unless(folio, 1, 0); } -static inline bool folio_ref_try_add_rcu(struct folio *folio, int count) -{ -#ifdef CONFIG_TINY_RCU - /* - * The caller guarantees the folio will not be freed from interrupt - * context, so (on !SMP) we only need preemption to be disabled - * and TINY_RCU does that for us. - */ -# ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic() && !irqs_disabled()); -# endif - VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio); - folio_ref_add(folio, count); -#else - if (unlikely(!folio_ref_add_unless(folio, count, 0))) { - /* Either the folio has been freed, or will be freed. */ - return false; - } -#endif - return true; -} - -/** - * folio_try_get_rcu - Attempt to increase the refcount on a folio. - * @folio: The folio. - * - * This is a version of folio_try_get() optimised for non-SMP kernels. - * If you are still holding the rcu_read_lock() after looking up the - * page and know that the page cannot have its refcount decreased to - * zero in interrupt context, you can use this instead of folio_try_get(). - * - * Example users include get_user_pages_fast() (as pages are not unmapped - * from interrupt context) and the page cache lookups (as pages are not - * truncated from interrupt context). We also know that pages are not - * frozen in interrupt context for the purposes of splitting or migration. - * - * You can also use this function if you're holding a lock that prevents - * pages being frozen & removed; eg the i_pages lock for the page cache - * or the mmap_lock or page table lock for page tables. In this case, - * it will always succeed, and you could have used a plain folio_get(), - * but it's sometimes more convenient to have a common function called - * from both locked and RCU-protected contexts. - * - * Return: True if the reference count was successfully incremented. - */ -static inline bool folio_try_get_rcu(struct folio *folio) +static inline bool folio_ref_try_add(struct folio *folio, int count) { - return folio_ref_try_add_rcu(folio, 1); + return folio_ref_add_unless(folio, count, 0); } static inline int page_ref_freeze(struct page *page, int count) -- cgit v1.2.3 From 82f0b6f041fad768c28b4ad05a683065412c226e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 25 Jun 2024 20:16:39 -0400 Subject: mm: prevent derefencing NULL ptr in pfn_section_valid() Commit 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing memory_section->usage") changed pfn_section_valid() to add a READ_ONCE() call around "ms->usage" to fix a race with section_deactivate() where ms->usage can be cleared. The READ_ONCE() call, by itself, is not enough to prevent NULL pointer dereference. We need to check its value before dereferencing it. Link: https://lkml.kernel.org/r/20240626001639.1350646-1-longman@redhat.com Fixes: 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing memory_section->usage") Signed-off-by: Waiman Long Cc: Charan Teja Kalla Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 586a8f0104d7..1dc6248feb83 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1979,8 +1979,9 @@ static inline int subsection_map_index(unsigned long pfn) static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); + struct mem_section_usage *usage = READ_ONCE(ms->usage); - return test_bit(idx, READ_ONCE(ms->usage)->subsection_map); + return usage ? test_bit(idx, usage->subsection_map) : 0; } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) -- cgit v1.2.3 From 099d90642a711caae377f53309abfe27e8724a8b Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 27 Jun 2024 10:39:49 +1000 Subject: mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray Patch series "mm/filemap: Limit page cache size to that supported by xarray", v2. Currently, xarray can't support arbitrary page cache size. More details can be found from the WARN_ON() statement in xas_split_alloc(). In our test whose code is attached below, we hit the WARN_ON() on ARM64 system where the base page size is 64KB and huge page size is 512MB. The issue was reported long time ago and some discussions on it can be found here [1]. [1] https://www.spinics.net/lists/linux-xfs/msg75404.html In order to fix the issue, we need to adjust MAX_PAGECACHE_ORDER to one supported by xarray and avoid PMD-sized page cache if needed. The code changes are suggested by David Hildenbrand. PATCH[1] adjusts MAX_PAGECACHE_ORDER to that supported by xarray PATCH[2-3] avoids PMD-sized page cache in the synchronous readahead path PATCH[4] avoids PMD-sized page cache for shmem files if needed Test program ============ # cat test.c #define _GNU_SOURCE #include #include #include #include #include #include #include #include #define TEST_XFS_FILENAME "/tmp/data" #define TEST_SHMEM_FILENAME "/dev/shm/data" #define TEST_MEM_SIZE 0x20000000 int main(int argc, char **argv) { const char *filename; int fd = 0; void *buf = (void *)-1, *p; int pgsize = getpagesize(); int ret; if (pgsize != 0x10000) { fprintf(stderr, "64KB base page size is required\n"); return -EPERM; } system("echo force > /sys/kernel/mm/transparent_hugepage/shmem_enabled"); system("rm -fr /tmp/data"); system("rm -fr /dev/shm/data"); system("echo 1 > /proc/sys/vm/drop_caches"); /* Open xfs or shmem file */ filename = TEST_XFS_FILENAME; if (argc > 1 && !strcmp(argv[1], "shmem")) filename = TEST_SHMEM_FILENAME; fd = open(filename, O_CREAT | O_RDWR | O_TRUNC); if (fd < 0) { fprintf(stderr, "Unable to open <%s>\n", filename); return -EIO; } /* Extend file size */ ret = ftruncate(fd, TEST_MEM_SIZE); if (ret) { fprintf(stderr, "Error %d to ftruncate()\n", ret); goto cleanup; } /* Create VMA */ buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (buf == (void *)-1) { fprintf(stderr, "Unable to mmap <%s>\n", filename); goto cleanup; } fprintf(stdout, "mapped buffer at 0x%p\n", buf); ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE); if (ret) { fprintf(stderr, "Unable to madvise(MADV_HUGEPAGE)\n"); goto cleanup; } /* Populate VMA */ ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_WRITE); if (ret) { fprintf(stderr, "Error %d to madvise(MADV_POPULATE_WRITE)\n", ret); goto cleanup; } /* Punch the file to enforce xarray split */ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, TEST_MEM_SIZE - pgsize, pgsize); if (ret) fprintf(stderr, "Error %d to fallocate()\n", ret); cleanup: if (buf != (void *)-1) munmap(buf, TEST_MEM_SIZE); if (fd > 0) close(fd); return 0; } # gcc test.c -o test # cat /proc/1/smaps | grep KernelPageSize | head -n 1 KernelPageSize: 64 kB # ./test shmem : ------------[ cut here ]------------ WARNING: CPU: 17 PID: 5253 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128 Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \ nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \ nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \ ip_set nf_tables rfkill nfnetlink vfat fat virtio_balloon \ drm fuse xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 \ virtio_net sha1_ce net_failover failover virtio_console virtio_blk \ dimlib virtio_mmio CPU: 17 PID: 5253 Comm: test Kdump: loaded Tainted: G W 6.10.0-rc5-gavin+ #12 Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024 pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : xas_split_alloc+0xf8/0x128 lr : split_huge_page_to_list_to_order+0x1c4/0x720 sp : ffff80008a92f5b0 x29: ffff80008a92f5b0 x28: ffff80008a92f610 x27: ffff80008a92f728 x26: 0000000000000cc0 x25: 000000000000000d x24: ffff0000cf00c858 x23: ffff80008a92f610 x22: ffffffdfc0600000 x21: 0000000000000000 x20: 0000000000000000 x19: ffffffdfc0600000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000018000000000 x15: 3374004000000000 x14: 0000e00000000000 x13: 0000000000002000 x12: 0000000000000020 x11: 3374000000000000 x10: 3374e1c0ffff6000 x9 : ffffb463a84c681c x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff00011c976ce0 x5 : ffffb463aa47e378 x4 : 0000000000000000 x3 : 0000000000000cc0 x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000 Call trace: xas_split_alloc+0xf8/0x128 split_huge_page_to_list_to_order+0x1c4/0x720 truncate_inode_partial_folio+0xdc/0x160 shmem_undo_range+0x2bc/0x6a8 shmem_fallocate+0x134/0x430 vfs_fallocate+0x124/0x2e8 ksys_fallocate+0x4c/0xa0 __arm64_sys_fallocate+0x24/0x38 invoke_syscall.constprop.0+0x7c/0xd8 do_el0_svc+0xb4/0xd0 el0_svc+0x44/0x1d8 el0t_64_sync_handler+0x134/0x150 el0t_64_sync+0x17c/0x180 This patch (of 4): The largest page cache order can be HPAGE_PMD_ORDER (13) on ARM64 with 64KB base page size. The xarray entry with this order can't be split as the following error messages indicate. ------------[ cut here ]------------ WARNING: CPU: 35 PID: 7484 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128 Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \ nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \ nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \ ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm \ fuse xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 \ sha1_ce virtio_net net_failover virtio_console virtio_blk failover \ dimlib virtio_mmio CPU: 35 PID: 7484 Comm: test Kdump: loaded Tainted: G W 6.10.0-rc5-gavin+ #9 Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024 pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : xas_split_alloc+0xf8/0x128 lr : split_huge_page_to_list_to_order+0x1c4/0x720 sp : ffff800087a4f6c0 x29: ffff800087a4f6c0 x28: ffff800087a4f720 x27: 000000001fffffff x26: 0000000000000c40 x25: 000000000000000d x24: ffff00010625b858 x23: ffff800087a4f720 x22: ffffffdfc0780000 x21: 0000000000000000 x20: 0000000000000000 x19: ffffffdfc0780000 x18: 000000001ff40000 x17: 00000000ffffffff x16: 0000018000000000 x15: 51ec004000000000 x14: 0000e00000000000 x13: 0000000000002000 x12: 0000000000000020 x11: 51ec000000000000 x10: 51ece1c0ffff8000 x9 : ffffbeb961a44d28 x8 : 0000000000000003 x7 : ffffffdfc0456420 x6 : ffff0000e1aa6eb8 x5 : 20bf08b4fe778fca x4 : ffffffdfc0456420 x3 : 0000000000000c40 x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000 Call trace: xas_split_alloc+0xf8/0x128 split_huge_page_to_list_to_order+0x1c4/0x720 truncate_inode_partial_folio+0xdc/0x160 truncate_inode_pages_range+0x1b4/0x4a8 truncate_pagecache_range+0x84/0xa0 xfs_flush_unmap_range+0x70/0x90 [xfs] xfs_file_fallocate+0xfc/0x4d8 [xfs] vfs_fallocate+0x124/0x2e8 ksys_fallocate+0x4c/0xa0 __arm64_sys_fallocate+0x24/0x38 invoke_syscall.constprop.0+0x7c/0xd8 do_el0_svc+0xb4/0xd0 el0_svc+0x44/0x1d8 el0t_64_sync_handler+0x134/0x150 el0t_64_sync+0x17c/0x180 Fix it by decreasing MAX_PAGECACHE_ORDER to the largest supported order by xarray. For this specific case, MAX_PAGECACHE_ORDER is dropped from 13 to 11 when CONFIG_BASE_SMALL is disabled. Link: https://lkml.kernel.org/r/20240627003953.1262512-1-gshan@redhat.com Link: https://lkml.kernel.org/r/20240627003953.1262512-2-gshan@redhat.com Fixes: 793917d997df ("mm/readahead: Add large folio readahead") Signed-off-by: Gavin Shan Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Darrick J. Wong Cc: Don Dutile Cc: Hugh Dickins Cc: Linus Torvalds Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: William Kucharski Cc: Zhenyu Zhang Cc: [5.18+] Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 59f1df0cde5a..a0a026d2d244 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -354,11 +354,18 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) * a good order (that's 1MB if you're using 4kB pages) */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#define PREFERRED_MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER #else -#define MAX_PAGECACHE_ORDER 8 +#define PREFERRED_MAX_PAGECACHE_ORDER 8 #endif +/* + * xas_split_alloc() does not support arbitrary orders. This implies no + * 512MB THP on ARM64 with 64KB base page size. + */ +#define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) +#define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) + /** * mapping_set_large_folios() - Indicate the file supports large folios. * @mapping: The file. -- cgit v1.2.3 From 5a4d8944d6b1e1aaaa83ea42c116b520b4ed0394 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 27 Jun 2024 13:17:37 -0700 Subject: cachestat: do not flush stats in recency check syzbot detects that cachestat() is flushing stats, which can sleep, in its RCU read section (see [1]). This is done in the workingset_test_recent() step (which checks if the folio's eviction is recent). Move the stat flushing step to before the RCU read section of cachestat, and skip stat flushing during the recency check. [1]: https://lore.kernel.org/cgroups/000000000000f71227061bdf97e0@google.com/ Link: https://lkml.kernel.org/r/20240627201737.3506959-1-nphamcs@gmail.com Fixes: b00684722262 ("mm: workingset: move the stats flush into workingset_test_recent()") Signed-off-by: Nhat Pham Reported-by: syzbot+b7f13b2d0cc156edf61a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/cgroups/000000000000f71227061bdf97e0@google.com/ Debugged-by: Johannes Weiner Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Al Viro Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Kairui Song Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yosry Ahmed Cc: [6.8+] Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index bd450023b9a4..e685e93ba354 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -354,7 +354,8 @@ static inline swp_entry_t page_swap_entry(struct page *page) } /* linux/mm/workingset.c */ -bool workingset_test_recent(void *shadow, bool file, bool *workingset); +bool workingset_test_recent(void *shadow, bool file, bool *workingset, + bool flush); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); void workingset_refault(struct folio *folio, void *shadow); -- cgit v1.2.3 From bd225530a4c717714722c3731442b78954c765b3 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 27 Jun 2024 16:27:05 -0600 Subject: mm/hugetlb_vmemmap: fix race with speculative PFN walkers While investigating HVO for THPs [1], it turns out that speculative PFN walkers like compaction can race with vmemmap modifications, e.g., CPU 1 (vmemmap modifier) CPU 2 (speculative PFN walker) ------------------------------- ------------------------------ Allocates an LRU folio page1 Sees page1 Frees page1 Allocates a hugeTLB folio page2 (page1 being a tail of page2) Updates vmemmap mapping page1 get_page_unless_zero(page1) Even though page1->_refcount is zero after HVO, get_page_unless_zero() can still try to modify this read-only field, resulting in a crash. An independent report [2] confirmed this race. There are two discussed approaches to fix this race: 1. Make RO vmemmap RW so that get_page_unless_zero() can fail without triggering a PF. 2. Use RCU to make sure get_page_unless_zero() either sees zero page->_refcount through the old vmemmap or non-zero page->_refcount through the new one. The second approach is preferred here because: 1. It can prevent illegal modifications to struct page[] that has been HVO'ed; 2. It can be generalized, in a way similar to ZERO_PAGE(), to fix similar races in other places, e.g., arch_remove_memory() on x86 [3], which frees vmemmap mapping offlined struct page[]. While adding synchronize_rcu(), the goal is to be surgical, rather than optimized. Specifically, calls to synchronize_rcu() on the error handling paths can be coalesced, but it is not done for the sake of Simplicity: noticeably, this fix removes ~50% more lines than it adds. According to the hugetlb_optimize_vmemmap section in Documentation/admin-guide/sysctl/vm.rst, enabling HVO makes allocating or freeing hugeTLB pages "~2x slower than before". Having synchronize_rcu() on top makes those operations even worse, and this also affects the user interface /proc/sys/vm/nr_overcommit_hugepages. This is *very* hard to trigger: 1. Most hugeTLB use cases I know of are static, i.e., reserved at boot time, because allocating at runtime is not reliable at all. 2. On top of that, someone has to be very unlucky to get tripped over above, because the race window is so small -- I wasn't able to trigger it with a stress testing that does nothing but that (with THPs though). [1] https://lore.kernel.org/20240229183436.4110845-4-yuzhao@google.com/ [2] https://lore.kernel.org/917FFC7F-0615-44DD-90EE-9F85F8EA9974@linux.dev/ [3] https://lore.kernel.org/be130a96-a27e-4240-ad78-776802f57cad@redhat.com/ Link: https://lkml.kernel.org/r/20240627222705.2974207-1-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Muchun Song Cc: David Hildenbrand Cc: Frank van der Linden Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/page_ref.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 490d0ad6e56d..8c236c651d1d 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -230,7 +230,13 @@ static inline int folio_ref_dec_return(struct folio *folio) static inline bool page_ref_add_unless(struct page *page, int nr, int u) { - bool ret = atomic_add_unless(&page->_refcount, nr, u); + bool ret = false; + + rcu_read_lock(); + /* avoid writing to the vmemmap area being remapped */ + if (!page_is_fake_head(page) && page_ref_count(page) != u) + ret = atomic_add_unless(&page->_refcount, nr, u); + rcu_read_unlock(); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); -- cgit v1.2.3 From 011f583781fa46699f1d4c4e9c39ad68f05ced2d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 24 Jun 2024 11:39:32 +0200 Subject: genirq/irq_sim: add an extended irq_sim initializer Currently users of the interrupt simulator don't have any way of being notified about interrupts from the simulated domain being requested or released. This causes a problem for one of the users - the GPIO simulator - which is unable to lock the pins as interrupts. Define a structure containing callbacks to be executed on various irq_sim-related events (for now: irq request and release) and provide an extended function for creating simulated interrupt domains that takes it and a pointer to custom user data (to be passed to said callbacks) as arguments. Reviewed-by: Linus Walleij Acked-by: Linus Walleij Link: https://lore.kernel.org/r/20240624093934.17089-2-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/irq_sim.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq_sim.h b/include/linux/irq_sim.h index ab831e5ae748..89b4d8ff274b 100644 --- a/include/linux/irq_sim.h +++ b/include/linux/irq_sim.h @@ -16,11 +16,28 @@ * requested like normal irqs and enqueued from process context. */ +struct irq_sim_ops { + int (*irq_sim_irq_requested)(struct irq_domain *domain, + irq_hw_number_t hwirq, void *data); + void (*irq_sim_irq_released)(struct irq_domain *domain, + irq_hw_number_t hwirq, void *data); +}; + struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, unsigned int num_irqs); struct irq_domain *devm_irq_domain_create_sim(struct device *dev, struct fwnode_handle *fwnode, unsigned int num_irqs); +struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode, + unsigned int num_irqs, + const struct irq_sim_ops *ops, + void *data); +struct irq_domain * +devm_irq_domain_create_sim_full(struct device *dev, + struct fwnode_handle *fwnode, + unsigned int num_irqs, + const struct irq_sim_ops *ops, + void *data); void irq_domain_remove_sim(struct irq_domain *domain); #endif /* _LINUX_IRQ_SIM_H */ -- cgit v1.2.3 From 162e06871e6dcde861ef608e0c00a8b6a2d35d43 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 4 Jul 2024 11:45:15 +0530 Subject: block: t10-pi: Return correct ref tag when queue has no integrity profile Commit c6e56cf6b2e7 ("block: move integrity information into queue_limits") changed the ref tag calculation logic. It would break if there is no integrity profile. This in turn causes read/write failures for such cases. Fixes: c6e56cf6b2e7 ("block: move integrity information into queue_limits") Signed-off-by: Anuj Gupta Link: https://lore.kernel.org/r/20240704061515.282343-1-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/t10-pi.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 1773610010eb..2c59fe3efcd4 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -39,8 +39,11 @@ struct t10_pi_tuple { static inline u32 t10_pi_ref_tag(struct request *rq) { - unsigned int shift = rq->q->limits.integrity.interval_exp; + unsigned int shift = ilog2(queue_logical_block_size(rq->q)); + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + rq->q->limits.integrity.interval_exp) + shift = rq->q->limits.integrity.interval_exp; return blk_rq_pos(rq) >> (shift - SECTOR_SHIFT) & 0xffffffff; } @@ -61,8 +64,11 @@ static inline u64 lower_48_bits(u64 n) static inline u64 ext_pi_ref_tag(struct request *rq) { - unsigned int shift = rq->q->limits.integrity.interval_exp; + unsigned int shift = ilog2(queue_logical_block_size(rq->q)); + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + rq->q->limits.integrity.interval_exp) + shift = rq->q->limits.integrity.interval_exp; return lower_48_bits(blk_rq_pos(rq) >> (shift - SECTOR_SHIFT)); } -- cgit v1.2.3 From c10bc5614ce0027fa2282ad11827629d81957a3a Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 3 Jul 2024 20:44:18 +0200 Subject: ata,scsi: Remove wrappers ata_sas_tport_{add,delete}() The ata_sas_tport_add() and ata_sas_tport_delete() wrappers only exist in order to export the internal libata functions which they wrap. Remove the wrappers and instead export the libata functions directly. Reviewed-by: Hannes Reinecke Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240703184418.723066-12-cassel@kernel.org Signed-off-by: Niklas Cassel --- include/linux/libata.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 7d3bd7c9664a..586f0116d1d7 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1250,8 +1250,8 @@ extern struct ata_port *ata_sas_port_alloc(struct ata_host *, struct ata_port_info *, struct Scsi_Host *); extern void ata_port_probe(struct ata_port *ap); extern void ata_port_free(struct ata_port *ap); -extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap); -extern void ata_sas_tport_delete(struct ata_port *ap); +extern int ata_tport_add(struct device *parent, struct ata_port *ap); +extern void ata_tport_delete(struct ata_port *ap); int ata_sas_device_configure(struct scsi_device *sdev, struct queue_limits *lim, struct ata_port *ap); extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap); -- cgit v1.2.3 From 2199d6ff565d66ccf0ff1ea1c6466c379e7cbb58 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 3 Jul 2024 20:44:19 +0200 Subject: ata: libata: Remove unused function declaration for ata_scsi_detect() Remove unused function declaration for ata_scsi_detect(). Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240703184418.723066-13-cassel@kernel.org Signed-off-by: Niklas Cassel --- include/linux/libata.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 586f0116d1d7..580971e11804 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1082,7 +1082,6 @@ extern int ata_host_activate(struct ata_host *host, int irq, const struct scsi_host_template *sht); extern void ata_host_detach(struct ata_host *host); extern void ata_host_init(struct ata_host *, struct device *, struct ata_port_operations *); -extern int ata_scsi_detect(struct scsi_host_template *sht); extern int ata_scsi_ioctl(struct scsi_device *dev, unsigned int cmd, void __user *arg); #ifdef CONFIG_COMPAT -- cgit v1.2.3 From 23262cce529e9f437ce2fa8fe4c2fa5b2b182318 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 3 Jul 2024 20:44:20 +0200 Subject: ata: libata-core: Remove support for decreasing the number of ports Commit f31871951b38 ("libata: separate out ata_host_alloc() and ata_host_register()") added ata_host_alloc(), where the API allowed a LLD to overallocate the number of ports supplied to ata_host_alloc(), as long as the LLD decreased host->n_ports before calling ata_host_register(). However, this functionally has never ever been used by a single LLD. Because of the current API design, the assignment of ap->print_id is deferred until registration time, which is bad, because that means that the ata_port_*() print functions cannot be used by a LLD until after registration time, which means that a LLD is forced to use a print function that is non-port specific, even for a port specific error. Remove the support for decreasing the number of ports, such that it will be possible to assign ap->print_id earlier. Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240703184418.723066-14-cassel@kernel.org Signed-off-by: Niklas Cassel --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 580971e11804..b7c5d3f33368 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1069,7 +1069,7 @@ extern int sata_std_hardreset(struct ata_link *link, unsigned int *class, unsigned long deadline); extern void ata_std_postreset(struct ata_link *link, unsigned int *classes); -extern struct ata_host *ata_host_alloc(struct device *dev, int max_ports); +extern struct ata_host *ata_host_alloc(struct device *dev, int n_ports); extern struct ata_host *ata_host_alloc_pinfo(struct device *dev, const struct ata_port_info * const * ppi, int n_ports); extern void ata_host_get(struct ata_host *host); -- cgit v1.2.3 From 1dd63a6b573ffb989ca618e60b6469f885454606 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 3 Jul 2024 20:44:22 +0200 Subject: ata: libata-core: Remove local_port_no struct member ap->local_port_no is simply ap->port_no + 1. Since ap->local_port_no can be derived from ap->port_no, there is no need for the ap->local_port_no struct member, so remove ap->local_port_no. Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240703184418.723066-16-cassel@kernel.org Signed-off-by: Niklas Cassel --- include/linux/libata.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index b7c5d3f33368..84a7bfbac9fa 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -814,7 +814,6 @@ struct ata_port { /* Flags that change dynamically, protected by ap->lock */ unsigned int pflags; /* ATA_PFLAG_xxx */ unsigned int print_id; /* user visible unique port ID */ - unsigned int local_port_no; /* host local port num */ unsigned int port_no; /* 0 based port no. inside the host */ #ifdef CONFIG_ATA_SFF -- cgit v1.2.3 From 0d3603acffe294ffacaf75b18b46f182b68a50df Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 3 Jul 2024 20:44:25 +0200 Subject: ata,scsi: Remove wrapper ata_sas_port_alloc() The ata_sas_port_alloc() wrapper mainly exists in order to export the internal libata function which it wraps. The secondary reason is that it initializes some ata_port struct members. However, ata_sas_port_alloc() is only used in a single location, sas_ata_init(), which already performs some ata_port struct member initialization, so it does not make sense to spread this initialization out over two separate locations. Thus, remove the wrapper and instead export the libata function directly, and move the libsas specific ata_port initialization to sas_ata_init(), which already does some ata_port initialization. Reviewed-by: Hannes Reinecke Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240703184418.723066-19-cassel@kernel.org Signed-off-by: Niklas Cassel --- include/linux/libata.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 84a7bfbac9fa..17394098bee9 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1244,9 +1244,8 @@ extern int sata_link_debounce(struct ata_link *link, extern int sata_link_scr_lpm(struct ata_link *link, enum ata_lpm_policy policy, bool spm_wakeup); extern int ata_slave_link_init(struct ata_port *ap); -extern struct ata_port *ata_sas_port_alloc(struct ata_host *, - struct ata_port_info *, struct Scsi_Host *); extern void ata_port_probe(struct ata_port *ap); +extern struct ata_port *ata_port_alloc(struct ata_host *host); extern void ata_port_free(struct ata_port *ap); extern int ata_tport_add(struct device *parent, struct ata_port *ap); extern void ata_tport_delete(struct ata_port *ap); -- cgit v1.2.3 From a1944676767e855869b6af8e1c7e185372feaf31 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Sun, 30 Jun 2024 21:47:39 +0200 Subject: misc: keba: Add basic KEBA CP500 system FPGA support The KEBA CP500 system FPGA is a PCIe device, which consists of multiple IP cores. Every IP core has its own auxiliary driver. The cp500 driver registers an auxiliary device for each device and the corresponding drivers are loaded by the Linux driver infrastructure. Currently 3 variants of this device exists. Every variant has its own PCI device ID, which is used to determine the list of available IP cores. In this first version only the auxiliary device for the I2C controller is registered. Besides the auxiliary device registration some other basic functions of the FPGA are implemented; e.g, FPGA version sysfs file, keep FPGA configuration on reset sysfs file, error message for errors on the internal AXI bus of the FPGA. Signed-off-by: Gerhard Engleder Link: https://lore.kernel.org/r/20240630194740.7137-2-gerhard@engleder-embedded.com Signed-off-by: Greg Kroah-Hartman --- include/linux/misc/keba.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 include/linux/misc/keba.h (limited to 'include/linux') diff --git a/include/linux/misc/keba.h b/include/linux/misc/keba.h new file mode 100644 index 000000000000..323b31a847c5 --- /dev/null +++ b/include/linux/misc/keba.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2024, KEBA Industrial Automation Gmbh */ + +#ifndef _LINUX_MISC_KEBA_H +#define _LINUX_MISC_KEBA_H + +#include + +struct i2c_board_info; + +/** + * struct keba_i2c_auxdev - KEBA I2C auxiliary device + * @auxdev: auxiliary device object + * @io: address range of I2C controller IO memory + * @info_size: number of I2C devices to be probed + * @info: I2C devices to be probed + */ +struct keba_i2c_auxdev { + struct auxiliary_device auxdev; + struct resource io; + int info_size; + struct i2c_board_info *info; +}; + +#endif /* _LINUX_MISC_KEBA_H */ -- cgit v1.2.3 From 4acab508eb03dc1827db6005764de29299e07569 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 3 Jul 2024 10:31:41 +0200 Subject: thermal: core: constify 'type' in devm_thermal_of_cooling_device_register() The 'type' string passed to thermal_of_cooling_device_register() is a 'const char *', so do the same in the devm interface. Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20240703083141.96013-1-krzysztof.kozlowski@linaro.org Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index f1155c0439c4..f732dab20368 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -261,7 +261,7 @@ thermal_of_cooling_device_register(struct device_node *np, const char *, void *, struct thermal_cooling_device * devm_thermal_of_cooling_device_register(struct device *dev, struct device_node *np, - char *type, void *devdata, + const char *type, void *devdata, const struct thermal_cooling_device_ops *ops); void thermal_cooling_device_update(struct thermal_cooling_device *); void thermal_cooling_device_unregister(struct thermal_cooling_device *); @@ -305,7 +305,7 @@ thermal_of_cooling_device_register(struct device_node *np, static inline struct thermal_cooling_device * devm_thermal_of_cooling_device_register(struct device *dev, struct device_node *np, - char *type, void *devdata, + const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return ERR_PTR(-ENODEV); -- cgit v1.2.3 From 14678219cf4093e897ab353fd78eab7994d1be7d Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 2 Jul 2024 14:34:35 +0800 Subject: iommu: Introduce domain attachment handle Currently, when attaching a domain to a device or its PASID, domain is stored within the iommu group. It could be retrieved for use during the window between attachment and detachment. With new features introduced, there's a need to store more information than just a domain pointer. This information essentially represents the association between a domain and a device. For example, the SVA code already has a custom struct iommu_sva which represents a bond between sva domain and a PASID of a device. Looking forward, the IOMMUFD needs a place to store the iommufd_device pointer in the core, so that the device object ID could be quickly retrieved in the critical fault handling path. Introduce domain attachment handle that explicitly represents the attachment relationship between a domain and a device or its PASID. Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240702063444.105814-2-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- include/linux/iommu.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 17b3f36ad843..afc5af0069bb 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -989,12 +989,22 @@ struct iommu_fwspec { /* ATS is supported */ #define IOMMU_FWSPEC_PCI_RC_ATS (1 << 0) +/* + * An iommu attach handle represents a relationship between an iommu domain + * and a PASID or RID of a device. It is allocated and managed by the component + * that manages the domain and is stored in the iommu group during the time the + * domain is attached. + */ +struct iommu_attach_handle { + struct iommu_domain *domain; +}; + /** * struct iommu_sva - handle to a device-mm bond */ struct iommu_sva { + struct iommu_attach_handle handle; struct device *dev; - struct iommu_domain *domain; struct list_head handle_item; refcount_t users; }; @@ -1052,7 +1062,8 @@ int iommu_device_claim_dma_owner(struct device *dev, void *owner); void iommu_device_release_dma_owner(struct device *dev); int iommu_attach_device_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid); + struct device *dev, ioasid_t pasid, + struct iommu_attach_handle *handle); void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); struct iommu_domain * @@ -1388,7 +1399,8 @@ static inline int iommu_device_claim_dma_owner(struct device *dev, void *owner) } static inline int iommu_attach_device_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_attach_handle *handle) { return -ENODEV; } -- cgit v1.2.3 From 3e7f57d1ef3f5fbed58974fae38d35e430f57d35 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 2 Jul 2024 14:34:36 +0800 Subject: iommu: Remove sva handle list The struct sva_iommu represents an association between an SVA domain and a PASID of a device. It's stored in the iommu group's pasid array and also tracked by a list in the per-mm data structure. Removes duplicate tracking of sva_iommu by eliminating the list. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240702063444.105814-3-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- include/linux/iommu.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index afc5af0069bb..87ebcc29020e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1005,14 +1005,12 @@ struct iommu_attach_handle { struct iommu_sva { struct iommu_attach_handle handle; struct device *dev; - struct list_head handle_item; refcount_t users; }; struct iommu_mm_data { u32 pasid; struct list_head sva_domains; - struct list_head sva_handles; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, -- cgit v1.2.3 From 06cdcc32d65759d42c6340700796e2906045b6a5 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 2 Jul 2024 14:34:37 +0800 Subject: iommu: Add attach handle to struct iopf_group Previously, the domain that a page fault targets is stored in an iopf_group, which represents a minimal set of page faults. With the introduction of attach handle, replace the domain with the handle so that the fault handler can obtain more information as needed when handling the faults. iommu_report_device_fault() is currently used for SVA page faults, which handles the page fault in an internal cycle. The domain is retrieved with iommu_get_domain_for_dev_pasid() if the pasid in the fault message is valid. This doesn't work in IOMMUFD case, where if the pasid table of a device is wholly managed by user space, there is no domain attached to the PASID of the device, and all page faults are forwarded through a NESTING domain attaching to RID. Add a static flag in iommu ops, which indicates if the IOMMU driver supports user-managed PASID tables. In the iopf deliver path, if no attach handle found for the iopf PASID, roll back to RID domain when the IOMMU driver supports this capability. iommu_get_domain_for_dev_pasid() is no longer used and can be removed. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240702063444.105814-4-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- include/linux/iommu.h | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 87ebcc29020e..910aec80886e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -127,7 +127,7 @@ struct iopf_group { /* list node for iommu_fault_param::faults */ struct list_head pending_node; struct work_struct work; - struct iommu_domain *domain; + struct iommu_attach_handle *attach_handle; /* The device's fault data parameter. */ struct iommu_fault_param *fault_param; }; @@ -547,6 +547,10 @@ static inline int __iommu_copy_struct_from_user_array( * @default_domain: If not NULL this will always be set as the default domain. * This should be an IDENTITY/BLOCKED/PLATFORM domain. * Do not use in new drivers. + * @user_pasid_table: IOMMU driver supports user-managed PASID table. There is + * no user domain for each PASID and the I/O page faults are + * forwarded through the user domain attached to the device + * RID. */ struct iommu_ops { bool (*capable)(struct device *dev, enum iommu_cap); @@ -590,6 +594,7 @@ struct iommu_ops { struct iommu_domain *blocked_domain; struct iommu_domain *release_domain; struct iommu_domain *default_domain; + u8 user_pasid_table:1; }; /** @@ -1064,9 +1069,6 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, struct iommu_attach_handle *handle); void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); -struct iommu_domain * -iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid, - unsigned int type); ioasid_t iommu_alloc_global_pasid(struct device *dev); void iommu_free_global_pasid(ioasid_t pasid); #else /* CONFIG_IOMMU_API */ @@ -1408,13 +1410,6 @@ static inline void iommu_detach_device_pasid(struct iommu_domain *domain, { } -static inline struct iommu_domain * -iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid, - unsigned int type) -{ - return NULL; -} - static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) { return IOMMU_PASID_INVALID; -- cgit v1.2.3 From a27bf2743cb80d3b36b5b43e8e2e702412c41668 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 10 Jun 2024 16:55:35 +0800 Subject: iommu: Add iommu_paging_domain_alloc() interface Commit <17de3f5fdd35> ("iommu: Retire bus ops") removes iommu ops from bus. The iommu subsystem no longer relies on bus for operations. So the bus parameter in iommu_domain_alloc() is no longer relevant. Add a new interface named iommu_paging_domain_alloc(), which explicitly indicates the allocation of a paging domain for DMA managed by a kernel driver. The new interface takes a device pointer as its parameter, that better aligns with the current iommu subsystem. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20240610085555.88197-2-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- include/linux/iommu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 17b3f36ad843..58ea0935d355 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -780,6 +780,7 @@ extern bool iommu_present(const struct bus_type *bus); extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); extern bool iommu_group_has_isolated_msi(struct iommu_group *group); extern struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus); +struct iommu_domain *iommu_paging_domain_alloc(struct device *dev); extern void iommu_domain_free(struct iommu_domain *domain); extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); @@ -1086,6 +1087,11 @@ static inline struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus return NULL; } +static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) +{ + return ERR_PTR(-ENODEV); +} + static inline void iommu_domain_free(struct iommu_domain *domain) { } -- cgit v1.2.3 From 3f7c320916282c26812d70cfe8830abb9e4dc696 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 2 Jul 2024 12:40:48 +0100 Subject: iommu: Resolve fwspec ops automatically There's no real need for callers to resolve ops from a fwnode in order to then pass both to iommu_fwspec_init() - it's simpler and more sensible for that to resolve the ops itself. This in turn means we can centralise the notion of checking for a present driver, and enforce that fwspecs aren't allocated unless and until we know they will be usable. Also use this opportunity to modernise with some "new" helpers that arrived shortly after this code was first written; the generic fwnode_handle_get() clears up that ugly get/put mismatch, while of_fwnode_handle() can now abstract those open-coded dereferences. Tested-by: Jean-Philippe Brucker Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/0e2727adeb8cd73274425322f2f793561bdc927e.1719919669.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- include/linux/iommu.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 17b3f36ad843..81893aad9ee4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1005,11 +1005,9 @@ struct iommu_mm_data { struct list_head sva_handles; }; -int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, - const struct iommu_ops *ops); +int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); void iommu_fwspec_free(struct device *dev); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids); -const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode); static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) { @@ -1315,8 +1313,7 @@ static inline void iommu_device_unlink(struct device *dev, struct device *link) } static inline int iommu_fwspec_init(struct device *dev, - struct fwnode_handle *iommu_fwnode, - const struct iommu_ops *ops) + struct fwnode_handle *iommu_fwnode) { return -ENODEV; } @@ -1331,12 +1328,6 @@ static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids, return -ENODEV; } -static inline -const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode) -{ - return NULL; -} - static inline int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat) { -- cgit v1.2.3 From 3e36c15fc1cce65cccc93ed16f86d8ff9d2f9992 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 2 Jul 2024 12:40:51 +0100 Subject: iommu: Remove iommu_fwspec ops The ops in iommu_fwspec are only needed for the early configuration and probe process, and by now are easy enough to derive on-demand in those couple of places which need them, so remove the redundant stored copy. Tested-by: Jean-Philippe Brucker Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/55c1410b2cd09531eab4f8e2f18f92a0faa0ea75.1719919669.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- include/linux/iommu.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 81893aad9ee4..11ae1750cb1d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -968,7 +968,6 @@ extern struct iommu_group *generic_single_device_group(struct device *dev); /** * struct iommu_fwspec - per-device IOMMU instance data - * @ops: ops for this device's IOMMU * @iommu_fwnode: firmware handle for this device's IOMMU * @flags: IOMMU_FWSPEC_* flags * @num_ids: number of associated device IDs @@ -979,7 +978,6 @@ extern struct iommu_group *generic_single_device_group(struct device *dev); * consumers. */ struct iommu_fwspec { - const struct iommu_ops *ops; struct fwnode_handle *iommu_fwnode; u32 flags; unsigned int num_ids; -- cgit v1.2.3 From 7640f1a44eba0cc1a41b312080c236f5c668cd50 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 3 Jul 2024 13:06:08 +0300 Subject: printk: Add match_devname_and_update_preferred_console() Let's add match_devname_and_update_preferred_console() for driver subsystems to call during init when the console is ready, and it's character device name is known. For now, we use it only for the serial layer to allow console=DEVNAME:0.0 style hardware based addressing for consoles. The earlier attempt on doing this caused a regression with the kernel command line console order as it added calling __add_preferred_console() again later on during init. A better approach was suggested by Petr where we add the deferred console to the console_cmdline[] and update it later on when the console is ready. Suggested-by: Petr Mladek Co-developed-by: Petr Mladek Signed-off-by: Tony Lindgren Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240703100615.118762-2-tony.lindgren@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/printk.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 65c5184470f1..7239976698e4 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -60,6 +60,10 @@ static inline const char *printk_skip_headers(const char *buffer) #define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT #define CONSOLE_LOGLEVEL_QUIET CONFIG_CONSOLE_LOGLEVEL_QUIET +int match_devname_and_update_preferred_console(const char *match, + const char *name, + const short idx); + extern int console_printk[]; #define console_loglevel (console_printk[0]) -- cgit v1.2.3 From 473b2cf9c4d1711146da1d8574c61e92c6672892 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 6 Jun 2024 12:56:35 +0530 Subject: PCI: endpoint: Introduce 'epc_deinit' event and notify the EPF drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As like the 'epc_init' event, that is used to signal the EPF drivers about the EPC initialization, let's introduce 'epc_deinit' event that is used to signal EPC deinitialization. The EPC deinitialization applies only when any sort of fundamental reset is supported by the endpoint controller as per the PCIe spec. Reference: PCIe r6.0, sec 4.2.5.9.1 and 6.6.1. Currently, some EPC drivers like pcie-qcom-ep and pcie-tegra194 support PERST# as the fundamental reset. So the 'deinit' event will be notified to the EPF drivers when PERST# assert happens in the above mentioned EPC drivers. The EPF drivers, on receiving the event through the epc_deinit() callback should reset the EPF state machine and also cleanup any configuration that got affected by the fundamental reset like BAR, DMA etc... This change also warrants skipping the cleanups in unbind() if already done in epc_deinit(). Link: https://lore.kernel.org/r/20240606-pci-deinit-v1-2-4395534520dc@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Krzysztof Wilczyński Reviewed-by: Niklas Cassel Reviewed-by: Siddharth Vadapalli Reviewed-by: Frank Li --- include/linux/pci-epc.h | 13 +++++++++++++ include/linux/pci-epf.h | 2 ++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 11115cd0fe5b..85bdf2adb760 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -197,6 +197,8 @@ struct pci_epc_features { #define to_pci_epc(device) container_of((device), struct pci_epc, dev) +#ifdef CONFIG_PCI_ENDPOINT + #define pci_epc_create(dev, ops) \ __pci_epc_create((dev), (ops), THIS_MODULE) #define devm_pci_epc_create(dev, ops) \ @@ -226,6 +228,7 @@ void pci_epc_linkup(struct pci_epc *epc); void pci_epc_linkdown(struct pci_epc *epc); void pci_epc_init_notify(struct pci_epc *epc); void pci_epc_notify_pending_init(struct pci_epc *epc, struct pci_epf *epf); +void pci_epc_deinit_notify(struct pci_epc *epc); void pci_epc_bus_master_enable_notify(struct pci_epc *epc); void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf, enum pci_epc_interface_type type); @@ -272,4 +275,14 @@ void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc, phys_addr_t *phys_addr, size_t size); void pci_epc_mem_free_addr(struct pci_epc *epc, phys_addr_t phys_addr, void __iomem *virt_addr, size_t size); + +#else +static inline void pci_epc_init_notify(struct pci_epc *epc) +{ +} + +static inline void pci_epc_deinit_notify(struct pci_epc *epc) +{ +} +#endif /* CONFIG_PCI_ENDPOINT */ #endif /* __LINUX_PCI_EPC_H */ diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index dc759eb7157c..0639d4dc8986 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -71,12 +71,14 @@ struct pci_epf_ops { /** * struct pci_epc_event_ops - Callbacks for capturing the EPC events * @epc_init: Callback for the EPC initialization complete event + * @epc_deinit: Callback for the EPC deinitialization event * @link_up: Callback for the EPC link up event * @link_down: Callback for the EPC link down event * @bus_master_enable: Callback for the EPC Bus Master Enable event */ struct pci_epc_event_ops { int (*epc_init)(struct pci_epf *epf); + void (*epc_deinit)(struct pci_epf *epf); int (*link_up)(struct pci_epf *epf); int (*link_down)(struct pci_epf *epf); int (*bus_master_enable)(struct pci_epf *epf); -- cgit v1.2.3 From e269d79c7d35aa3808b1f3c1737d63dab504ddc8 Mon Sep 17 00:00:00 2001 From: Denis Arefev Date: Thu, 13 Jun 2024 12:54:48 +0300 Subject: net: missing check virtio Two missing check in virtio_net_hdr_to_skb() allowed syzbot to crash kernels again 1. After the skb_segment function the buffer may become non-linear (nr_frags != 0), but since the SKBTX_SHARED_FRAG flag is not set anywhere the __skb_linearize function will not be executed, then the buffer will remain non-linear. Then the condition (offset >= skb_headlen(skb)) becomes true, which causes WARN_ON_ONCE in skb_checksum_help. 2. The struct sk_buff and struct virtio_net_hdr members must be mathematically related. (gso_size) must be greater than (needed) otherwise WARN_ON_ONCE. (remainder) must be greater than (needed) otherwise WARN_ON_ONCE. (remainder) may be 0 if division is without remainder. offset+2 (4191) > skb_headlen() (1116) WARNING: CPU: 1 PID: 5084 at net/core/dev.c:3303 skb_checksum_help+0x5e2/0x740 net/core/dev.c:3303 Modules linked in: CPU: 1 PID: 5084 Comm: syz-executor336 Not tainted 6.7.0-rc3-syzkaller-00014-gdf60cee26a2e #0 Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 11/10/2023 RIP: 0010:skb_checksum_help+0x5e2/0x740 net/core/dev.c:3303 Code: 89 e8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 52 01 00 00 44 89 e2 2b 53 74 4c 89 ee 48 c7 c7 40 57 e9 8b e8 af 8f dd f8 90 <0f> 0b 90 90 e9 87 fe ff ff e8 40 0f 6e f9 e9 4b fa ff ff 48 89 ef RSP: 0018:ffffc90003a9f338 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff888025125780 RCX: ffffffff814db209 RDX: ffff888015393b80 RSI: ffffffff814db216 RDI: 0000000000000001 RBP: ffff8880251257f4 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 000000000000045c R13: 000000000000105f R14: ffff8880251257f0 R15: 000000000000105d FS: 0000555555c24380(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000002000f000 CR3: 0000000023151000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ip_do_fragment+0xa1b/0x18b0 net/ipv4/ip_output.c:777 ip_fragment.constprop.0+0x161/0x230 net/ipv4/ip_output.c:584 ip_finish_output_gso net/ipv4/ip_output.c:286 [inline] __ip_finish_output net/ipv4/ip_output.c:308 [inline] __ip_finish_output+0x49c/0x650 net/ipv4/ip_output.c:295 ip_finish_output+0x31/0x310 net/ipv4/ip_output.c:323 NF_HOOK_COND include/linux/netfilter.h:303 [inline] ip_output+0x13b/0x2a0 net/ipv4/ip_output.c:433 dst_output include/net/dst.h:451 [inline] ip_local_out+0xaf/0x1a0 net/ipv4/ip_output.c:129 iptunnel_xmit+0x5b4/0x9b0 net/ipv4/ip_tunnel_core.c:82 ipip6_tunnel_xmit net/ipv6/sit.c:1034 [inline] sit_tunnel_xmit+0xed2/0x28f0 net/ipv6/sit.c:1076 __netdev_start_xmit include/linux/netdevice.h:4940 [inline] netdev_start_xmit include/linux/netdevice.h:4954 [inline] xmit_one net/core/dev.c:3545 [inline] dev_hard_start_xmit+0x13d/0x6d0 net/core/dev.c:3561 __dev_queue_xmit+0x7c1/0x3d60 net/core/dev.c:4346 dev_queue_xmit include/linux/netdevice.h:3134 [inline] packet_xmit+0x257/0x380 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3087 [inline] packet_sendmsg+0x24ca/0x5240 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xd5/0x180 net/socket.c:745 __sys_sendto+0x255/0x340 net/socket.c:2190 __do_sys_sendto net/socket.c:2202 [inline] __se_sys_sendto net/socket.c:2198 [inline] __x64_sys_sendto+0xe0/0x1b0 net/socket.c:2198 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Found by Linux Verification Center (linuxtesting.org) with Syzkaller Fixes: 0f6925b3e8da ("virtio_net: Do not pull payload in skb->head") Signed-off-by: Denis Arefev Message-Id: <20240613095448.27118-1-arefev@swemel.ru> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_net.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 4dfa9b69ca8d..d1d7825318c3 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -56,6 +56,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, unsigned int thlen = 0; unsigned int p_off = 0; unsigned int ip_proto; + u64 ret, remainder, gso_size; if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { @@ -98,6 +99,16 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset); u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16)); + if (hdr->gso_size) { + gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); + ret = div64_u64_rem(skb->len, gso_size, &remainder); + if (!(ret && (hdr->gso_size > needed) && + ((remainder > needed) || (remainder == 0)))) { + return -EINVAL; + } + skb_shinfo(skb)->tx_flags |= SKBFL_SHARED_FRAG; + } + if (!pskb_may_pull(skb, needed)) return -EINVAL; -- cgit v1.2.3 From a1cacb8a8e70c38ec0c78910c668abda99fcb780 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 24 Jun 2024 17:19:56 +0200 Subject: backlight: Add BACKLIGHT_POWER_ constants for power states Duplicate FB_BLANK_ constants as BACKLIGHT_POWER__ constants in the backlight header file. Allows backlight drivers to avoid including the fbdev header file and removes a compile-time dependency between the two subsystems. The new BACKLIGHT_POWER_ constants have the same values as their FB_BLANK_ counterparts. Hence UAPI and internal semantics do not change. The backlight drivers can be converted one by one. Each instance of FB_BLANK_UNBLANK becomes BACKLIGHT_POWER_ON, each of FB_BLANK_POWERDOWN becomes BACKLIGHT_POWER_OFF, and FB_BLANK_NORMAL becomes BACKLIGHT_POWER_REDUCED. Backlight code or drivers do not use FB_BLANK_VSYNC_SUSPEND and FB_BLANK_HSYNC_SUSPEND, so no new constants for these are being added. The semantics of FB_BLANK_NORMAL appear inconsistent. In fbdev, NORMAL means display off with sync enabled. In backlight code, this translates to turn the backlight off, but some drivers interpret it as backlight on. So we keep the current code as is, but mark BACKLIGHT_POWER_REDUCED as deprecated. Drivers should be fixed and the constant removed. This affects ams369fg06 and a few DRM panel drivers. v2: - rename BL_CORE_ power constants to BACKLIGHT_POWER_ (Sam) - fix documentation Signed-off-by: Thomas Zimmermann Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20240624152033.25016-2-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/backlight.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backlight.h b/include/linux/backlight.h index 19a1c0e22629..ea9c1bc8148e 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -209,15 +209,19 @@ struct backlight_properties { * attribute: /sys/class/backlight//bl_power * When the power property is updated update_status() is called. * - * The possible values are: (0: full on, 1 to 3: power saving - * modes; 4: full off), see FB_BLANK_XXX. + * The possible values are: (0: full on, 4: full off), see + * BACKLIGHT_POWER constants. * - * When the backlight device is enabled @power is set - * to FB_BLANK_UNBLANK. When the backlight device is disabled - * @power is set to FB_BLANK_POWERDOWN. + * When the backlight device is enabled, @power is set to + * BACKLIGHT_POWER_ON. When the backlight device is disabled, + * @power is set to BACKLIGHT_POWER_OFF. */ int power; +#define BACKLIGHT_POWER_ON (0) +#define BACKLIGHT_POWER_OFF (4) +#define BACKLIGHT_POWER_REDUCED (1) // deprecated; don't use in new code + /** * @type: The type of backlight supported. * @@ -346,7 +350,7 @@ static inline int backlight_enable(struct backlight_device *bd) if (!bd) return 0; - bd->props.power = FB_BLANK_UNBLANK; + bd->props.power = BACKLIGHT_POWER_ON; bd->props.state &= ~BL_CORE_FBBLANK; return backlight_update_status(bd); @@ -361,7 +365,7 @@ static inline int backlight_disable(struct backlight_device *bd) if (!bd) return 0; - bd->props.power = FB_BLANK_POWERDOWN; + bd->props.power = BACKLIGHT_POWER_OFF; bd->props.state |= BL_CORE_FBBLANK; return backlight_update_status(bd); @@ -380,7 +384,7 @@ static inline int backlight_disable(struct backlight_device *bd) */ static inline bool backlight_is_blank(const struct backlight_device *bd) { - return bd->props.power != FB_BLANK_UNBLANK || + return bd->props.power != BACKLIGHT_POWER_ON || bd->props.state & (BL_CORE_SUSPENDED | BL_CORE_FBBLANK); } -- cgit v1.2.3 From 326ae03d772de6e082d346466671c3b66de2962d Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 7 May 2024 17:53:56 +0100 Subject: mfd: idt8a340_reg: Start comments with '/*' Several comments in idt8a340_reg.h start with '/**', which denotes the start of a Kernel doc, but are otherwise not Kernel docs. Resolve this conflict by starting these comments with '/*' instead. Flagged by ./scripts/kernel-doc -none Signed-off-by: Simon Horman Link: https://lore.kernel.org/r/20240507-clockmatrix-kernel-doc-v2-1-3138d74192dd@kernel.org Signed-off-by: Lee Jones --- include/linux/mfd/idt8a340_reg.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/idt8a340_reg.h b/include/linux/mfd/idt8a340_reg.h index 0c706085c205..53a222605526 100644 --- a/include/linux/mfd/idt8a340_reg.h +++ b/include/linux/mfd/idt8a340_reg.h @@ -61,7 +61,7 @@ #define HW_Q8_CTRL_SPARE (0xa7d4) #define HW_Q11_CTRL_SPARE (0xa7ec) -/** +/* * Select FOD5 as sync_trigger for Q8 divider. * Transition from logic zero to one * sets trigger to sync Q8 divider. @@ -70,7 +70,7 @@ */ #define Q9_TO_Q8_SYNC_TRIG BIT(1) -/** +/* * Enable FOD5 as driver for clock and sync for Q8 divider. * Enable fanout buffer for FOD5. * @@ -78,7 +78,7 @@ */ #define Q9_TO_Q8_FANOUT_AND_CLOCK_SYNC_ENABLE_MASK (BIT(0) | BIT(2)) -/** +/* * Select FOD6 as sync_trigger for Q11 divider. * Transition from logic zero to one * sets trigger to sync Q11 divider. @@ -87,7 +87,7 @@ */ #define Q10_TO_Q11_SYNC_TRIG BIT(1) -/** +/* * Enable FOD6 as driver for clock and sync for Q11 divider. * Enable fanout buffer for FOD6. * -- cgit v1.2.3 From 13c151a919a876521c16810756764aa38683eb62 Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Thu, 13 Jun 2024 12:54:30 -0500 Subject: mfd: tps65912: Use devm helper functions to simplify probe This simplifies probe and also allows us to remove the remove callbacks from the core and interface drivers. Do that here. Signed-off-by: Andrew Davis Link: https://lore.kernel.org/r/20240613175430.57698-1-afd@ti.com Signed-off-by: Lee Jones --- include/linux/mfd/tps65912.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tps65912.h b/include/linux/mfd/tps65912.h index 860ec0a16c96..e5373c302722 100644 --- a/include/linux/mfd/tps65912.h +++ b/include/linux/mfd/tps65912.h @@ -314,6 +314,5 @@ struct tps65912 { extern const struct regmap_config tps65912_regmap_config; int tps65912_device_init(struct tps65912 *tps); -void tps65912_device_exit(struct tps65912 *tps); #endif /* __LINUX_MFD_TPS65912_H */ -- cgit v1.2.3 From d7636117ca976e217b6452c082e7f7c041f4019c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Jun 2024 22:14:16 +0300 Subject: mfd: lm3533: Move to new GPIO descriptor-based APIs Legacy GPIO APIs are subject to remove. Convert the driver to new APIs. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240605191458.2536819-1-andriy.shevchenko@linux.intel.com Signed-off-by: Lee Jones --- include/linux/mfd/lm3533.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/lm3533.h b/include/linux/mfd/lm3533.h index 77092f6363ad..69059a7a2ce5 100644 --- a/include/linux/mfd/lm3533.h +++ b/include/linux/mfd/lm3533.h @@ -16,6 +16,7 @@ DEVICE_ATTR(_name, S_IRUGO | S_IWUSR , show_##_name, store_##_name) struct device; +struct gpio_desc; struct regmap; struct lm3533 { @@ -23,7 +24,7 @@ struct lm3533 { struct regmap *regmap; - int gpio_hwen; + struct gpio_desc *hwen; int irq; unsigned have_als:1; @@ -69,8 +70,6 @@ enum lm3533_boost_ovp { }; struct lm3533_platform_data { - int gpio_hwen; - enum lm3533_boost_ovp boost_ovp; enum lm3533_boost_freq boost_freq; -- cgit v1.2.3 From caa93b7c25945d302689de07bd404655db93ae6e Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 3 Jul 2024 13:18:49 +0100 Subject: ethtool: move firmware flashing flag to struct ethtool_netdev_state Commit 31e0aa99dc02 ("ethtool: Veto some operations during firmware flashing process") added a flag module_fw_flash_in_progress to struct net_device. As this is ethtool related state, move it to the recently created struct ethtool_netdev_state, accessed via the 'ethtool' member of struct net_device. Suggested-by: Jakub Kicinski Signed-off-by: Edward Cree Reviewed-by: Michal Kubiak Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20240703121849.652893-1-edward.cree@amd.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 ++ include/linux/netdevice.h | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index f74bb0cf8ed1..3a99238ef895 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1107,11 +1107,13 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, * @rss_lock: Protects entries in @rss_ctx. May be taken from * within RTNL. * @wol_enabled: Wake-on-LAN is enabled + * @module_fw_flash_in_progress: Module firmware flashing is in progress. */ struct ethtool_netdev_state { struct xarray rss_ctx; struct mutex rss_lock; unsigned wol_enabled:1; + unsigned module_fw_flash_in_progress:1; }; struct phy_device; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3c719f0d5f5a..93558645c6d0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1989,8 +1989,6 @@ enum netdev_reg_state { * * @threaded: napi threaded mode is enabled * - * @module_fw_flash_in_progress: Module firmware flashing is in progress. - * * @net_notifier_list: List of per-net netdev notifier block * that follow this device when it is moved * to another network namespace. @@ -2376,7 +2374,6 @@ struct net_device { bool proto_down; bool threaded; - unsigned module_fw_flash_in_progress:1; struct list_head net_notifier_list; #if IS_ENABLED(CONFIG_MACSEC) -- cgit v1.2.3 From a61809a33239821d70eba77bd0d6d13c29bbad0d Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Wed, 3 Jul 2024 18:33:14 +0300 Subject: tpm: Address !chip->auth in tpm_buf_append_name() Unless tpm_chip_bootstrap() was called by the driver, !chip->auth can cause a null derefence in tpm_buf_append_name(). Thus, address !chip->auth in tpm_buf_append_name() and remove the fallback implementation for !TCG_TPM2_HMAC. Cc: stable@vger.kernel.org # v6.10+ Reported-by: Stefan Berger Closes: https://lore.kernel.org/linux-integrity/20240617193408.1234365-1-stefanb@linux.ibm.com/ Fixes: d0a25bb961e6 ("tpm: Add HMAC session name/handle append") Tested-by: Michael Ellerman # ppc Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 21a67dc9efe8..4d3071e885a0 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -490,11 +490,22 @@ static inline void tpm_buf_append_empty_auth(struct tpm_buf *buf, u32 handle) { } #endif + +static inline struct tpm2_auth *tpm2_chip_auth(struct tpm_chip *chip) +{ #ifdef CONFIG_TCG_TPM2_HMAC + return chip->auth; +#else + return NULL; +#endif +} -int tpm2_start_auth_session(struct tpm_chip *chip); void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf, u32 handle, u8 *name); + +#ifdef CONFIG_TCG_TPM2_HMAC + +int tpm2_start_auth_session(struct tpm_chip *chip); void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf, u8 attributes, u8 *passphrase, int passphraselen); @@ -521,14 +532,6 @@ static inline int tpm2_start_auth_session(struct tpm_chip *chip) static inline void tpm2_end_auth_session(struct tpm_chip *chip) { } -static inline void tpm_buf_append_name(struct tpm_chip *chip, - struct tpm_buf *buf, - u32 handle, u8 *name) -{ - tpm_buf_append_u32(buf, handle); - /* count the number of handles in the upper bits of flags */ - buf->handles++; -} static inline void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf, u8 attributes, u8 *passphrase, -- cgit v1.2.3 From 7ca110f2679b7d1f3ac1afc90e6ffbf0af3edf0d Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Wed, 3 Jul 2024 18:47:46 +0300 Subject: tpm: Address !chip->auth in tpm_buf_append_hmac_session*() Unless tpm_chip_bootstrap() was called by the driver, !chip->auth can cause a null derefence in tpm_buf_hmac_session*(). Thus, address !chip->auth in tpm_buf_hmac_session*() and remove the fallback implementation for !TCG_TPM2_HMAC. Cc: stable@vger.kernel.org # v6.9+ Reported-by: Stefan Berger Closes: https://lore.kernel.org/linux-integrity/20240617193408.1234365-1-stefanb@linux.ibm.com/ Fixes: 1085b8276bb4 ("tpm: Add the rest of the session HMAC API") Tested-by: Michael Ellerman # ppc Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 68 ++++++++++++++++------------------------------------- 1 file changed, 20 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 4d3071e885a0..e93ee8d936a9 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -502,10 +502,6 @@ static inline struct tpm2_auth *tpm2_chip_auth(struct tpm_chip *chip) void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf, u32 handle, u8 *name); - -#ifdef CONFIG_TCG_TPM2_HMAC - -int tpm2_start_auth_session(struct tpm_chip *chip); void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf, u8 attributes, u8 *passphrase, int passphraselen); @@ -515,9 +511,27 @@ static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip, u8 *passphrase, int passphraselen) { - tpm_buf_append_hmac_session(chip, buf, attributes, passphrase, - passphraselen); + struct tpm_header *head; + int offset; + + if (tpm2_chip_auth(chip)) { + tpm_buf_append_hmac_session(chip, buf, attributes, passphrase, passphraselen); + } else { + offset = buf->handles * 4 + TPM_HEADER_SIZE; + head = (struct tpm_header *)buf->data; + + /* + * If the only sessions are optional, the command tag must change to + * TPM2_ST_NO_SESSIONS. + */ + if (tpm_buf_length(buf) == offset) + head->tag = cpu_to_be16(TPM2_ST_NO_SESSIONS); + } } + +#ifdef CONFIG_TCG_TPM2_HMAC + +int tpm2_start_auth_session(struct tpm_chip *chip); void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf); int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf, int rc); @@ -532,48 +546,6 @@ static inline int tpm2_start_auth_session(struct tpm_chip *chip) static inline void tpm2_end_auth_session(struct tpm_chip *chip) { } -static inline void tpm_buf_append_hmac_session(struct tpm_chip *chip, - struct tpm_buf *buf, - u8 attributes, u8 *passphrase, - int passphraselen) -{ - /* offset tells us where the sessions area begins */ - int offset = buf->handles * 4 + TPM_HEADER_SIZE; - u32 len = 9 + passphraselen; - - if (tpm_buf_length(buf) != offset) { - /* not the first session so update the existing length */ - len += get_unaligned_be32(&buf->data[offset]); - put_unaligned_be32(len, &buf->data[offset]); - } else { - tpm_buf_append_u32(buf, len); - } - /* auth handle */ - tpm_buf_append_u32(buf, TPM2_RS_PW); - /* nonce */ - tpm_buf_append_u16(buf, 0); - /* attributes */ - tpm_buf_append_u8(buf, 0); - /* passphrase */ - tpm_buf_append_u16(buf, passphraselen); - tpm_buf_append(buf, passphrase, passphraselen); -} -static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip, - struct tpm_buf *buf, - u8 attributes, - u8 *passphrase, - int passphraselen) -{ - int offset = buf->handles * 4 + TPM_HEADER_SIZE; - struct tpm_header *head = (struct tpm_header *) buf->data; - - /* - * if the only sessions are optional, the command tag - * must change to TPM2_ST_NO_SESSIONS - */ - if (tpm_buf_length(buf) == offset) - head->tag = cpu_to_be16(TPM2_ST_NO_SESSIONS); -} static inline void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf) { -- cgit v1.2.3 From 87024f5837485c2f9541283747428df54c0f9183 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 24 Jun 2024 17:58:55 -0700 Subject: mm: memcg: rename soft limit reclaim-related functions Rename exported function related to the softlimit reclaim to have memcg1_ prefix. Link: https://lkml.kernel.org/r/20240625005906.106920-4-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7403dd5926eb..83c8327455d8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1121,9 +1121,9 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, int old_order, int new_order); -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned); +unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned); #else /* CONFIG_MEMCG */ @@ -1572,9 +1572,9 @@ static inline void split_page_memcg(struct page *head, int old_order, int new_or } static inline -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned) +unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) { return 0; } -- cgit v1.2.3 From 66d60c428b23c5b171218985b6880958fb7edbe3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 24 Jun 2024 17:58:58 -0700 Subject: mm: memcg: move legacy memcg event code into memcontrol-v1.c Cgroup v1's memory controller contains a pretty complicated event notifications mechanism which is not used on cgroup v2. Let's move the corresponding code into memcontrol-v1.c. Please, note, that mem_cgroup_event_ratelimit() remains in memcontrol.c, otherwise it would require exporting too many details on memcg stats outside of memcontrol.c. Link: https://lkml.kernel.org/r/20240625005906.106920-7-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 83c8327455d8..588179d29849 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -69,18 +69,6 @@ struct mem_cgroup_id { refcount_t ref; }; -/* - * Per memcg event counter is incremented at every pagein/pageout. With THP, - * it will be incremented by the number of pages. This counter is used - * to trigger some periodic events. This is straightforward and better - * than using jiffies etc. to handle periodic memcg event. - */ -enum mem_cgroup_events_target { - MEM_CGROUP_TARGET_THRESH, - MEM_CGROUP_TARGET_SOFTLIMIT, - MEM_CGROUP_NTARGETS, -}; - struct memcg_vmstats_percpu; struct memcg_vmstats; struct lruvec_stats_percpu; -- cgit v1.2.3 From 6f1173d6845974aeb654a7f070c0c9f21283e1b3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 24 Jun 2024 17:59:04 -0700 Subject: mm: memcg: group cgroup v1 memcg related declarations Group all cgroup v1-related declarations at the end of memcontrol.h and mm/memcontrol-v1.h with an intention to put them all together under a config option later on. It should make things easier to follow and maintain too. Link: https://lkml.kernel.org/r/20240625005906.106920-13-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 144 ++++++++++++++++++++++++--------------------- 1 file changed, 76 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 588179d29849..a70d64ed04f5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -950,39 +950,13 @@ static inline void mem_cgroup_exit_user_fault(void) current->in_user_fault = 0; } -static inline bool task_in_memcg_oom(struct task_struct *p) -{ - return p->memcg_in_oom; -} - -bool mem_cgroup_oom_synchronize(bool wait); struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); -void folio_memcg_lock(struct folio *folio); -void folio_memcg_unlock(struct folio *folio); - void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val); -/* try to stablize folio_memcg() for all the pages in a memcg */ -static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) -{ - rcu_read_lock(); - - if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) - return true; - - rcu_read_unlock(); - return false; -} - -static inline void mem_cgroup_unlock_pages(void) -{ - rcu_read_unlock(); -} - /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) @@ -1109,10 +1083,6 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, int old_order, int new_order); -unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned); - #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1423,26 +1393,6 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } -static inline void folio_memcg_lock(struct folio *folio) -{ -} - -static inline void folio_memcg_unlock(struct folio *folio) -{ -} - -static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) -{ - /* to match folio_memcg_rcu() */ - rcu_read_lock(); - return true; -} - -static inline void mem_cgroup_unlock_pages(void) -{ - rcu_read_unlock(); -} - static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } @@ -1455,16 +1405,6 @@ static inline void mem_cgroup_exit_user_fault(void) { } -static inline bool task_in_memcg_oom(struct task_struct *p) -{ - return false; -} - -static inline bool mem_cgroup_oom_synchronize(bool wait) -{ - return false; -} - static inline struct mem_cgroup *mem_cgroup_get_oom_group( struct task_struct *victim, struct mem_cgroup *oom_domain) { @@ -1558,14 +1498,6 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } - -static inline -unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, - gfp_t gfp_mask, - unsigned long *total_scanned) -{ - return 0; -} #endif /* CONFIG_MEMCG */ /* @@ -1916,4 +1848,80 @@ static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) } #endif + +/* Cgroup v1-related declarations */ + +#ifdef CONFIG_MEMCG +unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned); + +bool mem_cgroup_oom_synchronize(bool wait); + +static inline bool task_in_memcg_oom(struct task_struct *p) +{ + return p->memcg_in_oom; +} + +void folio_memcg_lock(struct folio *folio); +void folio_memcg_unlock(struct folio *folio); + +/* try to stablize folio_memcg() for all the pages in a memcg */ +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + rcu_read_lock(); + + if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) + return true; + + rcu_read_unlock(); + return false; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + +#else /* CONFIG_MEMCG */ +static inline +unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + return 0; +} + +static inline void folio_memcg_lock(struct folio *folio) +{ +} + +static inline void folio_memcg_unlock(struct folio *folio) +{ +} + +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + /* to match folio_memcg_rcu() */ + rcu_read_lock(); + return true; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + +static inline bool task_in_memcg_oom(struct task_struct *p) +{ + return false; +} + +static inline bool mem_cgroup_oom_synchronize(bool wait) +{ + return false; +} + +#endif /* CONFIG_MEMCG */ + #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.2.3 From e93d4166b40a84df83c4d5cb4c709d022808ef9b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 24 Jun 2024 17:59:05 -0700 Subject: mm: memcg: put cgroup v1-specific code under a config option Put legacy cgroup v1 memory controller code under a new CONFIG_MEMCG_V1 config option. The option is turned off by default. Nobody except those who are still using cgroup v1 should turn it on. If the option is not set, memory controller can still be mounted under cgroup v1, but none of memcg-specific control files are present. Please note, that not all cgroup v1's memory controller code is guarded yet (but most of it), it's a subject for some follow-up work. Thanks to Michal Hocko for providing a better Kconfig option description. [roman.gushchin@linux.dev: better config option description provided by Michal] Link: https://lkml.kernel.org/r/ZnxXNtvqllc9CDoo@google.com Link: https://lkml.kernel.org/r/20240625005906.106920-14-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a70d64ed04f5..796cfa842346 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1851,7 +1851,7 @@ static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) /* Cgroup v1-related declarations */ -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); @@ -1883,7 +1883,7 @@ static inline void mem_cgroup_unlock_pages(void) rcu_read_unlock(); } -#else /* CONFIG_MEMCG */ +#else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1922,6 +1922,6 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } -#endif /* CONFIG_MEMCG */ +#endif /* CONFIG_MEMCG_V1 */ #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.2.3 From 410abb20acaea9679fc1b2276d47e70fba331451 Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Wed, 3 Jan 2024 08:48:36 -0800 Subject: mm: add defines for min/max swappiness Patch series "Add swappiness argument to memory.reclaim", v6. This patch proposes augmenting the memory.reclaim interface with a swappiness= argument that overrides the swappiness value for that instance of proactive reclaim. Userspace proactive reclaimers use the memory.reclaim interface to trigger reclaim. The memory.reclaim interface does not allow for any way to effect the balance of file vs anon during proactive reclaim. The only approach is to adjust the vm.swappiness setting. However, there are a few reasons we look to control the balance of file vs anon during proactive reclaim, separately from reactive reclaim: * Swapout should be limited to manage SSD write endurance. In near-OOM situations we are fine with lots of swap-out to avoid OOMs. As these are typically rare events, they have relatively little impact on write endurance. However, proactive reclaim runs continuously and so its impact on SSD write endurance is more significant. Therefore it is desireable to control swap-out for proactive reclaim separately from reactive reclaim * Some userspace OOM killers like systemd-oomd[1] support OOM killing on swap exhaustion. This makes sense if the swap exhaustion is triggered due to reactive reclaim but less so if it is triggered due to proactive reclaim (e.g. one could see OOMs when free memory is ample but anon is just particularly cold). Therefore, it's desireable to have proactive reclaim reduce or stop swap-out before the threshold at which OOM killing occurs. In the case of Meta's Senpai proactive reclaimer, we adjust vm.swappiness before writes to memory.reclaim[2]. This has been in production for nearly two years and has addressed our needs to control proactive vs reactive reclaim behavior but is still not ideal for a number of reasons: * vm.swappiness is a global setting, adjusting it can race/interfere with other system administration that wishes to control vm.swappiness. In our case, we need to disable Senpai before adjusting vm.swappiness. * vm.swappiness is stateful - so a crash or restart of Senpai can leave a misconfigured setting. This requires some additional management to record the "desired" setting and ensure Senpai always adjusts to it. With this patch, we avoid these downsides of adjusting vm.swappiness globally. Previously, this exact interface addition was proposed by Yosry[3]. In response, Roman proposed instead an interface to specify precise file/anon/slab reclaim amounts[4]. More recently Huan also proposed this as well[5] and others similarly questioned if this was the proper interface. Previous proposals sought to use this to allow proactive reclaimers to effectively perform a custom reclaim algorithm by issuing proactive reclaim with different settings to control file vs anon reclaim (e.g. to only reclaim anon from some applications). Responses argued that adjusting swappiness is a poor interface for custom reclaim. In contrast, I argue in favor of a swappiness setting not as a way to implement custom reclaim algorithms but rather to bias the balance of anon vs file due to differences of proactive vs reactive reclaim. In this context, swappiness is the existing interface for controlling this balance and this patch simply allows for it to be configured differently for proactive vs reactive reclaim. Specifying explicit amounts of anon vs file pages to reclaim feels inappropriate for this prupose. Proactive reclaimers are un-aware of the relative age of file vs anon for a cgroup which makes it difficult to manage proactive reclaim of different memory pools. A proactive reclaimer would need some amount of anon reclaim attempts separate from the amount of file reclaim attempts which seems brittle given that it's difficult to observe the impact. [1]https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html [2]https://github.com/facebookincubator/oomd/blob/main/src/oomd/plugins/Senpai.cpp#L585-L598 [3]https://lore.kernel.org/linux-mm/CAJD7tkbDpyoODveCsnaqBBMZEkDvshXJmNdbk51yKSNgD7aGdg@mail.gmail.com/ [4]https://lore.kernel.org/linux-mm/YoPHtHXzpK51F%2F1Z@carbon/ [5]https://lore.kernel.org/lkml/20231108065818.19932-1-link@vivo.com/ This patch (of 2): We use the constants 0 and 200 in a few places in the mm code when referring to the min and max swappiness. This patch adds MIN_SWAPPINESS and MAX_SWAPPINESS #defines to improve clarity. There are no functional changes. Link: https://lkml.kernel.org/r/20240103164841.2800183-1-schatzberg.dan@gmail.com Link: https://lkml.kernel.org/r/20240103164841.2800183-2-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Acked-by: David Rientjes Acked-by: Chris Li Reviewed-by: Nhat Pham Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shakeel Butt Cc: Tejun Heo Cc: Yosry Ahmed Cc: Yue Zhao Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index d33ce740b695..e97db0fa7659 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -404,6 +404,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) +#define MIN_SWAPPINESS 0 +#define MAX_SWAPPINESS 200 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, -- cgit v1.2.3 From 68cd9050d871e4db5433420b5ceb32f5512d18bc Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Wed, 3 Jan 2024 08:48:37 -0800 Subject: mm: add swappiness= arg to memory.reclaim Allow proactive reclaimers to submit an additional swappiness= argument to memory.reclaim. This overrides the global or per-memcg swappiness setting for that reclaim attempt. For example: echo "2M swappiness=0" > /sys/fs/cgroup/memory.reclaim will perform reclaim on the rootcg with a swappiness setting of 0 (no swap) regardless of the vm.swappiness sysctl setting. Userspace proactive reclaimers use the memory.reclaim interface to trigger reclaim. The memory.reclaim interface does not allow for any way to effect the balance of file vs anon during proactive reclaim. The only approach is to adjust the vm.swappiness setting. However, there are a few reasons we look to control the balance of file vs anon during proactive reclaim, separately from reactive reclaim: * Swapout should be limited to manage SSD write endurance. In near-OOM situations we are fine with lots of swap-out to avoid OOMs. As these are typically rare events, they have relatively little impact on write endurance. However, proactive reclaim runs continuously and so its impact on SSD write endurance is more significant. Therefore it is desireable to control swap-out for proactive reclaim separately from reactive reclaim * Some userspace OOM killers like systemd-oomd[1] support OOM killing on swap exhaustion. This makes sense if the swap exhaustion is triggered due to reactive reclaim but less so if it is triggered due to proactive reclaim (e.g. one could see OOMs when free memory is ample but anon is just particularly cold). Therefore, it's desireable to have proactive reclaim reduce or stop swap-out before the threshold at which OOM killing occurs. In the case of Meta's Senpai proactive reclaimer, we adjust vm.swappiness before writes to memory.reclaim[2]. This has been in production for nearly two years and has addressed our needs to control proactive vs reactive reclaim behavior but is still not ideal for a number of reasons: * vm.swappiness is a global setting, adjusting it can race/interfere with other system administration that wishes to control vm.swappiness. In our case, we need to disable Senpai before adjusting vm.swappiness. * vm.swappiness is stateful - so a crash or restart of Senpai can leave a misconfigured setting. This requires some additional management to record the "desired" setting and ensure Senpai always adjusts to it. With this patch, we avoid these downsides of adjusting vm.swappiness globally. [1]https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html [2]https://github.com/facebookincubator/oomd/blob/main/src/oomd/plugins/Senpai.cpp#L585-L598 Link: https://lkml.kernel.org/r/20240103164841.2800183-3-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Suggested-by: Yosry Ahmed Acked-by: Michal Hocko Acked-by: David Rientjes Acked-by: Chris Li Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shakeel Butt Cc: Tejun Heo Cc: Yue Zhao Cc: Zefan Li Cc: Nhat Pham Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index e97db0fa7659..3df75d62a835 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -409,7 +409,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options); + unsigned int reclaim_options, + int *swappiness); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, -- cgit v1.2.3 From 773e9ae77fe777ac09739d8c32b32fff147f2d66 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Jun 2024 21:03:10 +0000 Subject: mm: memcg: factor out legacy socket memory accounting code Move out the legacy cgroup v1 socket memory accounting code into mm/memcontrol-v1.c. This commit introduces three new functions: memcg1_tcpmem_active(), memcg1_charge_skmem() and memcg1_uncharge_skmem(), which contain all cgroup v1-specific code and become trivial if CONFIG_MEMCG_V1 isn't set. Note, that !!memcg->tcpmem_pressure check in mem_cgroup_under_socket_pressure() can't be easily moved into memcontrol-v1.h without including memcontrol-v1.h from memcontrol.h which isn't a good idea, so it's better to just #ifdef it. Link: https://lkml.kernel.org/r/20240628210317.272856-3-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 796cfa842346..44ab6394c9ed 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1650,8 +1650,10 @@ void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { +#ifdef CONFIG_MEMCG_V1 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return !!memcg->tcpmem_pressure; +#endif /* CONFIG_MEMCG_V1 */ do { if (time_before(jiffies, READ_ONCE(memcg->socket_pressure))) return true; -- cgit v1.2.3 From 94b7e5bf09b08aa4cce4625d0a4b307399b77e2c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Jun 2024 21:03:14 +0000 Subject: mm: memcg: put memcg1-specific struct mem_cgroup's members under CONFIG_MEMCG_V1 Put memcg1-specific members of struct mem_cgroup under the CONFIG_MEMCG_V1 config option. Also group them close to the end of struct mem_cgroup just before the dynamic per-node part. Link: https://lkml.kernel.org/r/20240628210317.272856-7-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 103 +++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 44ab6394c9ed..107b0c5d6eab 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -188,10 +188,6 @@ struct mem_cgroup { struct page_counter memsw; /* v1 only */ }; - /* Legacy consumer-oriented counters */ - struct page_counter kmem; /* v1 only */ - struct page_counter tcpmem; /* v1 only */ - /* Range enforcement for interrupt charges */ struct work_struct high_work; @@ -205,8 +201,6 @@ struct mem_cgroup { bool zswap_writeback; #endif - unsigned long soft_limit; - /* vmpressure notifications */ struct vmpressure vmpressure; @@ -215,13 +209,7 @@ struct mem_cgroup { */ bool oom_group; - /* protected by memcg_oom_lock */ - bool oom_lock; - int under_oom; - - int swappiness; - /* OOM-Killer disable */ - int oom_kill_disable; + int swappiness; /* memory.events and memory.events.local */ struct cgroup_file events_file; @@ -230,27 +218,6 @@ struct mem_cgroup { /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; - /* protect arrays of thresholds */ - struct mutex thresholds_lock; - - /* thresholds for memory usage. RCU-protected */ - struct mem_cgroup_thresholds thresholds; - - /* thresholds for mem+swap usage. RCU-protected */ - struct mem_cgroup_thresholds memsw_thresholds; - - /* For oom notifier event fd */ - struct list_head oom_notify; - - /* - * Should we move charges of a task when a task is moved into this - * mem_cgroup ? And what type of charges should we move ? - */ - unsigned long move_charge_at_immigrate; - /* taken only while moving_account > 0 */ - spinlock_t move_lock; - unsigned long move_lock_flags; - CACHELINE_PADDING(_pad1_); /* memory.stat */ @@ -267,10 +234,6 @@ struct mem_cgroup { */ unsigned long socket_pressure; - /* Legacy tcp memory accounting */ - bool tcpmem_active; - int tcpmem_pressure; - #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; /* @@ -284,14 +247,6 @@ struct mem_cgroup { struct list_head objcg_list; #endif - CACHELINE_PADDING(_pad2_); - - /* - * set > 0 if pages under this cgroup are moving to other cgroup. - */ - atomic_t moving_account; - struct task_struct *move_lock_task; - struct memcg_vmstats_percpu __percpu *vmstats_percpu; #ifdef CONFIG_CGROUP_WRITEBACK @@ -300,10 +255,6 @@ struct mem_cgroup { struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif - /* List of events which userspace want to receive */ - struct list_head event_list; - spinlock_t event_list_lock; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif @@ -313,6 +264,58 @@ struct mem_cgroup { struct lru_gen_mm_list mm_list; #endif +#ifdef CONFIG_MEMCG_V1 + /* Legacy consumer-oriented counters */ + struct page_counter kmem; /* v1 only */ + struct page_counter tcpmem; /* v1 only */ + + unsigned long soft_limit; + + /* protected by memcg_oom_lock */ + bool oom_lock; + int under_oom; + + /* OOM-Killer disable */ + int oom_kill_disable; + + /* protect arrays of thresholds */ + struct mutex thresholds_lock; + + /* thresholds for memory usage. RCU-protected */ + struct mem_cgroup_thresholds thresholds; + + /* thresholds for mem+swap usage. RCU-protected */ + struct mem_cgroup_thresholds memsw_thresholds; + + /* For oom notifier event fd */ + struct list_head oom_notify; + + /* + * Should we move charges of a task when a task is moved into this + * mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; + /* taken only while moving_account > 0 */ + spinlock_t move_lock; + unsigned long move_lock_flags; + + /* Legacy tcp memory accounting */ + bool tcpmem_active; + int tcpmem_pressure; + + CACHELINE_PADDING(_pad2_); + + /* + * set > 0 if pages under this cgroup are moving to other cgroup. + */ + atomic_t moving_account; + struct task_struct *move_lock_task; + + /* List of events which userspace want to receive */ + struct list_head event_list; + spinlock_t event_list_lock; +#endif /* CONFIG_MEMCG_V1 */ + struct mem_cgroup_per_node *nodeinfo[]; }; -- cgit v1.2.3 From 98c9daf5ae6be008f78c07b744bcff7bcc6e98da Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Jun 2024 21:03:15 +0000 Subject: mm: memcg: guard memcg1-specific members of struct mem_cgroup_per_node Put memcg1-specific members of struct mem_cgroup_per_node under the CONFIG_MEMCG_V1 config option. Link: https://lkml.kernel.org/r/20240628210317.272856-8-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 107b0c5d6eab..c7ef628ee882 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -92,6 +92,7 @@ struct mem_cgroup_per_node { struct lruvec_stats *lruvec_stats; struct shrinker_info __rcu *shrinker_info; +#ifdef CONFIG_MEMCG_V1 /* * Memcg-v1 only stuff in middle as buffer between read mostly fields * and update often fields to avoid false sharing. Once v1 stuff is @@ -102,6 +103,7 @@ struct mem_cgroup_per_node { unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; +#endif /* Fields which get updated often at the end. */ struct lruvec lruvec; -- cgit v1.2.3 From 1c3a0b3d0b11fb98c40360f849563185218c2dd4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Jun 2024 21:03:16 +0000 Subject: mm: memcg: put struct task_struct::memcg_in_oom under CONFIG_MEMCG_V1 The memcg_in_oom field of the struct task_struct is not used by the cgroup v2's memory controller, so it can be happily compiled out if CONFIG_MEMCG_V1 is not set. Link: https://lkml.kernel.org/r/20240628210317.272856-9-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/sched.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 61591ac6eab6..2a16023e8620 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1447,9 +1447,11 @@ struct task_struct { unsigned int kcov_softirq; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 struct mem_cgroup *memcg_in_oom; +#endif +#ifdef CONFIG_MEMCG /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; -- cgit v1.2.3 From 1419ff984aad4c08d121793f71a520b432ead88a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Jun 2024 21:03:17 +0000 Subject: mm: memcg: put struct task_struct::in_user_fault under CONFIG_MEMCG_V1 The struct task_struct's in_user_fault member is not used by the cgroup v2's memory controller, so it can be put under the CONFIG_MEMCG_V1 config option. To do so, mem_cgroup_enter_user_fault() and mem_cgroup_exit_user_fault() are moved under the CONFIG_MEMCG_V1 option as well. Link: https://lkml.kernel.org/r/20240628210317.272856-10-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 40 ++++++++++++++++++++-------------------- include/linux/sched.h | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c7ef628ee882..d0c9365ff039 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -943,18 +943,6 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg); -static inline void mem_cgroup_enter_user_fault(void) -{ - WARN_ON(current->in_user_fault); - current->in_user_fault = 1; -} - -static inline void mem_cgroup_exit_user_fault(void) -{ - WARN_ON(!current->in_user_fault); - current->in_user_fault = 0; -} - struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); @@ -1402,14 +1390,6 @@ static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } -static inline void mem_cgroup_enter_user_fault(void) -{ -} - -static inline void mem_cgroup_exit_user_fault(void) -{ -} - static inline struct mem_cgroup *mem_cgroup_get_oom_group( struct task_struct *victim, struct mem_cgroup *oom_domain) { @@ -1890,6 +1870,18 @@ static inline void mem_cgroup_unlock_pages(void) rcu_read_unlock(); } +static inline void mem_cgroup_enter_user_fault(void) +{ + WARN_ON(current->in_user_fault); + current->in_user_fault = 1; +} + +static inline void mem_cgroup_exit_user_fault(void) +{ + WARN_ON(!current->in_user_fault); + current->in_user_fault = 0; +} + #else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, @@ -1929,6 +1921,14 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } +static inline void mem_cgroup_enter_user_fault(void) +{ +} + +static inline void mem_cgroup_exit_user_fault(void) +{ +} + #endif /* CONFIG_MEMCG_V1 */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a16023e8620..a7770c566c4d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -934,7 +934,7 @@ struct task_struct { #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN -- cgit v1.2.3 From 81e7706345f06e1e97a092f59697b7e20a0ee868 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Jul 2024 14:28:14 +0900 Subject: dm: handle REQ_OP_ZONE_RESET_ALL This commit implements processing of the REQ_OP_ZONE_RESET_ALL operation for zoned mapped devices. Given that this operation always has a BIO sector of 0 and a 0 size, processing through the regular BIO __split_and_process_bio() function does not work because this function would always select the first target. Instead, handling of this operation is implemented using the function __send_zone_reset_all(). Similarly to the __send_empty_flush() function, the new __send_zone_reset_all() function manually goes through all targets of a mapped device table doing the following: 1) If the target can natively support REQ_OP_ZONE_RESET_ALL, __send_duplicate_bios() is used to forward the reset all operation to the target. This case is handled with the __send_zone_reset_all_native() function. 2) For other targets, the function __send_zone_reset_all_emulated() is executed to emulate the execution of REQ_OP_ZONE_RESET_ALL using regular REQ_OP_ZONE_RESET operations. Targets that can natively support REQ_OP_ZONE_RESET_ALL are identified using the new target field zone_reset_all_supported. This boolean is set to true in for targets that have reliable zone limits, that is, targets that map all sequential write required zones of their zoned device(s). Setting this field is handled in dm_set_zones_restrictions() and device_get_zone_resource_limits(). For targets with unreliable zone limits, REQ_OP_ZONE_RESET_ALL must be emulated (case 2 above). This is implemented with __send_zone_reset_all_emulated() and is similar to the block layer function blkdev_zone_reset_all_emulated(): first a report zones is done for the zones of the target to identify zones that need reset, that is, any sequential write required zone that is not already empty. This is done using a bitmap and the function dm_zone_get_reset_bitmap() which sets to 1 the bit corresponding to a zone that needs reset. Next, this zone bitmap is inspected and a clone BIO modified to use the REQ_OP_ZONE_RESET operation issued for any zone with its bit set in the zone bitmap. This implementation is more efficient than what the block layer does with blkdev_zone_reset_all_emulated(), which is always used for DM zoned devices currently: as we can natively use REQ_OP_ZONE_RESET_ALL on targets mapping all sequential write required zones, resetting all zones of a zoned mapped device can be much faster compared to always emulating this operation using regular per-zone reset. In the worst case, this implementation is as-efficient as the block layer emulation. This reduction in the time it takes to reset all zones of a zoned mapped device depends directly on the mapped device targets mapping (reliable zone limits or not). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240704052816.623865-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/device-mapper.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 82b2195efaca..15d28164bbbd 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -357,6 +357,13 @@ struct dm_target { */ bool discards_supported:1; + /* + * Automatically set by dm-core if this target supports + * REQ_OP_ZONE_RESET_ALL. Otherwise, this operation will be emulated + * using REQ_OP_ZONE_RESET. Target drivers must not set this manually. + */ + bool zone_reset_all_supported:1; + /* * Set if this target requires that discards be split on * 'max_discard_sectors' boundaries. -- cgit v1.2.3 From f2a7bea23710fceb99dac6da4ef82c3cc8932f7f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Jul 2024 14:28:15 +0900 Subject: block: Remove REQ_OP_ZONE_RESET_ALL emulation Now that device mapper can handle resetting all zones of a mapped zoned device using REQ_OP_ZONE_RESET_ALL, all zoned block device drivers support this operation. With this, the request queue feature BLK_FEAT_ZONE_RESETALL is not necessary and the emulation code in blk-zone.c can be removed. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240704052816.623865-5-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4d0d4b83bc74..dc250d8070d2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -318,9 +318,6 @@ typedef unsigned int __bitwise blk_features_t; /* is a zoned device */ #define BLK_FEAT_ZONED ((__force blk_features_t)(1u << 10)) -/* supports Zone Reset All */ -#define BLK_FEAT_ZONE_RESETALL ((__force blk_features_t)(1u << 11)) - /* supports PCI(e) p2p requests */ #define BLK_FEAT_PCI_P2PDMA ((__force blk_features_t)(1u << 12)) @@ -618,8 +615,6 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) -#define blk_queue_zone_resetall(q) \ - ((q)->limits.features & BLK_FEAT_ZONE_RESETALL) #define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) #define blk_queue_pci_p2pdma(q) ((q)->limits.features & BLK_FEAT_PCI_P2PDMA) #ifdef CONFIG_BLK_RQ_ALLOC_TIME -- cgit v1.2.3 From bf86bcdb40123ee99669ee91b67e023669433a1a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 18:51:20 +0200 Subject: blk-lib: check for kill signal in ioctl BLKZEROOUT Zeroout can access a significant capacity and take longer than the user expected. A user may change their mind about wanting to run that command and attempt to kill the process and do something else with their device. But since the task is uninterruptable, they have to wait for it to finish, which could be many hours. Add a new BLKDEV_ZERO_KILLABLE flag for blkdev_issue_zeroout that checks for a fatal signal at each iteration so the user doesn't have to wait for their regretted operation to complete naturally. Heavily based on an earlier patch from Keith Busch. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240701165219.1571322-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dc250d8070d2..02e04df27282 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1095,6 +1095,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ +#define BLKDEV_ZERO_KILLABLE (1 << 2) /* interruptible by fatal signals */ extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, -- cgit v1.2.3 From f37bee9508885ce5edd4875c744e1af28cfac76c Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Mon, 1 Jul 2024 21:28:32 +0300 Subject: net: pcs: xpcs: Move native device ID macro to linux/pcs/pcs-xpcs.h One of the next commits will alter the DW XPCS driver to support setting a custom device ID for the particular MDIO-device detected on the platform. The generic DW XPCS ID can be used as a custom ID as well in case if the DW XPCS-device was erroneously synthesized with no or some undefined ID. In addition to that having all supported DW XPCS device IDs defined in a single place will improve the code maintainability and readability. Note while at it rename the macros to being shorter and looking alike to the already defined NXP XPCS ID macro. Signed-off-by: Serge Semin Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/pcs/pcs-xpcs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index da3a6c30f6d2..8dfe90295f12 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -12,6 +12,8 @@ #define NXP_SJA1105_XPCS_ID 0x00000010 #define NXP_SJA1110_XPCS_ID 0x00000020 +#define DW_XPCS_ID 0x7996ced0 +#define DW_XPCS_ID_MASK 0xffffffff /* AN mode */ #define DW_AN_C73 1 -- cgit v1.2.3 From 71b200b388ef2ff1b547114f17dc1fd088bfc2c6 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Mon, 1 Jul 2024 21:28:34 +0300 Subject: net: pcs: xpcs: Convert xpcs_id to dw_xpcs_desc A structure with the PCS/PMA MMD IDs data is being introduced in one of the next commits. In order to prevent the names ambiguity let's convert the xpcs_id structure name to dw_xpcs_desc. The later version is more suitable since the structure content is indeed the device descriptor containing the data and callbacks required for the driver to correctly set the device up. Signed-off-by: Serge Semin Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/pcs/pcs-xpcs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index 8dfe90295f12..e706bd16b986 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -28,11 +28,11 @@ /* dev_flag */ #define DW_DEV_TXGBE BIT(0) -struct xpcs_id; +struct dw_xpcs_desc; struct dw_xpcs { + const struct dw_xpcs_desc *desc; struct mdio_device *mdiodev; - const struct xpcs_id *id; struct phylink_pcs pcs; phy_interface_t interface; int dev_flag; -- cgit v1.2.3 From bcac735cf653ef8dc6d7c08ed311e0a77f2665f0 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Mon, 1 Jul 2024 21:28:36 +0300 Subject: net: pcs: xpcs: Introduce DW XPCS info structure The being introduced structure will preserve the PCS and PMA IDs retrieved from the respective DW XPCS MMDs or potentially pre-defined by the client drivers. (The later change will be introduced later in the framework of the commit adding the memory-mapped DW XPCS devices support.) The structure fields are filled in in the xpcs_get_id() function, which used to be responsible for the PCS Device ID getting only. Besides of the PCS ID the method now fetches the PMA/PMD IDs too from MMD 1, which used to be done in xpcs_dev_flag(). The retrieved PMA ID will be from now utilized for the PMA-specific tweaks like it was introduced for the Wangxun TxGBE PCS in the commit f629acc6f210 ("net: pcs: xpcs: support to switch mode for Wangxun NICs"). Note 1. The xpcs_get_id() error-handling semantics has been changed. From now the error number will be returned from the function. There is no point in the next IOs or saving 0xffs and then looping over the actual device IDs if device couldn't be reached. -ENODEV will be returned if the very first IO operation failed thus indicating that no device could be found. Note 2. The PCS and PMA IDs macros have been converted to enum'es. The enum'es will be populated later in another commit with the virtual IDs identifying the DW XPCS devices which have some platform-specifics, but have been synthesized with the default PCS/PMA ID. Signed-off-by: Serge Semin Signed-off-by: David S. Miller --- include/linux/pcs/pcs-xpcs.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index e706bd16b986..1dc60f5e653f 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -9,11 +9,7 @@ #include #include - -#define NXP_SJA1105_XPCS_ID 0x00000010 -#define NXP_SJA1110_XPCS_ID 0x00000020 -#define DW_XPCS_ID 0x7996ced0 -#define DW_XPCS_ID_MASK 0xffffffff +#include /* AN mode */ #define DW_AN_C73 1 @@ -22,20 +18,30 @@ #define DW_AN_C37_1000BASEX 4 #define DW_10GBASER 5 -/* device vendor OUI */ -#define DW_OUI_WX 0x0018fc80 +struct dw_xpcs_desc; -/* dev_flag */ -#define DW_DEV_TXGBE BIT(0) +enum dw_xpcs_pcs_id { + NXP_SJA1105_XPCS_ID = 0x00000010, + NXP_SJA1110_XPCS_ID = 0x00000020, + DW_XPCS_ID = 0x7996ced0, + DW_XPCS_ID_MASK = 0xffffffff, +}; -struct dw_xpcs_desc; +enum dw_xpcs_pma_id { + WX_TXGBE_XPCS_PMA_10G_ID = 0x0018fc80, +}; + +struct dw_xpcs_info { + u32 pcs; + u32 pma; +}; struct dw_xpcs { + struct dw_xpcs_info info; const struct dw_xpcs_desc *desc; struct mdio_device *mdiodev; struct phylink_pcs pcs; phy_interface_t interface; - int dev_flag; }; int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface); -- cgit v1.2.3 From f6bb3e9d98c2e8d70587d5ddaf9426ef30d7865c Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Mon, 1 Jul 2024 21:28:38 +0300 Subject: net: pcs: xpcs: Add Synopsys DW xPCS platform device driver Synopsys DesignWare XPCS IP-core can be synthesized with the device CSRs being accessible over the MCI or APB3 interface instead of the MDIO bus (see the CSR_INTERFACE HDL parameter). Thus all the PCS registers can be just memory mapped and be a subject of the standard MMIO operations of course taking into account the peculiarities of the Clause C45 CSRs mapping. From that perspective the DW XPCS devices would look as just normal platform devices for the kernel. On the other hand in order to have the DW XPCS devices handled by the pcs-xpcs.c driver they need to be registered in the framework of the MDIO-subsystem. So the suggested change is about providing a DW XPCS platform device driver registering a virtual MDIO-bus with a single MDIO-device representing the DW XPCS device. DW XPCS platform device is supposed to be described by the respective compatible string "snps,dw-xpcs" (or with the PMA-specific compatible string), CSRs memory space and optional peripheral bus and reference clock sources. Depending on the INDIRECT_ACCESS IP-core synthesize parameter the memory-mapped reg-space can be represented as either directly or indirectly mapped Clause 45 space. In the former case the particular address is determined based on the MMD device and the registers offset (5 + 16 bits all together) within the device reg-space. In the later case there is only 8 lower address bits are utilized for the registers mapping (255 CSRs). The upper bits are supposed to be written into the respective viewport CSR in order to select the respective MMD sub-page. Note, only the peripheral bus clock source is requested in the platform device probe procedure. The core and pad clocks handling has been implemented in the framework of the xpcs_create() method intentionally since the clocks-related setups are supposed to be performed later, during the DW XPCS main configuration procedures. (For instance they will be required for the DW Gen5 10G PMA configuration.) Signed-off-by: Serge Semin Signed-off-by: David S. Miller --- include/linux/pcs/pcs-xpcs.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index 1dc60f5e653f..813be644647f 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -7,6 +7,8 @@ #ifndef __LINUX_PCS_XPCS_H #define __LINUX_PCS_XPCS_H +#include +#include #include #include #include @@ -21,6 +23,7 @@ struct dw_xpcs_desc; enum dw_xpcs_pcs_id { + DW_XPCS_ID_NATIVE = 0, NXP_SJA1105_XPCS_ID = 0x00000010, NXP_SJA1110_XPCS_ID = 0x00000020, DW_XPCS_ID = 0x7996ced0, @@ -28,6 +31,14 @@ enum dw_xpcs_pcs_id { }; enum dw_xpcs_pma_id { + DW_XPCS_PMA_ID_NATIVE = 0, + DW_XPCS_PMA_GEN1_3G_ID, + DW_XPCS_PMA_GEN2_3G_ID, + DW_XPCS_PMA_GEN2_6G_ID, + DW_XPCS_PMA_GEN4_3G_ID, + DW_XPCS_PMA_GEN4_6G_ID, + DW_XPCS_PMA_GEN5_10G_ID, + DW_XPCS_PMA_GEN5_12G_ID, WX_TXGBE_XPCS_PMA_10G_ID = 0x0018fc80, }; @@ -36,10 +47,17 @@ struct dw_xpcs_info { u32 pma; }; +enum dw_xpcs_clock { + DW_XPCS_CORE_CLK, + DW_XPCS_PAD_CLK, + DW_XPCS_NUM_CLKS, +}; + struct dw_xpcs { struct dw_xpcs_info info; const struct dw_xpcs_desc *desc; struct mdio_device *mdiodev; + struct clk_bulk_data clks[DW_XPCS_NUM_CLKS]; struct phylink_pcs pcs; phy_interface_t interface; }; -- cgit v1.2.3 From 9cad7275463ab9e52aeae8d2ecb169afee6876f3 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Mon, 1 Jul 2024 21:28:39 +0300 Subject: net: pcs: xpcs: Add fwnode-based descriptor creation method It's now possible to have the DW XPCS device defined as a standard platform device for instance in the platform DT-file. Although that functionality is useless unless there is a way to have the device found by the client drivers (STMMAC/DW *MAC, NXP SJA1105 Eth Switch, etc). Provide such ability by means of the xpcs_create_fwnode() method. It needs to be called with the device DW XPCS fwnode instance passed. That node will be then used to find the MDIO-device instance in order to create the DW XPCS descriptor. Note the method semantics and name is similar to what has been recently introduced in the Lynx PCS driver. Signed-off-by: Serge Semin Signed-off-by: David S. Miller --- include/linux/pcs/pcs-xpcs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index 813be644647f..b4a4eb6c8866 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -8,6 +8,7 @@ #define __LINUX_PCS_XPCS_H #include +#include #include #include #include @@ -72,6 +73,8 @@ int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable); struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr, phy_interface_t interface); +struct dw_xpcs *xpcs_create_fwnode(struct fwnode_handle *fwnode, + phy_interface_t interface); void xpcs_destroy(struct dw_xpcs *xpcs); #endif /* __LINUX_PCS_XPCS_H */ -- cgit v1.2.3 From 351066bad6ade5ad0f5f90fde2dc759759959969 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Mon, 1 Jul 2024 21:28:40 +0300 Subject: net: stmmac: Create DW XPCS device with particular address Currently the only STMMAC platform driver using the DW XPCS code is the Intel mGBE device driver. (It can be determined by finding all the drivers having the stmmac_mdio_bus_data::has_xpcs flag set.) At the same time the low-level platform driver masks out the DW XPCS MDIO-address from being auto-detected as PHY by the MDIO subsystem core. Seeing the PCS MDIO ID is known the procedure of the DW XPCS device creation can be simplified by dropping the loop over all the MDIO IDs. From now the DW XPCS device descriptor will be created for the MDIO-bus address pre-defined by the platform drivers via the stmmac_mdio_bus_data::pcs_mask field. Note besides this shall speed up a bit the Intel mGBE probing. Signed-off-by: Serge Semin Signed-off-by: David S. Miller --- include/linux/stmmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 9c54f82901a1..84e13bd5df28 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -82,7 +82,7 @@ struct stmmac_priv; struct stmmac_mdio_bus_data { unsigned int phy_mask; - unsigned int has_xpcs; + unsigned int pcs_mask; unsigned int default_an_inband; int *irqs; int probed_phy_irq; -- cgit v1.2.3 From e46296002113b4556baffd5dc68c4f9e22dae13a Mon Sep 17 00:00:00 2001 From: "Kory Maincent (Dent Project)" Date: Thu, 4 Jul 2024 10:11:56 +0200 Subject: net: ethtool: pse-pd: Expand C33 PSE status with class, power and extended state This update expands the status information provided by ethtool for PSE c33. It includes details such as the detected class, current power delivered, and extended state information. Reviewed-by: Oleksij Rempel Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20240704-feature_poe_power_cap-v6-1-320003204264@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 15 +++++++++++++++ include/linux/pse-pd/pse.h | 8 ++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 3a99238ef895..3e70f5d9e0bb 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1273,4 +1273,19 @@ struct ethtool_forced_speed_map { void ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size); + +/* C33 PSE extended state and substate. */ +struct ethtool_c33_pse_ext_state_info { + enum ethtool_c33_pse_ext_state c33_pse_ext_state; + union { + enum ethtool_c33_pse_ext_substate_error_condition error_condition; + enum ethtool_c33_pse_ext_substate_mr_pse_enable mr_pse_enable; + enum ethtool_c33_pse_ext_substate_option_detect_ted option_detect_ted; + enum ethtool_c33_pse_ext_substate_option_vport_lim option_vport_lim; + enum ethtool_c33_pse_ext_substate_ovld_detected ovld_detected; + enum ethtool_c33_pse_ext_substate_power_not_available power_not_available; + enum ethtool_c33_pse_ext_substate_short_detected short_detected; + u32 __c33_pse_ext_substate; + }; +}; #endif /* _LINUX_ETHTOOL_H */ diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 6eec24ffa866..38b9308e5e7a 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -36,12 +36,20 @@ struct pse_control_config { * functions. IEEE 802.3-2022 30.9.1.1.2 aPSEAdminState * @c33_pw_status: power detection status of the PSE. * IEEE 802.3-2022 30.9.1.1.5 aPSEPowerDetectionStatus: + * @c33_pw_class: detected class of a powered PD + * IEEE 802.3-2022 30.9.1.1.8 aPSEPowerClassification + * @c33_actual_pw: power currently delivered by the PSE in mW + * IEEE 802.3-2022 30.9.1.1.23 aPSEActualPower + * @c33_ext_state_info: extended state information of the PSE */ struct pse_control_status { enum ethtool_podl_pse_admin_state podl_admin_state; enum ethtool_podl_pse_pw_d_status podl_pw_status; enum ethtool_c33_pse_admin_state c33_admin_state; enum ethtool_c33_pse_pw_d_status c33_pw_status; + u32 c33_pw_class; + u32 c33_actual_pw; + struct ethtool_c33_pse_ext_state_info c33_ext_state_info; }; /** -- cgit v1.2.3 From 4a83abcef5f4f36be4e04ba18edd64226f6f4a19 Mon Sep 17 00:00:00 2001 From: "Kory Maincent (Dent Project)" Date: Thu, 4 Jul 2024 10:11:59 +0200 Subject: net: pse-pd: Add new power limit get and set c33 features This patch add a way to get and set the power limit of a PSE PI. For that it uses regulator API callbacks wrapper like get_voltage() and get/set_current_limit() as power is simply V * I. We used mW unit as defined by the IEEE 802.3-2022 standards. set_current_limit() uses the voltage return by get_voltage() and the desired power limit to calculate the current limit. get_voltage() callback is then mandatory to set the power limit. get_current_limit() callback is by default looking at a driver callback and fallback to extracting the current limit from _pse_ethtool_get_status() if the driver does not set its callback. We prefer let the user the choice because ethtool_get_status return much more information than the current limit. expand pse status with c33_pw_limit_ranges to return the ranges available to configure the power limit. Reviewed-by: Sai Krishna Acked-by: Oleksij Rempel Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20240704-feature_poe_power_cap-v6-4-320003204264@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/pse-pd/pse.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 38b9308e5e7a..591a53e082e6 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -9,6 +9,9 @@ #include #include +/* Maximum current in uA according to IEEE 802.3-2022 Table 145-1 */ +#define MAX_PI_CURRENT 1920000 + struct phy_device; struct pse_controller_dev; @@ -41,6 +44,12 @@ struct pse_control_config { * @c33_actual_pw: power currently delivered by the PSE in mW * IEEE 802.3-2022 30.9.1.1.23 aPSEActualPower * @c33_ext_state_info: extended state information of the PSE + * @c33_avail_pw_limit: available power limit of the PSE in mW + * IEEE 802.3-2022 145.2.5.4 pse_avail_pwr + * @c33_pw_limit_ranges: supported power limit configuration range. The driver + * is in charge of the memory allocation. + * @c33_pw_limit_nb_ranges: number of supported power limit configuration + * ranges */ struct pse_control_status { enum ethtool_podl_pse_admin_state podl_admin_state; @@ -50,6 +59,9 @@ struct pse_control_status { u32 c33_pw_class; u32 c33_actual_pw; struct ethtool_c33_pse_ext_state_info c33_ext_state_info; + u32 c33_avail_pw_limit; + struct ethtool_c33_pse_pw_limit_range *c33_pw_limit_ranges; + u32 c33_pw_limit_nb_ranges; }; /** @@ -61,6 +73,14 @@ struct pse_control_status { * May also return negative errno. * @pi_enable: Configure the PSE PI as enabled. * @pi_disable: Configure the PSE PI as disabled. + * @pi_get_voltage: Return voltage similarly to get_voltage regulator + * callback. + * @pi_get_current_limit: Get the configured current limit similarly to + * get_current_limit regulator callback. + * @pi_set_current_limit: Configure the current limit similarly to + * set_current_limit regulator callback. + * Should not return an error in case of MAX_PI_CURRENT + * current value set. */ struct pse_controller_ops { int (*ethtool_get_status)(struct pse_controller_dev *pcdev, @@ -70,6 +90,11 @@ struct pse_controller_ops { int (*pi_is_enabled)(struct pse_controller_dev *pcdev, int id); int (*pi_enable)(struct pse_controller_dev *pcdev, int id); int (*pi_disable)(struct pse_controller_dev *pcdev, int id); + int (*pi_get_voltage)(struct pse_controller_dev *pcdev, int id); + int (*pi_get_current_limit)(struct pse_controller_dev *pcdev, + int id); + int (*pi_set_current_limit)(struct pse_controller_dev *pcdev, + int id, int max_uA); }; struct module; @@ -156,6 +181,11 @@ int pse_ethtool_get_status(struct pse_control *psec, int pse_ethtool_set_config(struct pse_control *psec, struct netlink_ext_ack *extack, const struct pse_control_config *config); +int pse_ethtool_set_pw_limit(struct pse_control *psec, + struct netlink_ext_ack *extack, + const unsigned int pw_limit); +int pse_ethtool_get_pw_limit(struct pse_control *psec, + struct netlink_ext_ack *extack); bool pse_has_podl(struct pse_control *psec); bool pse_has_c33(struct pse_control *psec); @@ -185,6 +215,19 @@ static inline int pse_ethtool_set_config(struct pse_control *psec, return -EOPNOTSUPP; } +static inline int pse_ethtool_set_pw_limit(struct pse_control *psec, + struct netlink_ext_ack *extack, + const unsigned int pw_limit) +{ + return -EOPNOTSUPP; +} + +static inline int pse_ethtool_get_pw_limit(struct pse_control *psec, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + static inline bool pse_has_podl(struct pse_control *psec) { return false; -- cgit v1.2.3 From 30d7b6727724ce3729f2cb5b8be985d2d1931d2b Mon Sep 17 00:00:00 2001 From: "Kory Maincent (Dent Project)" Date: Thu, 4 Jul 2024 10:12:00 +0200 Subject: net: ethtool: Add new power limit get and set features This patch expands the status information provided by ethtool for PSE c33 with available power limit and available power limit ranges. It also adds a call to pse_ethtool_set_pw_limit() to configure the PSE control power limit. Reviewed-by: Oleksij Rempel Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20240704-feature_poe_power_cap-v6-5-320003204264@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 3e70f5d9e0bb..e213b5508da6 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1288,4 +1288,9 @@ struct ethtool_c33_pse_ext_state_info { u32 __c33_pse_ext_substate; }; }; + +struct ethtool_c33_pse_pw_limit_range { + u32 min; + u32 max; +}; #endif /* _LINUX_ETHTOOL_H */ -- cgit v1.2.3 From 4bf1ea3fc914faef37d8c793b7144d4765ff4a00 Mon Sep 17 00:00:00 2001 From: Devin Bayer Date: Fri, 28 Jun 2024 10:46:03 +0200 Subject: platform/x86: asus-wmi: support the disable camera LED on F10 of Zenbook 2023 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a sysfs entry for the LED on F10 above the crossed out camera icon on 2023 Zenbooks. Signed-off-by: Devin Bayer Link: https://lore.kernel.org/r/20240628084603.217106-1-dev@doubly.so Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/asus-wmi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 3eb5cd6773ad..0aeeae1c1943 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -51,6 +51,10 @@ #define ASUS_WMI_DEVID_LED6 0x00020016 #define ASUS_WMI_DEVID_MICMUTE_LED 0x00040017 +/* Disable Camera LED */ +#define ASUS_WMI_DEVID_CAMERA_LED_NEG 0x00060078 /* 0 = on (unused) */ +#define ASUS_WMI_DEVID_CAMERA_LED 0x00060079 /* 1 = on */ + /* Backlight and Brightness */ #define ASUS_WMI_DEVID_ALS_ENABLE 0x00050001 /* Ambient Light Sensor */ #define ASUS_WMI_DEVID_BACKLIGHT 0x00050011 -- cgit v1.2.3 From 28bdacbcb36d093e23734acccecd139f5fc05f67 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 26 Jun 2024 16:53:23 +0800 Subject: mm: move memory_failure_queue() into copy_mc_[user]_highpage() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: migrate: support poison recover from migrate folio", v5. The folio migration is widely used in kernel, memory compaction, memory hotplug, soft offline page, numa balance, memory demote/promotion, etc, but once access a poisoned source folio when migrating, the kernel will panic. There is a mechanism in the kernel to recover from uncorrectable memory errors, ARCH_HAS_COPY_MC(eg, Machine Check Safe Memory Copy on x86), which is already used in NVDIMM or core-mm paths(eg, CoW, khugepaged, coredump, ksm copy), see copy_mc_to_{user,kernel}, copy_mc_{user_}highpage callers. This series of patches provide the recovery mechanism from folio copy for the widely used folio migration. Please note, because folio migration is no guarantee of success, so we could chose to make folio migration tolerant of memory failures, adding folio_mc_copy() which is a #MC versions of folio_copy(), once accessing a poisoned source folio, we could return error and make the folio migration fail, and this could avoid the similar panic shown below. CPU: 1 PID: 88343 Comm: test_softofflin Kdump: loaded Not tainted 6.6.0 pc : copy_page+0x10/0xc0 lr : copy_highpage+0x38/0x50 ... Call trace: copy_page+0x10/0xc0 folio_copy+0x78/0x90 migrate_folio_extra+0x54/0xa0 move_to_new_folio+0xd8/0x1f0 migrate_folio_move+0xb8/0x300 migrate_pages_batch+0x528/0x788 migrate_pages_sync+0x8c/0x258 migrate_pages+0x440/0x528 soft_offline_in_use_page+0x2ec/0x3c0 soft_offline_page+0x238/0x310 soft_offline_page_store+0x6c/0xc0 dev_attr_store+0x20/0x40 sysfs_kf_write+0x4c/0x68 kernfs_fop_write_iter+0x130/0x1c8 new_sync_write+0xa4/0x138 vfs_write+0x238/0x2d8 ksys_write+0x74/0x110 This patch (of 5): There is a memory_failure_queue() call after copy_mc_[user]_highpage(), see callers, eg, CoW/KSM page copy, it is used to mark the source page as h/w poisoned and unmap it from other tasks, and the upcomming poison recover from migrate folio will do the similar thing, so let's move the memory_failure_queue() into the copy_mc_[user]_highpage() instead of adding it into each user, this should also enhance the handling of poisoned page in khugepaged. Link: https://lkml.kernel.org/r/20240626085328.608006-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240626085328.608006-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Reviewed-by: Miaohe Lin Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Tony Luck Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/highmem.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index fa6891e06316..930a591b9b61 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -352,6 +352,9 @@ static inline int copy_mc_user_highpage(struct page *to, struct page *from, kunmap_local(vto); kunmap_local(vfrom); + if (ret) + memory_failure_queue(page_to_pfn(from), 0); + return ret; } @@ -368,6 +371,9 @@ static inline int copy_mc_highpage(struct page *to, struct page *from) kunmap_local(vto); kunmap_local(vfrom); + if (ret) + memory_failure_queue(page_to_pfn(from), 0); + return ret; } #else -- cgit v1.2.3 From 02f4ee5a144cef6b26421cb42cca64bb4138d459 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 26 Jun 2024 16:53:24 +0800 Subject: mm: add folio_mc_copy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a #MC variant of folio_copy() which uses copy_mc_highpage() to support #MC handled during folio copy, it will be used in folio migration soon. Link: https://lkml.kernel.org/r/20240626085328.608006-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Reviewed-by: Miaohe Lin Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Tony Luck Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e2140ea6ae98..a2e3ebead410 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1291,6 +1291,7 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); +int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); -- cgit v1.2.3 From 3f59493713534b61abcbe2e57582cf7e31a42d51 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 26 Jun 2024 16:53:28 +0800 Subject: mm: migrate: remove folio_migrate_copy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The folio_migrate_copy() is just a wrapper of folio_copy() and folio_migrate_flags(), it is simple and only aio use it for now, unfold it and remove folio_migrate_copy(). Link: https://lkml.kernel.org/r/20240626085328.608006-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Tony Luck Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/migrate.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index af2579ae93f2..644be30b69c8 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -76,7 +76,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); -void folio_migrate_copy(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count); -- cgit v1.2.3 From 25f76c3db2f08428b5acd082a52787164001eb6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 6 Jul 2024 09:52:17 +0200 Subject: block: add a bvec_phys helper Get callers out of poking into bvec internals a bit more. Not a huge win right now, but with the proposed new DMA mapping API we might end up with a lot more of this otherwise. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240706075228.2350978-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index bd1e361b351c..f41c7f0ef91e 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -280,4 +280,18 @@ static inline void *bvec_virt(struct bio_vec *bvec) return page_address(bvec->bv_page) + bvec->bv_offset; } +/** + * bvec_phys - return the physical address for a bvec + * @bvec: bvec to return the physical address for + */ +static inline phys_addr_t bvec_phys(const struct bio_vec *bvec) +{ + /* + * Note this open codes page_to_phys because page_to_phys is defined in + * , which we don't want to pull in here. If it ever moves to + * a sensible place we should start using it. + */ + return PFN_PHYS(page_to_pfn(bvec->bv_page)) + bvec->bv_offset; +} + #endif /* __LINUX_BVEC_H */ -- cgit v1.2.3 From cd6193877c603f4b0c3c7e5607ffa3d52815403f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 1 Jul 2024 09:35:33 +0200 Subject: x86/efistub: Enable SMBIOS protocol handling for x86 The smbios.c source file is not currently included in the x86 build, and before we can do so, it needs some tweaks to build correctly in combination with the EFI mixed mode support. Reviewed-by: Lukas Wunner Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 418e555459da..2a539816a436 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -74,10 +74,10 @@ typedef void *efi_handle_t; */ typedef guid_t efi_guid_t __aligned(__alignof__(u32)); -#define EFI_GUID(a, b, c, d...) (efi_guid_t){ { \ +#define EFI_GUID(a, b, c, d...) ((efi_guid_t){ { \ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ (b) & 0xff, ((b) >> 8) & 0xff, \ - (c) & 0xff, ((c) >> 8) & 0xff, d } } + (c) & 0xff, ((c) >> 8) & 0xff, d } }) /* * Generic EFI table header -- cgit v1.2.3 From 71e49eccdca6328eecc335ed8f5557bd0ed70fc6 Mon Sep 17 00:00:00 2001 From: Aditya Garg Date: Sun, 30 Jun 2024 19:24:54 +0000 Subject: x86/efistub: Call Apple set_os protocol on dual GPU Intel Macs 0c18184de990 ("platform/x86: apple-gmux: support MMIO gmux on T2 Macs") brought support for T2 Macs in apple-gmux. But in order to use dual GPU, the integrated GPU has to be enabled. On such dual GPU EFI Macs, the EFI stub needs to report that it is booting macOS in order to prevent the firmware from disabling the iGPU. This patch is also applicable for some non T2 Intel Macs. Based on this patch for GRUB by Andreas Heider : https://lists.gnu.org/archive/html/grub-devel/2013-12/msg00442.html Credits also goto Kerem Karabay for helping porting the patch to the Linux kernel. Cc: Orlando Chamberlain Signed-off-by: Aditya Garg [ardb: limit scope using list of DMI matches provided by Lukas and Orlando] Reviewed-by: Lukas Wunner Tested-by: Aditya Garg Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 2a539816a436..3a6c04a9f9aa 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -385,6 +385,7 @@ void efi_native_runtime_setup(void); #define EFI_MEMORY_ATTRIBUTES_TABLE_GUID EFI_GUID(0xdcfa911d, 0x26eb, 0x469f, 0xa2, 0x20, 0x38, 0xb7, 0xdc, 0x46, 0x12, 0x20) #define EFI_CONSOLE_OUT_DEVICE_GUID EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) #define APPLE_PROPERTIES_PROTOCOL_GUID EFI_GUID(0x91bd12fe, 0xf6c3, 0x44fb, 0xa5, 0xb7, 0x51, 0x22, 0xab, 0x30, 0x3a, 0xe0) +#define APPLE_SET_OS_PROTOCOL_GUID EFI_GUID(0xc5c5da95, 0x7d5c, 0x45e6, 0xb2, 0xf1, 0x3f, 0xd5, 0x2b, 0xb1, 0x00, 0x77) #define EFI_TCG2_PROTOCOL_GUID EFI_GUID(0x607f766c, 0x7455, 0x42be, 0x93, 0x0b, 0xe4, 0xd7, 0x6d, 0xb2, 0x72, 0x0f) #define EFI_TCG2_FINAL_EVENTS_TABLE_GUID EFI_GUID(0x1e2ed096, 0x30e2, 0x4254, 0xbd, 0x89, 0x86, 0x3b, 0xbe, 0xf8, 0x23, 0x25) #define EFI_LOAD_FILE_PROTOCOL_GUID EFI_GUID(0x56ec3091, 0x954c, 0x11d2, 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) -- cgit v1.2.3 From f21711bbdbf0d95a389bfaad54ce444b46830d58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 6 Jul 2024 13:13:42 +0200 Subject: regmap-irq: handle const struct regmap_irq_sub_irq_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct instances supplied by the drivers are never modified. Handle them as const in the regmap core allowing the drivers to put them into .rodata. Also add a new entry to const_structs.checkpatch to make sure future instances of this struct already enter the tree as const. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20240706-regmap-const-structs-v1-2-d08c776da787@weissschuh.net Signed-off-by: Mark Brown --- include/linux/regmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index a6bc2980a98b..2da1cfc52233 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1607,7 +1607,7 @@ struct regmap_irq_chip { unsigned int main_status; unsigned int num_main_status_bits; - struct regmap_irq_sub_irq_map *sub_reg_offsets; + const struct regmap_irq_sub_irq_map *sub_reg_offsets; int num_main_regs; unsigned int status_base; -- cgit v1.2.3 From f45b94ffc5f1204b35b5c695ed265b1385951616 Mon Sep 17 00:00:00 2001 From: Varadarajan Narayanan Date: Tue, 30 Apr 2024 12:12:09 +0530 Subject: interconnect: icc-clk: Specify master/slave ids Presently, icc-clk driver autogenerates the master and slave ids. However, devices with multiple nodes on the interconnect could have other constraints and may not match with the auto generated node ids. Hence, modify the driver to use the master/slave ids provided by the caller instead of auto generating. Also, update clk-cbf-8996 accordingly. Acked-by: Georgi Djakov Signed-off-by: Varadarajan Narayanan Link: https://lore.kernel.org/r/20240430064214.2030013-2-quic_varada@quicinc.com Signed-off-by: Bjorn Andersson --- include/linux/interconnect-clk.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interconnect-clk.h b/include/linux/interconnect-clk.h index 0cd80112bea5..170898faaacb 100644 --- a/include/linux/interconnect-clk.h +++ b/include/linux/interconnect-clk.h @@ -11,6 +11,8 @@ struct device; struct icc_clk_data { struct clk *clk; const char *name; + unsigned int master_id; + unsigned int slave_id; }; struct icc_provider *icc_clk_register(struct device *dev, -- cgit v1.2.3 From d3153113619216e87038a20bebf82582f9be10e7 Mon Sep 17 00:00:00 2001 From: Varadarajan Narayanan Date: Tue, 30 Apr 2024 12:12:11 +0530 Subject: interconnect: icc-clk: Add devm_icc_clk_register Wrap icc_clk_register to create devm_icc_clk_register to be able to release the resources properly. Acked-by: Georgi Djakov Reviewed-by: Dmitry Baryshkov Signed-off-by: Varadarajan Narayanan Link: https://lore.kernel.org/r/20240430064214.2030013-4-quic_varada@quicinc.com Signed-off-by: Bjorn Andersson --- include/linux/interconnect-clk.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interconnect-clk.h b/include/linux/interconnect-clk.h index 170898faaacb..9bcee3e9c56c 100644 --- a/include/linux/interconnect-clk.h +++ b/include/linux/interconnect-clk.h @@ -19,6 +19,8 @@ struct icc_provider *icc_clk_register(struct device *dev, unsigned int first_id, unsigned int num_clocks, const struct icc_clk_data *data); +int devm_icc_clk_register(struct device *dev, unsigned int first_id, + unsigned int num_clocks, const struct icc_clk_data *data); void icc_clk_unregister(struct icc_provider *provider); #endif -- cgit v1.2.3 From 7e86845a0346efc95fddaa97ce5cd6a8bda8c71c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 4 Jun 2024 15:45:24 -0400 Subject: rpcrdma: Implement generic device removal Commit e87a911fed07 ("nvme-rdma: use ib_client API to detect device removal") explains the benefits of handling device removal outside of the CM event handler. Sketch in an IB device removal notification mechanism that can be used by both the client and server side RPC-over-RDMA transport implementations. Suggested-by: Sagi Grimberg Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- include/linux/sunrpc/rdma_rn.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 include/linux/sunrpc/rdma_rn.h (limited to 'include/linux') diff --git a/include/linux/sunrpc/rdma_rn.h b/include/linux/sunrpc/rdma_rn.h new file mode 100644 index 000000000000..7d032ca057af --- /dev/null +++ b/include/linux/sunrpc/rdma_rn.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * * Copyright (c) 2024, Oracle and/or its affiliates. + */ + +#ifndef _LINUX_SUNRPC_RDMA_RN_H +#define _LINUX_SUNRPC_RDMA_RN_H + +#include + +/** + * rpcrdma_notification - request removal notification + */ +struct rpcrdma_notification { + void (*rn_done)(struct rpcrdma_notification *rn); + u32 rn_index; +}; + +int rpcrdma_rn_register(struct ib_device *device, + struct rpcrdma_notification *rn, + void (*done)(struct rpcrdma_notification *rn)); +void rpcrdma_rn_unregister(struct ib_device *device, + struct rpcrdma_notification *rn); +int rpcrdma_ib_client_register(void); +void rpcrdma_ib_client_unregister(void); + +#endif /* _LINUX_SUNRPC_RDMA_RN_H */ -- cgit v1.2.3 From 820620516993c151c88e9b59edc79ed12d1c31cd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 16 Jun 2024 21:21:19 -0400 Subject: NFSv4: Clean up open delegation return structure Instead of having the fields open coded in the struct nfs_openres, add a separate structure for them so that we can reuse that code for the WANT_DELEGATION case. Signed-off-by: Trond Myklebust Signed-off-by: Lance Shelton Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index d09b9773b20c..682559e19d9d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -449,6 +449,22 @@ struct stateowner_id { __u32 uniquifier; }; +struct nfs4_open_delegation { + __u32 open_delegation_type; + union { + struct { + fmode_t type; + __u32 do_recall; + nfs4_stateid stateid; + unsigned long pagemod_limit; + }; + struct { + __u32 why_no_delegation; + __u32 will_notify; + }; + }; +}; + /* * Arguments to the open call. */ @@ -490,13 +506,10 @@ struct nfs_openres { struct nfs_fattr * f_attr; struct nfs_seqid * seqid; const struct nfs_server *server; - fmode_t delegation_type; - nfs4_stateid delegation; - unsigned long pagemod_limit; - __u32 do_recall; __u32 attrset[NFS4_BITMAP_SIZE]; struct nfs4_string *owner; struct nfs4_string *group_owner; + struct nfs4_open_delegation delegation; __u32 access_request; __u32 access_supported; __u32 access_result; -- cgit v1.2.3 From 6a68aed602d7b770552c7f890d0c30c53725c7b8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 16 Jun 2024 21:21:21 -0400 Subject: NFSv4: Add new attribute delegation definitions Add the attribute delegation XDR definitions from the spec. Signed-off-by: Tom Haynes Signed-off-by: Trond Myklebust Signed-off-by: Lance Shelton Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs4.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 0d896ce296ce..c074e0ac390f 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -367,6 +367,8 @@ enum open_delegation_type4 { NFS4_OPEN_DELEGATE_READ = 1, NFS4_OPEN_DELEGATE_WRITE = 2, NFS4_OPEN_DELEGATE_NONE_EXT = 3, /* 4.1 */ + NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG = 4, + NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG = 5, }; enum why_no_delegation4 { /* new to v4.1 */ @@ -507,6 +509,11 @@ enum { FATTR4_XATTR_SUPPORT = 82, }; +enum { + FATTR4_TIME_DELEG_ACCESS = 84, + FATTR4_TIME_DELEG_MODIFY = 85, +}; + /* * The following internal definitions enable processing the above * attribute bits within 32-bit word boundaries. @@ -586,6 +593,8 @@ enum { #define FATTR4_WORD2_SECURITY_LABEL BIT(FATTR4_SEC_LABEL - 64) #define FATTR4_WORD2_MODE_UMASK BIT(FATTR4_MODE_UMASK - 64) #define FATTR4_WORD2_XATTR_SUPPORT BIT(FATTR4_XATTR_SUPPORT - 64) +#define FATTR4_WORD2_TIME_DELEG_ACCESS BIT(FATTR4_TIME_DELEG_ACCESS - 64) +#define FATTR4_WORD2_TIME_DELEG_MODIFY BIT(FATTR4_TIME_DELEG_MODIFY - 64) /* MDS threshold bitmap bits */ #define THRESHOLD_RD (1UL << 0) -- cgit v1.2.3 From 90f9ae74422d7e7447cb0ea21e1227142b58c52a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 16 Jun 2024 21:21:22 -0400 Subject: NFSv4: Plumb in XDR support for the new delegation-only setattr op We want to send the updated atime and mtime as part of the delegreturn compound. Add a special structure to hold those variables. Signed-off-by: Trond Myklebust Signed-off-by: Lance Shelton Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 682559e19d9d..f40be64ce942 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -622,6 +622,13 @@ struct nfs_release_lockowner_res { struct nfs4_sequence_res seq_res; }; +struct nfs4_delegattr { + struct timespec64 atime; + struct timespec64 mtime; + bool atime_set; + bool mtime_set; +}; + struct nfs4_delegreturnargs { struct nfs4_sequence_args seq_args; const struct nfs_fh *fhandle; @@ -629,6 +636,7 @@ struct nfs4_delegreturnargs { const u32 *bitmask; u32 bitmask_store[NFS_BITMASK_SZ]; struct nfs4_layoutreturn_args *lr_args; + struct nfs4_delegattr *sattr_args; }; struct nfs4_delegreturnres { @@ -637,6 +645,8 @@ struct nfs4_delegreturnres { struct nfs_server *server; struct nfs4_layoutreturn_res *lr_res; int lr_ret; + bool sattr_res; + int sattr_ret; }; /* -- cgit v1.2.3 From 4201916f2ab13577d45876f4bc784be55e4a83da Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 16 Jun 2024 21:21:24 -0400 Subject: NFSv4: Add a flags argument to the 'have_delegation' callback This argument will be used to allow the caller to specify whether or not they need to know that this is an attribute delegation. Signed-off-by: Trond Myklebust Signed-off-by: Lance Shelton Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index f40be64ce942..51611583af51 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1830,7 +1830,7 @@ struct nfs_rpc_ops { int open_flags, struct iattr *iattr, int *); - int (*have_delegation)(struct inode *, fmode_t); + int (*have_delegation)(struct inode *, fmode_t, int); struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); struct nfs_client *(*init_client) (struct nfs_client *, const struct nfs_client_initdata *); -- cgit v1.2.3 From 86e1c54d152e2799671e149d6e4d1c1a4a2be857 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 16 Jun 2024 21:21:26 -0400 Subject: NFSv4: Add recovery of attribute delegations After a reboot of the NFSv4.2 server, the recovery code needs to specify whether the delegation to be recovered is an attribute delegation or not. Signed-off-by: Trond Myklebust Signed-off-by: Lance Shelton Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 51611583af51..d8cfa956d24c 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -484,7 +484,7 @@ struct nfs_openargs { nfs4_verifier verifier; /* EXCLUSIVE */ }; nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ - fmode_t delegation_type; /* CLAIM_PREVIOUS */ + __u32 delegation_type; /* CLAIM_PREVIOUS */ } u; const struct qstr * name; const struct nfs_server *server; /* Needed for ID mapping */ -- cgit v1.2.3 From dcb3c20f741993efbd95be78aab93fac29516323 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 16 Jun 2024 21:21:27 -0400 Subject: NFSv4: Add a capability for delegated attributes Cache whether or not the server may have support for delegated attributes in a capability flag. Signed-off-by: Trond Myklebust Signed-off-by: Lance Shelton Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 92de074e63b9..5a76a87cd924 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -278,6 +278,7 @@ struct nfs_server { #define NFS_CAP_LGOPEN (1U << 5) #define NFS_CAP_CASE_INSENSITIVE (1U << 6) #define NFS_CAP_CASE_PRESERVING (1U << 7) +#define NFS_CAP_DELEGTIME (1U << 13) #define NFS_CAP_POSIX_LOCK (1U << 14) #define NFS_CAP_UIDGID_NOMAP (1U << 15) #define NFS_CAP_STATEID_NFSV41 (1U << 16) -- cgit v1.2.3 From 707f13b3d081ea811c40014e7b61f2c487ee9a4f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 16 Jun 2024 21:21:32 -0400 Subject: NFSv4: Add support for the FATTR4_OPEN_ARGUMENTS attribute Query the server for the OPEN arguments that it supports so that we can figure out which extensions we can use. Signed-off-by: Trond Myklebust Signed-off-by: Lance Shelton Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs4.h | 2 ++ include/linux/nfs_xdr.h | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index c074e0ac390f..f9df88091c6d 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -512,6 +512,7 @@ enum { enum { FATTR4_TIME_DELEG_ACCESS = 84, FATTR4_TIME_DELEG_MODIFY = 85, + FATTR4_OPEN_ARGUMENTS = 86, }; /* @@ -595,6 +596,7 @@ enum { #define FATTR4_WORD2_XATTR_SUPPORT BIT(FATTR4_XATTR_SUPPORT - 64) #define FATTR4_WORD2_TIME_DELEG_ACCESS BIT(FATTR4_TIME_DELEG_ACCESS - 64) #define FATTR4_WORD2_TIME_DELEG_MODIFY BIT(FATTR4_TIME_DELEG_MODIFY - 64) +#define FATTR4_WORD2_OPEN_ARGUMENTS BIT(FATTR4_OPEN_ARGUMENTS - 64) /* MDS threshold bitmap bits */ #define THRESHOLD_RD (1UL << 0) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index d8cfa956d24c..af510a7ec46a 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1213,6 +1213,14 @@ struct nfs4_statfs_res { struct nfs_fsstat *fsstat; }; +struct nfs4_open_caps { + u32 oa_share_access[1]; + u32 oa_share_deny[1]; + u32 oa_share_access_want[1]; + u32 oa_open_claim[1]; + u32 oa_createmode[1]; +}; + struct nfs4_server_caps_arg { struct nfs4_sequence_args seq_args; struct nfs_fh *fhandle; @@ -1229,6 +1237,7 @@ struct nfs4_server_caps_res { u32 fh_expire_type; u32 case_insensitive; u32 case_preserving; + struct nfs4_open_caps open_caps; }; #define NFS4_PATHNAME_MAXCOMPONENTS 512 -- cgit v1.2.3 From d2a00cceb93a4df2afaceb836297628d0aeec7ad Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 16 Jun 2024 21:21:33 -0400 Subject: NFSv4: Detect support for OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION If the server supports the NFSv4.2 protocol extension to optimise away returning a stateid when it returns a delegation, then we cache that information in another capability flag. Signed-off-by: Trond Myklebust Signed-off-by: Lance Shelton Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 5a76a87cd924..fe5b1a8bd723 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -278,6 +278,7 @@ struct nfs_server { #define NFS_CAP_LGOPEN (1U << 5) #define NFS_CAP_CASE_INSENSITIVE (1U << 6) #define NFS_CAP_CASE_PRESERVING (1U << 7) +#define NFS_CAP_OPEN_XOR (1U << 12) #define NFS_CAP_DELEGTIME (1U << 13) #define NFS_CAP_POSIX_LOCK (1U << 14) #define NFS_CAP_UIDGID_NOMAP (1U << 15) -- cgit v1.2.3 From adb4b42d19aea91826621a8d0bac94cf2c08f8bc Mon Sep 17 00:00:00 2001 From: Lance Shelton Date: Sun, 16 Jun 2024 21:21:36 -0400 Subject: Return the delegation when deleting sillyrenamed files Add a callback to return the delegation in order to allow generic NFS code to return the delegation when appropriate. Signed-off-by: Trond Myklebust Signed-off-by: Lance Shelton Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index af510a7ec46a..01efacae4634 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1840,6 +1840,7 @@ struct nfs_rpc_ops { struct iattr *iattr, int *); int (*have_delegation)(struct inode *, fmode_t, int); + int (*return_delegation)(struct inode *); struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); struct nfs_client *(*init_client) (struct nfs_client *, const struct nfs_client_initdata *); -- cgit v1.2.3 From 924cf3c91fe29c7ebd1b9d56e10f513c1bd7d4bd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 13 Jun 2024 01:00:46 -0400 Subject: NFSv4.1: constify the stateid argument in nfs41_test_stateid() Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 01efacae4634..45623af3e7b8 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1438,7 +1438,7 @@ struct nfs41_secinfo_no_name_args { struct nfs41_test_stateid_args { struct nfs4_sequence_args seq_args; - nfs4_stateid *stateid; + nfs4_stateid stateid; }; struct nfs41_test_stateid_res { -- cgit v1.2.3 From 5468fc8298a97ca504aca8ac0c7b9e6fd7c1b588 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 13 Jun 2024 01:00:55 -0400 Subject: NFSv4/pNFS: Do layout state recovery upon reboot Some pNFS implementations, such as flexible files, want the client to send the layout stats and layout errors that may have incurred while the metadata server was booting. To do so, the client sends a layoutreturn with an all-zero stateid while the server is in grace during reboot recovery. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index fe5b1a8bd723..ba9df1848b35 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -278,6 +278,7 @@ struct nfs_server { #define NFS_CAP_LGOPEN (1U << 5) #define NFS_CAP_CASE_INSENSITIVE (1U << 6) #define NFS_CAP_CASE_PRESERVING (1U << 7) +#define NFS_CAP_REBOOT_LAYOUTRETURN (1U << 8) #define NFS_CAP_OPEN_XOR (1U << 12) #define NFS_CAP_DELEGTIME (1U << 13) #define NFS_CAP_POSIX_LOCK (1U << 14) -- cgit v1.2.3 From 2f1f31042ef07719a0d5cb4784b8a32d20c13110 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 1 Jul 2024 12:50:48 +0200 Subject: nfs: Block on write congestion Commit 6df25e58532b ("nfs: remove reliance on bdi congestion") introduced NFS-private solution for limiting number of writes outstanding against a particular server. Unlike previous bdi congestion this algorithm actually works and limits number of outstanding writeback pages to nfs_congestion_kb which scales with amount of client's memory and is capped at 256 MB. As a result some workloads such as random buffered writes over NFS got slower (from ~170 MB/s to ~126 MB/s). The fio command to reproduce is: fio --direct=0 --ioengine=sync --thread --invalidate=1 --group_reporting=1 --runtime=300 --fallocate=posix --ramp_time=10 --new_group --rw=randwrite --size=64256m --numjobs=4 --bs=4k --fsync_on_close=1 --end_fsync=1 This happens because the client sends ~256 MB worth of dirty pages to the server and any further background writeback request is ignored until the number of writeback pages gets below the threshold of 192 MB. By the time this happens and clients decides to trigger another round of writeback, the server often has no pages to write and the disk is idle. To fix this problem and make the client react faster to eased congestion of the server by blocking waiting for congestion to resolve instead of aborting writeback. This improves the random 4k buffered write throughput to 184 MB/s. Reviewed-by: Sagi Grimberg Reviewed-by: Jeff Layton Signed-off-by: Jan Kara Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index ba9df1848b35..1df86ab98c77 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -140,6 +140,7 @@ struct nfs_server { struct rpc_clnt * client_acl; /* ACL RPC client handle */ struct nlm_host *nlm_host; /* NLM client handle */ struct nfs_iostats __percpu *io_stats; /* I/O statistics */ + wait_queue_head_t write_congestion_wait; /* wait until write congestion eases */ atomic_long_t writeback; /* number of writeback pages */ unsigned int write_congested;/* flag set when writeback gets too high */ unsigned int flags; /* various flags */ -- cgit v1.2.3 From 7e8e78a0ba00c88f0ded86de64bdddc82e06b196 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 07:26:48 +0200 Subject: nfs: remove dead code for the old swap over NFS implementation Remove the code testing folio_test_swapcache either explicitly or implicitly in pagemap.h headers, as is now handled using the direct I/O path and not the buffered I/O path that these helpers are located in. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- include/linux/nfs_page.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 1c315f854ea8..7bc31df457ea 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -208,8 +208,8 @@ static inline struct inode *nfs_page_to_inode(const struct nfs_page *req) struct folio *folio = nfs_page_to_folio(req); if (folio == NULL) - return page_file_mapping(req->wb_page)->host; - return folio_file_mapping(folio)->host; + return req->wb_page->mapping->host; + return folio->mapping->host; } /** -- cgit v1.2.3 From 9eb7c484db1ae993648fc9b9d48a295f4d99afb8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 07:26:50 +0200 Subject: nfs: simplify nfs_folio_find_and_lock_request nfs_folio_find_and_lock_request and the nfs_page_group_lock_head helper called by it spend quite some effort to deal with head vs subrequests. But given that only the head request can be stashed in the folio private data, non of that is required. Fold the locking logic from nfs_page_group_lock_head into nfs_folio_find_and_lock_request and simplify the result based on the invariant that we always find the head request in the folio private data. Signed-off-by: Christoph Hellwig Signed-off-by: Anna Schumaker --- include/linux/nfs_page.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 7bc31df457ea..e799d93626f1 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -155,7 +155,6 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, extern int nfs_wait_on_request(struct nfs_page *); extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_and_release_request(struct nfs_page *); -extern struct nfs_page *nfs_page_group_lock_head(struct nfs_page *req); extern int nfs_page_group_lock_subrequests(struct nfs_page *head); extern void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, -- cgit v1.2.3 From 25edbcac6e32eab345e470d56ca9974a577b878b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 07:26:52 +0200 Subject: nfs: fold nfs_page_group_lock_subrequests into nfs_lock_and_join_requests Fold nfs_page_group_lock_subrequests into nfs_lock_and_join_requests to prepare for future changes to this code, and move the helpers to write.c as well. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- include/linux/nfs_page.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index e799d93626f1..63eed97a18ad 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -155,7 +155,6 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, extern int nfs_wait_on_request(struct nfs_page *); extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_and_release_request(struct nfs_page *); -extern int nfs_page_group_lock_subrequests(struct nfs_page *head); extern void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, struct inode *inode); -- cgit v1.2.3 From f1b7c7552cbcf89e56b15ff481f3d19b53046291 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 07:26:53 +0200 Subject: nfs: move nfs_wait_on_request to write.c nfs_wait_on_request is now only used in write.c. Move it there and mark it static. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- include/linux/nfs_page.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 63eed97a18ad..169b4ae30ff4 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -152,7 +152,6 @@ extern void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t); extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req); -extern int nfs_wait_on_request(struct nfs_page *); extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_and_release_request(struct nfs_page *); extern void nfs_join_page_group(struct nfs_page *head, -- cgit v1.2.3 From 8e0c8d23952f338180d19718195d4f9fd10a1809 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 13 Jun 2024 14:34:30 -0400 Subject: sunrpc: fix up the special handling of sv_nrpools == 1 Only pooled services take a reference to the svc_pool_map. The sunrpc code has always used the sv_nrpools value to detect whether the service is pooled. The problem there is that nfsd is a pooled service, but when it's running in "global" pool_mode, it doesn't take a reference to the pool map because it has a sv_nrpools value of 1. This means that we have two separate codepaths for starting the server, depending on whether it's pooled or not. Fix this by adding a new flag to the svc_serv, that indicates whether the serv is pooled. With this we can have the nfsd service unconditionally take a reference, regardless of pool_mode. Note that this is a behavior change for /sys/module/sunrpc/parameters/pool_mode. Usually this file does not allow you to change the pool-mode while there are nfsd threads running, but if the pool-mode is "global" it's allowed. My assumption is that this is a bug, since it probably should never have worked this way. This patch changes the behavior such that you get back EBUSY even when nfsd is running in global mode. I think this is more reasonable behavior, and given that most people set this today using the module parameter, it's doubtful anyone will notice. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 23617da0e565..d0433e1642b3 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -85,6 +85,7 @@ struct svc_serv { char * sv_name; /* service name */ unsigned int sv_nrpools; /* number of thread pools */ + bool sv_is_pooled; /* is this a pooled service? */ struct svc_pool * sv_pools; /* array of thread pools */ int (*sv_threadfn)(void *data); -- cgit v1.2.3 From 5f71f3c325534da55dfda487a67208e2e0f0cd1b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 13 Jun 2024 14:34:33 -0400 Subject: sunrpc: refactor pool_mode setting code Allow the pool_mode setting code to be called from internal callers so we can call it from a new netlink op. Add a new svc_pool_map_get function to return the current setting. Change the existing module parameter handling to use the new interfaces under the hood. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index d0433e1642b3..a7d0406b9ef5 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -399,6 +399,8 @@ struct svc_procedure { /* * Function prototypes. */ +int sunrpc_set_pool_mode(const char *val); +int sunrpc_get_pool_mode(char *val, size_t size); int svc_rpcb_setup(struct svc_serv *serv, struct net *net); void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net); int svc_bind(struct svc_serv *serv, struct net *net); -- cgit v1.2.3 From 1f9a8286bc0c3df7d789ea625d9d9db3d7779f2d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 14:58:03 +0000 Subject: uaccess: always export _copy_[from|to]_user with CONFIG_RUST Rust code needs to be able to access _copy_from_user and _copy_to_user so that it can skip the check_copy_size check in cases where the length is known at compile-time, mirroring the logic for when C code will skip check_copy_size. To do this, we ensure that exported versions of these methods are available when CONFIG_RUST is enabled. Alice has verified that this patch passes the CONFIG_TEST_USER_COPY test on x86 using the Android cuttlefish emulator. Signed-off-by: Arnd Bergmann Tested-by: Alice Ryhl Reviewed-by: Boqun Feng Reviewed-by: Kees Cook Signed-off-by: Alice Ryhl Acked-by: Andrew Morton Link: https://lore.kernel.org/r/20240528-alice-mm-v7-2-78222c31b8f4@google.com Signed-off-by: Miguel Ojeda --- include/linux/uaccess.h | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 3064314f4832..d8e4105a2f21 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -138,13 +139,26 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) return raw_copy_to_user(to, from, n); } -#ifdef INLINE_COPY_FROM_USER +/* + * Architectures that #define INLINE_COPY_TO_USER use this function + * directly in the normal copy_to/from_user(), the other ones go + * through an extern _copy_to/from_user(), which expands the same code + * here. + * + * Rust code always uses the extern definition. + */ static inline __must_check unsigned long -_copy_from_user(void *to, const void __user *from, unsigned long n) +_inline_copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { + /* + * Ensure that bad access_ok() speculation will not + * lead to nasty side effects *after* the copy is + * finished: + */ + barrier_nospec(); instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); @@ -153,14 +167,11 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) memset(to + (n - res), 0, res); return res; } -#else extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); -#endif -#ifdef INLINE_COPY_TO_USER static inline __must_check unsigned long -_copy_to_user(void __user *to, const void *from, unsigned long n) +_inline_copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) @@ -171,25 +182,32 @@ _copy_to_user(void __user *to, const void *from, unsigned long n) } return n; } -#else extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); -#endif static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { - if (check_copy_size(to, n, false)) - n = _copy_from_user(to, from, n); - return n; + if (!check_copy_size(to, n, false)) + return n; +#ifdef INLINE_COPY_FROM_USER + return _inline_copy_from_user(to, from, n); +#else + return _copy_from_user(to, from, n); +#endif } static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { - if (check_copy_size(from, n, true)) - n = _copy_to_user(to, from, n); - return n; + if (!check_copy_size(from, n, true)) + return n; + +#ifdef INLINE_COPY_TO_USER + return _inline_copy_to_user(to, from, n); +#else + return _copy_to_user(to, from, n); +#endif } #ifndef copy_mc_to_kernel -- cgit v1.2.3 From 14498e993fb77adce75f0106162902b2f8b1d480 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 3 Jul 2024 14:37:50 -0700 Subject: Input: make events() method return number of events processed In preparation to consolidating filtering and event processing in the input core change events() method to return number of events processed by it. Reviewed-by: Jeff LaBundy Reviewed-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240703213756.3375978-4-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- include/linux/input.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/input.h b/include/linux/input.h index c22ac465254b..89a0be6ee0e2 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -275,7 +275,8 @@ struct input_handle; * it may not sleep * @events: event sequence handler. This method is being called by * input core with interrupts disabled and dev->event_lock - * spinlock held and so it may not sleep + * spinlock held and so it may not sleep. The method must return + * number of events passed to it. * @filter: similar to @event; separates normal event handlers from * "filters". * @match: called after comparing device's id with handler's id_table @@ -312,8 +313,8 @@ struct input_handler { void *private; void (*event)(struct input_handle *handle, unsigned int type, unsigned int code, int value); - void (*events)(struct input_handle *handle, - const struct input_value *vals, unsigned int count); + unsigned int (*events)(struct input_handle *handle, + struct input_value *vals, unsigned int count); bool (*filter)(struct input_handle *handle, unsigned int type, unsigned int code, int value); bool (*match)(struct input_handler *handler, struct input_dev *dev); int (*connect)(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id); -- cgit v1.2.3 From 6badc62f8fa4a1165f3f440943045b73c7a47735 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 27 May 2024 18:14:40 +0200 Subject: of: dynamic: Constify parameter in of_changeset_add_prop_string_array() The str_array parameter has no reason to be an un-const array. Indeed, elements of the 'str_array' array are not changed by the code. Constify the 'str_array' array parameter. With this const qualifier added, the following construction is allowed: static const char * const tab_str[] = { "string1", "string2" }; of_changeset_add_prop_string_array(..., tab_str, ARRAY_SIZE(tab_str)); Signed-off-by: Herve Codina Link: https://lore.kernel.org/r/20240527161450.326615-14-herve.codina@bootlin.com Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index a0bedd038a05..ee9a385a13db 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1639,7 +1639,7 @@ int of_changeset_add_prop_string(struct of_changeset *ocs, int of_changeset_add_prop_string_array(struct of_changeset *ocs, struct device_node *np, const char *prop_name, - const char **str_array, size_t sz); + const char * const *str_array, size_t sz); int of_changeset_add_prop_u32_array(struct of_changeset *ocs, struct device_node *np, const char *prop_name, -- cgit v1.2.3 From f2b388d63e6c7c6151bb295e5cbf48a2ac91e51a Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 27 May 2024 18:14:42 +0200 Subject: of: dynamic: Introduce of_changeset_add_prop_bool() APIs to add some properties in a changeset exist but nothing to add a DT boolean property (i.e. a property without any values). Fill this lack with of_changeset_add_prop_bool(). Signed-off-by: Herve Codina Link: https://lore.kernel.org/r/20240527161450.326615-16-herve.codina@bootlin.com Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index ee9a385a13db..13cf7a43b473 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1652,6 +1652,9 @@ static inline int of_changeset_add_prop_u32(struct of_changeset *ocs, return of_changeset_add_prop_u32_array(ocs, np, prop_name, &val, 1); } +int of_changeset_add_prop_bool(struct of_changeset *ocs, struct device_node *np, + const char *prop_name); + #else /* CONFIG_OF_DYNAMIC */ static inline int of_reconfig_notifier_register(struct notifier_block *nb) { -- cgit v1.2.3 From b4b1ddc9dfe997a5f492fa3a36487f8e7a5de30d Mon Sep 17 00:00:00 2001 From: Lizhe Date: Thu, 4 Jul 2024 12:23:55 +0530 Subject: cpufreq: Make cpufreq_driver->exit() return void The cpufreq core doesn't check the return type of the exit() callback and there is not much the core can do on failures at that point. Just drop the returned value and make it return void. Signed-off-by: Lizhe [ Viresh: Reworked the patches to fix all missing changes together. ] Signed-off-by: Viresh Kumar Reviewed-by: AngeloGioacchino Del Regno # Mediatek Acked-by: Sudeep Holla # scpi, scmi, vexpress Acked-by: Mario Limonciello # amd Reviewed-by: Florian Fainelli # bmips Acked-by: Rafael J. Wysocki Acked-by: Kevin Hilman # omap --- include/linux/cpufreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 20f7e98ee8af..e3b3face9134 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -396,7 +396,7 @@ struct cpufreq_driver { int (*online)(struct cpufreq_policy *policy); int (*offline)(struct cpufreq_policy *policy); - int (*exit)(struct cpufreq_policy *policy); + void (*exit)(struct cpufreq_policy *policy); int (*suspend)(struct cpufreq_policy *policy); int (*resume)(struct cpufreq_policy *policy); -- cgit v1.2.3 From 4aa99c71e42ad60178c1154ec24e3df9c684fb67 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 24 Jun 2024 19:01:17 +0200 Subject: jbd2: make jbd2_journal_get_max_txn_bufs() internal There's no reason to have jbd2_journal_get_max_txn_bufs() public function. Currently all users are internal and can use journal->j_max_transaction_buffers instead. This saves some unnecessary recomputations of the limit as a bonus which becomes important as this function gets more complex in the following patch. CC: stable@vger.kernel.org Signed-off-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240624170127.3253-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index ab04c1c27fae..f91b930abe20 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1660,11 +1660,6 @@ int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); int jbd2_fc_release_bufs(journal_t *journal); -static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal) -{ - return (journal->j_total_len - journal->j_fc_wbufsize) / 4; -} - /* * is_journal_abort * -- cgit v1.2.3 From e3a00a23781c1f2fcda98a7aecaac515558e7a35 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 24 Jun 2024 19:01:18 +0200 Subject: jbd2: precompute number of transaction descriptor blocks Instead of computing the number of descriptor blocks a transaction can have each time we need it (which is currently when starting each transaction but will become more frequent later) precompute the number once during journal initialization together with maximum transaction size. We perform the precomputation whenever journal feature set is updated similarly as for computation of journal->j_revoke_records_per_block. CC: stable@vger.kernel.org Signed-off-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240624170127.3253-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f91b930abe20..b900c642210c 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1085,6 +1085,13 @@ struct journal_s */ int j_revoke_records_per_block; + /** + * @j_transaction_overhead: + * + * Number of blocks each transaction needs for its own bookkeeping + */ + int j_transaction_overhead_buffers; + /** * @j_commit_interval: * -- cgit v1.2.3 From fe3d508ba95bc63adba661355115be340275c0d1 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 8 Jul 2024 09:16:48 +0000 Subject: block: Validate logical block size in blk_validate_limits() Some drivers validate that their own logical block size. It is no harm to always do this, so validate in blk_validate_limits(). This allows us to remove the validation in most of those drivers. Add a comment to blk_validate_block_size() to inform users that self- validation of LBS is usually unnecessary. Signed-off-by: John Garry Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240708091651.177447-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 02e04df27282..dce4a6bf7307 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -268,6 +268,7 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } +/* blk_validate_limits() validates bsize, so drivers don't usually need to */ static inline int blk_validate_block_size(unsigned long bsize) { if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) -- cgit v1.2.3 From 97c4264f62a73664a2934203346a3e04c109b8ec Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Tue, 2 Jul 2024 08:35:09 +0200 Subject: soc: samsung: exynos-pmu: add support for PMU_ALIVE non atomic registers Not all registers in PMU_ALIVE block support atomic set/clear operations. GS101_SYSIP_DAT0 and GS101_SYSTEM_CONFIGURATION registers are two regs where attempting atomic access fails. As documentation on exactly which registers support atomic operations is not forthcoming. We default to atomic access, unless the register is explicitly added to the tensor_is_atomic() function. Update the comment to reflect this as well. Reviewed-by: Will McVicker Tested-by: Will McVicker Signed-off-by: Peter Griffin Link: https://lore.kernel.org/r/20240628223506.1237523-4-peter.griffin@linaro.org Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240702063514.6215-2-krzysztof.kozlowski@linaro.org Signed-off-by: Arnd Bergmann --- include/linux/soc/samsung/exynos-regs-pmu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h index aa840ed043e1..f411c176536d 100644 --- a/include/linux/soc/samsung/exynos-regs-pmu.h +++ b/include/linux/soc/samsung/exynos-regs-pmu.h @@ -657,4 +657,8 @@ #define EXYNOS5433_PAD_RETENTION_UFS_OPTION (0x3268) #define EXYNOS5433_PAD_RETENTION_FSYSGENIO_OPTION (0x32A8) +/* For Tensor GS101 */ +#define GS101_SYSIP_DAT0 (0x810) +#define GS101_SYSTEM_CONFIGURATION (0x3A00) + #endif /* __LINUX_SOC_EXYNOS_REGS_PMU_H */ -- cgit v1.2.3 From ef6dfcbcbbf7d9a414feb44aa093d2d8592a2e20 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 13 Feb 2024 23:02:20 +0100 Subject: mfd: tmio: Remove obsolete platform_data With commit 8971bb812e3c ("mfd: remove toshiba tmio drivers"), all users of platform data for NAND and framebuffers are gone. So, remove definitions from the header, too. Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240213220221.2380-9-wsa+renesas@sang-engineering.com Signed-off-by: Lee Jones --- include/linux/mfd/tmio.h | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index eace8ea6cda0..bc53323293a3 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -103,31 +103,4 @@ struct tmio_mmc_data { void (*set_pwr)(struct platform_device *host, int state); void (*set_clk_div)(struct platform_device *host, int state); }; - -/* - * data for the NAND controller - */ -struct tmio_nand_data { - struct nand_bbt_descr *badblock_pattern; - struct mtd_partition *partition; - unsigned int num_partitions; - const char *const *part_parsers; -}; - -#define FBIO_TMIO_ACC_WRITE 0x7C639300 -#define FBIO_TMIO_ACC_SYNC 0x7C639301 - -struct tmio_fb_data { - int (*lcd_set_power)(struct platform_device *fb_dev, - bool on); - int (*lcd_mode)(struct platform_device *fb_dev, - const struct fb_videomode *mode); - int num_modes; - struct fb_videomode *modes; - - /* in mm: size of screen */ - int height; - int width; -}; - #endif -- cgit v1.2.3 From 6bec678b9872271ce2e0879c360ad7a1a524c7fa Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 13 Feb 2024 23:02:21 +0100 Subject: mfd: tmio: Remove obsolete io accessors Since commit 568494db6809 ("mtd: remove tmio_nand driver") and commit aceae7848624 ("fbdev: remove tmiofb driver"), these accessors have no users anymore. Remove them. Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240213220221.2380-10-wsa+renesas@sang-engineering.com Signed-off-by: Lee Jones --- include/linux/mfd/tmio.h | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index bc53323293a3..4223315d2b2a 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -10,31 +10,6 @@ #include #include -#define tmio_ioread8(addr) readb(addr) -#define tmio_ioread16(addr) readw(addr) -#define tmio_ioread16_rep(r, b, l) readsw(r, b, l) -#define tmio_ioread32(addr) \ - (((u32)readw((addr))) | (((u32)readw((addr) + 2)) << 16)) - -#define tmio_iowrite8(val, addr) writeb((val), (addr)) -#define tmio_iowrite16(val, addr) writew((val), (addr)) -#define tmio_iowrite16_rep(r, b, l) writesw(r, b, l) -#define tmio_iowrite32(val, addr) \ - do { \ - writew((val), (addr)); \ - writew((val) >> 16, (addr) + 2); \ - } while (0) - -#define sd_config_write8(base, shift, reg, val) \ - tmio_iowrite8((val), (base) + ((reg) << (shift))) -#define sd_config_write16(base, shift, reg, val) \ - tmio_iowrite16((val), (base) + ((reg) << (shift))) -#define sd_config_write32(base, shift, reg, val) \ - do { \ - tmio_iowrite16((val), (base) + ((reg) << (shift))); \ - tmio_iowrite16((val) >> 16, (base) + ((reg + 2) << (shift))); \ - } while (0) - /* tmio MMC platform flags */ /* * Some controllers can support a 2-byte block size when the bus width -- cgit v1.2.3 From d411ccbe103d665a31861987e2f1b36944ab63f2 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 13 Feb 2024 23:02:23 +0100 Subject: mfd: tmio: Update include files Remove meanwhile unneeded includes, only add types.h for dma_addr_t. Also, remove an obsolete forward declaration while here. Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240213220221.2380-12-wsa+renesas@sang-engineering.com Signed-off-by: Lee Jones --- include/linux/mfd/tmio.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index 4223315d2b2a..f71d4e507dcb 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -2,13 +2,8 @@ #ifndef MFD_TMIO_H #define MFD_TMIO_H -#include -#include -#include -#include -#include #include -#include +#include /* tmio MMC platform flags */ /* @@ -59,8 +54,6 @@ /* Some controllers have a CBSY bit */ #define TMIO_MMC_HAVE_CBSY BIT(11) -struct dma_chan; - /* * data for the MMC controller */ -- cgit v1.2.3 From 763135b819ad3e3e0301b66094d50923e64092a7 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 13 Feb 2024 23:02:24 +0100 Subject: mfd: tmio: Sanitize comments Reformat the comments to utilize the maximum line length and use single line comments where appropriate. Remove superfluous comments, too. Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240213220221.2380-13-wsa+renesas@sang-engineering.com Signed-off-by: Lee Jones --- include/linux/mfd/tmio.h | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index f71d4e507dcb..1cf418643da9 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -5,23 +5,23 @@ #include #include -/* tmio MMC platform flags */ +/* TMIO MMC platform flags */ + /* - * Some controllers can support a 2-byte block size when the bus width - * is configured in 4-bit mode. + * Some controllers can support a 2-byte block size when the bus width is + * configured in 4-bit mode. */ #define TMIO_MMC_BLKSZ_2BYTES BIT(1) -/* - * Some controllers can support SDIO IRQ signalling. - */ + +/* Some controllers can support SDIO IRQ signalling */ #define TMIO_MMC_SDIO_IRQ BIT(2) /* Some features are only available or tested on R-Car Gen2 or later */ #define TMIO_MMC_MIN_RCAR2 BIT(3) /* - * Some controllers require waiting for the SD bus to become - * idle before writing to some registers. + * Some controllers require waiting for the SD bus to become idle before + * writing to some registers. */ #define TMIO_MMC_HAS_IDLE_WAIT BIT(4) @@ -32,31 +32,21 @@ */ #define TMIO_MMC_USE_BUSY_TIMEOUT BIT(5) -/* - * Some controllers have CMD12 automatically - * issue/non-issue register - */ +/* Some controllers have CMD12 automatically issue/non-issue register */ #define TMIO_MMC_HAVE_CMD12_CTRL BIT(7) /* Controller has some SDIO status bits which must be 1 */ #define TMIO_MMC_SDIO_STATUS_SETBITS BIT(8) -/* - * Some controllers have a 32-bit wide data port register - */ +/* Some controllers have a 32-bit wide data port register */ #define TMIO_MMC_32BIT_DATA_PORT BIT(9) -/* - * Some controllers allows to set SDx actual clock - */ +/* Some controllers allows to set SDx actual clock */ #define TMIO_MMC_CLK_ACTUAL BIT(10) /* Some controllers have a CBSY bit */ #define TMIO_MMC_HAVE_CBSY BIT(11) -/* - * data for the MMC controller - */ struct tmio_mmc_data { void *chan_priv_tx; void *chan_priv_rx; -- cgit v1.2.3 From 70b46487b1558eb25ea6e533c905131b10596dc0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 13 Feb 2024 23:02:25 +0100 Subject: mfd: tmio: Move header to platform_data All the MFD components are gone from the header meanwhile. Only the MMC relevant data is left which makes it a platform_data for the MMC controller. Move the header to the now fitting directory. Signed-off-by: Wolfram Sang Acked-by: Ulf Hansson # For MMC Acked-by: John Paul Adrian Glaubitz Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240213220221.2380-14-wsa+renesas@sang-engineering.com Signed-off-by: Lee Jones --- include/linux/mfd/tmio.h | 64 -------------------------------------- include/linux/platform_data/tmio.h | 64 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 64 deletions(-) delete mode 100644 include/linux/mfd/tmio.h create mode 100644 include/linux/platform_data/tmio.h (limited to 'include/linux') diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h deleted file mode 100644 index 1cf418643da9..000000000000 --- a/include/linux/mfd/tmio.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef MFD_TMIO_H -#define MFD_TMIO_H - -#include -#include - -/* TMIO MMC platform flags */ - -/* - * Some controllers can support a 2-byte block size when the bus width is - * configured in 4-bit mode. - */ -#define TMIO_MMC_BLKSZ_2BYTES BIT(1) - -/* Some controllers can support SDIO IRQ signalling */ -#define TMIO_MMC_SDIO_IRQ BIT(2) - -/* Some features are only available or tested on R-Car Gen2 or later */ -#define TMIO_MMC_MIN_RCAR2 BIT(3) - -/* - * Some controllers require waiting for the SD bus to become idle before - * writing to some registers. - */ -#define TMIO_MMC_HAS_IDLE_WAIT BIT(4) - -/* - * Use the busy timeout feature. Probably all TMIO versions support it. Yet, - * we don't have documentation for old variants, so we enable only known good - * variants with this flag. Can be removed once all variants are known good. - */ -#define TMIO_MMC_USE_BUSY_TIMEOUT BIT(5) - -/* Some controllers have CMD12 automatically issue/non-issue register */ -#define TMIO_MMC_HAVE_CMD12_CTRL BIT(7) - -/* Controller has some SDIO status bits which must be 1 */ -#define TMIO_MMC_SDIO_STATUS_SETBITS BIT(8) - -/* Some controllers have a 32-bit wide data port register */ -#define TMIO_MMC_32BIT_DATA_PORT BIT(9) - -/* Some controllers allows to set SDx actual clock */ -#define TMIO_MMC_CLK_ACTUAL BIT(10) - -/* Some controllers have a CBSY bit */ -#define TMIO_MMC_HAVE_CBSY BIT(11) - -struct tmio_mmc_data { - void *chan_priv_tx; - void *chan_priv_rx; - unsigned int hclk; - unsigned long capabilities; - unsigned long capabilities2; - unsigned long flags; - u32 ocr_mask; /* available voltages */ - dma_addr_t dma_rx_offset; - unsigned int max_blk_count; - unsigned short max_segs; - void (*set_pwr)(struct platform_device *host, int state); - void (*set_clk_div)(struct platform_device *host, int state); -}; -#endif diff --git a/include/linux/platform_data/tmio.h b/include/linux/platform_data/tmio.h new file mode 100644 index 000000000000..1cf418643da9 --- /dev/null +++ b/include/linux/platform_data/tmio.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MFD_TMIO_H +#define MFD_TMIO_H + +#include +#include + +/* TMIO MMC platform flags */ + +/* + * Some controllers can support a 2-byte block size when the bus width is + * configured in 4-bit mode. + */ +#define TMIO_MMC_BLKSZ_2BYTES BIT(1) + +/* Some controllers can support SDIO IRQ signalling */ +#define TMIO_MMC_SDIO_IRQ BIT(2) + +/* Some features are only available or tested on R-Car Gen2 or later */ +#define TMIO_MMC_MIN_RCAR2 BIT(3) + +/* + * Some controllers require waiting for the SD bus to become idle before + * writing to some registers. + */ +#define TMIO_MMC_HAS_IDLE_WAIT BIT(4) + +/* + * Use the busy timeout feature. Probably all TMIO versions support it. Yet, + * we don't have documentation for old variants, so we enable only known good + * variants with this flag. Can be removed once all variants are known good. + */ +#define TMIO_MMC_USE_BUSY_TIMEOUT BIT(5) + +/* Some controllers have CMD12 automatically issue/non-issue register */ +#define TMIO_MMC_HAVE_CMD12_CTRL BIT(7) + +/* Controller has some SDIO status bits which must be 1 */ +#define TMIO_MMC_SDIO_STATUS_SETBITS BIT(8) + +/* Some controllers have a 32-bit wide data port register */ +#define TMIO_MMC_32BIT_DATA_PORT BIT(9) + +/* Some controllers allows to set SDx actual clock */ +#define TMIO_MMC_CLK_ACTUAL BIT(10) + +/* Some controllers have a CBSY bit */ +#define TMIO_MMC_HAVE_CBSY BIT(11) + +struct tmio_mmc_data { + void *chan_priv_tx; + void *chan_priv_rx; + unsigned int hclk; + unsigned long capabilities; + unsigned long capabilities2; + unsigned long flags; + u32 ocr_mask; /* available voltages */ + dma_addr_t dma_rx_offset; + unsigned int max_blk_count; + unsigned short max_segs; + void (*set_pwr)(struct platform_device *host, int state); + void (*set_clk_div)(struct platform_device *host, int state); +}; +#endif -- cgit v1.2.3 From 95f6454da936e4e6418facddf66596442a842d74 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Mon, 24 Jun 2024 10:18:05 +0530 Subject: PM: domains: Allow devices attached to genpd to be managed by HW Some power-domains may be capable of relying on the HW to control the power for a device that's hooked up to it. Typically, for these kinds of configurations the consumer driver should be able to change the behavior of power domain at runtime, control the power domain in SW mode for certain configurations and handover the control to HW mode for other usecases. To allow a consumer driver to change the behaviour of the PM domain for its device, let's provide a new function, dev_pm_genpd_set_hwmode(). Moreover, let's add a corresponding optional genpd callback, ->set_hwmode_dev(), which the genpd provider should implement if it can support switching between HW controlled mode and SW controlled mode. Similarly, add the dev_pm_genpd_get_hwmode() to allow consumers to read the current mode and its corresponding optional genpd callback, ->get_hwmode_dev(), which the genpd provider can also implement to synchronize the initial HW mode state in genpd_add_device() by reading back the mode from the hardware. Signed-off-by: Ulf Hansson Signed-off-by: Abel Vesa Signed-off-by: Jagadeesh Kona Reviewed-by: Dmitry Baryshkov Reviewed-by: Dhruva Gole Reviewed-by: Taniya Das Link: https://lore.kernel.org/r/20240624044809.17751-2-quic_jkona@quicinc.com --- include/linux/pm_domain.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index f24546a3d3db..015751b64746 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -175,6 +175,10 @@ struct generic_pm_domain { int (*set_performance_state)(struct generic_pm_domain *genpd, unsigned int state); struct gpd_dev_ops dev_ops; + int (*set_hwmode_dev)(struct generic_pm_domain *domain, + struct device *dev, bool enable); + bool (*get_hwmode_dev)(struct generic_pm_domain *domain, + struct device *dev); int (*attach_dev)(struct generic_pm_domain *domain, struct device *dev); void (*detach_dev)(struct generic_pm_domain *domain, @@ -237,6 +241,7 @@ struct generic_pm_domain_data { unsigned int performance_state; unsigned int default_pstate; unsigned int rpm_pstate; + bool hw_mode; void *data; }; @@ -267,6 +272,8 @@ int dev_pm_genpd_remove_notifier(struct device *dev); void dev_pm_genpd_set_next_wakeup(struct device *dev, ktime_t next); ktime_t dev_pm_genpd_get_next_hrtimer(struct device *dev); void dev_pm_genpd_synced_poweroff(struct device *dev); +int dev_pm_genpd_set_hwmode(struct device *dev, bool enable); +bool dev_pm_genpd_get_hwmode(struct device *dev); extern struct dev_power_governor simple_qos_governor; extern struct dev_power_governor pm_domain_always_on_gov; @@ -340,6 +347,16 @@ static inline ktime_t dev_pm_genpd_get_next_hrtimer(struct device *dev) static inline void dev_pm_genpd_synced_poweroff(struct device *dev) { } +static inline int dev_pm_genpd_set_hwmode(struct device *dev, bool enable) +{ + return -EOPNOTSUPP; +} + +static inline bool dev_pm_genpd_get_hwmode(struct device *dev) +{ + return false; +} + #define simple_qos_governor (*(struct dev_power_governor *)(NULL)) #define pm_domain_always_on_gov (*(struct dev_power_governor *)(NULL)) #endif -- cgit v1.2.3 From 68cbd415dd4b9c5b9df69f0f091879e56bf5907a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 21 Jun 2024 11:15:58 +0200 Subject: task_work: s/task_work_cancel()/task_work_cancel_func()/ A proper task_work_cancel() API that actually cancels a callback and not *any* callback pointing to a given function is going to be needed for perf events event freeing. Do the appropriate rename to prepare for that. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240621091601.18227-2-frederic@kernel.org --- include/linux/task_work.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 795ef5a68429..23ab01ae185e 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -30,7 +30,7 @@ int task_work_add(struct task_struct *task, struct callback_head *twork, struct callback_head *task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data); -struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t); +struct callback_head *task_work_cancel_func(struct task_struct *, task_work_func_t); void task_work_run(void); static inline void exit_task_work(struct task_struct *task) -- cgit v1.2.3 From f409530e4db9dd11b88cb7703c97c8f326ff6566 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 21 Jun 2024 11:15:59 +0200 Subject: task_work: Introduce task_work_cancel() again Re-introduce task_work_cancel(), this time to cancel an actual callback and not *any* callback pointing to a given function. This is going to be needed for perf events event freeing. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240621091601.18227-3-frederic@kernel.org --- include/linux/task_work.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 23ab01ae185e..26b8a47f41fc 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -31,6 +31,7 @@ int task_work_add(struct task_struct *task, struct callback_head *twork, struct callback_head *task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data); struct callback_head *task_work_cancel_func(struct task_struct *, task_work_func_t); +bool task_work_cancel(struct task_struct *task, struct callback_head *cb); void task_work_run(void); static inline void exit_task_work(struct task_struct *task) -- cgit v1.2.3 From 3a5465418f5fd970e86a86c7f4075be262682840 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 21 Jun 2024 11:16:01 +0200 Subject: perf: Fix event leak upon exec and file release The perf pending task work is never waited upon the matching event release. In the case of a child event, released via free_event() directly, this can potentially result in a leaked event, such as in the following scenario that doesn't even require a weak IRQ work implementation to trigger: schedule() prepare_task_switch() =======> perf_event_overflow() event->pending_sigtrap = ... irq_work_queue(&event->pending_irq) <======= perf_event_task_sched_out() event_sched_out() event->pending_sigtrap = 0; atomic_long_inc_not_zero(&event->refcount) task_work_add(&event->pending_task) finish_lock_switch() =======> perf_pending_irq() //do nothing, rely on pending task work <======= begin_new_exec() perf_event_exit_task() perf_event_exit_event() // If is child event free_event() WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1) // event is leaked Similar scenarios can also happen with perf_event_remove_on_exec() or simply against concurrent perf_event_release(). Fix this with synchonizing against the possibly remaining pending task work while freeing the event, just like is done with remaining pending IRQ work. This means that the pending task callback neither need nor should hold a reference to the event, preventing it from ever beeing freed. Fixes: 517e6a301f34 ("perf: Fix perf_pending_task() UaF") Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240621091601.18227-5-frederic@kernel.org --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a5304ae8c654..393fb13733b0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -786,6 +786,7 @@ struct perf_event { struct irq_work pending_irq; struct callback_head pending_task; unsigned int pending_work; + struct rcuwait pending_work_wait; atomic_t event_limit; -- cgit v1.2.3 From 466e4d801cd438a1ab2c8a2cce1bef6b65c31bbb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Jul 2024 19:03:36 +0200 Subject: task_work: Add TWA_NMI_CURRENT as an additional notify mode. Adding task_work from NMI context requires the following: - The kasan_record_aux_stack() is not NMU safe and must be avoided. - Using TWA_RESUME is NMI safe. If the NMI occurs while the CPU is in userland then it will continue in userland and not invoke the `work' callback. Add TWA_NMI_CURRENT as an additional notify mode. In this mode skip kasan and use irq_work in hardirq-mode to for needed interrupt. Set TIF_NOTIFY_RESUME within the irq_work callback due to k[ac]san instrumentation in test_and_set_bit() which does not look NMI safe in case of a report. Suggested-by: Peter Zijlstra Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240704170424.1466941-3-bigeasy@linutronix.de --- include/linux/task_work.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 26b8a47f41fc..cf5e7e891a77 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -18,6 +18,7 @@ enum task_work_notify_mode { TWA_RESUME, TWA_SIGNAL, TWA_SIGNAL_NO_IPI, + TWA_NMI_CURRENT, }; static inline bool task_work_pending(struct task_struct *task) -- cgit v1.2.3 From c5d93d23a26012a98b77f9e354ab9b3afd420059 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Jul 2024 19:03:37 +0200 Subject: perf: Enqueue SIGTRAP always via task_work. A signal is delivered by raising irq_work() which works from any context including NMI. irq_work() can be delayed if the architecture does not provide an interrupt vector. In order not to lose a signal, the signal is injected via task_work during event_sched_out(). Instead going via irq_work, the signal could be added directly via task_work. The signal is sent to current and can be enqueued on its return path to userland. Queue signal via task_work and consider possible NMI context. Remove perf_event::pending_sigtrap and and use perf_event::pending_work instead. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marco Elver Tested-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/20240704170424.1466941-4-bigeasy@linutronix.de --- include/linux/perf_event.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 393fb13733b0..ea0d82418d85 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -781,7 +781,6 @@ struct perf_event { unsigned int pending_wakeup; unsigned int pending_kill; unsigned int pending_disable; - unsigned int pending_sigtrap; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; struct callback_head pending_task; @@ -963,7 +962,7 @@ struct perf_event_context { struct rcu_head rcu_head; /* - * Sum (event->pending_sigtrap + event->pending_work) + * Sum (event->pending_work + event->pending_work) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. -- cgit v1.2.3 From 0d40a6d83e3e6751f1107ba33587262d937c969f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Jul 2024 19:03:39 +0200 Subject: perf: Move swevent_htable::recursion into task_struct. The swevent_htable::recursion counter is used to avoid creating an swevent while an event is processed to avoid recursion. The counter is per-CPU and preemption must be disabled to have a stable counter. perf_pending_task() disables preemption to access the counter and then signal. This is problematic on PREEMPT_RT because sending a signal uses a spinlock_t which must not be acquired in atomic on PREEMPT_RT because it becomes a sleeping lock. The atomic context can be avoided by moving the counter into the task_struct. There is a 4 byte hole between futex_state (usually always on) and the following perf pointer (perf_event_ctxp). After the recursion lost some weight it fits perfectly. Move swevent_htable::recursion into task_struct. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marco Elver Link: https://lore.kernel.org/r/20240704170424.1466941-6-bigeasy@linutronix.de --- include/linux/perf_event.h | 6 ------ include/linux/sched.h | 7 +++++++ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ea0d82418d85..99a7ea1d29ed 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -970,12 +970,6 @@ struct perf_event_context { local_t nr_pending; }; -/* - * Number of contexts where an event can trigger: - * task, softirq, hardirq, nmi. - */ -#define PERF_NR_CONTEXTS 4 - struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; diff --git a/include/linux/sched.h b/include/linux/sched.h index 61591ac6eab6..afb1087f5831 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -734,6 +734,12 @@ enum perf_event_task_context { perf_nr_task_contexts, }; +/* + * Number of contexts where an event can trigger: + * task, softirq, hardirq, nmi. + */ +#define PERF_NR_CONTEXTS 4 + struct wake_q_node { struct wake_q_node *next; }; @@ -1256,6 +1262,7 @@ struct task_struct { unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS + u8 perf_recursion[PERF_NR_CONTEXTS]; struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; -- cgit v1.2.3 From 2b84def990d388bed4afe4f21ae383a01991046c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Jul 2024 19:03:41 +0200 Subject: perf: Split __perf_pending_irq() out of perf_pending_irq() perf_pending_irq() invokes perf_event_wakeup() and __perf_pending_irq(). The former is in charge of waking any tasks which waits to be woken up while the latter disables perf-events. The irq_work perf_pending_irq(), while this an irq_work, the callback is invoked in thread context on PREEMPT_RT. This is needed because all the waking functions (wake_up_all(), kill_fasync()) acquire sleep locks which must not be used with disabled interrupts. Disabling events, as done by __perf_pending_irq(), expects a hardirq context and disabled interrupts. This requirement is not fulfilled on PREEMPT_RT. Split functionality based on perf_event::pending_disable into irq_work named `pending_disable_irq' and invoke it in hardirq context on PREEMPT_RT. Rename the split out callback to perf_pending_disable(). Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marco Elver Tested-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/20240704170424.1466941-8-bigeasy@linutronix.de --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 99a7ea1d29ed..65ece0d5b4b6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -783,6 +783,7 @@ struct perf_event { unsigned int pending_disable; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; + struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; struct rcuwait pending_work_wait; -- cgit v1.2.3 From f70080c5bc39716f434e7c0888662aa077eb4405 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 26 Jun 2024 13:26:46 +0300 Subject: vdpa/mlx5: Add support for modifying the virtio_version VQ field This is done in preparation for the pre-creation of hardware virtqueues at device add time. Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Message-Id: <20240626-stage-vdpa-vq-precreate-v2-10-560c491078df@nvidia.com> Signed-off-by: Michael S. Tsirkin --- include/linux/mlx5/mlx5_ifc_vdpa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index 40371c916cf9..34f27c01cec9 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -148,6 +148,7 @@ enum { MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS = (u64)1 << 6, MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX = (u64)1 << 7, MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX = (u64)1 << 8, + MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION = (u64)1 << 10, MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY = (u64)1 << 11, MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY = (u64)1 << 14, }; -- cgit v1.2.3 From cdc3c7eaae695738b37f374866950602ec5af9c2 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 26 Jun 2024 13:26:47 +0300 Subject: vdpa/mlx5: Add support for modifying the VQ features field This is done in preparation for the pre-creation of hardware virtqueues at device add time. Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Message-Id: <20240626-stage-vdpa-vq-precreate-v2-11-560c491078df@nvidia.com> Signed-off-by: Michael S. Tsirkin --- include/linux/mlx5/mlx5_ifc_vdpa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index 34f27c01cec9..58dfa2ee7c83 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -150,6 +150,7 @@ enum { MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX = (u64)1 << 8, MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION = (u64)1 << 10, MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY = (u64)1 << 11, + MLX5_VIRTQ_MODIFY_MASK_QUEUE_FEATURES = (u64)1 << 12, MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY = (u64)1 << 14, }; -- cgit v1.2.3 From 444b89875fc0937ece181fa865c75c9d22649986 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 28 Jun 2024 11:08:48 -0700 Subject: ARM: spitz: Use software nodes to describe MMC GPIOs Convert Spitz to use software nodes for specifying GPIOs for the MMC. Signed-off-by: Dmitry Torokhov Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20240628180852.1738922-9-dmitry.torokhov@gmail.com Signed-off-by: Arnd Bergmann --- include/linux/platform_data/mmc-pxamci.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/mmc-pxamci.h b/include/linux/platform_data/mmc-pxamci.h index 7e44e84e7150..652f323b5ecc 100644 --- a/include/linux/platform_data/mmc-pxamci.h +++ b/include/linux/platform_data/mmc-pxamci.h @@ -7,6 +7,7 @@ struct device; struct mmc_host; +struct property_entry; struct pxamci_platform_data { unsigned int ocr_mask; /* available voltages */ @@ -18,7 +19,8 @@ struct pxamci_platform_data { bool gpio_card_ro_invert; /* gpio ro is inverted */ }; -extern void pxa_set_mci_info(struct pxamci_platform_data *info); +extern void pxa_set_mci_info(const struct pxamci_platform_data *info, + const struct property_entry *props); extern void pxa3xx_set_mci2_info(struct pxamci_platform_data *info); extern void pxa3xx_set_mci3_info(struct pxamci_platform_data *info); -- cgit v1.2.3 From d05374dee295d771261d382f97c0c23cb15aa104 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 5 Jul 2024 21:44:50 +0200 Subject: thermal: core: Change passive_delay and polling_delay data type It is better to use unsigned int as the data type for the passive_delay and polling_delay arguments of thermal_zone_device_register_with_trips() because they are implicitly cast to unsigned int anyway in thermal_set_delay_jiffies() and if they happen to be negative at that point, the resulting behavior may not be as desired. Update the thermal_zone_device_register_with_trips() definition accordingly. Signed-off-by: Rafael J. Wysocki Acked-by: Daniel Lezcano Link: https://patch.msgid.link/5803791.DvuYhMxLoT@rjwysocki.net Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index f732dab20368..cd0045915af5 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -221,7 +221,8 @@ struct thermal_zone_device *thermal_zone_device_register_with_trips( int num_trips, void *devdata, const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp, - int passive_delay, int polling_delay); + unsigned int passive_delay, + unsigned int polling_delay); struct thermal_zone_device *thermal_tripless_zone_device_register( const char *type, -- cgit v1.2.3 From 463b86fed2b25ddb5f576376bfea00134dd23030 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Jul 2024 16:39:55 +0200 Subject: thermal: helpers: Introduce thermal_trip_is_bound_to_cdev() Introduce a new helper function thermal_trip_is_bound_to_cdev() for checking whether or not a given trip point has been bound to a given cooling device. The primary user of it will be the Tegra thermal driver. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/13545762.uLZWGnKmhe@rjwysocki.net --- include/linux/thermal.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index cd0045915af5..5a0b66bd34f0 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -270,6 +270,9 @@ struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name); int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp); int thermal_zone_get_slope(struct thermal_zone_device *tz); int thermal_zone_get_offset(struct thermal_zone_device *tz); +bool thermal_trip_is_bound_to_cdev(struct thermal_zone_device *tz, + const struct thermal_trip *trip, + struct thermal_cooling_device *cdev); int thermal_zone_device_enable(struct thermal_zone_device *tz); int thermal_zone_device_disable(struct thermal_zone_device *tz); -- cgit v1.2.3 From d1fbf18a0f9403df2edeffa1c7f5a2d66e82c20a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Jul 2024 16:41:03 +0200 Subject: thermal: trip: Add conversion macros for thermal trip priv field Some drivers will need to store integers in the priv field of struct thermal_trip, so add conversion macros for doing this in a consistent way and switch over the int340x_thermal driver that already does it and uses custom conversion functions to using the new macros. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/3297884.aeNJFYEL58@rjwysocki.net --- include/linux/thermal.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 5a0b66bd34f0..3bd1b361ec34 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -79,6 +79,9 @@ struct thermal_trip { #define THERMAL_TRIP_FLAG_RW (THERMAL_TRIP_FLAG_RW_TEMP | \ THERMAL_TRIP_FLAG_RW_HYST) +#define THERMAL_TRIP_PRIV_TO_INT(_val_) (uintptr_t)(_val_) +#define THERMAL_INT_TO_TRIP_PRIV(_val_) (void *)(uintptr_t)(_val_) + struct thermal_zone_device; struct thermal_zone_device_ops { -- cgit v1.2.3 From ca52aa4c60f76566601b42e935b8a78f0fb4f8eb Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 8 Jul 2024 20:05:29 -0500 Subject: spi: add defer_optimize_message controller flag Adding spi_optimize_message() broke the spi-mux driver because it calls spi_async() from it's transfer_one_message() callback. This resulted in passing an incorrectly optimized message to the controller. For example, if the underlying controller has an optimize_message() callback, this would have not been called and can cause a crash when the underlying controller driver tries to transfer the message. Also, since the spi-mux driver swaps out the controller pointer by replacing msg->spi, __spi_unoptimize_message() was being called with a different controller than the one used in __spi_optimize_message(). This could cause a crash when attempting to free the message resources when __spi_unoptimize_message() is called in spi_finalize_current_message() since it is being called with a controller that did not allocate the resources. This is fixed by adding a defer_optimize_message flag for controllers. This flag causes all of the spi_[maybe_][un]optimize_message() calls to be a no-op (other than attaching a pointer to the spi device to the message). This allows the spi-mux driver to pass an unmodified message to spi_async() in spi_mux_transfer_one_message() after the spi device has been swapped out. This causes __spi_optimize_message() and __spi_unoptimize_message() to be called only once per message and with the correct/same controller in each case. Reported-by: Oleksij Rempel Closes: https://lore.kernel.org/linux-spi/Zn6HMrYG2b7epUxT@pengutronix.de/ Reported-by: Marc Kleine-Budde Closes: https://lore.kernel.org/linux-spi/20240628-awesome-discerning-bear-1621f9-mkl@pengutronix.de/ Fixes: 7b1d87af14d9 ("spi: add spi_optimize_message() APIs") Signed-off-by: David Lechner Link: https://patch.msgid.link/20240708-spi-mux-fix-v1-2-6c8845193128@baylibre.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 98fdef6e28f2..67b9a15a5330 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -533,6 +533,9 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @queue_empty: signal green light for opportunistically skipping the queue * for spi_sync transfers. * @must_async: disable all fast paths in the core + * @defer_optimize_message: set to true if controller cannot pre-optimize messages + * and needs to defer the optimization step until the message is actually + * being transferred * * Each SPI controller can communicate with one or more @spi_device * children. These make a small bus, sharing MOSI, MISO and SCK signals @@ -776,6 +779,7 @@ struct spi_controller { /* Flag for enabling opportunistic skipping of the queue in spi_sync */ bool queue_empty; bool must_async; + bool defer_optimize_message; }; static inline void *spi_controller_get_devdata(struct spi_controller *ctlr) -- cgit v1.2.3 From 07838f7fd529c8a6de44b601d4b7057e6c8d36ed Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 2 Jul 2024 14:34:40 +0800 Subject: iommufd: Add iommufd fault object An iommufd fault object provides an interface for delivering I/O page faults to user space. These objects are created and destroyed by user space, and they can be associated with or dissociated from hardware page table objects during page table allocation or destruction. User space interacts with the fault object through a file interface. This interface offers a straightforward and efficient way for user space to handle page faults. It allows user space to read fault messages sequentially and respond to them by writing to the same file. The file interface supports reading messages in poll mode, so it's recommended that user space applications use io_uring to enhance read and write efficiency. A fault object can be associated with any iopf-capable iommufd_hw_pgtable during the pgtable's allocation. All I/O page faults triggered by devices when accessing the I/O addresses of an iommufd_hw_pgtable are routed through the fault object to user space. Similarly, user space's responses to these page faults are routed back to the iommu device driver through the same fault object. Link: https://lore.kernel.org/r/20240702063444.105814-7-baolu.lu@linux.intel.com Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 910aec80886e..73bc3aee95a1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -124,12 +124,16 @@ struct iopf_fault { struct iopf_group { struct iopf_fault last_fault; struct list_head faults; + size_t fault_count; /* list node for iommu_fault_param::faults */ struct list_head pending_node; struct work_struct work; struct iommu_attach_handle *attach_handle; /* The device's fault data parameter. */ struct iommu_fault_param *fault_param; + /* Used by handler provider to hook the group on its own lists. */ + struct list_head node; + u32 cookie; }; /** -- cgit v1.2.3 From 5a5aa3c3769051c02a2210cc1b7e12a0833b76c9 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 4 Jul 2024 06:25:05 -0700 Subject: sched.h: always_inline alloc_tag_{save|restore} to fix modpost warnings Mark alloc_tag_{save|restore} as always_inline to fix the following modpost warnings: WARNING: modpost: vmlinux: section mismatch in reference: alloc_tag_save+0x1c (section: .text.unlikely) -> initcall_level_names (section: .init.data) WARNING: modpost: vmlinux: section mismatch in reference: alloc_tag_restore+0x3c (section: .text.unlikely) -> initcall_level_names (section: .init.data) The warnings happen when these functions are called from an __init function and they don't get inlined (remain in the .text section) while the value returned by get_current() points into .init.data section. Assuming get_current() always returns a valid address, this situation can happen only during init stage and accessing .init.data from .text section during that stage should pose no issues. Link: https://lkml.kernel.org/r/20240704132506.1011978-1-surenb@google.com Fixes: 22d407b164ff ("lib: add allocation tagging support for memory allocation profiling") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407032306.gi9nZsBi-lkp@intel.com/ Cc: Kent Overstreet Cc: Chris Zankel Cc: Ingo Molnar Cc: Juri Lelli Cc: Max Filippov Cc: Peter Zijlstra Cc: Vincent Guittot Signed-off-by: Andrew Morton --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 61591ac6eab6..a5f4b48fca18 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2192,13 +2192,13 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } extern void sched_set_stop_task(int cpu, struct task_struct *stop); #ifdef CONFIG_MEM_ALLOC_PROFILING -static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) +static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { swap(current->alloc_tag, tag); return tag; } -static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) +static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) { #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); -- cgit v1.2.3 From 1b9469cf15976a7cb7378caaa8a1772e7901514d Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Thu, 13 Jun 2024 13:50:21 +0200 Subject: PCI: Move struct pci_devres.pinned bit to struct pci_dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bit describing whether the PCI device is currently pinned is stored in struct pci_devres. To clean up and simplify the PCI devres API, it's better if this information is stored in struct pci_dev. This will later permit simplifying pcim_enable_device(). Move the 'pinned' boolean bit to struct pci_dev. Restructure bits in struct pci_dev so the pm / pme fields are next to each other. Link: https://lore.kernel.org/r/20240613115032.29098-9-pstanner@redhat.com Signed-off-by: Philipp Stanner Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index fb004fd4e889..0c19f0717899 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -367,10 +367,11 @@ struct pci_dev { this is D0-D3, D0 being fully functional, and D3 being off. */ u8 pm_cap; /* PM capability offset */ - unsigned int imm_ready:1; /* Supports Immediate Readiness */ unsigned int pme_support:5; /* Bitmask of states from which PME# can be generated */ unsigned int pme_poll:1; /* Poll device's PME status bit */ + unsigned int pinned:1; /* Whether this dev is pinned */ + unsigned int imm_ready:1; /* Supports Immediate Readiness */ unsigned int d1_support:1; /* Low power state D1 is supported */ unsigned int d2_support:1; /* Low power state D2 is supported */ unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ -- cgit v1.2.3 From 7296f2301a057493e97b07739213c6e864f76891 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 8 Jul 2024 12:41:00 -0700 Subject: swiotlb: reduce swiotlb pool lookups With CONFIG_SWIOTLB_DYNAMIC enabled, each round-trip map/unmap pair in the swiotlb results in 6 calls to swiotlb_find_pool(). In multiple places, the pool is found and used in one function, and then must be found again in the next function that is called because only the tlb_addr is passed as an argument. These are the six call sites: dma_direct_map_page: 1. swiotlb_map -> swiotlb_tbl_map_single -> swiotlb_bounce dma_direct_unmap_page: 2. dma_direct_sync_single_for_cpu -> is_swiotlb_buffer 3. dma_direct_sync_single_for_cpu -> swiotlb_sync_single_for_cpu -> swiotlb_bounce 4. is_swiotlb_buffer 5. swiotlb_tbl_unmap_single -> swiotlb_del_transient 6. swiotlb_tbl_unmap_single -> swiotlb_release_slots Reduce the number of calls by finding the pool at a higher level, and passing it as an argument instead of searching again. A key change is for is_swiotlb_buffer() to return a pool pointer instead of a boolean, and then pass this pool pointer to subsequent swiotlb functions. There are 9 occurrences of is_swiotlb_buffer() used to test if a buffer is a swiotlb buffer before calling a swiotlb function. To reduce code duplication in getting the pool pointer and passing it as an argument, introduce inline wrappers for this pattern. The generated code is essentially unchanged. Since is_swiotlb_buffer() no longer returns a boolean, rename some functions to reflect the change: * swiotlb_find_pool() becomes __swiotlb_find_pool() * is_swiotlb_buffer() becomes swiotlb_find_pool() * is_xen_swiotlb_buffer() becomes xen_swiotlb_find_pool() With these changes, a round-trip map/unmap pair requires only 2 pool lookups (listed using the new names and wrappers): dma_direct_unmap_page: 1. dma_direct_sync_single_for_cpu -> swiotlb_find_pool 2. swiotlb_tbl_unmap_single -> swiotlb_find_pool These changes come from noticing the inefficiencies in a code review, not from performance measurements. With CONFIG_SWIOTLB_DYNAMIC, __swiotlb_find_pool() is not trivial, and it uses an RCU read lock, so avoiding the redundant calls helps performance in a hot path. When CONFIG_SWIOTLB_DYNAMIC is *not* set, the code size reduction is minimal and the perf benefits are likely negligible, but no harm is done. No functional change is intended. Signed-off-by: Michael Kelley Reviewed-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- include/linux/scatterlist.h | 2 +- include/linux/swiotlb.h | 105 ++++++++++++++++++++++++++------------------ 2 files changed, 63 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 77df3d7b18a6..e61d164622db 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -332,7 +332,7 @@ static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) * Description: * Returns true if the scatterlist was marked for SWIOTLB bouncing. Not all * elements may have been bounced, so the caller would have to check - * individual SG entries with is_swiotlb_buffer(). + * individual SG entries with swiotlb_find_pool(). */ static inline bool sg_dma_is_swiotlb(struct scatterlist *sg) { diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 14bc10c1bb23..3dae0f592063 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -42,24 +42,6 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, int (*remap)(void *tlb, unsigned long nslabs)); extern void __init swiotlb_update_mem_attributes(void); -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, - size_t mapping_size, - unsigned int alloc_aligned_mask, enum dma_data_direction dir, - unsigned long attrs); - -extern void swiotlb_tbl_unmap_single(struct device *hwdev, - phys_addr_t tlb_addr, - size_t mapping_size, - enum dma_data_direction dir, - unsigned long attrs); - -void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir); -void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir); -dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys, - size_t size, enum dma_data_direction dir, unsigned long attrs); - #ifdef CONFIG_SWIOTLB /** @@ -143,37 +125,27 @@ struct io_tlb_mem { #endif }; -#ifdef CONFIG_SWIOTLB_DYNAMIC - -struct io_tlb_pool *swiotlb_find_pool(struct device *dev, phys_addr_t paddr); - -#else - -static inline struct io_tlb_pool *swiotlb_find_pool(struct device *dev, - phys_addr_t paddr) -{ - return &dev->dma_io_tlb_mem->defpool; -} - -#endif +struct io_tlb_pool *__swiotlb_find_pool(struct device *dev, phys_addr_t paddr); /** - * is_swiotlb_buffer() - check if a physical address belongs to a swiotlb + * swiotlb_find_pool() - find swiotlb pool to which a physical address belongs * @dev: Device which has mapped the buffer. * @paddr: Physical address within the DMA buffer. * - * Check if @paddr points into a bounce buffer. + * Find the swiotlb pool that @paddr points into. * * Return: - * * %true if @paddr points into a bounce buffer - * * %false otherwise + * * pool address if @paddr points into a bounce buffer + * * NULL if @paddr does not point into a bounce buffer. As such, this function + * can be used to determine if @paddr denotes a swiotlb bounce buffer. */ -static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) +static inline struct io_tlb_pool *swiotlb_find_pool(struct device *dev, + phys_addr_t paddr) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; if (!mem) - return false; + return NULL; #ifdef CONFIG_SWIOTLB_DYNAMIC /* @@ -182,16 +154,19 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) * If a SWIOTLB address is checked on another CPU, then it was * presumably loaded by the device driver from an unspecified private * data structure. Make sure that this load is ordered before reading - * dev->dma_uses_io_tlb here and mem->pools in swiotlb_find_pool(). + * dev->dma_uses_io_tlb here and mem->pools in __swiotlb_find_pool(). * * This barrier pairs with smp_mb() in swiotlb_find_slots(). */ smp_rmb(); - return READ_ONCE(dev->dma_uses_io_tlb) && - swiotlb_find_pool(dev, paddr); + if (READ_ONCE(dev->dma_uses_io_tlb)) + return __swiotlb_find_pool(dev, paddr); #else - return paddr >= mem->defpool.start && paddr < mem->defpool.end; + if (paddr >= mem->defpool.start && paddr < mem->defpool.end) + return &mem->defpool; #endif + + return NULL; } static inline bool is_swiotlb_force_bounce(struct device *dev) @@ -219,9 +194,10 @@ static inline void swiotlb_dev_init(struct device *dev) { } -static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) +static inline struct io_tlb_pool *swiotlb_find_pool(struct device *dev, + phys_addr_t paddr) { - return false; + return NULL; } static inline bool is_swiotlb_force_bounce(struct device *dev) { @@ -260,6 +236,49 @@ static inline phys_addr_t default_swiotlb_limit(void) } #endif /* CONFIG_SWIOTLB */ +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, + size_t mapping_size, unsigned int alloc_aligned_mask, + enum dma_data_direction dir, unsigned long attrs); +dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs); + +void __swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, + size_t mapping_size, enum dma_data_direction dir, + unsigned long attrs, struct io_tlb_pool *pool); +static inline void swiotlb_tbl_unmap_single(struct device *dev, + phys_addr_t addr, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + struct io_tlb_pool *pool = swiotlb_find_pool(dev, addr); + + if (unlikely(pool)) + __swiotlb_tbl_unmap_single(dev, addr, size, dir, attrs, pool); +} + +void __swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir, + struct io_tlb_pool *pool); +static inline void swiotlb_sync_single_for_device(struct device *dev, + phys_addr_t addr, size_t size, enum dma_data_direction dir) +{ + struct io_tlb_pool *pool = swiotlb_find_pool(dev, addr); + + if (unlikely(pool)) + __swiotlb_sync_single_for_device(dev, addr, size, dir, pool); +} + +void __swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir, + struct io_tlb_pool *pool); +static inline void swiotlb_sync_single_for_cpu(struct device *dev, + phys_addr_t addr, size_t size, enum dma_data_direction dir) +{ + struct io_tlb_pool *pool = swiotlb_find_pool(dev, addr); + + if (unlikely(pool)) + __swiotlb_sync_single_for_cpu(dev, addr, size, dir, pool); +} + extern void swiotlb_print_info(void); #ifdef CONFIG_DMA_RESTRICTED_POOL -- cgit v1.2.3 From ab7a880263c30b1675850a584c206770f5545c2f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 8 Jul 2024 10:15:45 +0200 Subject: driver core: make driver_[create|remove]_file take a const * The functions driver_create_file() and driver_remove_file() do not modify the struct device_driver structure directly, so they are safe to be marked as a constant pointer type. Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/2024070844-volley-hatchling-c812@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/device/driver.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index 7738f458995f..dceb36f1c42c 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -146,9 +146,9 @@ struct driver_attribute { #define DRIVER_ATTR_WO(_name) \ struct driver_attribute driver_attr_##_name = __ATTR_WO(_name) -int __must_check driver_create_file(struct device_driver *driver, +int __must_check driver_create_file(const struct device_driver *driver, const struct driver_attribute *attr); -void driver_remove_file(struct device_driver *driver, +void driver_remove_file(const struct device_driver *driver, const struct driver_attribute *attr); int driver_set_override(struct device *dev, const char **override, -- cgit v1.2.3 From f8fb469147e7db57e3f78d46f3f427705b4a1935 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 8 Jul 2024 10:15:46 +0200 Subject: driver core: make driver_find_device() take a const * The function driver_find_device() does not modify the struct device_driver structure directly, so it is safe to be marked as a constant pointer type. As that is fixed up, also change the function signature on the inline functions that call this, which are: driver_find_device_by_name() driver_find_device_by_of_node() driver_find_device_by_devt() driver_find_next_device() driver_find_device_by_acpi_dev() Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/2024070849-broken-front-9eb5@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/device/driver.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index dceb36f1c42c..1fc8b68786de 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -155,7 +155,7 @@ int driver_set_override(struct device *dev, const char **override, const char *s, size_t len); int __must_check driver_for_each_device(struct device_driver *drv, struct device *start, void *data, int (*fn)(struct device *dev, void *)); -struct device *driver_find_device(struct device_driver *drv, +struct device *driver_find_device(const struct device_driver *drv, struct device *start, const void *data, int (*match)(struct device *dev, const void *data)); @@ -165,7 +165,7 @@ struct device *driver_find_device(struct device_driver *drv, * @drv: the driver we're iterating * @name: name of the device to match */ -static inline struct device *driver_find_device_by_name(struct device_driver *drv, +static inline struct device *driver_find_device_by_name(const struct device_driver *drv, const char *name) { return driver_find_device(drv, NULL, name, device_match_name); @@ -178,7 +178,7 @@ static inline struct device *driver_find_device_by_name(struct device_driver *dr * @np: of_node pointer to match. */ static inline struct device * -driver_find_device_by_of_node(struct device_driver *drv, +driver_find_device_by_of_node(const struct device_driver *drv, const struct device_node *np) { return driver_find_device(drv, NULL, np, device_match_of_node); @@ -203,13 +203,13 @@ driver_find_device_by_fwnode(struct device_driver *drv, * @drv: the driver we're iterating * @devt: devt pointer to match. */ -static inline struct device *driver_find_device_by_devt(struct device_driver *drv, +static inline struct device *driver_find_device_by_devt(const struct device_driver *drv, dev_t devt) { return driver_find_device(drv, NULL, &devt, device_match_devt); } -static inline struct device *driver_find_next_device(struct device_driver *drv, +static inline struct device *driver_find_next_device(const struct device_driver *drv, struct device *start) { return driver_find_device(drv, start, NULL, device_match_any); @@ -223,14 +223,14 @@ static inline struct device *driver_find_next_device(struct device_driver *drv, * @adev: ACPI_COMPANION device to match. */ static inline struct device * -driver_find_device_by_acpi_dev(struct device_driver *drv, +driver_find_device_by_acpi_dev(const struct device_driver *drv, const struct acpi_device *adev) { return driver_find_device(drv, NULL, adev, device_match_acpi_dev); } #else static inline struct device * -driver_find_device_by_acpi_dev(struct device_driver *drv, const void *adev) +driver_find_device_by_acpi_dev(const struct device_driver *drv, const void *adev) { return NULL; } -- cgit v1.2.3 From c9add2e607a1ca63cc84bd1a7ab04d513096f37e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 10 Jul 2024 09:34:14 +0200 Subject: zorro: make match function take a const pointer In commit d69d80484598 ("driver core: have match() callback in struct bus_type take a const *"), the match callback for busses was changed to take a const pointer to struct device_driver. Unfortunately I missed fixing up the zorro code, and was only noticed after-the-fact by the kernel test robot. Resolve this issue by properly changing the zorro_bus_match() function. Cc: Geert Uytterhoeven Fixes: d69d80484598 ("driver core: have match() callback in struct bus_type take a const *") Reported-by: kernel test robot Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240710073413.495541-2-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/zorro.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/zorro.h b/include/linux/zorro.h index db7416ed6057..f36c8d39553d 100644 --- a/include/linux/zorro.h +++ b/include/linux/zorro.h @@ -52,7 +52,7 @@ struct zorro_driver { struct device_driver driver; }; -#define to_zorro_driver(drv) container_of(drv, struct zorro_driver, driver) +#define to_zorro_driver(drv) container_of_const(drv, struct zorro_driver, driver) #define zorro_for_each_dev(dev) \ -- cgit v1.2.3 From 396a27e91265a6632be17bebacb6743f0b9447be Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Jul 2024 15:45:00 +0200 Subject: dm: Remove max_write_zeroes_granularity The max_write_zeroes_granularity boolean of struct dm_target is used in __process_abnormal_io() but never set by any target. Remove this field and the dead code using it. Signed-off-by: Damien Le Moal Signed-off-by: Mikulas Patocka --- include/linux/device-mapper.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 3611b230d0aa..5b7e96653ec6 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -369,12 +369,6 @@ struct dm_target { */ bool max_secure_erase_granularity:1; - /* - * Set if this target requires that write_zeroes be split on - * 'max_write_zeroes_sectors' boundaries. - */ - bool max_write_zeroes_granularity:1; - /* * Set if we need to limit the number of in-flight bios when swapping. */ -- cgit v1.2.3 From 9d45db03acf9cee4f83148c403d105b1a38a0f23 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Jul 2024 15:45:25 +0200 Subject: dm: Remove max_secure_erase_granularity The max_secure_erase_granularity boolean of struct dm_target is used in __process_abnormal_io() but never set by any target. Remove this field and the dead code using it. Signed-off-by: Damien Le Moal Signed-off-by: Mikulas Patocka --- include/linux/device-mapper.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 5b7e96653ec6..4ba2e73993bd 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -363,12 +363,6 @@ struct dm_target { */ bool max_discard_granularity:1; - /* - * Set if this target requires that secure_erases be split on - * 'max_secure_erase_sectors' boundaries. - */ - bool max_secure_erase_granularity:1; - /* * Set if we need to limit the number of in-flight bios when swapping. */ -- cgit v1.2.3 From a21f9edb13b0d8066775cbd5efa7261e41871182 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Thu, 4 Jul 2024 16:17:15 +0200 Subject: dm: factor out helper function from dm_get_device Factor out a helper function, dm_devt_from_path(), from dm_get_device() for use in dm targets. Signed-off-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- include/linux/device-mapper.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 4ba2e73993bd..384649a61bfa 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -179,6 +179,11 @@ int dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, struct dm_dev **result); void dm_put_device(struct dm_target *ti, struct dm_dev *d); +/* + * Helper function for getting devices + */ +int dm_devt_from_path(const char *path, dev_t *dev_p); + /* * Information about a target type */ -- cgit v1.2.3 From 2bb6b10ebe5d42c119827b57ee8123175b7ecc3d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 9 Jul 2024 08:49:56 -0700 Subject: usb: gadget: Use u16 types for 16-bit fields Since the beginning of time, struct usb_ep::maxpacket was a bitfield, and when new 16-bit members were added, the convention was followed: 1da177e4c3f41 (Linus Torvalds 2005-04-16 236) unsigned maxpacket:16; e117e742d3106 (Robert Baldyga 2013-12-13 237) unsigned maxpacket_limit:16; a59d6b91cbca5 (Tatyana Brokhman 2011-06-28 238) unsigned max_streams:16; However, there is no need for this as a simple u16 can be used instead, simplifying the struct and the resulting compiler binary output. Switch to u16 for all three, and rearrange struct slightly to minimize holes. No change in the final size of the struct results; the 2 byte gap is just moved to the end, as seen with pahole: - /* XXX 2 bytes hole, try to pack */ ... /* size: 72, cachelines: 2, members: 15 */ ... + /* padding: 2 */ Changing this simplifies future introspection[1] of maxpacket's type during allocations: drivers/usb/gadget/function/f_tcm.c:330:24: error: 'typeof' applied to a bit-field 330 | fu->cmd.buf = kmalloc(fu->ep_out->maxpacket, GFP_KERNEL); Link: https://lore.kernel.org/all/202407090928.6UaOAZAJ-lkp@intel.com [1] Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20240709154953.work.953-kees@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/gadget.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 56dda8e1562d..df33333650a0 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -229,18 +229,18 @@ struct usb_ep { const char *name; const struct usb_ep_ops *ops; + const struct usb_endpoint_descriptor *desc; + const struct usb_ss_ep_comp_descriptor *comp_desc; struct list_head ep_list; struct usb_ep_caps caps; bool claimed; bool enabled; - unsigned maxpacket:16; - unsigned maxpacket_limit:16; - unsigned max_streams:16; unsigned mult:2; unsigned maxburst:5; u8 address; - const struct usb_endpoint_descriptor *desc; - const struct usb_ss_ep_comp_descriptor *comp_desc; + u16 maxpacket; + u16 maxpacket_limit; + u16 max_streams; }; /*-------------------------------------------------------------------------*/ -- cgit v1.2.3 From a5f81642a7228489292f842a106e33c558121e8b Mon Sep 17 00:00:00 2001 From: Kerem Karabay Date: Sat, 6 Jul 2024 12:03:23 +0000 Subject: USB: core: add 'shutdown' callback to usb_driver Currently there is no standardized method for USB drivers to handle shutdown events. This patch simplifies running code on shutdown for USB devices by adding a shutdown callback to usb_driver. Signed-off-by: Kerem Karabay Signed-off-by: Aditya Garg Link: https://lore.kernel.org/r/7AAC1BF4-8B60-448D-A3C1-B7E80330BE42@live.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 1913a13833f2..832997a9da0a 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1171,6 +1171,7 @@ extern ssize_t usb_show_dynids(struct usb_dynids *dynids, char *buf); * post_reset method is called. * @post_reset: Called by usb_reset_device() after the device * has been reset + * @shutdown: Called at shut-down time to quiesce the device. * @id_table: USB drivers use ID table to support hotplugging. * Export this with MODULE_DEVICE_TABLE(usb,...). This must be set * or your driver's probe function will never get called. @@ -1222,6 +1223,8 @@ struct usb_driver { int (*pre_reset)(struct usb_interface *intf); int (*post_reset)(struct usb_interface *intf); + void (*shutdown)(struct usb_interface *intf); + const struct usb_device_id *id_table; const struct attribute_group **dev_groups; -- cgit v1.2.3 From af46fe8c41de5b79358c3007e3854dda9c61c7dc Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 10 Jul 2024 09:44:52 +0200 Subject: dio: Have dio_bus_match() callback take a const * MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/dio/dio-driver.c:128:11: error: initialization of ‘int (*)(struct device *, const struct device_driver *)’ from incompatible pointer type ‘int (*)(struct device *, struct device_driver *)’ [-Werror=incompatible-pointer-types] 128 | .match = dio_bus_match, | ^~~~~~~~~~~~~ drivers/dio/dio-driver.c:128:11: note: (near initialization for ‘dio_bus_type.match’) Reported-by: noreply@ellerman.id.au Fixes: d69d804845985c29 ("driver core: have match() callback in struct bus_type take a const *") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240710074452.2841173-1-geert@linux-m68k.org [ added dio.h change - gregkh ] Signed-off-by: Greg Kroah-Hartman --- include/linux/dio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dio.h b/include/linux/dio.h index 2b5923909f96..464331c4c4a7 100644 --- a/include/linux/dio.h +++ b/include/linux/dio.h @@ -93,7 +93,7 @@ struct dio_driver { struct device_driver driver; }; -#define to_dio_driver(drv) container_of(drv, struct dio_driver, driver) +#define to_dio_driver(drv) container_of_const(drv, struct dio_driver, driver) /* DIO/DIO-II boards all have the following 8bit registers. * These are offsets from the base of the device. -- cgit v1.2.3 From 29f1c1ae6d2fff3bf4f89d265f4a1a7c8ab78a8e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 29 Jun 2024 22:12:09 -0400 Subject: closures: fix closure_sync + closure debugging originally, stack closures were only used synchronously, and with the original implementation of closure_sync() the ref never hit 0; thus, closure_put_after_sub() assumes that if the ref hits 0 it's on the debug list, in debug mode. that's no longer true with the current implementation of closure_sync, so we need a new magic so closure_debug_destroy() doesn't pop an assert. Signed-off-by: Kent Overstreet --- include/linux/closure.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/closure.h b/include/linux/closure.h index 59b8c06b11ff..2af44427107d 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -159,6 +159,7 @@ struct closure { #ifdef CONFIG_DEBUG_CLOSURES #define CLOSURE_MAGIC_DEAD 0xc054dead #define CLOSURE_MAGIC_ALIVE 0xc054a11e +#define CLOSURE_MAGIC_STACK 0xc05451cc unsigned int magic; struct list_head all; @@ -323,12 +324,18 @@ static inline void closure_init_stack(struct closure *cl) { memset(cl, 0, sizeof(struct closure)); atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +#ifdef CONFIG_DEBUG_CLOSURES + cl->magic = CLOSURE_MAGIC_STACK; +#endif } static inline void closure_init_stack_release(struct closure *cl) { memset(cl, 0, sizeof(struct closure)); atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +#ifdef CONFIG_DEBUG_CLOSURES + cl->magic = CLOSURE_MAGIC_STACK; +#endif } /** -- cgit v1.2.3 From 33c651a3fba9a3d380cb0e35ea207e048d39bed9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 7 Jun 2024 18:00:13 +0200 Subject: pwm: Make use of a symbol namespace for the core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define all pwm core's symbols in the namespace "PWM". The necessary module import statement is just added to the main header, this way every file that knows about the public functions automatically has this namespace available. Thanks to Biju Das for pointing out a cut'n'paste failure in my initial patch. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240607160012.1206874-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 60b92c2c75ef..812c550de60c 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -4,9 +4,12 @@ #include #include +#include #include #include +MODULE_IMPORT_NS(PWM); + struct pwm_chip; /** -- cgit v1.2.3 From d6f66e292676db976c4d42df2631427236c36fdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 7 Jun 2024 10:44:17 +0200 Subject: pwm: Make pwm_request_from_chip() private to the core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The last user of this function outside of core.c is gone, so it can be made static. Signed-off-by: Uwe Kleine-König Reviewed-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20240607084416.897777-8-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 812c550de60c..2e225f3b4282 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -410,10 +410,6 @@ void pwmchip_remove(struct pwm_chip *chip); int __devm_pwmchip_add(struct device *dev, struct pwm_chip *chip, struct module *owner); #define devm_pwmchip_add(dev, chip) __devm_pwmchip_add(dev, chip, THIS_MODULE) -struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip, - unsigned int index, - const char *label); - struct pwm_device *of_pwm_xlate_with_flags(struct pwm_chip *chip, const struct of_phandle_args *args); struct pwm_device *of_pwm_single_xlate(struct pwm_chip *chip, @@ -508,14 +504,6 @@ static inline int devm_pwmchip_add(struct device *dev, struct pwm_chip *chip) return -EINVAL; } -static inline struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip, - unsigned int index, - const char *label) -{ - might_sleep(); - return ERR_PTR(-ENODEV); -} - static inline struct pwm_device *pwm_get(struct device *dev, const char *consumer) { -- cgit v1.2.3 From a96d3659c9437673dbef5c05cba31a3c0b5a0a7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 7 Jun 2024 12:42:31 +0200 Subject: pwm: Remove wrong implementation details from pwm_ops's documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When .get_state() is called is an implementation detail that implementors and users shouldn't care about and rely on. Additionally it's wrong, because with PWM_DEBUG enabled it is called more often. Just drop the wrong statement. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/611ba758d7e9fb2695e96b23cb7ceeefb6ba8513.1717756902.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 2e225f3b4282..75ad0d2fd949 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -252,9 +252,7 @@ struct pwm_capture { * @free: optional hook for freeing a PWM * @capture: capture and report PWM signal * @apply: atomically apply a new PWM config - * @get_state: get the current PWM state. This function is only - * called once per PWM device when the PWM chip is - * registered. + * @get_state: get the current PWM state. */ struct pwm_ops { int (*request)(struct pwm_chip *chip, struct pwm_device *pwm); -- cgit v1.2.3 From da804fa9bc718e4210ec27cc0457ecc1eb073a14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 14 Jun 2024 17:39:00 +0200 Subject: pwm: Drop pwm_apply_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is not supposed to be used any more since commit c748a6d77c06 ("pwm: Rename pwm_apply_state() to pwm_apply_might_sleep()") that is included in v6.8-rc1. Two kernel releases should be enough for everyone to adapt, so drop the old function that was introduced as a compatibility stub for the transition. Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 75ad0d2fd949..f8c2dc12dbd3 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -563,13 +563,6 @@ static inline void pwm_apply_args(struct pwm_device *pwm) pwm_apply_might_sleep(pwm, &state); } -/* only for backwards-compatibility, new code should not use this */ -static inline int pwm_apply_state(struct pwm_device *pwm, - const struct pwm_state *state) -{ - return pwm_apply_might_sleep(pwm, state); -} - struct pwm_lookup { struct list_head list; const char *provider; -- cgit v1.2.3 From 2cb13dec8c5e5e104fd2f71c2dee761d6ed9a333 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 20 Jun 2024 15:53:34 +0200 Subject: cache: add __cacheline_group_{begin, end}_aligned() (+ couple more) __cacheline_group_begin(), unfortunately, doesn't align the group anyhow. If it is wanted, then you need to do something like __cacheline_group_begin(grp) __aligned(ALIGN) which isn't really convenient nor compact. Add the _aligned() counterparts to align the groups automatically to either the specified alignment (optional) or ``SMP_CACHE_BYTES``. Note that the actual struct layout will then be (on x64 with 64-byte CL): struct x { u32 y; // offset 0, size 4, padding 56 __cacheline_group_begin__grp; // offset 64, size 0 u32 z; // offset 64, size 4, padding 4 __cacheline_group_end__grp; // offset 72, size 0 __cacheline_group_pad__grp; // offset 72, size 0, padding 56 u32 w; // offset 128 }; The end marker is aligned to long, so that you can assert the struct size more strictly, but the offset of the next field in the structure will be aligned to the group alignment, so that the next field won't fall into the group it's not intended to. Add __LARGEST_ALIGN definition and LARGEST_ALIGN() macro. __LARGEST_ALIGN is the value to which the compilers align fields when __aligned_largest is specified. Sometimes, it might be needed to get this value outside of variable definitions. LARGEST_ALIGN() is macro which just aligns a value to __LARGEST_ALIGN. Also add SMP_CACHE_ALIGN(), similar to L1_CACHE_ALIGN(), but using ``SMP_CACHE_BYTES`` instead of ``L1_CACHE_BYTES`` as the former also accounts L2, needed in some cases. Signed-off-by: Alexander Lobakin Reviewed-by: Przemek Kitszel Signed-off-by: Tony Nguyen --- include/linux/cache.h | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cache.h b/include/linux/cache.h index 0ecb17bb6883..ca2a05682a54 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -13,6 +13,32 @@ #define SMP_CACHE_BYTES L1_CACHE_BYTES #endif +/** + * SMP_CACHE_ALIGN - align a value to the L2 cacheline size + * @x: value to align + * + * On some architectures, L2 ("SMP") CL size is bigger than L1, and sometimes, + * this needs to be accounted. + * + * Return: aligned value. + */ +#ifndef SMP_CACHE_ALIGN +#define SMP_CACHE_ALIGN(x) ALIGN(x, SMP_CACHE_BYTES) +#endif + +/* + * ``__aligned_largest`` aligns a field to the value most optimal for the + * target architecture to perform memory operations. Get the actual value + * to be able to use it anywhere else. + */ +#ifndef __LARGEST_ALIGN +#define __LARGEST_ALIGN sizeof(struct { long x; } __aligned_largest) +#endif + +#ifndef LARGEST_ALIGN +#define LARGEST_ALIGN(x) ALIGN(x, __LARGEST_ALIGN) +#endif + /* * __read_mostly is used to keep rarely changing variables out of frequently * updated cachelines. Its use should be reserved for data that is used @@ -95,6 +121,39 @@ __u8 __cacheline_group_end__##GROUP[0] #endif +/** + * __cacheline_group_begin_aligned - declare an aligned group start + * @GROUP: name of the group + * @...: optional group alignment + * + * The following block inside a struct: + * + * __cacheline_group_begin_aligned(grp); + * field a; + * field b; + * __cacheline_group_end_aligned(grp); + * + * will always be aligned to either the specified alignment or + * ``SMP_CACHE_BYTES``. + */ +#define __cacheline_group_begin_aligned(GROUP, ...) \ + __cacheline_group_begin(GROUP) \ + __aligned((__VA_ARGS__ + 0) ? : SMP_CACHE_BYTES) + +/** + * __cacheline_group_end_aligned - declare an aligned group end + * @GROUP: name of the group + * @...: optional alignment (same as was in __cacheline_group_begin_aligned()) + * + * Note that the end marker is aligned to sizeof(long) to allow more precise + * size assertion. It also declares a padding at the end to avoid next field + * falling into this cacheline. + */ +#define __cacheline_group_end_aligned(GROUP, ...) \ + __cacheline_group_end(GROUP) __aligned(sizeof(long)); \ + struct { } __cacheline_group_pad__##GROUP \ + __aligned((__VA_ARGS__ + 0) ? : SMP_CACHE_BYTES) + #ifndef CACHELINE_ASSERT_GROUP_MEMBER #define CACHELINE_ASSERT_GROUP_MEMBER(TYPE, GROUP, MEMBER) \ BUILD_BUG_ON(!(offsetof(TYPE, MEMBER) >= \ -- cgit v1.2.3 From 584e86e14c59d36688633002613792923620d8c0 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 10 Jul 2024 11:36:38 +0100 Subject: firmware: cs_dsp: Make wmfw and bin filename arguments const char * The wmfw_filename and bin_filename strings passed into cs_dsp_power_up() and cs_dsp_adsp1_power_up() should be const char *. Signed-off-by: Richard Fitzgerald Reviewed-by: Charles Keepax Link: https://lore.kernel.org/r/20240710103640.78197-3-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 82687e07a7c2..8078dc377948 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -213,13 +213,13 @@ int cs_dsp_adsp2_init(struct cs_dsp *dsp); int cs_dsp_halo_init(struct cs_dsp *dsp); int cs_dsp_adsp1_power_up(struct cs_dsp *dsp, - const struct firmware *wmfw_firmware, char *wmfw_filename, - const struct firmware *coeff_firmware, char *coeff_filename, + const struct firmware *wmfw_firmware, const char *wmfw_filename, + const struct firmware *coeff_firmware, const char *coeff_filename, const char *fw_name); void cs_dsp_adsp1_power_down(struct cs_dsp *dsp); int cs_dsp_power_up(struct cs_dsp *dsp, - const struct firmware *wmfw_firmware, char *wmfw_filename, - const struct firmware *coeff_firmware, char *coeff_filename, + const struct firmware *wmfw_firmware, const char *wmfw_filename, + const struct firmware *coeff_firmware, const char *coeff_filename, const char *fw_name); void cs_dsp_power_down(struct cs_dsp *dsp); int cs_dsp_run(struct cs_dsp *dsp); -- cgit v1.2.3 From dc0e5ca8856dc6a97e3b117879dfb2b52bda06b6 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 10 Jul 2024 11:36:40 +0100 Subject: firmware: cs_dsp: Rename fw_ver to wmfw_ver Rename the confusingly named struct member fw_ver to wmfw_ver. It contains the wmfw format version of the loaded wmfw file. This commit also contains an update to wm_adsp for the new name. Signed-off-by: Richard Fitzgerald Reviewed-by: Charles Keepax Link: https://lore.kernel.org/r/20240710103640.78197-5-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 8078dc377948..2e649810169f 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -167,7 +167,7 @@ struct cs_dsp { const struct cs_dsp_region *mem; int num_mems; - int fw_ver; + int wmfw_ver; bool booted; bool running; -- cgit v1.2.3 From 3c1ff93b4deea502cd8b0869839557cab2a28b71 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 9 Jul 2024 18:56:20 -0700 Subject: regmap: Implement regmap_multi_reg_read() regmap_multi_reg_read() is similar to regmap_bilk_read() but reads from an array of non-sequential registers. Signed-off-by: Guenter Roeck Link: https://lore.kernel.org/r/20240710015622.1960522-2-linux@roeck-us.net Signed-off-by: Mark Brown --- include/linux/regmap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index a6bc2980a98b..9c1ddbf82719 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1237,6 +1237,8 @@ int regmap_noinc_read(struct regmap *map, unsigned int reg, void *val, size_t val_len); int regmap_bulk_read(struct regmap *map, unsigned int reg, void *val, size_t val_count); +int regmap_multi_reg_read(struct regmap *map, unsigned int *reg, void *val, + size_t val_count); int regmap_update_bits_base(struct regmap *map, unsigned int reg, unsigned int mask, unsigned int val, bool *change, bool async, bool force); -- cgit v1.2.3 From 9fa001cf3bb0598aad09d15b2896f7db9ef87637 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 1 Jul 2024 18:59:31 +0000 Subject: mm: memcg: drop obsolete cache line padding in struct mem_cgroup After the grouping of the cgroup v1-related fields and the corresponding reorganization of the struct mem_cgroup, the existing cache line padding doesn't make much sense anymore. Let's drop it for now and put back to new places, if necessary. Link: https://lkml.kernel.org/r/20240701185932.704807-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Suggested-by: Shakeel Butt Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d0c9365ff039..8b5b3ddeba05 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -220,8 +220,6 @@ struct mem_cgroup { /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; - CACHELINE_PADDING(_pad1_); - /* memory.stat */ struct memcg_vmstats *vmstats; @@ -305,8 +303,6 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure; - CACHELINE_PADDING(_pad2_); - /* * set > 0 if pages under this cgroup are moving to other cgroup. */ -- cgit v1.2.3 From 6df13230b612af81ce04f20bb37a02e58ef71925 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 1 Jul 2024 18:59:32 +0000 Subject: mm: memcg: add cache line padding to mem_cgroup_per_node Memcg v1-specific fields serve a buffer function between read-mostly and update often parts of the mem_cgroup_per_node structure. If CONFIG_MEMCG_V1 is not set and these fields are not present, an explicit cacheline padding is needed. Link: https://lkml.kernel.org/r/20240701185932.704807-2-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Suggested-by: Shakeel Butt Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8b5b3ddeba05..60418934827c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -95,14 +95,16 @@ struct mem_cgroup_per_node { #ifdef CONFIG_MEMCG_V1 /* * Memcg-v1 only stuff in middle as buffer between read mostly fields - * and update often fields to avoid false sharing. Once v1 stuff is - * moved in a separate struct, an explicit padding is needed. + * and update often fields to avoid false sharing. If v1 stuff is + * not present, an explicit padding is needed. */ struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; +#else + CACHELINE_PADDING(_pad1_); #endif /* Fields which get updated often at the end. */ -- cgit v1.2.3 From 3a3b7fec3974f954600844e41d773c00857ef48a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 1 Jul 2024 11:31:15 -0400 Subject: mm: remove CONFIG_MEMCG_KMEM CONFIG_MEMCG_KMEM used to be a user-visible option for whether slab tracking is enabled. It has been default-enabled and equivalent to CONFIG_MEMCG for almost a decade. We've only grown more kernel memory accounting sites since, and there is no imaginable cgroup usecase going forward that wants to track user pages but not the multitude of user-drivable kernel allocations. Link: https://lkml.kernel.org/r/20240701153148.452230-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: David Hildenbrand Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/bpf.h | 4 ++-- include/linux/list_lru.h | 2 +- include/linux/memcontrol.h | 22 +++++----------------- include/linux/sched.h | 3 +-- include/linux/slab.h | 12 ++++++------ 5 files changed, 15 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5e694a308081..b8637555c9c2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -275,7 +275,7 @@ struct bpf_map { u32 btf_value_type_id; u32 btf_vmlinux_value_type_id; struct btf *btf; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; @@ -2252,7 +2252,7 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 792b67ceb631..5099a8ccd5f4 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -50,7 +50,7 @@ struct list_lru_node { struct list_lru { struct list_lru_node *node; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct list_head list; int shrinker_id; bool memcg_aware; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 60418934827c..7e2eb091049a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -195,7 +195,7 @@ struct mem_cgroup { /* Range enforcement for interrupt charges */ struct work_struct high_work; -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#ifdef CONFIG_ZSWAP unsigned long zswap_max; /* @@ -236,7 +236,6 @@ struct mem_cgroup { */ unsigned long socket_pressure; -#ifdef CONFIG_MEMCG_KMEM int kmemcg_id; /* * memcg->objcg is wiped out as a part of the objcg repaprenting @@ -247,7 +246,6 @@ struct mem_cgroup { struct obj_cgroup *orig_objcg; /* list of inherited objcgs, protected by objcg_lock */ struct list_head objcg_list; -#endif struct memcg_vmstats_percpu __percpu *vmstats_percpu; @@ -532,7 +530,6 @@ retry: return memcg; } -#ifdef CONFIG_MEMCG_KMEM /* * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. * @folio: Pointer to the folio. @@ -548,15 +545,6 @@ static inline bool folio_memcg_kmem(struct folio *folio) return folio->memcg_data & MEMCG_DATA_KMEM; } - -#else -static inline bool folio_memcg_kmem(struct folio *folio) -{ - return false; -} - -#endif - static inline bool PageMemcgKmem(struct page *page) { return folio_memcg_kmem(page_folio(page)); @@ -1488,7 +1476,7 @@ static inline void split_page_memcg(struct page *head, int old_order, int new_or * if MEMCG_DATA_OBJEXTS is set. */ struct slabobj_ext { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING @@ -1663,7 +1651,7 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg, } #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); @@ -1806,9 +1794,9 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +#if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); diff --git a/include/linux/sched.h b/include/linux/sched.h index a7770c566c4d..82da65131a6b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1457,9 +1457,8 @@ struct task_struct { /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; -#endif -#ifdef CONFIG_MEMCG_KMEM + /* Cache for current->cgroups->memcg->objcg lookups: */ struct obj_cgroup *objcg; #endif diff --git a/include/linux/slab.h b/include/linux/slab.h index 7247e217e21b..a332dd2fa6cd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -41,7 +41,7 @@ enum _slab_flag_bits { #ifdef CONFIG_FAILSLAB _SLAB_FAILSLAB, #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG _SLAB_ACCOUNT, #endif #ifdef CONFIG_KASAN_GENERIC @@ -171,7 +171,7 @@ enum _slab_flag_bits { # define SLAB_FAILSLAB __SLAB_FLAG_UNUSED #endif /* Account to memcg */ -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG # define SLAB_ACCOUNT __SLAB_FLAG_BIT(_SLAB_ACCOUNT) #else # define SLAB_ACCOUNT __SLAB_FLAG_UNUSED @@ -407,7 +407,7 @@ enum kmalloc_cache_type { #ifndef CONFIG_ZONE_DMA KMALLOC_DMA = KMALLOC_NORMAL, #endif -#ifndef CONFIG_MEMCG_KMEM +#ifndef CONFIG_MEMCG KMALLOC_CGROUP = KMALLOC_NORMAL, #endif KMALLOC_RANDOM_START = KMALLOC_NORMAL, @@ -420,7 +420,7 @@ enum kmalloc_cache_type { #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG KMALLOC_CGROUP, #endif NR_KMALLOC_TYPES @@ -435,7 +435,7 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; #define KMALLOC_NOT_NORMAL_BITS \ (__GFP_RECLAIMABLE | \ (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ - (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) + (IS_ENABLED(CONFIG_MEMCG) ? __GFP_ACCOUNT : 0)) extern unsigned long random_kmalloc_seed; @@ -463,7 +463,7 @@ static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigne */ if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA)) return KMALLOC_DMA; - if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE)) + if (!IS_ENABLED(CONFIG_MEMCG) || (flags & __GFP_RECLAIMABLE)) return KMALLOC_RECLAIM; else return KMALLOC_CGROUP; -- cgit v1.2.3 From 259043e3b730e0aa6408bff27af7edf7a5c9101c Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 30 Jun 2024 11:22:31 +1200 Subject: mm: zswap: fix zswap_never_enabled() for CONFIG_ZSWAP==N If CONFIG_ZSWAP is set to N, it means zswap cannot be enabled. zswap_never_enabled() should return true. The only effect of this issue is that with Barry's latest large folio swapin patches for zram ("mm: support mTHP swap-in for zRAM-like swapfile"), we will always fallback to order-0 swapin, even mistakenly when !CONFIG_ZSWAP. Basically this bug makes Barry's in progress patches not work at all. The API was created to inform the mm core that zswap has never been enabled, allowing the mm core to perform mTHP swap-in. This is a transitional solution until zswap supports mTHP. If zswap has been enabled, performing mTHP swap-in will result in corrupted data. You may find the answer in the mTHP swap-in series: https://lore.kernel.org/linux-mm/CAJD7tkZ4FQr6HZpduOdvmqgg_-whuZYE-Bz5O2t6yzw6Yg+v1A@mail.gmail.com/ Link: https://lkml.kernel.org/r/20240629232231.42394-1-21cnbao@gmail.com Fixes: 0300e17d67c3 ("mm: zswap: add zswap_never_enabled()") Signed-off-by: Barry Song Reviewed-by: Chengming Zhou Acked-by: Yosry Ahmed Acked-by: Chris Li Acked-by: David Hildenbrand Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/zswap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index bf83ae5e285d..6cecb4a4f68b 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -68,7 +68,7 @@ static inline bool zswap_is_enabled(void) static inline bool zswap_never_enabled(void) { - return false; + return true; } #endif -- cgit v1.2.3 From a43fe27d650375cd9e5ea915c538f6f9eabd185e Mon Sep 17 00:00:00 2001 From: Xiao Wang Date: Fri, 21 Jun 2024 13:47:07 +0800 Subject: riscv: Optimize crc32 with Zbc extension As suggested by the B-ext spec, the Zbc (carry-less multiplication) instructions can be used to accelerate CRC calculations. Currently, the crc32 is the most widely used crc function inside kernel, so this patch focuses on the optimization of just the crc32 APIs. Compared with the current table-lookup based optimization, Zbc based optimization can also achieve large stride during CRC calculation loop, meantime, it avoids the memory access latency of the table-lookup based implementation and it reduces memory footprint. If Zbc feature is not supported in a runtime environment, then the table-lookup based implementation would serve as fallback via alternative mechanism. By inspecting the vmlinux built by gcc v12.2.0 with default optimization level (-O2), we can see below instruction count change for each 8-byte stride in the CRC32 loop: rv64: crc32_be (54->31), crc32_le (54->13), __crc32c_le (54->13) rv32: crc32_be (50->32), crc32_le (50->16), __crc32c_le (50->16) The compile target CPU is little endian, extra effort is needed for byte swapping for the crc32_be API, thus, the instruction count change is not as significant as that in the *_le cases. This patch is tested on QEMU VM with the kernel CRC32 selftest for both rv64 and rv32. Running the CRC32 selftest on a real hardware (SpacemiT K1) with Zbc extension shows 65% and 125% performance improvement respectively on crc32_test() and crc32c_test(). Signed-off-by: Xiao Wang Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240621054707.1847548-1-xiao.w.wang@intel.com Signed-off-by: Palmer Dabbelt --- include/linux/crc32.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 9e8a032c1788..87f788c0d607 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -9,7 +9,9 @@ #include u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len); +u32 __pure crc32_le_base(u32 crc, unsigned char const *p, size_t len); u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len); +u32 __pure crc32_be_base(u32 crc, unsigned char const *p, size_t len); /** * crc32_le_combine - Combine two crc32 check values into one. For two @@ -37,6 +39,7 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) } u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len); +u32 __pure __crc32c_le_base(u32 crc, unsigned char const *p, size_t len); /** * __crc32c_le_combine - Combine two crc32c check values into one. For two -- cgit v1.2.3 From bed6b0317441d82c32506750ccd868d83850e6f4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Jun 2024 11:16:03 +0800 Subject: f2fs: clean up addrs_per_{inode,block}() Introduce a new help addrs_per_page() to wrap common code from addrs_per_inode() and addrs_per_block() for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 41d1d71c36ff..01bee2b289c2 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -259,15 +259,14 @@ struct node_footer { #define CUR_ADDRS_PER_INODE(inode) (DEF_ADDRS_PER_INODE - \ get_extra_isize(inode)) #define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */ -#define ADDRS_PER_INODE(inode) addrs_per_inode(inode) +#define ADDRS_PER_INODE(inode) addrs_per_page(inode, true) /* Address Pointers in a Direct Block */ #define DEF_ADDRS_PER_BLOCK ((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32)) -#define ADDRS_PER_BLOCK(inode) addrs_per_block(inode) +#define ADDRS_PER_BLOCK(inode) addrs_per_page(inode, false) /* Node IDs in an Indirect Block */ #define NIDS_PER_BLOCK ((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32)) -#define ADDRS_PER_PAGE(page, inode) \ - (IS_INODE(page) ? ADDRS_PER_INODE(inode) : ADDRS_PER_BLOCK(inode)) +#define ADDRS_PER_PAGE(page, inode) (addrs_per_page(inode, IS_INODE(page))) #define NODE_DIR1_BLOCK (DEF_ADDRS_PER_INODE + 1) #define NODE_DIR2_BLOCK (DEF_ADDRS_PER_INODE + 2) -- cgit v1.2.3 From a93c2e5fe766825d6264823fd1dca9aae747cf5d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 6 Jul 2024 13:20:01 +0200 Subject: i2c: reword i2c_algorithm according to newest specification Start changing the wording of the I2C main header wrt. the newest I2C v7 and SMBus 3.2 specifications and replace "master/slave" with more appropriate terms. The first step renames the members of struct i2c_algorithm. Once all in-tree users are converted, the anonymous union will go away again. All this work will also pave the way for finally seperating the monolithic header into more fine-grained headers like "i2c/clients.h" etc. Signed-off-by: Wolfram Sang Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti --- include/linux/i2c.h | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 424acb98c7c2..9d45b7b912dd 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -548,10 +548,18 @@ struct i2c_algorithm { * master_xfer should return the number of messages successfully * processed, or a negative value on error */ - int (*master_xfer)(struct i2c_adapter *adap, struct i2c_msg *msgs, - int num); - int (*master_xfer_atomic)(struct i2c_adapter *adap, + union { + int (*xfer)(struct i2c_adapter *adap, struct i2c_msg *msgs, + int num); + int (*master_xfer)(struct i2c_adapter *adap, struct i2c_msg *msgs, + int num); + }; + union { + int (*xfer_atomic)(struct i2c_adapter *adap, struct i2c_msg *msgs, int num); + int (*master_xfer_atomic)(struct i2c_adapter *adap, + struct i2c_msg *msgs, int num); + }; int (*smbus_xfer)(struct i2c_adapter *adap, u16 addr, unsigned short flags, char read_write, u8 command, int size, union i2c_smbus_data *data); @@ -563,8 +571,14 @@ struct i2c_algorithm { u32 (*functionality)(struct i2c_adapter *adap); #if IS_ENABLED(CONFIG_I2C_SLAVE) - int (*reg_slave)(struct i2c_client *client); - int (*unreg_slave)(struct i2c_client *client); + union { + int (*reg_target)(struct i2c_client *client); + int (*reg_slave)(struct i2c_client *client); + }; + union { + int (*unreg_target)(struct i2c_client *client); + int (*unreg_slave)(struct i2c_client *client); + }; #endif }; -- cgit v1.2.3 From a808878308a8041ae10a151d69e2d22f94cae9f4 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 14 Apr 2024 11:05:25 +0300 Subject: driver core: auxiliary bus: show auxiliary device IRQs PCI subfunctions (SF) are anchored on the auxiliary bus. PCI physical and virtual functions are anchored on the PCI bus. The irq information of each such function is visible to users via sysfs directory "msi_irqs" containing files for each irq entry. However, for PCI SFs such information is unavailable. Due to this users have no visibility on IRQs used by the SFs. Secondly, an SF can be multi function device supporting rdma, netdevice and more. Without irq information at the bus level, the user is unable to view or use the affinity of the SF IRQs. Hence to match to the equivalent PCI PFs and VFs, add "irqs" directory, for supporting auxiliary devices, containing file for each irq entry. For example: $ ls /sys/bus/auxiliary/devices/mlx5_core.sf.1/irqs/ 50 51 52 53 54 55 56 57 58 Cc: Simon Horman Reviewed-by: Przemek Kitszel Reviewed-by: Parav Pandit Reviewed-by: Greg Kroah-Hartman Signed-off-by: Shay Drory Signed-off-by: Saeed Mahameed --- v9-v10: - remove Przemek RB - add name field to auxiliary_irq_info (Greg and Przemek) - handle bogus IRQ in auxiliary_device_sysfs_irq_remove (Greg) v8-v9: - add Przemek RB - use guard() in auxiliary_irq_dir_prepare (Paolo) v7-v8: - use cleanup.h for info and name fields (Greg) - correct error flow in auxiliary_irq_dir_prepare (Przemek) - add documentation for new fields of auxiliary_device (Simon) v6-v7: - dynamically creating irqs directory when first irq file created (Greg) - removed irqs flag and simplified the dev_add() API (Greg) - move sysfs related new code to a new auxiliary_sysfs.c file (Greg) v5-v6: - removed concept of shared and exclusive and hence global xarray (Greg) v4-v5: - restore global mutex and replace refcount_t with simple integer (Greg) v3->4: - remove global mutex (Przemek) v2->v3: - fix function declaration in case SYSFS isn't defined v1->v2: - move #ifdefs from drivers/base/auxiliary.c to include/linux/auxiliary_bus.h (Greg) - use EXPORT_SYMBOL_GPL instead of EXPORT_SYMBOL (Greg) - Fix kzalloc(ref) to kzalloc(*ref) (Simon) - Add return description in auxiliary_device_sysfs_irq_add() kdoc (Simon) - Fix auxiliary_irq_mode_show doc (kernel test boot) --- include/linux/auxiliary_bus.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index de21d9d24a95..3ba4487c9cd9 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -58,6 +58,9 @@ * in * @name: Match name found by the auxiliary device driver, * @id: unique identitier if multiple devices of the same name are exported, + * @irqs: irqs xarray contains irq indices which are used by the device, + * @lock: Synchronize irq sysfs creation, + * @irq_dir_exists: whether "irqs" directory exists, * * An auxiliary_device represents a part of its parent device's functionality. * It is given a name that, combined with the registering drivers @@ -139,6 +142,11 @@ struct auxiliary_device { struct device dev; const char *name; u32 id; + struct { + struct xarray irqs; + struct mutex lock; /* Synchronize irq sysfs creation */ + bool irq_dir_exists; + } sysfs; }; /** @@ -212,8 +220,24 @@ int auxiliary_device_init(struct auxiliary_device *auxdev); int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname); #define auxiliary_device_add(auxdev) __auxiliary_device_add(auxdev, KBUILD_MODNAME) +#ifdef CONFIG_SYSFS +int auxiliary_device_sysfs_irq_add(struct auxiliary_device *auxdev, int irq); +void auxiliary_device_sysfs_irq_remove(struct auxiliary_device *auxdev, + int irq); +#else /* CONFIG_SYSFS */ +static inline int +auxiliary_device_sysfs_irq_add(struct auxiliary_device *auxdev, int irq) +{ + return 0; +} + +static inline void +auxiliary_device_sysfs_irq_remove(struct auxiliary_device *auxdev, int irq) {} +#endif + static inline void auxiliary_device_uninit(struct auxiliary_device *auxdev) { + mutex_destroy(&auxdev->sysfs.lock); put_device(&auxdev->dev); } -- cgit v1.2.3 From ad78e05d654567e0a96f91d5db198469ddc2d4fb Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Thu, 13 Jun 2024 13:50:25 +0200 Subject: PCI: Add managed pcim_iomap_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only managed mapping function currently is pcim_iomap() which doesn't allow for mapping an area starting at a certain offset, which many drivers want. Add pcim_iomap_range() as an exported function. Link: https://lore.kernel.org/r/20240613115032.29098-13-pstanner@redhat.com Signed-off-by: Philipp Stanner Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0c19f0717899..98893a89bb5b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2303,6 +2303,8 @@ int pcim_iomap_regions(struct pci_dev *pdev, int mask, const char *name); int pcim_iomap_regions_request_all(struct pci_dev *pdev, int mask, const char *name); void pcim_iounmap_regions(struct pci_dev *pdev, int mask); +void __iomem *pcim_iomap_range(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long len); extern int pci_pci_problems; #define PCIPCI_FAIL 1 /* No PCI PCI DMA */ -- cgit v1.2.3 From 13cabc47f8ae69d24653f32c28399d493fde0a56 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 10 Jul 2024 04:30:28 -0700 Subject: netdevice: define and allocate &net_device _properly_ In fact, this structure contains a flexible array at the end, but historically its size, alignment etc., is calculated manually. There are several instances of the structure embedded into other structures, but also there's ongoing effort to remove them and we could in the meantime declare &net_device properly. Declare the array explicitly, use struct_size() and store the array size inside the structure, so that __counted_by() can be applied. Don't use PTR_ALIGN(), as SLUB itself tries its best to ensure the allocated buffer is aligned to what the user expects. Also, change its alignment from %NETDEV_ALIGN to the cacheline size as per several suggestions on the netdev ML. bloat-o-meter for vmlinux: free_netdev 445 440 -5 netdev_freemem 24 - -24 alloc_netdev_mqs 1481 1450 -31 On x86_64 with several NICs of different vendors, I was never able to get a &net_device pointer not aligned to the cacheline size after the change. Signed-off-by: Alexander Lobakin Signed-off-by: Breno Leitao Reviewed-by: Przemek Kitszel Reviewed-by: Eric Dumazet Reviewed-by: Kees Cook Link: https://patch.msgid.link/20240710113036.2125584-1-leitao@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93558645c6d0..607009150b5f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1819,7 +1819,8 @@ enum netdev_reg_state { * @priv_flags: Like 'flags' but invisible to userspace, * see if.h for the definitions * @gflags: Global flags ( kept as legacy ) - * @padded: How much padding added by alloc_netdev() + * @priv_len: Size of the ->priv flexible array + * @priv: Flexible array containing private data * @operstate: RFC2863 operstate * @link_mode: Mapping policy to operstate * @if_port: Selectable AUI, TP, ... @@ -2199,10 +2200,10 @@ struct net_device { unsigned short neigh_priv_len; unsigned short dev_id; unsigned short dev_port; - unsigned short padded; + int irq; + u32 priv_len; spinlock_t addr_list_lock; - int irq; struct netdev_hw_addr_list uc; struct netdev_hw_addr_list mc; @@ -2406,7 +2407,10 @@ struct net_device { /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */ struct dim_irq_moder *irq_moder; -}; + + u8 priv[] ____cacheline_aligned + __counted_by(priv_len); +} ____cacheline_aligned; #define to_net_dev(d) container_of(d, struct net_device, dev) /* @@ -2596,7 +2600,7 @@ void dev_net_set(struct net_device *dev, struct net *net) */ static inline void *netdev_priv(const struct net_device *dev) { - return (char *)dev + ALIGN(sizeof(struct net_device), NETDEV_ALIGN); + return (void *)dev->priv; } /* Set the sysfs physical device reference for the network logical device @@ -3127,7 +3131,6 @@ static inline void unregister_netdevice(struct net_device *dev) int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); -void netdev_freemem(struct net_device *dev); void init_dummy_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, -- cgit v1.2.3 From 887c4cf5594a073fd60c0df84150eb06d78c6406 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 11 Jul 2024 10:11:12 -0700 Subject: efi: Rename efi_early_memdesc_ptr() to efi_memdesc_ptr() The "early" part of the helper's name isn't accurate[1]. Drop it in preparation for adding a new (not early) usage. Suggested-by: Ard Biesheuvel Link: https://lore.kernel.org/lkml/CAMj1kXEyDjH0uu3Z4eBesV3PEnKGi5ArXXMp7R-hn8HdRytiPg@mail.gmail.com [1] Signed-off-by: Kees Cook Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 3a6c04a9f9aa..67b2fa80c8b9 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -784,7 +784,7 @@ extern int efi_memattr_apply_permissions(struct mm_struct *mm, efi_memattr_perm_setter fn); /* - * efi_early_memdesc_ptr - get the n-th EFI memmap descriptor + * efi_memdesc_ptr - get the n-th EFI memmap descriptor * @map: the start of efi memmap * @desc_size: the size of space for each EFI memmap descriptor * @n: the index of efi memmap descriptor @@ -802,7 +802,7 @@ extern int efi_memattr_apply_permissions(struct mm_struct *mm, * during bootup since for_each_efi_memory_desc_xxx() is available after the * kernel initializes the EFI subsystem to set up struct efi_memory_map. */ -#define efi_early_memdesc_ptr(map, desc_size, n) \ +#define efi_memdesc_ptr(map, desc_size, n) \ (efi_memory_desc_t *)((void *)(map) + ((n) * (desc_size))) /* Iterate through an efi_memory_map */ -- cgit v1.2.3 From 4a2ebb082297f41803742729642961532e54079e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 11 Jul 2024 10:11:13 -0700 Subject: efi: Replace efi_memory_attributes_table_t 0-sized array with flexible array While efi_memory_attributes_table_t::entry isn't used directly as an array, it is used as a base for pointer arithmetic. The type is wrong as it's not technically an array of efi_memory_desc_t's; they could be larger. Regardless, leave the type unchanged and remove the old style "0" array size. Additionally replace the open-coded entry offset code with the existing efi_memdesc_ptr() helper. Signed-off-by: Kees Cook Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 67b2fa80c8b9..6bf3c4fe8511 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -608,7 +608,11 @@ typedef struct { u32 num_entries; u32 desc_size; u32 flags; - efi_memory_desc_t entry[0]; + /* + * There are @num_entries following, each of size @desc_size bytes, + * including an efi_memory_desc_t header. See efi_memdesc_ptr(). + */ + efi_memory_desc_t entry[]; } efi_memory_attributes_table_t; typedef struct { -- cgit v1.2.3 From 0728c810873e1a94e8b767f9809af940c9307d60 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Jul 2024 16:42:33 +0200 Subject: thermal: trip: Pass trip pointer to .set_trip_temp() thermal zone callback Out of several drivers implementing the .set_trip_temp() thermal zone operation, three don't actually use the trip ID argument passed to it, two call __thermal_zone_get_trip() to get a struct thermal_trip corresponding to the given trip ID, and the other use the trip ID as an index into their own data structures with the assumption that it will always match the ordering of entries in the trips table passed to the core during thermal zone registration, which is fragile and not really guaranteed. Even though the trip IDs used by the core are in fact their indices in the trips table passed to it by the thermal zone creator, that is purely a matter of convenience and should not be relied on for correctness. For this reason, modify trip_point_temp_store() to pass a (const) trip pointer to .set_trip_temp() and adjust the drivers implementing it accordingly. This helps to simplify the drivers invoking __thermal_zone_get_trip() from their .set_trip_temp() callback functions because they will not need to do it now and the other drivers can store their internal trip indices in the priv field in struct thermal_trip and their .set_trip_temp() callback functions can get those indices from there. The intel_quark_dts thermal driver can instead use the trip type to determine the requisite trip index. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/8392906.T7Z3S40VBb@rjwysocki.net [ rjw: Add missing colon and 2 empty code lines ] [ rjw: Add missing change in imx_thermal.c and adjust the changelog ] [ rjw: Drop an unused local variable ] Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 3bd1b361ec34..cdd9c722cda2 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -93,7 +93,8 @@ struct thermal_zone_device_ops { int (*set_trips) (struct thermal_zone_device *, int, int); int (*change_mode) (struct thermal_zone_device *, enum thermal_device_mode); - int (*set_trip_temp) (struct thermal_zone_device *, int, int); + int (*set_trip_temp) (struct thermal_zone_device *, + const struct thermal_trip *, int); int (*get_crit_temp) (struct thermal_zone_device *, int *); int (*set_emul_temp) (struct thermal_zone_device *, int); int (*get_trend) (struct thermal_zone_device *, -- cgit v1.2.3 From 5b674baa596e624fde8bf62b9a3d8a26eef399b2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Jul 2024 16:44:27 +0200 Subject: thermal: trip: Fold __thermal_zone_get_trip() into its caller Because __thermal_zone_get_trip() is only called by thermal_zone_get_trip() now, fold the former into the latter. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/22339769.EfDdHjke4D@rjwysocki.net --- include/linux/thermal.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index cdd9c722cda2..25fbf960b474 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -202,8 +202,6 @@ static inline void devm_thermal_of_zone_unregister(struct device *dev, } #endif -int __thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id, - struct thermal_trip *trip); int thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id, struct thermal_trip *trip); int for_each_thermal_trip(struct thermal_zone_device *tz, -- cgit v1.2.3 From 4bdc3eaa102b6bedb0800f76f53eca516d5cf20c Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Wed, 10 Jul 2024 16:35:21 +1200 Subject: clocksource/drivers/realtek: Add timer driver for rtl-otto platforms The timer/counter block on the Realtek SoCs provides up to 5 timers. It also includes a watchdog timer which is handled by the realtek_otto_wdt.c driver. One timer will be used per CPU as a local clock event generator. An additional timer will be used as an overal stable clocksource. Signed-off-by: Markus Stockhausen Signed-off-by: Sander Vanheule Signed-off-by: Chris Packham Link: https://lore.kernel.org/r/20240710043524.1535151-8-chris.packham@alliedtelesis.co.nz Signed-off-by: Daniel Lezcano --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 7a5785f405b6..56b744dc1317 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -171,6 +171,7 @@ enum cpuhp_state { CPUHP_AP_ARMADA_TIMER_STARTING, CPUHP_AP_MIPS_GIC_TIMER_STARTING, CPUHP_AP_ARC_TIMER_STARTING, + CPUHP_AP_REALTEK_TIMER_STARTING, CPUHP_AP_RISCV_TIMER_STARTING, CPUHP_AP_CLINT_TIMER_STARTING, CPUHP_AP_CSKY_TIMER_STARTING, -- cgit v1.2.3 From 27e6a24a4cf3d25421c0f6ebb7c39f45fc14d20f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jul 2024 13:56:54 -0400 Subject: mm, virt: merge AS_UNMOVABLE and AS_INACCESSIBLE The flags AS_UNMOVABLE and AS_INACCESSIBLE were both added just for guest_memfd; AS_UNMOVABLE is already in existing versions of Linux, while AS_INACCESSIBLE was acked for inclusion in 6.11. But really, they are the same thing: only guest_memfd uses them, at least for now, and guest_memfd pages are unmovable because they should not be accessed by the CPU. So merge them into one; use the AS_INACCESSIBLE name which is more comprehensive. At the same time, this fixes an embarrassing bug where AS_INACCESSIBLE was used as a bit mask, despite it being just a bit index. The bug was mostly benign, because AS_INACCESSIBLE's bit representation (1010) corresponded to setting AS_UNEVICTABLE (which is already set) and AS_ENOSPC (except no async writes can happen on the guest_memfd). So the AS_INACCESSIBLE flag simply had no effect. Fixes: 1d23040caa8b ("KVM: guest_memfd: Use AS_INACCESSIBLE when creating guest_memfd inode") Fixes: c72ceafbd12c ("mm: Introduce AS_INACCESSIBLE for encrypted/confidential memory") Cc: linux-mm@kvack.org Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Tested-by: Michael Roth Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- include/linux/pagemap.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ce7bac8f81da..e05585eda771 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -208,8 +208,8 @@ enum mapping_flags { AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ AS_STABLE_WRITES, /* must wait for writeback before modifying folio contents */ - AS_UNMOVABLE, /* The mapping cannot be moved, ever */ - AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping */ + AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping, + including to move the mapping */ }; /** @@ -310,20 +310,20 @@ static inline void mapping_clear_stable_writes(struct address_space *mapping) clear_bit(AS_STABLE_WRITES, &mapping->flags); } -static inline void mapping_set_unmovable(struct address_space *mapping) +static inline void mapping_set_inaccessible(struct address_space *mapping) { /* - * It's expected unmovable mappings are also unevictable. Compaction + * It's expected inaccessible mappings are also unevictable. Compaction * migrate scanner (isolate_migratepages_block()) relies on this to * reduce page locking. */ set_bit(AS_UNEVICTABLE, &mapping->flags); - set_bit(AS_UNMOVABLE, &mapping->flags); + set_bit(AS_INACCESSIBLE, &mapping->flags); } -static inline bool mapping_unmovable(struct address_space *mapping) +static inline bool mapping_inaccessible(struct address_space *mapping) { - return test_bit(AS_UNMOVABLE, &mapping->flags); + return test_bit(AS_INACCESSIBLE, &mapping->flags); } static inline gfp_t mapping_gfp_mask(struct address_space * mapping) -- cgit v1.2.3 From bc1a5cd002116552db4c3541e91f8a5b1b0cf65d Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 10 Apr 2024 15:07:28 -0700 Subject: KVM: Add KVM_PRE_FAULT_MEMORY vcpu ioctl to pre-populate guest memory Add a new ioctl KVM_PRE_FAULT_MEMORY in the KVM common code. It iterates on the memory range and calls the arch-specific function. The implementation is optional and enabled by a Kconfig symbol. Suggested-by: Sean Christopherson Signed-off-by: Isaku Yamahata Reviewed-by: Rick Edgecombe Message-ID: <819322b8f25971f2b9933bfa4506e618508ad782.1712785629.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7b57878c8c18..c3c922bf077f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2477,4 +2477,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); #endif +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range); +#endif + #endif -- cgit v1.2.3 From 9b2bc6b9a264b863a2273c02db5ee9e214e0a526 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 12 Jul 2024 12:31:32 +0100 Subject: iommu: Move IOMMU_DIRTY_NO_CLEAR define Fixes the compile issue when CONFIG_IOMMU_API is not set. Fixes: 4fe88fd8b4ae ("iommu/io-pgtable-arm: Add read_and_clear_dirty() support") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407121602.HL9ih1it-lkp@intel.com/ Signed-off-by: Shameer Kolothum Reviewed-by: Joao Martins Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240712113132.45100-1-shameerali.kolothum.thodi@huawei.com Signed-off-by: Will Deacon --- include/linux/iommu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7bc8dff7cf6d..104ce84647d4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -317,6 +317,9 @@ enum iommu_dev_features { #define IOMMU_PASID_INVALID (-1U) typedef unsigned int ioasid_t; +/* Read but do not clear any dirty bits */ +#define IOMMU_DIRTY_NO_CLEAR (1 << 0) + #ifdef CONFIG_IOMMU_API /** @@ -353,9 +356,6 @@ struct iommu_dirty_bitmap { struct iommu_iotlb_gather *gather; }; -/* Read but do not clear any dirty bits */ -#define IOMMU_DIRTY_NO_CLEAR (1 << 0) - /** * struct iommu_dirty_ops - domain specific dirty tracking operations * @set_dirty_tracking: Enable or Disable dirty tracking on the iommu domain -- cgit v1.2.3 From 617069741dfbb141bc4574531a5dbbbad8ddf197 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 10 Jul 2024 20:53:12 +0200 Subject: dm: introduce the target flag mempool_needs_integrity This commit introduces the dm target flag mempool_needs_integrity. When the flag is set, device mapper will call bioset_integrity_create on it's bio sets. The target can then call bio_integrity_alloc on the bios allocated from the table's mempool. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- include/linux/device-mapper.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 384649a61bfa..1363f87fbba6 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -405,6 +405,12 @@ struct dm_target { * support it. */ bool flush_bypasses_map:1; + + /* + * Set if the target calls bio_integrity_alloc on bios received + * in the map method. + */ + bool mempool_needs_integrity:1; }; void *dm_per_bio_data(struct bio *bio, size_t data_size); -- cgit v1.2.3 From 6a26f9c68901797261bc145975a02f85be0c1d8f Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 11 Jul 2024 10:14:57 +0000 Subject: cgroup/misc: Introduce misc.events.local Currently the event counting provided by misc.events is hierarchical, it's not practical if user is only concerned with events of a specified cgroup. Therefore, introduce misc.events.local collect events specific to the given cgroup. This is analogous to memory.events.local and pids.events.local. Signed-off-by: Xiu Jianfeng Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 618392d41975..49eef10c8e59 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -40,6 +40,7 @@ struct misc_res { atomic64_t watermark; atomic64_t usage; atomic64_t events; + atomic64_t events_local; }; /** @@ -53,6 +54,8 @@ struct misc_cg { /* misc.events */ struct cgroup_file events_file; + /* misc.events.local */ + struct cgroup_file events_local_file; struct misc_res res[MISC_CG_RES_TYPES]; }; -- cgit v1.2.3 From f7866c35873377313ff94398f17d425b28b71de1 Mon Sep 17 00:00:00 2001 From: Tengda Wu Date: Thu, 11 Jul 2024 22:58:18 +0800 Subject: bpf: Fix null pointer dereference in resolve_prog_type() for BPF_PROG_TYPE_EXT When loading a EXT program without specifying `attr->attach_prog_fd`, the `prog->aux->dst_prog` will be null. At this time, calling resolve_prog_type() anywhere will result in a null pointer dereference. Example stack trace: [ 8.107863] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000004 [ 8.108262] Mem abort info: [ 8.108384] ESR = 0x0000000096000004 [ 8.108547] EC = 0x25: DABT (current EL), IL = 32 bits [ 8.108722] SET = 0, FnV = 0 [ 8.108827] EA = 0, S1PTW = 0 [ 8.108939] FSC = 0x04: level 0 translation fault [ 8.109102] Data abort info: [ 8.109203] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 8.109399] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 8.109614] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 8.109836] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000101354000 [ 8.110011] [0000000000000004] pgd=0000000000000000, p4d=0000000000000000 [ 8.112624] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP [ 8.112783] Modules linked in: [ 8.113120] CPU: 0 PID: 99 Comm: may_access_dire Not tainted 6.10.0-rc3-next-20240613-dirty #1 [ 8.113230] Hardware name: linux,dummy-virt (DT) [ 8.113390] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 8.113429] pc : may_access_direct_pkt_data+0x24/0xa0 [ 8.113746] lr : add_subprog_and_kfunc+0x634/0x8e8 [ 8.113798] sp : ffff80008283b9f0 [ 8.113813] x29: ffff80008283b9f0 x28: ffff800082795048 x27: 0000000000000001 [ 8.113881] x26: ffff0000c0bb2600 x25: 0000000000000000 x24: 0000000000000000 [ 8.113897] x23: ffff0000c1134000 x22: 000000000001864f x21: ffff0000c1138000 [ 8.113912] x20: 0000000000000001 x19: ffff0000c12b8000 x18: ffffffffffffffff [ 8.113929] x17: 0000000000000000 x16: 0000000000000000 x15: 0720072007200720 [ 8.113944] x14: 0720072007200720 x13: 0720072007200720 x12: 0720072007200720 [ 8.113958] x11: 0720072007200720 x10: 0000000000f9fca4 x9 : ffff80008021f4e4 [ 8.113991] x8 : 0101010101010101 x7 : 746f72705f6d656d x6 : 000000001e0e0f5f [ 8.114006] x5 : 000000000001864f x4 : ffff0000c12b8000 x3 : 000000000000001c [ 8.114020] x2 : 0000000000000002 x1 : 0000000000000000 x0 : 0000000000000000 [ 8.114126] Call trace: [ 8.114159] may_access_direct_pkt_data+0x24/0xa0 [ 8.114202] bpf_check+0x3bc/0x28c0 [ 8.114214] bpf_prog_load+0x658/0xa58 [ 8.114227] __sys_bpf+0xc50/0x2250 [ 8.114240] __arm64_sys_bpf+0x28/0x40 [ 8.114254] invoke_syscall.constprop.0+0x54/0xf0 [ 8.114273] do_el0_svc+0x4c/0xd8 [ 8.114289] el0_svc+0x3c/0x140 [ 8.114305] el0t_64_sync_handler+0x134/0x150 [ 8.114331] el0t_64_sync+0x168/0x170 [ 8.114477] Code: 7100707f 54000081 f9401c00 f9403800 (b9400403) [ 8.118672] ---[ end trace 0000000000000000 ]--- One way to fix it is by forcing `attach_prog_fd` non-empty when bpf_prog_load(). But this will lead to `libbpf_probe_bpf_prog_type` API broken which use verifier log to probe prog type and will log nothing if we reject invalid EXT prog before bpf_check(). Another way is by adding null check in resolve_prog_type(). The issue was introduced by commit 4a9c7bbe2ed4 ("bpf: Resolve to prog->aux->dst_prog->type only for BPF_PROG_TYPE_EXT") which wanted to correct type resolution for BPF_PROG_TYPE_TRACING programs. Before that, the type resolution of BPF_PROG_TYPE_EXT prog actually follows the logic below: prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; It implies that when EXT program is not yet attached to `dst_prog`, the prog type should be EXT itself. This code worked fine in the past. So just keep using it. Fix this by returning `prog->type` for BPF_PROG_TYPE_EXT if `dst_prog` is not present in resolve_prog_type(). Fixes: 4a9c7bbe2ed4 ("bpf: Resolve to prog->aux->dst_prog->type only for BPF_PROG_TYPE_EXT") Signed-off-by: Tengda Wu Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20240711145819.254178-2-wutengda@huaweicloud.com --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e98ba5a5cf79..6503c85b10a3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -856,7 +856,7 @@ static inline u32 type_flag(u32 type) /* only use after check_attach_btf_id() */ static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog) { - return prog->type == BPF_PROG_TYPE_EXT ? + return (prog->type == BPF_PROG_TYPE_EXT && prog->aux->dst_prog) ? prog->aux->dst_prog->type : prog->type; } -- cgit v1.2.3 From 6cc040542ba7b2c60e5119cd04d841fcf048c872 Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Sun, 23 Jun 2024 23:36:09 -0700 Subject: mm/gup: introduce unpin_folio/unpin_folios helpers Patch series "mm/gup: Introduce memfd_pin_folios() for pinning memfd folios", v16. Currently, some drivers (e.g, Udmabuf) that want to longterm-pin the pages/folios associated with a memfd, do so by simply taking a reference on them. This is not desirable because the pages/folios may reside in Movable zone or CMA block. Therefore, having drivers use memfd_pin_folios() API ensures that the folios are appropriately pinned via FOLL_PIN for longterm DMA. This patchset also introduces a few helpers and converts the Udmabuf driver to use folios and memfd_pin_folios() API to longterm-pin the folios for DMA. Two new Udmabuf selftests are also included to test the driver and the new API. This patch (of 9): These helpers are the folio versions of unpin_user_page/unpin_user_pages. They are currently only useful for unpinning folios pinned by memfd_pin_folios() or other associated routines. However, they could find new uses in the future, when more and more folio-only helpers are added to GUP. We should probably sanity check the folio as part of unpin similar to how it is done in unpin_user_page/unpin_user_pages but we cannot cleanly do that at the moment without also checking the subpage. Therefore, sanity checking needs to be added to these routines once we have a way to determine if any given folio is anon-exclusive (via a per folio AnonExclusive flag). Link: https://lkml.kernel.org/r/20240624063952.1572359-1-vivek.kasireddy@intel.com Link: https://lkml.kernel.org/r/20240624063952.1572359-2-vivek.kasireddy@intel.com Signed-off-by: Vivek Kasireddy Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Acked-by: Dave Airlie Acked-by: Gerd Hoffmann Cc: Matthew Wilcox Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Peter Xu Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dongwon Kim Cc: Hugh Dickins Cc: Junxiao Chang Cc: Oscar Salvador Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a2e3ebead410..7b84379e3a1b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1584,11 +1584,13 @@ static inline void put_page(struct page *page) #define GUP_PIN_COUNTING_BIAS (1U << 10) void unpin_user_page(struct page *page); +void unpin_folio(struct folio *folio); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); +void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) { -- cgit v1.2.3 From 89c1905d9c140372b7f50ef48f42378cf85d9bc5 Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Sun, 23 Jun 2024 23:36:11 -0700 Subject: mm/gup: introduce memfd_pin_folios() for pinning memfd folios For drivers that would like to longterm-pin the folios associated with a memfd, the memfd_pin_folios() API provides an option to not only pin the folios via FOLL_PIN but also to check and migrate them if they reside in movable zone or CMA block. This API currently works with memfds but it should work with any files that belong to either shmemfs or hugetlbfs. Files belonging to other filesystems are rejected for now. The folios need to be located first before pinning them via FOLL_PIN. If they are found in the page cache, they can be immediately pinned. Otherwise, they need to be allocated using the filesystem specific APIs and then pinned. [akpm@linux-foundation.org: improve the CONFIG_MMU=n situation, per SeongJae] [vivek.kasireddy@intel.com: return -EINVAL if the end offset is greater than the size of memfd] Link: https://lkml.kernel.org/r/IA0PR11MB71850525CBC7D541CAB45DF1F8DB2@IA0PR11MB7185.namprd11.prod.outlook.com Link: https://lkml.kernel.org/r/20240624063952.1572359-4-vivek.kasireddy@intel.com Signed-off-by: Vivek Kasireddy Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe (v2) Reviewed-by: David Hildenbrand (v3) Reviewed-by: Christoph Hellwig (v6) Acked-by: Dave Airlie Acked-by: Gerd Hoffmann Cc: Matthew Wilcox (Oracle) Cc: Daniel Vetter Cc: Hugh Dickins Cc: Peter Xu Cc: Dongwon Kim Cc: Junxiao Chang Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Mike Kravetz Cc: Oscar Salvador Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/memfd.h | 5 +++++ include/linux/mm.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index e7abf6fa4c52..3f2cf339ceaf 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -6,11 +6,16 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); +struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { return -EINVAL; } +static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) +{ + return ERR_PTR(-EINVAL); +} #endif #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b84379e3a1b..5f1075d19600 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2500,6 +2500,9 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); +long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, + struct folio **folios, unsigned int max_folios, + pgoff_t *offset); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); -- cgit v1.2.3 From f216c845f3c772e54d27fe209fd300b10e7bf54a Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 28 Jun 2024 21:07:49 +0800 Subject: mm: add per-order mTHP split counters Patch series "mm: introduce per-order mTHP split counters", v3. At present, the split counters in THP statistics no longer include PTE-mapped mTHP. Therefore, we want to introduce per-order mTHP split counters to monitor the frequency of mTHP splits. This will assist developers in better analyzing and optimizing system performance. /sys/kernel/mm/transparent_hugepage/hugepages-/stats split split_failed split_deferred This patch (of 2): Currently, the split counters in THP statistics no longer include PTE-mapped mTHP. Therefore, we propose introducing per-order mTHP split counters to monitor the frequency of mTHP splits. This will help developers better analyze and optimize system performance. /sys/kernel/mm/transparent_hugepage/hugepages-/stats split split_failed split_deferred [ioworker0@gmail.com: make things more readable, per Barry and Baolin] Link: https://lkml.kernel.org/r/20240704012905.42971-2-ioworker0@gmail.com [ioworker0@gmail.com: use == for `order' test, per David] Link: https://lkml.kernel.org/r/20240705113119.82210-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240704012905.42971-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240704012905.42971-2-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240628130750.73097-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240628130750.73097-2-ioworker0@gmail.com Signed-off-by: Mingzhe Yang Signed-off-by: Lance Yang Reviewed-by: Ryan Roberts Acked-by: Barry Song Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Bang Li Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 212cca384d7e..cee3c5da8f0e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -284,6 +284,9 @@ enum mthp_stat_item { MTHP_STAT_FILE_ALLOC, MTHP_STAT_FILE_FALLBACK, MTHP_STAT_FILE_FALLBACK_CHARGE, + MTHP_STAT_SPLIT, + MTHP_STAT_SPLIT_FAILED, + MTHP_STAT_SPLIT_DEFERRED, __MTHP_STAT_COUNT }; -- cgit v1.2.3 From 18d095b2556e5e1292003c8e9f5d845ed42ef89b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Jul 2024 15:51:19 +0200 Subject: mm: define __pte_leaf_size() to also take a PMD entry On powerpc 8xx, when a page is 8M size, the information is in the PMD entry. So allow architectures to provide __pte_leaf_size() instead of pte_leaf_size() and provide the PMD entry to that function. When __pte_leaf_size() is not defined, define it as a pte_leaf_size() so that architectures not interested in the PMD arguments are not impacted. Only define a default pte_leaf_size() when __pte_leaf_size() is not defined to make sure nobody adds new calls to pte_leaf_size() in the core. Link: https://lkml.kernel.org/r/c7c008f0a314bf8029ad7288fdc908db1ec7e449.1719928057.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Oscar Salvador Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2f32eaccf0b9..2a6a3cccfc36 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1907,9 +1907,12 @@ typedef unsigned int pgtbl_mod_mask; #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif +#ifndef __pte_leaf_size #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif +#define __pte_leaf_size(x,y) pte_leaf_size(y) +#endif /* * We always define pmd_pfn for all archs as it's used in lots of generic -- cgit v1.2.3 From e6c0c03245b14d6c205814aee67128257d0bea84 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Jul 2024 15:51:20 +0200 Subject: mm: provide mm_struct and address to huge_ptep_get() On powerpc 8xx huge_ptep_get() will need to know whether the given ptep is a PTE entry or a PMD entry. This cannot be known with the PMD entry itself because there is no easy way to know it from the content of the entry. So huge_ptep_get() will need to know either the size of the page or get the pmd. In order to be consistent with huge_ptep_get_and_clear(), give mm and address to huge_ptep_get(). Link: https://lkml.kernel.org/r/cc00c70dd384298796a4e1b25d6c4eb306d3af85.1719928057.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Oscar Salvador Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/swapops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index a5c560a2f8c2..cb468e418ea1 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -334,7 +334,7 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); +extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); #else /* CONFIG_MIGRATION */ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -359,7 +359,7 @@ static inline int is_migration_entry(swp_entry_t swp) static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, - pte_t *pte) { } + unsigned long addr, pte_t *pte) { } static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; -- cgit v1.2.3 From 8268614b408be6b76c1cee6f67de7deb0d6593b3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Jul 2024 15:51:35 +0200 Subject: mm: remove CONFIG_ARCH_HAS_HUGEPD powerpc was the only user of CONFIG_ARCH_HAS_HUGEPD and doesn't use it anymore, so remove all related code. Link: https://lkml.kernel.org/r/4b10c54c794780b955f3ad6c657d0199dd792146.1719928057.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Acked-by: Oscar Salvador Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a951c0d06061..ecd0ceeea206 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -20,12 +20,6 @@ struct user_struct; struct mmu_gather; struct node; -#ifndef CONFIG_ARCH_HAS_HUGEPD -typedef struct { unsigned long pd; } hugepd_t; -#define is_hugepd(hugepd) (0) -#define __hugepd(x) ((hugepd_t) { (x) }) -#endif - void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3 From 3b0ba54d5f8ff60553c01d3ec3c607ab7bb3b452 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 3 Jul 2024 10:42:25 -0700 Subject: mm: add comments for allocation helpers explaining why they are macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A number of allocation helper functions were converted into macros to account them at the call sites. Add a comment for each converted allocation helper explaining why it has to be a macro and why we typecast the return value wherever required. The patch also moves acpi_os_acquire_object() closer to other allocation helpers to group them together under the same comment. The patch has no functional changes. Link: https://lkml.kernel.org/r/20240703174225.3891393-1-surenb@google.com Fixes: 2c321f3f70bc ("mm: change inlined allocation helpers to account at the call site") Signed-off-by: Suren Baghdasaryan Suggested-by: Andrew Morton Cc: Christian König Cc: Christoph Hellwig Cc: Jan Kara Cc: Kent Overstreet Cc: Thorsten Blum Signed-off-by: Andrew Morton --- include/linux/bpf.h | 4 ++++ include/linux/dma-fence-chain.h | 4 ++++ include/linux/hid_bpf.h | 5 +++++ include/linux/jbd2.h | 10 ++++++++++ include/linux/skbuff.h | 8 ++++++++ include/linux/skmsg.h | 5 +++++ 6 files changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b8637555c9c2..50b82ff4551a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2261,6 +2261,10 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags); #else +/* + * These specialized allocators have to be macros for their allocations to be + * accounted separately (to have separate alloc_tag). + */ #define bpf_map_kmalloc_node(_map, _size, _flags, _node) \ kmalloc_node(_size, _flags, _node) #define bpf_map_kzalloc(_map, _size, _flags) \ diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h index ad9e2506c2f4..68c3c1e41014 100644 --- a/include/linux/dma-fence-chain.h +++ b/include/linux/dma-fence-chain.h @@ -85,6 +85,10 @@ dma_fence_chain_contained(struct dma_fence *fence) * dma_fence_chain_alloc * * Returns a new struct dma_fence_chain object or NULL on failure. + * + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. */ #define dma_fence_chain_alloc() \ ((struct dma_fence_chain *)kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL)) diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index eec2592dec12..99a3edb6cf07 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -152,6 +152,11 @@ static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; } static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {} static inline void hid_bpf_destroy_device(struct hid_device *hid) {} static inline void hid_bpf_device_init(struct hid_device *hid) {} +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ #define call_hid_bpf_rdesc_fixup(_hdev, _rdesc, _size) \ ((u8 *)kmemdup(_rdesc, *(_size), GFP_KERNEL)) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index ab04c1c27fae..a280f42c8c76 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1588,6 +1588,11 @@ void jbd2_journal_put_journal_head(struct journal_head *jh); */ extern struct kmem_cache *jbd2_handle_cache; +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ #define jbd2_alloc_handle(_gfp_flags) \ ((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags)) @@ -1602,6 +1607,11 @@ static inline void jbd2_free_handle(handle_t *handle) */ extern struct kmem_cache *jbd2_inode_cache; +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ #define jbd2_alloc_inode(_gfp_flags) \ ((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags)) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1c2902eaebd3..fd51f2de51da 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3400,6 +3400,10 @@ static inline struct page *__dev_alloc_pages_noprof(gfp_t gfp_mask, } #define __dev_alloc_pages(...) alloc_hooks(__dev_alloc_pages_noprof(__VA_ARGS__)) +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). + */ #define dev_alloc_pages(_order) __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, _order) /** @@ -3416,6 +3420,10 @@ static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask) } #define __dev_alloc_page(...) alloc_hooks(__dev_alloc_page_noprof(__VA_ARGS__)) +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). + */ #define dev_alloc_page() dev_alloc_pages(0) /** diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c9efda9df285..d9b03e0746e7 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -414,6 +414,11 @@ void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). The typecast is + * intentional to enforce typesafety. + */ #define sk_psock_init_link() \ ((struct sk_psock_link *)kzalloc(sizeof(struct sk_psock_link), \ GFP_ATOMIC | __GFP_NOWARN)) -- cgit v1.2.3 From a8585ac68621983587f1701b7567978fcbcd9573 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 3 Jul 2024 13:25:10 +0200 Subject: mm/page_counter: move calculating protection values to page_counter It's a lot of math, and there is nothing memcontrol specific about it. This makes it easier to use inside of the drm cgroup controller. [akpm@linux-foundation.org: fix kerneldoc, per Jeff Johnson] Link: https://lkml.kernel.org/r/20240703112510.36424-1-maarten.lankhorst@linux.intel.com Signed-off-by: Maarten Lankhorst Acked-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Michal Hocko Cc: Johannes Weiner Cc: Muchun Song Cc: Jeff Johnson Signed-off-by: Andrew Morton --- include/linux/page_counter.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 8cd858d912c4..904c52f97284 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -81,4 +81,8 @@ static inline void page_counter_reset_watermark(struct page_counter *counter) counter->watermark = page_counter_read(counter); } +void page_counter_calculate_protection(struct page_counter *root, + struct page_counter *counter, + bool recursive_protection); + #endif /* _LINUX_PAGE_COUNTER_H */ -- cgit v1.2.3 From 823430c8e9d98c5865af518c782d0493b76aa511 Mon Sep 17 00:00:00 2001 From: "Ho-Ren (Jack) Chuang" Date: Thu, 4 Jul 2024 07:26:44 +0000 Subject: memory tier: consolidate the initialization of memory tiers The current memory tier initialization process is distributed across two different functions, memory_tier_init() and memory_tier_late_init(). This design is hard to maintain. Thus, this patch is proposed to reduce the possible code paths by consolidating different initialization patches into one. The earlier discussion with Jonathan and Ying is listed here: https://lore.kernel.org/lkml/20240405150244.00004b49@Huawei.com/ If we want to put these two initializations together, they must be placed together in the later function. Because only at that time, the HMAT information will be ready, adist between nodes can be calculated, and memory tiering can be established based on the adist. So we position the initialization at memory_tier_init() to the memory_tier_late_init() call. Moreover, it's natural to keep memory_tier initialization in drivers at device_initcall() level. If we simply move the set_node_memory_tier() from memory_tier_init() to late_initcall(), it will result in HMAT not registering the mt_adistance_algorithm callback function, because set_node_memory_tier() is not performed during the memory tiering initialization phase, leading to a lack of correct default_dram information. Therefore, we introduced a nodemask to pass the information of the default DRAM nodes. The reason for not choosing to reuse default_dram_type->nodes is that it is not clean enough. So in the end, we use a __initdata variable, which is a variable that is released once initialization is complete, including both CPU and memory nodes for HMAT to iterate through. Link: https://lkml.kernel.org/r/20240704072646.437579-1-horen.chuang@linux.dev Signed-off-by: Ho-Ren (Jack) Chuang Suggested-by: Jonathan Cameron Reviewed-by: "Huang, Ying" Reviewed-by: Jonathan Cameron Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Dan Williams Cc: Dave Jiang Cc: Gregory Price Cc: Len Brown Cc: Michal Hocko Cc: Rafael J. Wysocki Cc: Ravi Jonnalagadda Cc: SeongJae Park Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 0d70788558f4..0dc0cf2863e2 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -38,6 +38,7 @@ struct access_coordinate; #ifdef CONFIG_NUMA extern bool numa_demotion_enabled; extern struct memory_dev_type *default_dram_type; +extern nodemask_t default_dram_nodes; struct memory_dev_type *alloc_memory_type(int adistance); void put_memory_type(struct memory_dev_type *memtype); void init_node_memory_type(int node, struct memory_dev_type *default_type); @@ -76,6 +77,7 @@ static inline bool node_is_toptier(int node) #define numa_demotion_enabled false #define default_dram_type NULL +#define default_dram_nodes NODE_MASK_NONE /* * CONFIG_NUMA implementation returns non NULL error. */ -- cgit v1.2.3 From 00f58104202c472e487f0866fbd38832523fd4f9 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 4 Jul 2024 10:10:50 +0100 Subject: mm: fix khugepaged activation policy Since the introduction of mTHP, the docuementation has stated that khugepaged would be enabled when any mTHP size is enabled, and disabled when all mTHP sizes are disabled. There are 2 problems with this; 1. this is not what was implemented by the code and 2. this is not the desirable behavior. Desirable behavior is for khugepaged to be enabled when any PMD-sized THP is enabled, anon or file. (Note that file THP is still controlled by the top-level control so we must always consider that, as well as the PMD-size mTHP control for anon). khugepaged only supports collapsing to PMD-sized THP so there is no value in enabling it when PMD-sized THP is disabled. So let's change the code and documentation to reflect this policy. Further, per-size enabled control modification events were not previously forwarded to khugepaged to give it an opportunity to start or stop. Consequently the following was resulting in khugepaged eroneously not being activated: echo never > /sys/kernel/mm/transparent_hugepage/enabled echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled [ryan.roberts@arm.com: v3] Link: https://lkml.kernel.org/r/20240705102849.2479686-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240705102849.2479686-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240704091051.2411934-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Fixes: 3485b88390b0 ("mm: thp: introduce multi-size THP sysfs interface") Closes: https://lore.kernel.org/linux-mm/7a0bbe69-1e3d-4263-b206-da007791a5c4@redhat.com/ Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Barry Song Cc: Jonathan Corbet Cc: Lance Yang Cc: Yang Shi Cc: Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cee3c5da8f0e..acb6ac24a07e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -128,18 +128,6 @@ static inline bool hugepage_global_always(void) (1< Date: Fri, 5 Jul 2024 11:23:09 +0800 Subject: mm: thp: support "THPeligible" semantics for mTHP with anonymous shmem After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for anonymous shmem"), we can configure different policies through the multi-size THP sysfs interface for anonymous shmem. But currently "THPeligible" indicates only whether the mapping is eligible for allocating THP-pages as well as the THP is PMD mappable or not for anonymous shmem, we need to support semantics for mTHP with anonymous shmem similar to those for mTHP with anonymous memory. Link: https://lkml.kernel.org/r/20240705032309.24933-1-libang.li@antgroup.com Signed-off-by: Bang Li Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 3fb18f7eb73e..1d06b1e5408a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -113,12 +113,21 @@ int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags); +unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge); #else static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags) { return false; } +static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge) +{ + return 0; +} #endif #ifdef CONFIG_SHMEM -- cgit v1.2.3 From 63d9866ab01ffd0d0835d5564107283a4afc0a38 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 10 Jul 2024 10:55:01 +0100 Subject: mm: shmem: rename mTHP shmem counters The legacy PMD-sized THP counters at /proc/vmstat include thp_file_alloc, thp_file_fallback and thp_file_fallback_charge, which rather confusingly refer to shmem THP and do not include any other types of file pages. This is inconsistent since in most other places in the kernel, THP counters are explicitly separated for anon, shmem and file flavours. However, we are stuck with it since it constitutes a user ABI. Recently, commit 66f44583f9b6 ("mm: shmem: add mTHP counters for anonymous shmem") added equivalent mTHP stats for shmem, keeping the same "file_" prefix in the names. But in future, we may want to add extra stats to cover actual file pages, at which point, it would all become very confusing. So let's take the opportunity to rename these new counters "shmem_" before the change makes it upstream and the ABI becomes immutable. While we are at it, let's improve the documentation for the legacy counters to make it clear that they count shmem pages only. Link: https://lkml.kernel.org/r/20240710095503.3193901-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Baolin Wang Reviewed-by: Lance Yang Reviewed-by: Zi Yan Reviewed-by: Barry Song Acked-by: David Hildenbrand Cc: Daniel Gomez Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index acb6ac24a07e..cff002be83eb 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -269,9 +269,9 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, - MTHP_STAT_FILE_ALLOC, - MTHP_STAT_FILE_FALLBACK, - MTHP_STAT_FILE_FALLBACK_CHARGE, + MTHP_STAT_SHMEM_ALLOC, + MTHP_STAT_SHMEM_FALLBACK, + MTHP_STAT_SHMEM_FALLBACK_CHARGE, MTHP_STAT_SPLIT, MTHP_STAT_SPLIT_FAILED, MTHP_STAT_SPLIT_DEFERRED, -- cgit v1.2.3 From 7a7127aa33c9f5b7b54ffa80619f644c5e000846 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 7 Jul 2024 01:05:05 +0900 Subject: init: remove unused __MEMINIT* macros These macros are not used anywhere. Link: https://lkml.kernel.org/r/20240706160511.2331061-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Reviewed-by: Wei Yang Signed-off-by: Andrew Morton --- include/linux/init.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index 58cef4c2e59a..b2e9dfff8691 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -99,10 +99,6 @@ #define __INITRODATA .section ".init.rodata","a",%progbits #define __FINITDATA .previous -#define __MEMINIT .section ".meminit.text", "ax" -#define __MEMINITDATA .section ".meminit.data", "aw" -#define __MEMINITRODATA .section ".meminit.rodata", "a" - /* silence warnings when references are OK */ #define __REF .section ".ref.text", "ax" #define __REFDATA .section ".ref.data", "aw" -- cgit v1.2.3 From 73db3abdca58c8a014ec4c88cf5ef925cbf63669 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 7 Jul 2024 01:05:06 +0900 Subject: init/modpost: conditionally check section mismatch to __meminit* This reverts commit eb8f689046b8 ("Use separate sections for __dev/ _cpu/__mem code/data"). Check section mismatch to __meminit* only when CONFIG_MEMORY_HOTPLUG=n. With this change, the linker script and modpost become simpler, and we can get rid of the __ref annotations from the memory hotplug code. [sfr@canb.auug.org.au: remove MEM_KEEP from arch/powerpc/kernel/vmlinux.lds.S] Link: https://lkml.kernel.org/r/20240710093213.2aefb25f@canb.auug.org.au Link: https://lkml.kernel.org/r/20240706160511.2331061-2-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Signed-off-by: Stephen Rothwell Reviewed-by: Wei Yang Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/init.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index b2e9dfff8691..ee1309473bc6 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -84,11 +84,15 @@ #define __exit __section(".exit.text") __exitused __cold notrace -/* Used for MEMORY_HOTPLUG */ -#define __meminit __section(".meminit.text") __cold notrace \ - __latent_entropy -#define __meminitdata __section(".meminit.data") -#define __meminitconst __section(".meminit.rodata") +#ifdef CONFIG_MEMORY_HOTPLUG +#define __meminit +#define __meminitdata +#define __meminitconst +#else +#define __meminit __init +#define __meminitdata __initdata +#define __meminitconst __initconst +#endif /* For assembly routines */ #define __HEAD .section ".head.text","ax" -- cgit v1.2.3 From d69ba6bbaf1f606ac354e925571a54d025e32aae Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jul 2024 15:07:03 -0700 Subject: net: ethtool: let drivers remove lost RSS contexts RSS contexts may get lost from a device, in various extreme circumstances. Specifically if the firmware leaks resources and resets, or crashes and either recovers in partially working state or the crash causes a different FW version to run - creating the context again may fail. Drivers should do their absolute best to prevent this from happening. When it does, however, telling user that a context exists, when it can't possibly be used any more is counter productive. Add a helper for drivers to discard contexts. Print an error, in the future netlink notification will also be sent. More robust approaches were proposed, like keeping the contexts but marking them as "dead" (but possibly resurrected by next reset). That may be better but it's unclear at this stage whether the effort is worth the benefits. Reviewed-by: Pavan Chebbi Link: https://patch.msgid.link/20240711220713.283778-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e213b5508da6..89da0254ccd4 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -210,6 +210,8 @@ static inline size_t ethtool_rxfh_context_size(u32 indir_size, u32 key_size, return struct_size_t(struct ethtool_rxfh_context, data, flex_len); } +void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id); + /* declare a link mode bitmap */ #define __ETHTOOL_DECLARE_LINK_MODE_MASK(name) \ DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS) -- cgit v1.2.3 From 28c8757a792bbbc76407777bd0303862daa75057 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jul 2024 15:07:04 -0700 Subject: net: ethtool: let drivers declare max size of RSS indir table and key Some drivers (bnxt but I think also mlx5 from ML discussions) change the size of the indirection table depending on the number of Rx rings. Decouple the max table size from the size of the currently used table, so that we can reserve space in the context for table growth. Static members in ethtool_ops are good enough for now, we can add callbacks to read the max size more dynamically if someone needs that. Reviewed-by: Pavan Chebbi Link: https://patch.msgid.link/20240711220713.283778-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 89da0254ccd4..a1ee76936f53 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -181,6 +181,7 @@ struct ethtool_rxfh_context { /* private: driver private data, indirection table, and hash key are * stored sequentially in @data area. Use below helpers to access. */ + u32 key_off; u8 data[] __aligned(sizeof(void *)); }; @@ -196,18 +197,7 @@ static inline u32 *ethtool_rxfh_context_indir(struct ethtool_rxfh_context *ctx) static inline u8 *ethtool_rxfh_context_key(struct ethtool_rxfh_context *ctx) { - return (u8 *)(ethtool_rxfh_context_indir(ctx) + ctx->indir_size); -} - -static inline size_t ethtool_rxfh_context_size(u32 indir_size, u32 key_size, - u16 priv_size) -{ - size_t indir_bytes = array_size(indir_size, sizeof(u32)); - size_t flex_len; - - flex_len = size_add(size_add(indir_bytes, key_size), - ALIGN(priv_size, sizeof(u32))); - return struct_size_t(struct ethtool_rxfh_context, data, flex_len); + return &ctx->data[ctx->key_off]; } void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id); @@ -723,6 +713,10 @@ struct ethtool_rxfh_param { * contexts. * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor * RSS. + * @rxfh_indir_space: max size of RSS indirection tables, if indirection table + * size as returned by @get_rxfh_indir_size may change during lifetime + * of the device. Leave as 0 if the table size is constant. + * @rxfh_key_space: same as @rxfh_indir_space, but for the key. * @rxfh_priv_size: size of the driver private data area the core should * allocate for an RSS context (in &struct ethtool_rxfh_context). * @rxfh_max_context_id: maximum (exclusive) supported RSS context ID. If this @@ -940,6 +934,8 @@ struct ethtool_ops { u32 cap_link_lanes_supported:1; u32 cap_rss_ctx_supported:1; u32 cap_rss_sym_xor_supported:1; + u32 rxfh_indir_space; + u16 rxfh_key_space; u16 rxfh_priv_size; u32 rxfh_max_context_id; u32 supported_coalesce_params; -- cgit v1.2.3 From 5d89b5bdbce3937c86f05ffe19455c3068fd94f7 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 12 Jul 2024 11:52:40 +0200 Subject: i2c: document new callbacks in i2c_algorithm When updating the callbacks, adding their kernel-doc was forgotten. Add it now. Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/r/20240712165527.75e4ddc9@canb.auug.org.au Fixes: a93c2e5fe766 ("i2c: reword i2c_algorithm according to newest specification") Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 9d45b7b912dd..e9cc14b1f9a1 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -513,10 +513,10 @@ i2c_register_board_info(int busnum, struct i2c_board_info const *info, /** * struct i2c_algorithm - represent I2C transfer method - * @master_xfer: Issue a set of i2c transactions to the given I2C adapter + * @xfer: Issue a set of i2c transactions to the given I2C adapter * defined by the msgs array, with num messages available to transfer via * the adapter specified by adap. - * @master_xfer_atomic: same as @master_xfer. Yet, only using atomic context + * @xfer_atomic: same as @xfer. Yet, only using atomic context * so e.g. PMICs can be accessed very late before shutdown. Optional. * @smbus_xfer: Issue smbus transactions to the given I2C adapter. If this * is not present, then the bus layer will try and convert the SMBus calls @@ -525,27 +525,33 @@ i2c_register_board_info(int busnum, struct i2c_board_info const *info, * so e.g. PMICs can be accessed very late before shutdown. Optional. * @functionality: Return the flags that this algorithm/adapter pair supports * from the ``I2C_FUNC_*`` flags. - * @reg_slave: Register given client to I2C slave mode of this adapter - * @unreg_slave: Unregister given client from I2C slave mode of this adapter + * @reg_target: Register given client to local target mode of this adapter + * @unreg_target: Unregister given client from local target mode of this adapter + * + * @master_xfer: deprecated, use @xfer + * @master_xfer_atomic: deprecated, use @xfer_atomic + * @reg_slave: deprecated, use @reg_target + * @unreg_slave: deprecated, use @unreg_target + * * * The following structs are for those who like to implement new bus drivers: * i2c_algorithm is the interface to a class of hardware solutions which can * be addressed using the same bus algorithms - i.e. bit-banging or the PCF8584 * to name two of the most common. * - * The return codes from the ``master_xfer{_atomic}`` fields should indicate the + * The return codes from the ``xfer{_atomic}`` fields should indicate the * type of error code that occurred during the transfer, as documented in the * Kernel Documentation file Documentation/i2c/fault-codes.rst. Otherwise, the * number of messages executed should be returned. */ struct i2c_algorithm { /* - * If an adapter algorithm can't do I2C-level access, set master_xfer + * If an adapter algorithm can't do I2C-level access, set xfer * to NULL. If an adapter algorithm can do SMBus access, set * smbus_xfer. If set to NULL, the SMBus protocol is simulated * using common I2C messages. * - * master_xfer should return the number of messages successfully + * xfer should return the number of messages successfully * processed, or a negative value on error */ union { -- cgit v1.2.3 From 63c6e08eac8e7e2ad2e883c968e58e23e1dc8c70 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 11 Jul 2024 17:33:07 -0700 Subject: net/mlx5: IFC updates for SF max IO EQs Expose a new cap sf_eq_usage. The vhca_resource_manager can write this cap, indicating the SF driver should use max_num_eqs_24b to determine how many EQs to use. Will be used in the next patch, to indicate to the SF driver from the PF that the user has set the max io eqs via devlink. So the SF driver can later query the proper max eq value from the new cap. devlink port function set pci/0000:08:00.0/32768 max_io_eqs 32 Signed-off-by: Daniel Jurgens Reviewed-by: William Tu Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed Link: https://patch.msgid.link/20240712003310.355106-2-saeed@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fdad0071d599..360d42f041b0 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1994,7 +1994,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 migration_tracking_state[0x1]; u8 reserved_at_ca[0x6]; u8 migration_in_chunks[0x1]; - u8 reserved_at_d1[0xf]; + u8 reserved_at_d1[0x1]; + u8 sf_eq_usage[0x1]; + u8 reserved_at_d3[0xd]; u8 cross_vhca_object_to_object_supported[0x20]; -- cgit v1.2.3 From 084ebf7ca83e6cb743784f2eecc654193ce064fb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 21 Jun 2024 13:50:43 -0700 Subject: execve: Keep bprm->argmin behind CONFIG_MMU When argmin was added in commit 655c16a8ce9c ("exec: separate MM_ANONPAGES and RLIMIT_STACK accounting"), it was intended only for validating stack limits on CONFIG_MMU[1]. All checking for reaching the limit (argmin) is wrapped in CONFIG_MMU ifdef checks, though setting argmin was not. That argmin is only supposed to be used under CONFIG_MMU was rediscovered recently[2], and I don't want to trip over this again. Move argmin's declaration into the existing CONFIG_MMU area, and add helpers functions so the MMU tests can be consolidated. Link: https://lore.kernel.org/all/20181126122307.GA1660@redhat.com [1] Link: https://lore.kernel.org/all/202406211253.7037F69@keescook/ [2] Link: https://lore.kernel.org/r/20240621205046.4001362-1-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/binfmts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 70f97f685bff..e6c00e860951 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -19,13 +19,13 @@ struct linux_binprm { #ifdef CONFIG_MMU struct vm_area_struct *vma; unsigned long vma_pages; + unsigned long argmin; /* rlimit marker for copy_strings() */ #else # define MAX_ARG_PAGES 32 struct page *page[MAX_ARG_PAGES]; #endif struct mm_struct *mm; unsigned long p; /* current top of mem */ - unsigned long argmin; /* rlimit marker for copy_strings() */ unsigned int /* Should an execfd be passed to userspace? */ have_execfd:1, -- cgit v1.2.3 From 872bb37f6829d4f7f3ed5afe2786add3d4384b4b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 2 Jul 2024 14:16:16 -0700 Subject: randomize_kstack: Improve stack alignment codegen The codgen for adding architecture-specific stack alignment to the effective alloca() usage is somewhat inefficient and allows a bit to get carried beyond the desired entropy range. This isn't really a problem, but it's unexpected and the codegen is kind of bad. Quoting Mark[1], the disassembly for arm64's invoke_syscall() looks like: // offset = raw_cpu_read(kstack_offset) mov x4, sp adrp x0, kstack_offset mrs x5, tpidr_el1 add x0, x0, #:lo12:kstack_offset ldr w0, [x0, x5] // offset = KSTACK_OFFSET_MAX(offset) and x0, x0, #0x3ff // alloca(offset) add x0, x0, #0xf and x0, x0, #0x7f0 sub sp, x4, x0 ... which in C would be: offset = raw_cpu_read(kstack_offset) offset &= 0x3ff; // [0x0, 0x3ff] offset += 0xf; // [0xf, 0x40e] offset &= 0x7f0; // [0x0, ... so when *all* bits [3:0] are 0, they'll have no impact, and when *any* of bits [3:0] are 1 they'll trigger a carry into bit 4, which could ripple all the way up and spill into bit 10. Switch the masking in KSTACK_OFFSET_MAX() to explicitly clear the bottom bits to avoid the rounding by using 0b1111110000 instead of 0b1111111111: // offset = raw_cpu_read(kstack_offset) mov x4, sp adrp x0, 0 mrs x5, tpidr_el1 add x0, x0, #:lo12:kstack_offset ldr w0, [x0, x5] // offset = KSTACK_OFFSET_MAX(offset) and x0, x0, #0x3f0 // alloca(offset) sub sp, x4, x0 Suggested-by: Mark Rutland Link: https://lore.kernel.org/lkml/ZnVfOnIuFl2kNWkT@J2N7QTR9R3/ [1] Link: https://lore.kernel.org/r/20240702211612.work.576-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/randomize_kstack.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index 6d92b68efbf6..1d982dbdd0d0 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -32,13 +32,19 @@ DECLARE_PER_CPU(u32, kstack_offset); #endif /* - * Use, at most, 10 bits of entropy. We explicitly cap this to keep the - * "VLA" from being unbounded (see above). 10 bits leaves enough room for - * per-arch offset masks to reduce entropy (by removing higher bits, since - * high entropy may overly constrain usable stack space), and for - * compiler/arch-specific stack alignment to remove the lower bits. + * Use, at most, 6 bits of entropy (on 64-bit; 8 on 32-bit). This cap is + * to keep the "VLA" from being unbounded (see above). Additionally clear + * the bottom 4 bits (on 64-bit systems, 2 for 32-bit), since stack + * alignment will always be at least word size. This makes the compiler + * code gen better when it is applying the actual per-arch alignment to + * the final offset. The resulting randomness is reasonable without overly + * constraining usable stack space. */ -#define KSTACK_OFFSET_MAX(x) ((x) & 0x3FF) +#ifdef CONFIG_64BIT +#define KSTACK_OFFSET_MAX(x) ((x) & 0b1111110000) +#else +#define KSTACK_OFFSET_MAX(x) ((x) & 0b1111111100) +#endif /** * add_random_kstack_offset - Increase stack utilization by previously -- cgit v1.2.3 From 1a616c2fe96b357894b74b41787d4ea6987f6199 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 21 Dec 2023 20:34:17 -0500 Subject: lockdep: lockdep_set_notrack_class() Add a new helper to disable lockdep tracking entirely for a given class. This is needed for bcachefs, which takes too many btree node locks for lockdep to track. Instead, we have a single lockdep_map for "btree_trans has any btree nodes locked", which makes more since given that we have centralized lock management and a cycle detector. Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Cc: Waiman Long Cc: Boqun Feng Signed-off-by: Kent Overstreet --- include/linux/lockdep.h | 4 ++++ include/linux/lockdep_types.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 08b0d1d9d78b..b76f1bcd2f7f 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -181,6 +181,9 @@ static inline void lockdep_init_map(struct lockdep_map *lock, const char *name, #define lockdep_set_novalidate_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock) +#define lockdep_set_notrack_class(lock) \ + lockdep_set_class_and_name(lock, &__lockdep_no_track__, #lock) + /* * Compare locking classes */ @@ -338,6 +341,7 @@ static inline void lockdep_set_selftest_task(struct task_struct *task) #define lockdep_set_subclass(lock, sub) do { } while (0) #define lockdep_set_novalidate_class(lock) do { } while (0) +#define lockdep_set_notrack_class(lock) do { } while (0) /* * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 70d30d40ea4a..9f361d3ab9d9 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -80,6 +80,7 @@ struct lock_class_key { }; extern struct lock_class_key __lockdep_no_validate__; +extern struct lock_class_key __lockdep_no_track__; struct lock_trace; -- cgit v1.2.3 From ff253875ff3bab75214c4e94352956dc02a0d965 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20Hor=C3=A1k=20=282N=29?= Date: Fri, 12 Jul 2024 17:07:07 +0200 Subject: net: phy: bcm54811: Add LRE registers definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the definitions of LRE registers for Broadcom BCM5481x PHY Signed-off-by: Kamil Horák (2N) Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20240712150709.3134474-3-kamilh@axis.com Signed-off-by: Jakub Kicinski --- include/linux/brcmphy.h | 88 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 1394ba302367..028b3e00378e 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -271,12 +271,100 @@ #define BCM5482_SSD_SGMII_SLAVE_EN 0x0002 /* Slave mode enable */ #define BCM5482_SSD_SGMII_SLAVE_AD 0x0001 /* Slave auto-detection */ +/* BroadR-Reach LRE Registers. */ +#define MII_BCM54XX_LRECR 0x00 /* LRE Control Register */ +#define MII_BCM54XX_LRESR 0x01 /* LRE Status Register */ +#define MII_BCM54XX_LREPHYSID1 0x02 /* LRE PHYS ID 1 */ +#define MII_BCM54XX_LREPHYSID2 0x03 /* LRE PHYS ID 2 */ +#define MII_BCM54XX_LREANAA 0x04 /* LDS Auto-Negotiation Advertised Ability */ +#define MII_BCM54XX_LREANAC 0x05 /* LDS Auto-Negotiation Advertised Control */ +#define MII_BCM54XX_LREANPT 0x06 /* LDS Ability Next Page Transmit */ +#define MII_BCM54XX_LRELPA 0x07 /* LDS Link Partner Ability */ +#define MII_BCM54XX_LRELPNPM 0x08 /* LDS Link Partner Next Page Message */ +#define MII_BCM54XX_LRELPNPC 0x09 /* LDS Link Partner Next Page Control */ +#define MII_BCM54XX_LRELDSE 0x0a /* LDS Expansion Register */ +#define MII_BCM54XX_LREES 0x0f /* LRE Extended Status */ + +/* LRE control register. */ +#define LRECR_RESET 0x8000 /* Reset to default state */ +#define LRECR_LOOPBACK 0x4000 /* Internal Loopback */ +#define LRECR_LDSRES 0x2000 /* Restart LDS Process */ +#define LRECR_LDSEN 0x1000 /* LDS Enable */ +#define LRECR_PDOWN 0x0800 /* Enable low power state */ +#define LRECR_ISOLATE 0x0400 /* Isolate data paths from MII */ +#define LRECR_SPEED100 0x0200 /* Select 100 Mbps */ +#define LRECR_SPEED10 0x0000 /* Select 10 Mbps */ +#define LRECR_4PAIRS 0x0020 /* Select 4 Pairs */ +#define LRECR_2PAIRS 0x0010 /* Select 2 Pairs */ +#define LRECR_1PAIR 0x0000 /* Select 1 Pair */ +#define LRECR_MASTER 0x0008 /* Force Master when LDS disabled */ +#define LRECR_SLAVE 0x0000 /* Force Slave when LDS disabled */ + +/* LRE status register. */ +#define LRESR_100_1PAIR 0x2000 /* Can do 100Mbps 1 Pair */ +#define LRESR_100_4PAIR 0x1000 /* Can do 100Mbps 4 Pairs */ +#define LRESR_100_2PAIR 0x0800 /* Can do 100Mbps 2 Pairs */ +#define LRESR_10_2PAIR 0x0400 /* Can do 10Mbps 2 Pairs */ +#define LRESR_10_1PAIR 0x0200 /* Can do 10Mbps 1 Pair */ +#define LRESR_ESTATEN 0x0100 /* Extended Status in R15 */ +#define LRESR_RESV 0x0080 /* Unused... */ +#define LRESR_MFPS 0x0040 /* Can suppress Management Frames Preamble */ +#define LRESR_LDSCOMPLETE 0x0020 /* LDS Auto-negotiation complete */ +#define LRESR_8023 0x0010 /* Has IEEE 802.3 Support */ +#define LRESR_LDSABILITY 0x0008 /* LDS auto-negotiation capable */ +#define LRESR_LSTATUS 0x0004 /* Link status */ +#define LRESR_JCD 0x0002 /* Jabber detected */ +#define LRESR_ERCAP 0x0001 /* Ext-reg capability */ + +/* LDS Auto-Negotiation Advertised Ability. */ +#define LREANAA_PAUSE_ASYM 0x8000 /* Can pause asymmetrically */ +#define LREANAA_PAUSE 0x4000 /* Can pause */ +#define LREANAA_100_1PAIR 0x0020 /* Can do 100Mbps 1 Pair */ +#define LREANAA_100_4PAIR 0x0010 /* Can do 100Mbps 4 Pair */ +#define LREANAA_100_2PAIR 0x0008 /* Can do 100Mbps 2 Pair */ +#define LREANAA_10_2PAIR 0x0004 /* Can do 10Mbps 2 Pair */ +#define LREANAA_10_1PAIR 0x0002 /* Can do 10Mbps 1 Pair */ + +#define LRE_ADVERTISE_FULL (LREANAA_100_1PAIR | LREANAA_100_4PAIR | \ + LREANAA_100_2PAIR | LREANAA_10_2PAIR | \ + LREANAA_10_1PAIR) + +#define LRE_ADVERTISE_ALL LRE_ADVERTISE_FULL + +/* LDS Link Partner Ability. */ +#define LRELPA_PAUSE_ASYM 0x8000 /* Supports asymmetric pause */ +#define LRELPA_PAUSE 0x4000 /* Supports pause capability */ +#define LRELPA_100_1PAIR 0x0020 /* 100Mbps 1 Pair capable */ +#define LRELPA_100_4PAIR 0x0010 /* 100Mbps 4 Pair capable */ +#define LRELPA_100_2PAIR 0x0008 /* 100Mbps 2 Pair capable */ +#define LRELPA_10_2PAIR 0x0004 /* 10Mbps 2 Pair capable */ +#define LRELPA_10_1PAIR 0x0002 /* 10Mbps 1 Pair capable */ + +/* LDS Expansion register. */ +#define LDSE_DOWNGRADE 0x8000 /* Can do LDS Speed Downgrade */ +#define LDSE_MASTER 0x4000 /* Master / Slave */ +#define LDSE_PAIRS_MASK 0x3000 /* Pair Count Mask */ +#define LDSE_PAIRS_SHIFT 12 +#define LDSE_4PAIRS (2 << LDSE_PAIRS_SHIFT) /* 4 Pairs Connection */ +#define LDSE_2PAIRS (1 << LDSE_PAIRS_SHIFT) /* 2 Pairs Connection */ +#define LDSE_1PAIR (0 << LDSE_PAIRS_SHIFT) /* 1 Pair Connection */ +#define LDSE_CABLEN_MASK 0x0FFF /* Cable Length Mask */ + /* BCM54810 Registers */ #define BCM54810_EXP_BROADREACH_LRE_MISC_CTL (MII_BCM54XX_EXP_SEL_ER + 0x90) #define BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN (1 << 0) #define BCM54810_SHD_CLK_CTL 0x3 #define BCM54810_SHD_CLK_CTL_GTXCLK_EN (1 << 9) +/* BCM54811 Registers */ +#define BCM54811_EXP_BROADREACH_LRE_OVERLAY_CTL (MII_BCM54XX_EXP_SEL_ER + 0x9A) +/* Access Control Override Enable */ +#define BCM54811_EXP_BROADREACH_LRE_OVERLAY_CTL_EN BIT(15) +/* Access Control Override Value */ +#define BCM54811_EXP_BROADREACH_LRE_OVERLAY_CTL_OVERRIDE_VAL BIT(14) +/* Access Control Value */ +#define BCM54811_EXP_BROADREACH_LRE_OVERLAY_CTL_VAL BIT(13) + /* BCM54612E Registers */ #define BCM54612E_EXP_SPARE0 (MII_BCM54XX_EXP_SEL_ETC + 0x34) #define BCM54612E_LED4_CLK125OUT_EN (1 << 1) -- cgit v1.2.3 From a52c6330ff2fe1163333fa6609bdc6e8763ec286 Mon Sep 17 00:00:00 2001 From: "Alex Shi (Tencent)" Date: Fri, 12 Jul 2024 12:14:35 +0800 Subject: mm/memcg: alignment memcg_data define condition commit 21c690a349ba ("mm: introduce slabobj_ext to support slab object extensions") changed the folio/page->memcg_data define condition from MEMCG to SLAB_OBJ_EXT. This action make memcg_data exposed while !MEMCG. As Vlastimil Babka suggested, let's add _unused_slab_obj_exts for SLAB_MATCH for slab.obj_exts while !MEMCG. That could resolve the match issue, clean up the feature logical. Signed-off-by: Alex Shi (Tencent) Cc: Randy Dunlap Cc: Yoann Congal Cc: Masahiro Yamada Cc: Petr Mladek Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/mm_types.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index af3a0256fa93..a199c48bc462 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -169,8 +169,10 @@ struct page { /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ atomic_t _refcount; -#ifdef CONFIG_SLAB_OBJ_EXT +#ifdef CONFIG_MEMCG unsigned long memcg_data; +#elif defined(CONFIG_SLAB_OBJ_EXT) + unsigned long _unused_slab_obj_exts; #endif /* @@ -298,6 +300,7 @@ typedef struct { * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). * @_deferred_list: Folios to be split under memory pressure. + * @_unused_slab_obj_exts: Placeholder to match obj_exts in struct slab. * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -332,8 +335,10 @@ struct folio { }; atomic_t _mapcount; atomic_t _refcount; -#ifdef CONFIG_SLAB_OBJ_EXT +#ifdef CONFIG_MEMCG unsigned long memcg_data; +#elif defined(CONFIG_SLAB_OBJ_EXT) + unsigned long _unused_slab_obj_exts; #endif #if defined(WANT_PAGE_VIRTUAL) void *virtual; -- cgit v1.2.3 From f0eb154c39471bf881422e8ac23e4c037289ece9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 5 Jul 2024 10:31:54 +0100 Subject: irqchip/gic-v4: Substitute vmovp_lock for a per-VM lock vmovp_lock is abused in a number of cases to serialise updates to vlpi_count[] and deal with map/unmap of a VM to ITSs. Instead, provide a per-VM lock and revisit the use of vlpi_count[] so that it is always wrapped in this per-VM vmapp_lock. This reduces the potential contention on a concurrent VMOVP command, and paves the way for subsequent VPE locking that holding vmovp_lock actively prevents due to the lock ordering. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Nianyao Tang Link: https://lore.kernel.org/r/20240705093155.871070-3-maz@kernel.org --- include/linux/irqchip/arm-gic-v4.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 2c63375bbd43..ecabed6d3307 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -25,6 +25,14 @@ struct its_vm { irq_hw_number_t db_lpi_base; unsigned long *db_bitmap; int nr_db_lpis; + /* + * Ensures mutual exclusion between updates to vlpi_count[] + * and map/unmap when using the ITSList mechanism. + * + * The lock order for any sequence involving the ITSList is + * vmapp_lock -> vpe_lock ->vmovp_lock. + */ + raw_spinlock_t vmapp_lock; u32 vlpi_count[GICv4_ITS_LIST_MAX]; }; -- cgit v1.2.3 From c37927a203fa283950f6045602b9f71328ad786c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 11 Jul 2024 12:20:04 +0200 Subject: genirq: Set IRQF_COND_ONESHOT in request_irq() The callers of request_irq() don't care about IRQF_ONESHOT because they don't provide threaded handlers, but if they happen to share the IRQ with the ACPI SCI, which has a threaded handler and sets IRQF_ONESHOT, request_irq() will fail for them due to a flags mismatch. Address this by making request_irq() add IRQF_COND_ONESHOT to the flags passed to request_threaded_irq() for all of its callers. Fixes: 7a36b901a6eb ("ACPI: OSL: Use a threaded interrupt handler for SCI") Reported-by: Stefan Seyfried Signed-off-by: Rafael J. Wysocki Signed-off-by: Thomas Gleixner Tested-by: Stefan Seyfried Cc: stable@vger.kerel.org Link: https://lore.kernel.org/r/5800834.DvuYhMxLoT@rjwysocki.net Closes: https://lore.kernel.org/lkml/205bd84a-fe8e-4963-968e-0763285f35ba@message-id.googlemail.com --- include/linux/interrupt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5c9bdd3ffccc..dac7466de5f3 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -168,7 +168,7 @@ static inline int __must_check request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev) { - return request_threaded_irq(irq, handler, NULL, flags, name, dev); + return request_threaded_irq(irq, handler, NULL, flags | IRQF_COND_ONESHOT, name, dev); } extern int __must_check -- cgit v1.2.3 From b7b377332b96a38bc98928d7ec2674c77a95fcb3 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Fri, 12 Jul 2024 08:41:48 +0200 Subject: irqdomain: Fix the kernel-doc and plug it into Documentation There were several undocumented fields in structs irq_domain_ops and irq_domain_info. Document them. irq_domain_ops::revmap_size contained "[]" in the description, which is not allowed in sphinx. Remove that. Finally, plug the whole header (irqdomain.h) into genericirq.rst, so that the docs is autogenerated and hyperlinks to these structure are created. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Tested-by: Randy Dunlap Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/20240712064148.157040-1-jirislaby@kernel.org --- include/linux/irqdomain.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 02cd486ac354..de6105f68fec 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -74,11 +74,24 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, * struct irq_domain_ops - Methods for irq_domain objects * @match: Match an interrupt controller device node to a host, returns * 1 on a match + * @select: Match an interrupt controller fw specification. It is more generic + * than @match as it receives a complete struct irq_fwspec. Therefore, + * @select is preferred if provided. Returns 1 on a match. * @map: Create or update a mapping between a virtual irq number and a hw * irq number. This is called only once for a given mapping. * @unmap: Dispose of such a mapping * @xlate: Given a device tree node and interrupt specifier, decode * the hardware irq number and linux irq type value. + * @alloc: Allocate @nr_irqs interrupts starting from @virq. + * @free: Free @nr_irqs interrupts starting from @virq. + * @activate: Activate one interrupt in HW (@irqd). If @reserve is set, only + * reserve the vector. If unset, assign the vector (called from + * request_irq()). + * @deactivate: Disarm one interrupt (@irqd). + * @translate: Given @fwspec, decode the hardware irq number (@out_hwirq) and + * linux irq type value (@out_type). This is a generalised @xlate + * (over struct irq_fwspec) and is preferred if provided. + * @debug_show: For domains to show specific data for an interrupt in debugfs. * * Functions below are provided by the driver and called whenever a new mapping * is created or an old mapping is disposed. The driver can then proceed to @@ -131,6 +144,9 @@ struct irq_domain_chip_generic; * Optional elements: * @fwnode: Pointer to firmware node associated with the irq_domain. Pretty easy * to swap it for the of_node via the irq_domain_get_of_node accessor + * @bus_token: @fwnode's device_node might be used for several irq domains. But + * in connection with @bus_token, the pair shall be unique in a + * system. * @gc: Pointer to a list of generic chips. There is a helper function for * setting up one or more generic chips for interrupt controllers * drivers using the generic chip library which uses this pointer. @@ -144,7 +160,9 @@ struct irq_domain_chip_generic; * @exit: Function called when the domain is destroyed * * Revmap data, used internally by the irq domain code: - * @revmap_size: Size of the linear map table @revmap[] + * @hwirq_max: Top limit for the HW irq number. Especially to avoid + * conflicts/failures with reserved HW irqs. Can be ~0. + * @revmap_size: Size of the linear map table @revmap * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map * @revmap: Linear table of irq_data pointers */ -- cgit v1.2.3 From ce20fdd670ac375a4e3dff91c2888ad9ff9eef56 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Sat, 13 Jul 2024 23:16:15 +0200 Subject: net: dsa: Define max num of bridges in tag8021q implementation Max number of bridges in tag8021q implementation is strictly limited by VBID size: 3 bits. But zero is reserved and only 7 values can be used. This patch adds define which describe maximum possible value. Suggested-by: Vladimir Oltean Signed-off-by: Pawel Dembicki Reviewed-by: Florian Fainelli Reviewed-by: Linus Walleij Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20240713211620.1125910-10-paweldembicki@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/dsa/8021q.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index f3664ee12170..1dda2a13b832 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -8,6 +8,11 @@ #include #include +/* VBID is limited to three bits only and zero is reserved. + * Only 7 bridges can be enumerated. + */ +#define DSA_TAG_8021Q_MAX_NUM_BRIDGES 7 + int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto); void dsa_tag_8021q_unregister(struct dsa_switch *ds); -- cgit v1.2.3 From 85aabd1fe9d6af4dc5d11a2d8be567ec45d1dc5e Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Sat, 13 Jul 2024 23:16:16 +0200 Subject: net: dsa: prepare 'dsa_tag_8021q_bridge_join' for standalone use The 'dsa_tag_8021q_bridge_join' could be used as a generic implementation of the 'ds->ops->port_bridge_join()' function. However, it is necessary to synchronize their arguments. This patch also moves the 'tx_fwd_offload' flag configuration line into 'dsa_tag_8021q_bridge_join' body. Currently, every (sja1105) driver sets it, and the future vsc73xx implementation will also need it for simplification. Suggested-by: Vladimir Oltean Signed-off-by: Pawel Dembicki Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20240713211620.1125910-11-paweldembicki@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/dsa/8021q.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 1dda2a13b832..d13aabdeb4b2 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -18,7 +18,8 @@ int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto); void dsa_tag_8021q_unregister(struct dsa_switch *ds); int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port, - struct dsa_bridge bridge); + struct dsa_bridge bridge, bool *tx_fwd_offload, + struct netlink_ext_ack *extack); void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge); -- cgit v1.2.3 From e50bfd6bb231a6e2b7221ad78dce294330238c76 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 9 Jul 2024 15:53:33 +0200 Subject: net_tstamp: Add TIMESTAMPING SOFTWARE and HARDWARE mask Timestamping software or hardware flags are often used as a group, therefore adding these masks will ease future use. I did not use SOF_TIMESTAMPING_SYS_HARDWARE flag as it is deprecated and not used at all. Reviewed-by: Willem de Bruijn Reviewed-by: Florian Fainelli Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20240709-feature_ptp_netnext-v17-1-b5317f50df2a@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/net_tstamp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index eb01c37e71e0..3799c79b6c83 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -5,6 +5,14 @@ #include +#define SOF_TIMESTAMPING_SOFTWARE_MASK (SOF_TIMESTAMPING_RX_SOFTWARE | \ + SOF_TIMESTAMPING_TX_SOFTWARE | \ + SOF_TIMESTAMPING_SOFTWARE) + +#define SOF_TIMESTAMPING_HARDWARE_MASK (SOF_TIMESTAMPING_RX_HARDWARE | \ + SOF_TIMESTAMPING_TX_HARDWARE | \ + SOF_TIMESTAMPING_RAW_HARDWARE) + enum hwtstamp_source { HWTSTAMP_SOURCE_NETDEV, HWTSTAMP_SOURCE_PHYLIB, -- cgit v1.2.3 From 2dd35600590148d843367c04975acad3c1a527c3 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 9 Jul 2024 15:53:36 +0200 Subject: net: Change the API of PHY default timestamp to MAC Change the API to select MAC default time stamping instead of the PHY. Indeed the PHY is closer to the wire therefore theoretically it has less delay than the MAC timestamping but the reality is different. Due to lower time stamping clock frequency, latency in the MDIO bus and no PHC hardware synchronization between different PHY, the PHY PTP is often less precise than the MAC. The exception is for PHY designed specially for PTP case but these devices are not very widespread. For not breaking the compatibility default_timestamp flag has been introduced in phy_device that is set by the phy driver to know we are using the old API behavior. Reviewed-by: Rahul Rameshbabu Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20240709-feature_ptp_netnext-v17-4-b5317f50df2a@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index bd68f9d8e74f..e7a38137211c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -616,6 +616,8 @@ struct macsec_ops; * handling shall be postponed until PHY has resumed * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume + * @default_timestamp: Flag indicating whether we are using the phy + * timestamp as the default one * @interface: enum phy_interface_t value * @possible_interfaces: bitmap if interface modes that the attached PHY * will switch between depending on media speed. @@ -681,6 +683,8 @@ struct phy_device { unsigned irq_suspended:1; unsigned irq_rerun:1; + unsigned default_timestamp:1; + int rate_matching; enum phy_state state; @@ -1625,6 +1629,21 @@ static inline void phy_txtstamp(struct phy_device *phydev, struct sk_buff *skb, phydev->mii_ts->txtstamp(phydev->mii_ts, skb, type); } +/** + * phy_is_default_hwtstamp - Is the PHY hwtstamp the default timestamp + * @phydev: Pointer to phy_device + * + * This is used to get default timestamping device taking into account + * the new API choice, which is selecting the timestamping from MAC by + * default if the phydev does not have default_timestamp flag enabled. + * + * Return: True if phy is the default hw timestamp, false otherwise. + */ +static inline bool phy_is_default_hwtstamp(struct phy_device *phydev) +{ + return phy_has_hwtstamp(phydev) && phydev->default_timestamp; +} + /** * phy_is_internal - Convenience function for testing if a PHY is internal * @phydev: the phy_device struct -- cgit v1.2.3 From bc5a07ed15a3d30df3e8af2ef6bb0472f1d337f9 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 9 Jul 2024 15:53:37 +0200 Subject: net: net_tstamp: Add unspec field to hwtstamp_source enumeration Prepare for future support of saving hwtstamp source in PTP xarray by introducing HWTSTAMP_SOURCE_UNSPEC to hwtstamp_source enum, setting it to 0 to match old behavior of no source defined. Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20240709-feature_ptp_netnext-v17-5-b5317f50df2a@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/net_tstamp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index 3799c79b6c83..662074b08c94 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -14,6 +14,7 @@ SOF_TIMESTAMPING_RAW_HARDWARE) enum hwtstamp_source { + HWTSTAMP_SOURCE_UNSPEC, HWTSTAMP_SOURCE_NETDEV, HWTSTAMP_SOURCE_PHYLIB, }; -- cgit v1.2.3 From 2111375b85ad173d58e7b8604246a3de60950ac8 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 9 Jul 2024 15:53:38 +0200 Subject: net: Add struct kernel_ethtool_ts_info In prevision to add new UAPI for hwtstamp we will be limited to the struct ethtool_ts_info that is currently passed in fixed binary format through the ETHTOOL_GET_TS_INFO ethtool ioctl. It would be good if new kernel code already started operating on an extensible kernel variant of that structure, similar in concept to struct kernel_hwtstamp_config vs struct hwtstamp_config. Since struct ethtool_ts_info is in include/uapi/linux/ethtool.h, here we introduce the kernel-only structure in include/linux/ethtool.h. The manual copy is then made in the function called by ETHTOOL_GET_TS_INFO. Acked-by: Shannon Nelson Acked-by: Alexandra Winter Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20240709-feature_ptp_netnext-v17-6-b5317f50df2a@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/can/dev.h | 2 +- include/linux/ethtool.h | 25 ++++++++++++++++++++++--- include/linux/mii_timestamper.h | 2 +- include/linux/phy.h | 2 +- 4 files changed, 25 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 1b92aed49363..23492213ea35 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -186,7 +186,7 @@ void close_candev(struct net_device *dev); int can_change_mtu(struct net_device *dev, int new_mtu); int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd); int can_ethtool_op_get_ts_info_hwts(struct net_device *dev, - struct ethtool_ts_info *info); + struct kernel_ethtool_ts_info *info); int register_candev(struct net_device *dev); void unregister_candev(struct net_device *dev); diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e213b5508da6..6b38bbda5790 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -18,6 +18,7 @@ #include #include #include +#include struct compat_ethtool_rx_flow_spec { u32 flow_type; @@ -713,6 +714,22 @@ struct ethtool_rxfh_param { u8 input_xfrm; }; +/** + * struct kernel_ethtool_ts_info - kernel copy of struct ethtool_ts_info + * @cmd: command number = %ETHTOOL_GET_TS_INFO + * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags + * @phc_index: device index of the associated PHC, or -1 if there is none + * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values + * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values + */ +struct kernel_ethtool_ts_info { + u32 cmd; + u32 so_timestamping; + int phc_index; + enum hwtstamp_tx_types tx_types; + enum hwtstamp_rx_filters rx_filters; +}; + /** * struct ethtool_ops - optional netdev operations * @cap_link_lanes_supported: indicates if the driver supports lanes @@ -1020,7 +1037,7 @@ struct ethtool_ops { int (*get_dump_data)(struct net_device *, struct ethtool_dump *, void *); int (*set_dump)(struct net_device *, struct ethtool_dump *); - int (*get_ts_info)(struct net_device *, struct ethtool_ts_info *); + int (*get_ts_info)(struct net_device *, struct kernel_ethtool_ts_info *); void (*get_ts_stats)(struct net_device *dev, struct ethtool_ts_stats *ts_stats); int (*get_module_info)(struct net_device *, @@ -1181,7 +1198,8 @@ int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index); /* Some generic methods drivers may use in their ethtool_ops */ u32 ethtool_op_get_link(struct net_device *dev); -int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *eti); +int ethtool_op_get_ts_info(struct net_device *dev, + struct kernel_ethtool_ts_info *eti); /** * ethtool_mm_frag_size_add_to_min - Translate (standard) additional fragment @@ -1230,7 +1248,8 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, * @info: buffer to hold the result * Returns zero on success, non-zero otherwise. */ -int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info *info); +int ethtool_get_ts_info_by_layer(struct net_device *dev, + struct kernel_ethtool_ts_info *info); /** * ethtool_sprintf - Write formatted string to ethtool string data diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h index 26b04f73f214..995db62570f9 100644 --- a/include/linux/mii_timestamper.h +++ b/include/linux/mii_timestamper.h @@ -59,7 +59,7 @@ struct mii_timestamper { struct phy_device *phydev); int (*ts_info)(struct mii_timestamper *mii_ts, - struct ethtool_ts_info *ts_info); + struct kernel_ethtool_ts_info *ts_info); struct device *device; }; diff --git a/include/linux/phy.h b/include/linux/phy.h index e7a38137211c..04ae5c811cfb 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1618,7 +1618,7 @@ static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb, } static inline int phy_ts_info(struct phy_device *phydev, - struct ethtool_ts_info *tsinfo) + struct kernel_ethtool_ts_info *tsinfo) { return phydev->mii_ts->ts_info(phydev->mii_ts, tsinfo); } -- cgit v1.2.3 From f96eb1172ed8d74cfed7da92d2045ec64028cc38 Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Mon, 15 Jul 2024 14:30:50 +0200 Subject: dsa: lan9303: consistent naming for PHY address parameter Name it 'addr' instead of 'port' or 'phy'. Signed-off-by: Christian Eggers Link: https://patch.msgid.link/20240715123050.21202-1-ceggers@arri.de Signed-off-by: Jakub Kicinski --- include/linux/dsa/lan9303.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h index b4f22112ba75..3ce7cbcc37a3 100644 --- a/include/linux/dsa/lan9303.h +++ b/include/linux/dsa/lan9303.h @@ -5,8 +5,8 @@ struct lan9303; struct lan9303_phy_ops { /* PHY 1 and 2 access*/ - int (*phy_read)(struct lan9303 *chip, int port, int regnum); - int (*phy_write)(struct lan9303 *chip, int port, + int (*phy_read)(struct lan9303 *chip, int addr, int regnum); + int (*phy_write)(struct lan9303 *chip, int addr, int regnum, u16 val); }; -- cgit v1.2.3 From c5eaf1b3f824369b6dcb33a39232b871d2ca8e33 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 12 Jul 2024 08:49:12 -0700 Subject: PCI: Add Meta Platforms vendor ID Add Meta as a vendor ID for PCI devices so we can use the macro for future drivers. Signed-off-by: Alexander Duyck Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/172079935272.1778861.13619056509276833225.stgit@ahduyck-xeon-server.home.arpa Signed-off-by: Jakub Kicinski --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 677aea20d3e1..76a8f2d6bd64 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2601,6 +2601,8 @@ #define PCI_VENDOR_ID_HYGON 0x1d94 +#define PCI_VENDOR_ID_META 0x1d9b + #define PCI_VENDOR_ID_FUNGIBLE 0x1dad #define PCI_VENDOR_ID_HXT 0x1dbf -- cgit v1.2.3 From 415fb383ec2bbe4d4bb1a1b5593cd67eb058f1de Mon Sep 17 00:00:00 2001 From: Francis Pravin Date: Mon, 15 Jul 2024 06:31:57 +0530 Subject: nvme-core: choose PIF from QPIF if QPIFS supports and PIF is QTYPE As per TP4141a: "If the Qualified Protection Information Format Support(QPIFS) bit is set to 1 and the Protection Information Format(PIF) field is set to 11b (i.e., Qualified Type), then the pif is as defined in the Qualified Protection Information Format (QPIF) field." So, choose PIF from QPIF if QPIFS supports and PIF is QTYPE. Signed-off-by: Francis Pravin Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 57e27e48c913..5c2290f0d5af 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -483,6 +483,9 @@ enum { NVME_ID_NS_NVM_STS_MASK = 0x7f, NVME_ID_NS_NVM_GUARD_SHIFT = 7, NVME_ID_NS_NVM_GUARD_MASK = 0x3, + NVME_ID_NS_NVM_QPIF_SHIFT = 9, + NVME_ID_NS_NVM_QPIF_MASK = 0xf, + NVME_ID_NS_NVM_QPIFS = 1 << 3, }; static inline __u8 nvme_elbaf_sts(__u32 elbaf) @@ -495,6 +498,11 @@ static inline __u8 nvme_elbaf_guard_type(__u32 elbaf) return (elbaf >> NVME_ID_NS_NVM_GUARD_SHIFT) & NVME_ID_NS_NVM_GUARD_MASK; } +static inline __u8 nvme_elbaf_qualified_guard_type(__u32 elbaf) +{ + return (elbaf >> NVME_ID_NS_NVM_QPIF_SHIFT) & NVME_ID_NS_NVM_QPIF_MASK; +} + struct nvme_id_ctrl_nvm { __u8 vsl; __u8 wzsl; @@ -574,6 +582,7 @@ enum { NVME_NVM_NS_16B_GUARD = 0, NVME_NVM_NS_32B_GUARD = 1, NVME_NVM_NS_64B_GUARD = 2, + NVME_NVM_NS_QTYPE_GUARD = 3, }; static inline __u8 nvme_lbaf_index(__u8 flbas) -- cgit v1.2.3 From de1177e56005dccd9729d4ca331de64f5d463b90 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 8 Jul 2024 09:47:57 +0200 Subject: virtio: make virtio_find_vqs() call virtio_find_vqs_ctx() In order to prepare for conversion of virtio_find_vqs*() arguments, make virtio_find_vqs() to call virtio_find_vqs_ctx() instead of op directly. Signed-off-by: Jiri Pirko Message-Id: <20240708074814.1739223-3-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index da9b271b54db..d19eaf6bafbf 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -225,22 +225,23 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, } static inline -int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs, +int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], + const char * const names[], const bool *ctx, struct irq_affinity *desc) { - return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, desc); + return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, + desc); } static inline -int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs, +int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], const bool *ctx, + const char * const names[], struct irq_affinity *desc) { - return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, - desc); + return virtio_find_vqs_ctx(vdev, nvqs, vqs, callbacks, + names, NULL, desc); } /** -- cgit v1.2.3 From 959538c11a88298672195a0eb3289c0f2f88b02d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 8 Jul 2024 09:47:58 +0200 Subject: virtio: make virtio_find_single_vq() call virtio_find_vqs() In order to prepare for conversion of virtio_find_vqs*() arguments, make virtio_find_single_vq() to call virtio_find_vqs() instead of op directly. Signed-off-by: Jiri Pirko Message-Id: <20240708074814.1739223-4-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index d19eaf6bafbf..82a1d798b2f1 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -210,20 +210,6 @@ static inline bool virtio_has_dma_quirk(const struct virtio_device *vdev) return !virtio_has_feature(vdev, VIRTIO_F_ACCESS_PLATFORM); } -static inline -struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, - vq_callback_t *c, const char *n) -{ - vq_callback_t *callbacks[] = { c }; - const char *names[] = { n }; - struct virtqueue *vq; - int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL, - NULL); - if (err < 0) - return ERR_PTR(err); - return vq; -} - static inline int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], @@ -244,6 +230,20 @@ int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs, names, NULL, desc); } +static inline +struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, + vq_callback_t *c, const char *n) +{ + vq_callback_t *callbacks[] = { c }; + const char *names[] = { n }; + struct virtqueue *vq; + int err = virtio_find_vqs(vdev, 1, &vq, callbacks, names, NULL); + + if (err < 0) + return ERR_PTR(err); + return vq; +} + /** * virtio_synchronize_cbs - synchronize with virtqueue callbacks * @dev: the virtio device -- cgit v1.2.3 From c502eb85c34e29bb185e3d15dd9b8fce8a74f303 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 8 Jul 2024 09:47:59 +0200 Subject: virtio: introduce virtio_queue_info struct and find_vqs_info() config op Introduce a structure virtio_queue_info to carry name, callback and ctx together. In order to allow config implementations to accept config op with array of virtio_queue_info structures, introduce a new find_vqs_info() op. Do the needed conversion in virtio_find_vqs_ctx(). Note that whole virtio_find_vqs_ctx() is going to be eventually removed at the and of this patchset. Suggested-by: Xuan Zhuo Signed-off-by: Jiri Pirko Message-Id: <20240708074814.1739223-5-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 55 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 82a1d798b2f1..fb7a1fd00ee5 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -18,6 +18,20 @@ struct virtio_shm_region { typedef void vq_callback_t(struct virtqueue *); +/** + * struct virtqueue_info - Info for a virtqueue passed to find_vqs(). + * @name: virtqueue description. Used mainly for debugging, NULL for + * a virtqueue unused by the driver. + * @callback: A callback to invoke on a used buffer notification. + * NULL for a virtqueue that does not need a callback. + * @ctx: A flag to indicate to maintain an extra context per virtqueue. + */ +struct virtqueue_info { + const char *name; + vq_callback_t *callback; + bool ctx; +}; + /** * struct virtio_config_ops - operations for configuring a virtio device * Note: Do not assume that a transport implements all of the operations @@ -58,6 +72,12 @@ typedef void vq_callback_t(struct virtqueue *); * names: array of virtqueue names (mainly for debugging) * include a NULL entry for vqs unused by driver * Returns 0 on success or error status + * @find_vqs_info: find virtqueues and instantiate them. + * vdev: the virtio_device + * nvqs: the number of virtqueues to find + * vqs: on success, includes new virtqueues + * vqs_info: array of virtqueue info structures + * Returns 0 on success or error status * @del_vqs: free virtqueues found by find_vqs(). * @synchronize_cbs: synchronize with the virtqueue callbacks (optional) * The function guarantees that all memory operations on the @@ -109,6 +129,10 @@ struct virtio_config_ops { struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], const bool *ctx, struct irq_affinity *desc); + int (*find_vqs_info)(struct virtio_device *vdev, unsigned int nvqs, + struct virtqueue *vqs[], + struct virtqueue_info vqs_info[], + struct irq_affinity *desc); void (*del_vqs)(struct virtio_device *); void (*synchronize_cbs)(struct virtio_device *); u64 (*get_features)(struct virtio_device *vdev); @@ -117,7 +141,7 @@ struct virtio_config_ops { int (*set_vq_affinity)(struct virtqueue *vq, const struct cpumask *cpu_mask); const struct cpumask *(*get_vq_affinity)(struct virtio_device *vdev, - int index); + int index); bool (*get_shm_region)(struct virtio_device *vdev, struct virtio_shm_region *region, u8 id); int (*disable_vq_and_reset)(struct virtqueue *vq); @@ -210,14 +234,39 @@ static inline bool virtio_has_dma_quirk(const struct virtio_device *vdev) return !virtio_has_feature(vdev, VIRTIO_F_ACCESS_PLATFORM); } +static inline +int virtio_find_vqs_info(struct virtio_device *vdev, unsigned int nvqs, + struct virtqueue *vqs[], + struct virtqueue_info vqs_info[], + struct irq_affinity *desc) +{ + return vdev->config->find_vqs_info(vdev, nvqs, vqs, vqs_info, desc); +} + static inline int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], const bool *ctx, struct irq_affinity *desc) { - return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, - desc); + struct virtqueue_info *vqs_info; + int err, i; + + if (!vdev->config->find_vqs_info) + return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, + names, ctx, desc); + + vqs_info = kmalloc_array(nvqs, sizeof(*vqs_info), GFP_KERNEL); + if (!vqs_info) + return -ENOMEM; + for (i = 0; i < nvqs; i++) { + vqs_info[i].name = names[i]; + vqs_info[i].callback = callbacks[i]; + vqs_info[i].ctx = ctx ? ctx[i] : false; + } + err = virtio_find_vqs_info(vdev, nvqs, vqs, vqs_info, desc); + kfree(vqs_info); + return err; } static inline -- cgit v1.2.3 From 992648f5a61f1c9fab800fbc42dea31727cd8a55 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 8 Jul 2024 09:48:02 +0200 Subject: virtio: call virtio_find_vqs_info() from virtio_find_single_vq() directly Since there are no more implementations of find_vqs() op, call virtio_find_vqs_info() from virtio_find_single_vq() directly. Signed-off-by: Jiri Pirko Message-Id: <20240708074814.1739223-8-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index fb7a1fd00ee5..23ce1aec54e8 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -283,10 +283,11 @@ static inline struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, vq_callback_t *c, const char *n) { - vq_callback_t *callbacks[] = { c }; - const char *names[] = { n }; + struct virtqueue_info vqs_info[] = { + { n, c }, + }; struct virtqueue *vq; - int err = virtio_find_vqs(vdev, 1, &vq, callbacks, names, NULL); + int err = virtio_find_vqs_info(vdev, 1, &vq, vqs_info, NULL); if (err < 0) return ERR_PTR(err); -- cgit v1.2.3 From 18cd029fd7f7a85d522973f34c413a2b009255ac Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 8 Jul 2024 09:48:03 +0200 Subject: virtio: remove the original find_vqs() op As it is no longer used, remove it. Signed-off-by: Jiri Pirko Message-Id: <20240708074814.1739223-9-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 23ce1aec54e8..1ae346bd2dbd 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -63,15 +63,6 @@ struct virtqueue_info { * After this, status and feature negotiation must be done again * Device must not be reset from its vq/config callbacks, or in * parallel with being added/removed. - * @find_vqs: find virtqueues and instantiate them. - * vdev: the virtio_device - * nvqs: the number of virtqueues to find - * vqs: on success, includes new virtqueues - * callbacks: array of callbacks, for each virtqueue - * include a NULL entry for vqs that do not need a callback - * names: array of virtqueue names (mainly for debugging) - * include a NULL entry for vqs unused by driver - * Returns 0 on success or error status * @find_vqs_info: find virtqueues and instantiate them. * vdev: the virtio_device * nvqs: the number of virtqueues to find @@ -125,10 +116,6 @@ struct virtio_config_ops { u8 (*get_status)(struct virtio_device *vdev); void (*set_status)(struct virtio_device *vdev, u8 status); void (*reset)(struct virtio_device *vdev); - int (*find_vqs)(struct virtio_device *, unsigned nvqs, - struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], const bool *ctx, - struct irq_affinity *desc); int (*find_vqs_info)(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], @@ -252,10 +239,6 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs, struct virtqueue_info *vqs_info; int err, i; - if (!vdev->config->find_vqs_info) - return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, - names, ctx, desc); - vqs_info = kmalloc_array(nvqs, sizeof(*vqs_info), GFP_KERNEL); if (!vqs_info) return -ENOMEM; -- cgit v1.2.3 From b49503eaf9c74c3d0efd1e8331aba37ce3c1fd68 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 8 Jul 2024 09:48:04 +0200 Subject: virtio: rename find_vqs_info() op to find_vqs() Since the original find_vqs() is no longer present, rename find_vqs_info() back to find_vqs(). Signed-off-by: Jiri Pirko Message-Id: <20240708074814.1739223-10-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 1ae346bd2dbd..7848fba7f48a 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -63,7 +63,7 @@ struct virtqueue_info { * After this, status and feature negotiation must be done again * Device must not be reset from its vq/config callbacks, or in * parallel with being added/removed. - * @find_vqs_info: find virtqueues and instantiate them. + * @find_vqs: find virtqueues and instantiate them. * vdev: the virtio_device * nvqs: the number of virtqueues to find * vqs: on success, includes new virtqueues @@ -116,10 +116,10 @@ struct virtio_config_ops { u8 (*get_status)(struct virtio_device *vdev); void (*set_status)(struct virtio_device *vdev, u8 status); void (*reset)(struct virtio_device *vdev); - int (*find_vqs_info)(struct virtio_device *vdev, unsigned int nvqs, - struct virtqueue *vqs[], - struct virtqueue_info vqs_info[], - struct irq_affinity *desc); + int (*find_vqs)(struct virtio_device *vdev, unsigned int nvqs, + struct virtqueue *vqs[], + struct virtqueue_info vqs_info[], + struct irq_affinity *desc); void (*del_vqs)(struct virtio_device *); void (*synchronize_cbs)(struct virtio_device *); u64 (*get_features)(struct virtio_device *vdev); @@ -227,7 +227,7 @@ int virtio_find_vqs_info(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue_info vqs_info[], struct irq_affinity *desc) { - return vdev->config->find_vqs_info(vdev, nvqs, vqs, vqs_info, desc); + return vdev->config->find_vqs(vdev, nvqs, vqs, vqs_info, desc); } static inline -- cgit v1.2.3 From 3e8d51c7765dac2a70fe2ad222a8ad244c9f29a4 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 8 Jul 2024 09:48:13 +0200 Subject: virtio: remove unused virtio_find_vqs() and virtio_find_vqs_ctx() helpers All callers of virtio_find_vqs() and virtio_find_vqs_ctx() were converted to use virtio_find_vqs_info(). Remove no longer used helpers. Signed-off-by: Jiri Pirko Message-Id: <20240708074814.1739223-19-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 7848fba7f48a..b6b615c3e21d 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -230,38 +230,6 @@ int virtio_find_vqs_info(struct virtio_device *vdev, unsigned int nvqs, return vdev->config->find_vqs(vdev, nvqs, vqs, vqs_info, desc); } -static inline -int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], const bool *ctx, - struct irq_affinity *desc) -{ - struct virtqueue_info *vqs_info; - int err, i; - - vqs_info = kmalloc_array(nvqs, sizeof(*vqs_info), GFP_KERNEL); - if (!vqs_info) - return -ENOMEM; - for (i = 0; i < nvqs; i++) { - vqs_info[i].name = names[i]; - vqs_info[i].callback = callbacks[i]; - vqs_info[i].ctx = ctx ? ctx[i] : false; - } - err = virtio_find_vqs_info(vdev, nvqs, vqs, vqs_info, desc); - kfree(vqs_info); - return err; -} - -static inline -int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], - struct irq_affinity *desc) -{ - return virtio_find_vqs_ctx(vdev, nvqs, vqs, callbacks, - names, NULL, desc); -} - static inline struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, vq_callback_t *c, const char *n) -- cgit v1.2.3 From 6c85d6b653caeba2ef982925703cbb4f2b3b3163 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 8 Jul 2024 09:48:14 +0200 Subject: virtio: rename virtio_find_vqs_info() to virtio_find_vqs() Since the original virtio_find_vqs() is no longer present, rename virtio_find_vqs_info() back to virtio_find_vqs(). Signed-off-by: Jiri Pirko Message-Id: <20240708074814.1739223-20-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index b6b615c3e21d..ab4b9a3fef6b 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -222,10 +222,10 @@ static inline bool virtio_has_dma_quirk(const struct virtio_device *vdev) } static inline -int virtio_find_vqs_info(struct virtio_device *vdev, unsigned int nvqs, - struct virtqueue *vqs[], - struct virtqueue_info vqs_info[], - struct irq_affinity *desc) +int virtio_find_vqs(struct virtio_device *vdev, unsigned int nvqs, + struct virtqueue *vqs[], + struct virtqueue_info vqs_info[], + struct irq_affinity *desc) { return vdev->config->find_vqs(vdev, nvqs, vqs, vqs_info, desc); } @@ -238,7 +238,7 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, { n, c }, }; struct virtqueue *vq; - int err = virtio_find_vqs_info(vdev, 1, &vq, vqs_info, NULL); + int err = virtio_find_vqs(vdev, 1, &vq, vqs_info, NULL); if (err < 0) return ERR_PTR(err); -- cgit v1.2.3 From a7526fe8b94eced7d82aa00b2bcca44e39ae0769 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2024 18:35:30 +0200 Subject: mm, slab: put should_failslab() back behind CONFIG_SHOULD_FAILSLAB Patch series "revert unconditional slab and page allocator fault injection calls". These two patches largely revert commits that added function call overhead into slab and page allocation hotpaths and that cannot be currently disabled even though related CONFIG_ options do exist. A much more involved solution that can keep the callsites always existing but hidden behind a static key if unused, is possible [1] and can be pursued by anyone who believes it's necessary. Meanwhile the fact the should_failslab() error injection is already not functional on kernels built with current gcc without anyone noticing [2], and lukewarm response to [1] suggests the need is not there. I believe it will be more fair to have the state after this series as a baseline for possible further optimisation, instead of the unconditional overhead. For example a possible compromise for anyone who's fine with an empty function call overhead but not the full CONFIG_FAILSLAB / CONFIG_FAIL_PAGE_ALLOC overhead is to reuse patch 1 from [1] but insert a static key check only inside should_failslab() and should_fail_alloc_page() before performing the more expensive checks. [1] https://lore.kernel.org/all/20240620-fault-injection-statickeys-v2-0-e23947d3d84b@suse.cz/#t [2] https://github.com/bpftrace/bpftrace/issues/3258 This patch (of 2): This mostly reverts commit 4f6923fbb352 ("mm: make should_failslab always available for fault injection"). The commit made should_failslab() a noinline function that's always called from the slab allocation hotpath, even if it's empty because CONFIG_SHOULD_FAILSLAB is not enabled, and there is no option to disable that call. This is visible in profiles and the function call overhead can be noticeable especially with cpu mitigations. Meanwhile the bpftrace program example in the commit silently does not work without CONFIG_SHOULD_FAILSLAB anyway with a recent gcc, because the empty function gets a .constprop clone that is actually being called (uselessly) from the slab hotpath, while the error injection is hooked to the original function that's not being called at all [1]. Thus put the whole should_failslab() function back behind CONFIG_SHOULD_FAILSLAB. It's not a complete revert of 4f6923fbb352 - the int return type that returns -ENOMEM on failure is preserved, as well ALLOW_ERROR_INJECTION annotation. The BTF_ID() record that was meanwhile added is also guarded by CONFIG_SHOULD_FAILSLAB. [1] https://github.com/bpftrace/bpftrace/issues/3258 Link: https://lkml.kernel.org/r/20240711-b4-fault-injection-reverts-v1-0-9e2651945d68@suse.cz Link: https://lkml.kernel.org/r/20240711-b4-fault-injection-reverts-v1-1-9e2651945d68@suse.cz Signed-off-by: Vlastimil Babka Cc: Akinobu Mita Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Christoph Lameter Cc: Daniel Borkmann Cc: David Rientjes Cc: Eduard Zingerman Cc: Hao Luo Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Martin KaFai Lau Cc: Mateusz Guzik Cc: Roman Gushchin Cc: Song Liu Cc: Stanislav Fomichev Cc: Yonghong Song Signed-off-by: Andrew Morton --- include/linux/fault-inject.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 6d5edef09d45..be6d0bc111ad 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -102,11 +102,10 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) } #endif /* CONFIG_FAIL_PAGE_ALLOC */ -int should_failslab(struct kmem_cache *s, gfp_t gfpflags); #ifdef CONFIG_FAILSLAB -extern bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags); +int should_failslab(struct kmem_cache *s, gfp_t gfpflags); #else -static inline bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) +static inline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) { return false; } -- cgit v1.2.3 From 53dabce2652fb854eae84609ce9c37429d5d87ba Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 11 Jul 2024 18:35:31 +0200 Subject: mm, page_alloc: put should_fail_alloc_page() back behing CONFIG_FAIL_PAGE_ALLOC This mostly reverts commit af3b854492f3 ("mm/page_alloc.c: allow error injection"). The commit made should_fail_alloc_page() a noinline function that's always called from the page allocation hotpath, even if it's empty because CONFIG_FAIL_PAGE_ALLOC is not enabled, and there is no option to disable it and prevent the associated function call overhead. As with the preceding patch "mm, slab: put should_failslab back behind CONFIG_SHOULD_FAILSLAB" and for the same reasons, put the should_fail_alloc_page() back behind the config option. When enabled, the ALLOW_ERROR_INJECTION and BTF_ID records are preserved so it's not a complete revert. Link: https://lkml.kernel.org/r/20240711-b4-fault-injection-reverts-v1-2-9e2651945d68@suse.cz Signed-off-by: Vlastimil Babka Cc: Akinobu Mita Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Christoph Lameter Cc: Daniel Borkmann Cc: David Rientjes Cc: Eduard Zingerman Cc: Hao Luo Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Martin KaFai Lau Cc: Mateusz Guzik Cc: Roman Gushchin Cc: Song Liu Cc: Stanislav Fomichev Cc: Yonghong Song Signed-off-by: Andrew Morton --- include/linux/fault-inject.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index be6d0bc111ad..354413950d34 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -91,12 +91,10 @@ static inline void fault_config_init(struct fault_config *config, struct kmem_cache; -bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); - #ifdef CONFIG_FAIL_PAGE_ALLOC -bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); +bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); #else -static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return false; } -- cgit v1.2.3 From 4810a82c8a8ae06fe6496a23fcb89a4952603e60 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 11 Jul 2024 15:04:55 -0700 Subject: lib: add missing newline character in the warning message Link: https://lkml.kernel.org/r/20240711220457.1751071-1-surenb@google.com Fixes: 22d407b164ff ("lib: add allocation tagging support for memory allocation profiling") Signed-off-by: Suren Baghdasaryan Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index abd24016a900..8c61ccd161ba 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -122,7 +122,7 @@ static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag "alloc_tag was not cleared (got tag for %s:%u)\n", ref->ct->filename, ref->ct->lineno); - WARN_ONCE(!tag, "current->alloc_tag not set"); + WARN_ONCE(!tag, "current->alloc_tag not set\n"); } static inline void alloc_tag_sub_check(union codetag_ref *ref) -- cgit v1.2.3 From fd8acc0097b91fab3104fa8a66ce2fd9cf8b0c11 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 11 Jul 2024 15:04:56 -0700 Subject: lib: reuse page_ext_data() to obtain codetag_ref codetag_ref_from_page_ext() reimplements the same calculation as page_ext_data(). Reuse existing function instead. Link: https://lkml.kernel.org/r/20240711220457.1751071-2-surenb@google.com Fixes: dcfe378c81f7 ("lib: introduce support for page allocation tagging") Signed-off-by: Suren Baghdasaryan Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 9cacadbd61f8..acb1e9ce7981 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -15,7 +15,7 @@ extern struct page_ext_operations page_alloc_tagging_ops; static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext) { - return (void *)page_ext + page_alloc_tagging_ops.offset; + return (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); } static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref) -- cgit v1.2.3 From 6ab42fe21c84d72da752923b4bd7075344f4a362 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 11 Jul 2024 15:04:57 -0700 Subject: alloc_tag: fix page_ext_get/page_ext_put sequence during page splitting pgalloc_tag_sub() might call page_ext_put() using a page different from the one used in page_ext_get() call. This does not pose an issue since page_ext_put() ignores this parameter as long as it's non-NULL but technically this is wrong. Fix it by storing the original page used in page_ext_get() and passing it to page_ext_put(). Link: https://lkml.kernel.org/r/20240711220457.1751071-3-surenb@google.com Fixes: be25d1d4e822 ("mm: create new codetag references during page splitting") Signed-off-by: Suren Baghdasaryan Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index acb1e9ce7981..18cd0c0c73d9 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -71,6 +71,7 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) static inline void pgalloc_tag_split(struct page *page, unsigned int nr) { int i; + struct page_ext *first_page_ext; struct page_ext *page_ext; union codetag_ref *ref; struct alloc_tag *tag; @@ -78,7 +79,7 @@ static inline void pgalloc_tag_split(struct page *page, unsigned int nr) if (!mem_alloc_profiling_enabled()) return; - page_ext = page_ext_get(page); + first_page_ext = page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; @@ -94,7 +95,7 @@ static inline void pgalloc_tag_split(struct page *page, unsigned int nr) page_ext = page_ext_next(page_ext); } out: - page_ext_put(page_ext); + page_ext_put(first_page_ext); } static inline struct alloc_tag *pgalloc_tag_get(struct page *page) -- cgit v1.2.3 From 667574e873b5f77a220b2a93329689f36fb56d5d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 12 Jul 2024 11:13:14 +0800 Subject: mm/hugetlb: fix possible recursive locking detected warning When tries to demote 1G hugetlb folios, a lockdep warning is observed: ============================================ WARNING: possible recursive locking detected 6.10.0-rc6-00452-ga4d0275fa660-dirty #79 Not tainted -------------------------------------------- bash/710 is trying to acquire lock: ffffffff8f0a7850 (&h->resize_lock){+.+.}-{3:3}, at: demote_store+0x244/0x460 but task is already holding lock: ffffffff8f0a6f48 (&h->resize_lock){+.+.}-{3:3}, at: demote_store+0xae/0x460 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&h->resize_lock); lock(&h->resize_lock); *** DEADLOCK *** May be due to missing lock nesting notation 4 locks held by bash/710: #0: ffff8f118439c3f0 (sb_writers#5){.+.+}-{0:0}, at: ksys_write+0x64/0xe0 #1: ffff8f11893b9e88 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0xf8/0x1d0 #2: ffff8f1183dc4428 (kn->active#98){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x100/0x1d0 #3: ffffffff8f0a6f48 (&h->resize_lock){+.+.}-{3:3}, at: demote_store+0xae/0x460 stack backtrace: CPU: 3 PID: 710 Comm: bash Not tainted 6.10.0-rc6-00452-ga4d0275fa660-dirty #79 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x68/0xa0 __lock_acquire+0x10f2/0x1ca0 lock_acquire+0xbe/0x2d0 __mutex_lock+0x6d/0x400 demote_store+0x244/0x460 kernfs_fop_write_iter+0x12c/0x1d0 vfs_write+0x380/0x540 ksys_write+0x64/0xe0 do_syscall_64+0xb9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fa61db14887 RSP: 002b:00007ffc56c48358 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fa61db14887 RDX: 0000000000000002 RSI: 000055a030050220 RDI: 0000000000000001 RBP: 000055a030050220 R08: 00007fa61dbd1460 R09: 000000007fffffff R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002 R13: 00007fa61dc1b780 R14: 00007fa61dc17600 R15: 00007fa61dc16a00 Lockdep considers this an AA deadlock because the different resize_lock mutexes reside in the same lockdep class, but this is a false positive. Place them in distinct classes to avoid these warnings. Link: https://lkml.kernel.org/r/20240712031314.2570452-1-linmiaohe@huawei.com Fixes: 8531fc6f52f5 ("hugetlb: add hugetlb demote page support") Signed-off-by: Miaohe Lin Acked-by: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ecd0ceeea206..c9bf68c239a0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -663,6 +663,7 @@ HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable) /* Defines one hugetlb page size */ struct hstate { struct mutex resize_lock; + struct lock_class_key resize_key; int next_nid_to_alloc; int next_nid_to_free; unsigned int order; -- cgit v1.2.3 From 67856f44da381973caf4eb692ad2cca1de7b2d37 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 15 Jul 2024 08:25:33 +0300 Subject: ia64: scrub ia64 from poison.h Link: https://lkml.kernel.org/r/c72e5467-06a8-4739-ae6a-7c84c96cad77@p183 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- include/linux/poison.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/poison.h b/include/linux/poison.h index 1f0ee2459f2a..a265b499033b 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -52,12 +52,6 @@ /********** arch/$ARCH/mm/init.c **********/ #define POISON_FREE_INITMEM 0xcc -/********** arch/ia64/hp/common/sba_iommu.c **********/ -/* - * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a - * value of "SBAIOMMU POISON\0" for spill-over poisoning. - */ - /********** fs/jbd/journal.c **********/ #define JBD_POISON_FREE 0x5b #define JBD2_POISON_FREE 0x5c -- cgit v1.2.3 From c14112a5574ff5cf3de198ab6eeff53ac1234068 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 17 Jul 2024 10:29:16 -0700 Subject: driver core: auxiliary bus: Fix documentation of auxiliary_device Fix the documentation of the below field of struct auxiliary_device include/linux/auxiliary_bus.h:150: warning: Function parameter or struct member 'sysfs' not described in 'auxiliary_device' include/linux/auxiliary_bus.h:150: warning: Excess struct member 'irqs' description in 'auxiliary_device' include/linux/auxiliary_bus.h:150: warning: Excess struct member 'lock' description in 'auxiliary_device' include/linux/auxiliary_bus.h:150: warning: Excess struct member 'irq_dir_exists' description in 'auxiliary_device' Fixes: a808878308a8 ("driver core: auxiliary bus: show auxiliary device IRQs") Reported-by: Stephen Rothwell Signed-off-by: Shay Drory Signed-off-by: Saeed Mahameed Link: https://patch.msgid.link/20240717172916.595808-1-saeed@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/auxiliary_bus.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index 3ba4487c9cd9..1539bbd263d2 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -58,9 +58,10 @@ * in * @name: Match name found by the auxiliary device driver, * @id: unique identitier if multiple devices of the same name are exported, - * @irqs: irqs xarray contains irq indices which are used by the device, - * @lock: Synchronize irq sysfs creation, - * @irq_dir_exists: whether "irqs" directory exists, + * @sysfs: embedded struct which hold all sysfs related fields, + * @sysfs.irqs: irqs xarray contains irq indices which are used by the device, + * @sysfs.lock: Synchronize irq sysfs creation, + * @sysfs.irq_dir_exists: whether "irqs" directory exists, * * An auxiliary_device represents a part of its parent device's functionality. * It is given a name that, combined with the registering drivers -- cgit v1.2.3 From f2f6a8e8871725035959b90bac048cde555aa0e9 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 18 Jul 2024 13:06:47 +0100 Subject: init/Kconfig: remove CONFIG_GCC_ASM_GOTO_OUTPUT_WORKAROUND Several versions of GCC mis-compile asm goto with outputs. We try to workaround this, but our workaround is demonstrably incomplete and liable to result in subtle bugs, especially on arm64 where get_user() has recently been moved over to using asm goto with outputs. From discussion(s) with Linus at: https://lore.kernel.org/linux-arm-kernel/Zpfv2tnlQ-gOLGac@J2N7QTR9R3.cambridge.arm.com/ https://lore.kernel.org/linux-arm-kernel/ZpfxLrJAOF2YNqCk@J2N7QTR9R3.cambridge.arm.com/ ... it sounds like the best thing to do for now is to remove the workaround and make CC_HAS_ASM_GOTO_OUTPUT depend on working compiler versions. The issue was originally reported to GCC by Sean Christopherson: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 ... and Jakub Jelinek fixed this for GCC 14, with the fix backported to 13.3.0, 12.4.0, and 11.5.0. In the kernel, we tried to workaround broken compilers in commits: 4356e9f841f7 ("work around gcc bugs with 'asm goto' with outputs") 68fb3ca0e408 ("update workarounds for gcc "asm goto" issue") ... but the workaround of adding an empty asm("") after the asm volatile goto(...) demonstrably does not always avoid the problem, as can be seen in the following test case: | #define asm_goto_output(x...) \ | do { asm volatile goto(x); asm (""); } while (0) | | #define __good_or_bad(__val, __key) \ | do { \ | __label__ __failed; \ | unsigned long __tmp; \ | asm_goto_output( \ | " cbnz %[key], %l[__failed]\n" \ | " mov %[val], #0x900d\n" \ | : [val] "=r" (__tmp) \ | : [key] "r" (__key) \ | : \ | : __failed); \ | (__val) = __tmp; \ | break; \ | __failed: \ | (__val) = 0xbad; \ | } while (0) | | unsigned long get_val(unsigned long key); | unsigned long get_val(unsigned long key) | { | unsigned long val = 0xbad; | | __good_or_bad(val, key); | | return val; | } GCC 13.2.0 (at -O2) compiles this to: | cbnz x0, .Lfailed | mov x0, #0x900d | .Lfailed: | ret GCC 14.1.0 (at -O2) compiles this to: | cbnz x0, .Lfailed | mov x0, #0x900d | ret | .Lfailed: | mov x0, #0xbad | ret Note that GCC 13.2.0 erroneously omits the assignment to 'val' in the error path (even though this does not depend on an output of the asm goto). GCC 14.1.0 correctly retains the assignment. This problem can be seen within the kernel with the following test case: | #include | #include | | noinline unsigned long test_unsafe_get_user(unsigned long __user *ptr); | noinline unsigned long test_unsafe_get_user(unsigned long __user *ptr) | { | unsigned long val; | | unsafe_get_user(val, ptr, Efault); | return val; | | Efault: | val = 0x900d; | return val; | } GCC 13.2.0 (arm64 defconfig) compiles this to: | and x0, x0, #0xff7fffffffffffff | ldtr x0, [x0] | .Lextable_fixup: | ret GCC 13.2.0 (x86_64 defconfig + MITIGATION_RETPOLINE=n) compiles this to: | endbr64 | mov (%rdi),%rax | .Lextable_fixup: | ret ... omitting the assignment to 'val' in the error path, and leaving garbage in the result register returned by the function (which happens to contain the faulting address in the generated code). GCC 14.1.0 (arm64 defconfig) compiles this to: | and x0, x0, #0xff7fffffffffffff | ldtr x0, [x0] | ret | .Lextable_fixup: | mov x0, #0x900d // #36877 | ret GCC 14.1.0 (x86_64 defconfig + MITIGATION_RETPOLINE=n) compiles this to: | endbr64 | mov (%rdi),%rax | ret | .Lextable_fixup: | mov $0x900d,%eax | ret ... retaining the expected assignment to 'val' in the error path. We don't have a complete and reasonable workaround. While placing empty asm("") blocks after each goto label *might* be sufficient, we don't know for certain, this is tedious and error-prone, and there doesn't seem to be a neat way to wrap this up (which is especially painful for cases with multiple goto labels). Avoid this issue by disabling CONFIG_CC_HAS_ASM_GOTO_OUTPUT for known-broken compiler versions and removing the workaround (along with the CONFIG_GCC_ASM_GOTO_OUTPUT_WORKAROUND config option). For the moment I've left the default implementation of asm_goto_output() unchanged. This should now be redundant since any compiler with the fix for the clobbering issue whould also have a fix for the (earlier) volatile issue, but it's far less churny to leave it around, which makes it easier to backport this patch if necessary. Signed-off-by: Mark Rutland Cc: Alex Coplan Cc: Catalin Marinas Cc: Jakub Jelinek Cc: Peter Zijlstra Cc: Sean Christopherson Cc: Szabolcs Nagy Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index aff92b1d284f..f805adaa316e 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -64,26 +64,6 @@ __builtin_unreachable(); \ } while (0) -/* - * GCC 'asm goto' with outputs miscompiles certain code sequences: - * - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 - * - * Work around it via the same compiler barrier quirk that we used - * to use for the old 'asm goto' workaround. - * - * Also, always mark such 'asm goto' statements as volatile: all - * asm goto statements are supposed to be volatile as per the - * documentation, but some versions of gcc didn't actually do - * that for asms with outputs: - * - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619 - */ -#ifdef CONFIG_GCC_ASM_GOTO_OUTPUT_WORKAROUND -#define asm_goto_output(x...) \ - do { asm volatile goto(x); asm (""); } while (0) -#endif - #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ -- cgit v1.2.3 From 7d189c77106ed6df09829f7a419e35ada67b2bd0 Mon Sep 17 00:00:00 2001 From: Shivamurthy Shastri Date: Wed, 26 Jun 2024 21:05:12 +0200 Subject: PCI/MSI: Provide MSI_FLAG_PCI_MSI_MASK_PARENT Most ARM(64) PCI/MSI domains mask and unmask in the parent domain after or before the PCI mask/unmask operation takes place. So there are more than a dozen of the same wrapper implementation all over the place. Don't make the same mistake with the new per device PCI/MSI domains and provide a new MSI feature flag, which lets the domain implementation enable this sequence in the PCI/MSI code. Signed-off-by: Shivamurthy Shastri Signed-off-by: Thomas Gleixner Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/87ed8j34pj.ffs@tglx --- include/linux/msi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index dc27cf3903d5..04f33e7f6f8b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -556,6 +556,8 @@ enum { MSI_FLAG_USE_DEV_FWNODE = (1 << 7), /* Set parent->dev into domain->pm_dev on device domain creation */ MSI_FLAG_PARENT_PM_DEV = (1 << 8), + /* Support for parent mask/unmask */ + MSI_FLAG_PCI_MSI_MASK_PARENT = (1 << 9), /* Mask for the generic functionality */ MSI_GENERIC_FLAGS_MASK = GENMASK(15, 0), -- cgit v1.2.3 From f6a9886a9e55a1e6bd78c7e505205d05ef50a71e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jun 2024 17:18:50 +0200 Subject: genirq/msi: Remove platform_msi_create_device_domain() No more users. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Shivamurthy Shastri Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240623142235.395577449@linutronix.de --- include/linux/msi.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 04f33e7f6f8b..4ae036d0c7db 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -660,8 +660,6 @@ __platform_msi_create_device_domain(struct device *dev, const struct irq_domain_ops *ops, void *host_data); -#define platform_msi_create_device_domain(dev, nvec, write, ops, data) \ - __platform_msi_create_device_domain(dev, nvec, false, write, ops, data) #define platform_msi_create_device_tree_domain(dev, nvec, write, ops, data) \ __platform_msi_create_device_domain(dev, nvec, true, write, ops, data) -- cgit v1.2.3 From e9894248994ca8291838baf063f045eab28e5a0e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jun 2024 17:19:05 +0200 Subject: genirq/msi: Remove platform MSI leftovers No more users! Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Shivamurthy Shastri Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240623142235.943295676@linutronix.de --- include/linux/msi.h | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 4ae036d0c7db..4c3462a6a97b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -81,7 +81,6 @@ extern int pci_msi_ignore_mask; /* Helper functions */ struct msi_desc; struct pci_dev; -struct platform_msi_priv_data; struct device_attribute; struct irq_domain; struct irq_affinity_desc; @@ -231,14 +230,12 @@ struct msi_dev_domain { /** * msi_device_data - MSI per device data * @properties: MSI properties which are interesting to drivers - * @platform_data: Platform-MSI specific data * @mutex: Mutex protecting the MSI descriptor store * @__domains: Internal data for per device MSI domains * @__iter_idx: Index to search the next entry for iterators */ struct msi_device_data { unsigned long properties; - struct platform_msi_priv_data *platform_data; struct mutex mutex; struct msi_dev_domain __domains[MSI_MAX_DEVICE_IRQDOMAINS]; unsigned long __iter_idx; @@ -641,33 +638,6 @@ void msi_domain_free_irqs_all(struct device *dev, unsigned int domid); struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain); -struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode, - struct msi_domain_info *info, - struct irq_domain *parent); - -/* When an MSI domain is used as an intermediate domain */ -int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, - int nvec, msi_alloc_info_t *args); -int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, - int virq, int nvec, msi_alloc_info_t *args); -void msi_domain_depopulate_descs(struct device *dev, int virq, int nvec); - -struct irq_domain * -__platform_msi_create_device_domain(struct device *dev, - unsigned int nvec, - bool is_tree, - irq_write_msi_msg_t write_msi_msg, - const struct irq_domain_ops *ops, - void *host_data); - -#define platform_msi_create_device_tree_domain(dev, nvec, write, ops, data) \ - __platform_msi_create_device_domain(dev, nvec, true, write, ops, data) - -int platform_msi_device_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs); -void platform_msi_device_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nvec); -void *platform_msi_get_host_data(struct irq_domain *domain); /* Per device platform MSI */ int platform_device_msi_init_and_alloc_irqs(struct device *dev, unsigned int nvec, irq_write_msi_msg_t write_msi_msg); -- cgit v1.2.3 From 2fdda02a8749fdaff5621c96aaf24a61d2f8c5a2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jun 2024 17:19:07 +0200 Subject: genirq/msi: Move msi_device_data to core Now that the platform MSI hack is gone, nothing needs to know about struct msi_device_data outside of the core code. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Shivamurthy Shastri Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240623142236.003295177@linutronix.de --- include/linux/msi.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 4c3462a6a97b..369367ecae5e 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -21,11 +21,7 @@ #include #include #include -#include -#include -#include #include -#include #include @@ -227,20 +223,6 @@ struct msi_dev_domain { struct irq_domain *domain; }; -/** - * msi_device_data - MSI per device data - * @properties: MSI properties which are interesting to drivers - * @mutex: Mutex protecting the MSI descriptor store - * @__domains: Internal data for per device MSI domains - * @__iter_idx: Index to search the next entry for iterators - */ -struct msi_device_data { - unsigned long properties; - struct mutex mutex; - struct msi_dev_domain __domains[MSI_MAX_DEVICE_IRQDOMAINS]; - unsigned long __iter_idx; -}; - int msi_setup_device_data(struct device *dev); void msi_lock_descs(struct device *dev); -- cgit v1.2.3 From a97b43fac5b9b3ffca71b8a917a249789902fce9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 18 Jul 2024 17:17:10 -0400 Subject: lockdep: Add comments for lockdep_set_no{validate,track}_class() Cc: Waiman Long Signed-off-by: Kent Overstreet --- include/linux/lockdep.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b76f1bcd2f7f..0759e30df392 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -178,9 +178,24 @@ static inline void lockdep_init_map(struct lockdep_map *lock, const char *name, (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) +/** + * lockdep_set_novalidate_class: disable checking of lock ordering on a given + * lock + * @lock: Lock to mark + * + * Lockdep will still record that this lock has been taken, and print held + * instances when dumping locks + */ #define lockdep_set_novalidate_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock) +/** + * lockdep_set_notrack_class: disable lockdep tracking of a given lock entirely + * @lock: Lock to mark + * + * Bigger hammer than lockdep_set_novalidate_class: so far just for bcachefs, + * which takes more locks than lockdep is able to track (48). + */ #define lockdep_set_notrack_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_track__, #lock) -- cgit v1.2.3 From c8f51feee135f37f0d77b4616083c25524daa7b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Jul 2024 11:29:01 +0000 Subject: block: remove QUEUE_FLAG_STOPPED QUEUE_FLAG_STOPPED is entirely unused. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240719112912.3830443-5-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dce4a6bf7307..942ad4e0f231 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -588,7 +588,6 @@ struct request_queue { }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ -#define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ @@ -608,7 +607,6 @@ struct request_queue { void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); -#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) -- cgit v1.2.3 From 3dff6155733f25872530ad358c6f5559800f4ccb Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 19 Jul 2024 11:29:02 +0000 Subject: block: Relocate BLK_MQ_CPU_WORK_BATCH BLK_MQ_CPU_WORK_BATCH is defined in include/linux/blk-mq.h, but only used in blk-mq.c, so relocate to block/blk-mq.h Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240719112912.3830443-6-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 89ba6b16fe8b..df775a203a4d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -672,8 +672,6 @@ enum { BLK_MQ_S_INACTIVE = 3, BLK_MQ_MAX_DEPTH = 10240, - - BLK_MQ_CPU_WORK_BATCH = 8, }; #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ -- cgit v1.2.3 From 793356d23f8a817e164a917c792741a6d6d651ed Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 19 Jul 2024 11:29:03 +0000 Subject: block: Relocate BLK_MQ_MAX_DEPTH BLK_MQ_MAX_DEPTH is defined as an enumerated value, but has no real relation to the other members in its enum, so just use #define to provide the definition. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240719112912.3830443-7-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index df775a203a4d..8a84f49468d5 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -670,8 +670,6 @@ enum { /* hw queue is inactive after all its CPUs become offline */ BLK_MQ_S_INACTIVE = 3, - - BLK_MQ_MAX_DEPTH = 10240, }; #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ @@ -680,6 +678,7 @@ enum { ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) +#define BLK_MQ_MAX_DEPTH (10240) #define BLK_MQ_NO_HCTX_IDX (-1U) struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, -- cgit v1.2.3 From 55177adf1837bc56f878f7f6f7123947a2088148 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 19 Jul 2024 11:29:04 +0000 Subject: block: Make QUEUE_FLAG_x as an enum This will allow us better keep in sync with blk_queue_flag_name[]. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240719112912.3830443-8-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 942ad4e0f231..ded46fad67a0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -588,19 +588,22 @@ struct request_queue { }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ -#define QUEUE_FLAG_DYING 1 /* queue being torn down */ -#define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ -#define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ -#define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ -#define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ -#define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ -#define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ -#define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ -#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ -#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ -#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ -#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ +enum { + QUEUE_FLAG_DYING, /* queue being torn down */ + QUEUE_FLAG_NOMERGES, /* disable merge attempts */ + QUEUE_FLAG_SAME_COMP, /* complete on same CPU-group */ + QUEUE_FLAG_FAIL_IO, /* fake timeout */ + QUEUE_FLAG_NOXMERGES, /* No extended merges */ + QUEUE_FLAG_SAME_FORCE, /* force complete on same CPU */ + QUEUE_FLAG_INIT_DONE, /* queue is initialized */ + QUEUE_FLAG_STATS, /* track IO start and completion times */ + QUEUE_FLAG_REGISTERED, /* queue has been registered to a disk */ + QUEUE_FLAG_QUIESCED, /* queue has been quiesced */ + QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */ + QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */ + QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */ + QUEUE_FLAG_MAX +}; #define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP) -- cgit v1.2.3 From 23827310cce7eff3477aeaeb59ea3718f5c9c633 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 19 Jul 2024 11:29:06 +0000 Subject: block: Catch possible entries missing from hctx_state_name[] Add a build-time assert that we are not missing entries from hctx_state_name[]. For this, create a separate enum for state flags and add a "max" entry for BLK_MQ_S_x flags. The numbering for those enum values is as default, so don't explicitly number. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240719112912.3830443-10-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8a84f49468d5..4905a1d67551 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -663,13 +663,6 @@ enum { BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, - - BLK_MQ_S_STOPPED = 0, - BLK_MQ_S_TAG_ACTIVE = 1, - BLK_MQ_S_SCHED_RESTART = 2, - - /* hw queue is inactive after all its CPUs become offline */ - BLK_MQ_S_INACTIVE = 3, }; #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ @@ -681,6 +674,16 @@ enum { #define BLK_MQ_MAX_DEPTH (10240) #define BLK_MQ_NO_HCTX_IDX (-1U) +enum { + /* Keep hctx_state_name[] in sync with the definitions below */ + BLK_MQ_S_STOPPED, + BLK_MQ_S_TAG_ACTIVE, + BLK_MQ_S_SCHED_RESTART, + /* hw queue is inactive after all its CPUs become offline */ + BLK_MQ_S_INACTIVE, + BLK_MQ_S_MAX +}; + struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass); -- cgit v1.2.3 From 226f0f6afc3e5c8903c6e57e1f6073ad8ad189b5 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 19 Jul 2024 11:29:07 +0000 Subject: block: Catch possible entries missing from hctx_flag_name[] Refresh values in BLK_MQ_F_x enum, and then re-arrange members in hctx_flag_name[] to match that enum. Renumber BLK_MQ_F_ALLOC_POLICY_START_BIT to match the value refresh. Add a BUILD_BUG_ON() call to ensure that we are not missing entries in hctx_flag_name[]. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240719112912.3830443-11-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4905a1d67551..27241009c8f9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -644,6 +644,7 @@ struct blk_mq_ops { #endif }; +/* Keep hctx_flag_name[] in sync with the definitions below */ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, @@ -653,15 +654,16 @@ enum { */ BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, - BLK_MQ_F_BLOCKING = 1 << 5, + BLK_MQ_F_BLOCKING = 1 << 4, /* Do not allow an I/O scheduler to be configured. */ - BLK_MQ_F_NO_SCHED = 1 << 6, + BLK_MQ_F_NO_SCHED = 1 << 5, + /* * Select 'none' during queue registration in case of a single hwq * or shared hwqs instead of 'mq-deadline'. */ - BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, - BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, + BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6, + BLK_MQ_F_ALLOC_POLICY_START_BIT = 7, BLK_MQ_F_ALLOC_POLICY_BITS = 1, }; #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ -- cgit v1.2.3 From 26d3bdb57ec3fa56eaf8d2e74b5d488e55f43013 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 19 Jul 2024 11:29:08 +0000 Subject: block: Catch possible entries missing from alloc_policy_name[] Make BLK_TAG_ALLOC_x an enum and add a "max" entry. Add a BUILD_BUG_ON() call to ensure that we are not missing entries in hctx_flag_name[]. Reviewed-by: Bart Van Assche Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240719112912.3830443-12-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 27241009c8f9..a64a50a0edf7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -278,8 +278,12 @@ enum blk_eh_timer_return { BLK_EH_RESET_TIMER, }; -#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ -#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ +/* Keep alloc_policy_name[] in sync with the definitions below */ +enum { + BLK_TAG_ALLOC_FIFO, /* allocate starting from 0 */ + BLK_TAG_ALLOC_RR, /* allocate starting from last allocated tag */ + BLK_TAG_ALLOC_MAX +}; /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware -- cgit v1.2.3 From 6fa99325ec86bcd442363d77561a1babd8d9a427 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 19 Jul 2024 11:29:09 +0000 Subject: block: Catch possible entries missing from cmd_flag_name[] Add a BUILD_BUG_ON() call to ensure that we are not missing entries in cmd_flag_name[]. Reviewed-by: Bart Van Assche Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240719112912.3830443-13-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 632edd71f8c6..36ed96133217 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -354,6 +354,7 @@ enum req_op { REQ_OP_LAST = (__force blk_opf_t)36, }; +/* Keep cmd_flag_name[] in sync with the definitions below */ enum req_flag_bits { __REQ_FAILFAST_DEV = /* no driver retries of device errors */ REQ_OP_BITS, -- cgit v1.2.3 From 5f89154e8e9e3445f9b592e58a7045e06153b822 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 19 Jul 2024 11:29:10 +0000 Subject: block: Use enum to define RQF_x bit indexes Similar to what we do for enum req_flag_bits, divide the definition of RQF_x flags into an enum to declare the bits and an actual flag. Tweak some comments to not spill onto new lines. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240719112912.3830443-14-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 86 +++++++++++++++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a64a50a0edf7..af52ec6a1ed5 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -27,38 +27,60 @@ typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); * request flags */ typedef __u32 __bitwise req_flags_t; -/* drive already may have started this one */ -#define RQF_STARTED ((__force req_flags_t)(1 << 1)) -/* request for flush sequence */ -#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) -/* merge of different types, fail separately */ -#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) -/* don't call prep for this one */ -#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) -/* use hctx->sched_tags */ -#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << 8)) -/* use an I/O scheduler for this request */ -#define RQF_USE_SCHED ((__force req_flags_t)(1 << 9)) -/* vaguely specified driver internal error. Ignored by the block layer */ -#define RQF_FAILED ((__force req_flags_t)(1 << 10)) -/* don't warn about errors */ -#define RQF_QUIET ((__force req_flags_t)(1 << 11)) -/* account into disk and partition IO statistics */ -#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) -/* runtime pm request */ -#define RQF_PM ((__force req_flags_t)(1 << 15)) -/* on IO scheduler merge hash */ -#define RQF_HASHED ((__force req_flags_t)(1 << 16)) -/* track IO completion time */ -#define RQF_STATS ((__force req_flags_t)(1 << 17)) -/* Look at ->special_vec for the actual data payload instead of the - bio chain. */ -#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) -/* The request completion needs to be signaled to zone write pluging. */ -#define RQF_ZONE_WRITE_PLUGGING ((__force req_flags_t)(1 << 20)) -/* ->timeout has been called, don't expire again */ -#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) -#define RQF_RESV ((__force req_flags_t)(1 << 23)) +enum { + /* drive already may have started this one */ + __RQF_STARTED, + /* request for flush sequence */ + __RQF_FLUSH_SEQ, + /* merge of different types, fail separately */ + __RQF_MIXED_MERGE, + /* don't call prep for this one */ + __RQF_DONTPREP, + /* use hctx->sched_tags */ + __RQF_SCHED_TAGS, + /* use an I/O scheduler for this request */ + __RQF_USE_SCHED, + /* vaguely specified driver internal error. Ignored by block layer */ + __RQF_FAILED, + /* don't warn about errors */ + __RQF_QUIET, + /* account into disk and partition IO statistics */ + __RQF_IO_STAT, + /* runtime pm request */ + __RQF_PM, + /* on IO scheduler merge hash */ + __RQF_HASHED, + /* track IO completion time */ + __RQF_STATS, + /* Look at ->special_vec for the actual data payload instead of the + bio chain. */ + __RQF_SPECIAL_PAYLOAD, + /* request completion needs to be signaled to zone write plugging. */ + __RQF_ZONE_WRITE_PLUGGING, + /* ->timeout has been called, don't expire again */ + __RQF_TIMED_OUT, + __RQF_RESV, + __RQF_BITS +}; + +#define RQF_STARTED ((__force req_flags_t)(1 << __RQF_STARTED)) +#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ)) +#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << __RQF_MIXED_MERGE)) +#define RQF_DONTPREP ((__force req_flags_t)(1 << __RQF_DONTPREP)) +#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << __RQF_SCHED_TAGS)) +#define RQF_USE_SCHED ((__force req_flags_t)(1 << __RQF_USE_SCHED)) +#define RQF_FAILED ((__force req_flags_t)(1 << __RQF_FAILED)) +#define RQF_QUIET ((__force req_flags_t)(1 << __RQF_QUIET)) +#define RQF_IO_STAT ((__force req_flags_t)(1 << __RQF_IO_STAT)) +#define RQF_PM ((__force req_flags_t)(1 << __RQF_PM)) +#define RQF_HASHED ((__force req_flags_t)(1 << __RQF_HASHED)) +#define RQF_STATS ((__force req_flags_t)(1 << __RQF_STATS)) +#define RQF_SPECIAL_PAYLOAD \ + ((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD)) +#define RQF_ZONE_WRITE_PLUGGING \ + ((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING)) +#define RQF_TIMED_OUT ((__force req_flags_t)(1 << __RQF_TIMED_OUT)) +#define RQF_RESV ((__force req_flags_t)(1 << __RQF_RESV)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ -- cgit v1.2.3 From 8a47e33f50dd779f94bc277c6d3de81672463c5e Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 19 Jul 2024 11:29:12 +0000 Subject: block: Catch possible entries missing from rqf_name[] Add a BUILD_BUG_ON() call to ensure that we are not missing entries in rqf_name[]. Reviewed-by: Bart Van Assche Signed-off-by: John Garry Link: https://lore.kernel.org/r/20240719112912.3830443-16-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index af52ec6a1ed5..8d304b1d16b1 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -27,6 +27,7 @@ typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); * request flags */ typedef __u32 __bitwise req_flags_t; +/* Keep rqf_name[] in sync with the definitions below */ enum { /* drive already may have started this one */ __RQF_STARTED, -- cgit v1.2.3 From 72d04bdcf3f7d7e07d82f9757946f68802a7270a Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 16 Jul 2024 16:26:27 +0800 Subject: sbitmap: fix io hung due to race on sbitmap_word::cleared Configuration for sbq: depth=64, wake_batch=6, shift=6, map_nr=1 1. There are 64 requests in progress: map->word = 0xFFFFFFFFFFFFFFFF 2. After all the 64 requests complete, and no more requests come: map->word = 0xFFFFFFFFFFFFFFFF, map->cleared = 0xFFFFFFFFFFFFFFFF 3. Now two tasks try to allocate requests: T1: T2: __blk_mq_get_tag . __sbitmap_queue_get . sbitmap_get . sbitmap_find_bit . sbitmap_find_bit_in_word . __sbitmap_get_word -> nr=-1 __blk_mq_get_tag sbitmap_deferred_clear __sbitmap_queue_get /* map->cleared=0xFFFFFFFFFFFFFFFF */ sbitmap_find_bit if (!READ_ONCE(map->cleared)) sbitmap_find_bit_in_word return false; __sbitmap_get_word -> nr=-1 mask = xchg(&map->cleared, 0) sbitmap_deferred_clear atomic_long_andnot() /* map->cleared=0 */ if (!(map->cleared)) return false; /* * map->cleared is cleared by T1 * T2 fail to acquire the tag */ 4. T2 is the sole tag waiter. When T1 puts the tag, T2 cannot be woken up due to the wake_batch being set at 6. If no more requests come, T1 will wait here indefinitely. This patch achieves two purposes: 1. Check on ->cleared and update on both ->cleared and ->word need to be done atomically, and using spinlock could be the simplest solution. 2. Add extra check in sbitmap_deferred_clear(), to identify whether ->word has free bits. Fixes: ea86ea2cdced ("sbitmap: ammortize cost of clearing bits") Signed-off-by: Yang Yang Reviewed-by: Ming Lei Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240716082644.659566-1-yang.yang@vivo.com Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index d662cf136021..c09cdcc99471 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -36,6 +36,11 @@ struct sbitmap_word { * @cleared: word holding cleared bits */ unsigned long cleared ____cacheline_aligned_in_smp; + + /** + * @swap_lock: serializes simultaneous updates of ->word and ->cleared + */ + spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** -- cgit v1.2.3 From 89ed6c9ac69ec398ccb648f5f675b43e8ca679ca Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 16 Jul 2024 13:30:58 +0000 Subject: blk-cgroup: move congestion_count to struct blkcg The congestion_count was introduced into the struct cgroup by commit d09d8df3a294 ("blkcg: add generic throttling mechanism"), but since it is closely related to the blkio subsys, it is not appropriate to put it in the struct cgroup, so let's move it to struct blkcg. There should be no functional changes because blkcg is per cgroup. Signed-off-by: Xiu Jianfeng Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240716133058.3491350-1-xiujianfeng@huawei.com Signed-off-by: Jens Axboe --- include/linux/cgroup-defs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ea48c861cd36..ec82c4b7ed4d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -534,9 +534,6 @@ struct cgroup { /* used to store eBPF programs */ struct cgroup_bpf bpf; - /* If there is block congestion on this cgroup. */ - atomic_t congestion_count; - /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; -- cgit v1.2.3 From 9651fcedf7b92d3f7f1ab179e8ab55b85ee10fc1 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 8 Dec 2022 17:55:04 +0100 Subject: mm: add MAP_DROPPABLE for designating always lazily freeable mappings The vDSO getrandom() implementation works with a buffer allocated with a new system call that has certain requirements: - It shouldn't be written to core dumps. * Easy: VM_DONTDUMP. - It should be zeroed on fork. * Easy: VM_WIPEONFORK. - It shouldn't be written to swap. * Uh-oh: mlock is rlimited. * Uh-oh: mlock isn't inherited by forks. - It shouldn't reserve actual memory, but it also shouldn't crash when page faulting in memory if none is available * Uh-oh: VM_NORESERVE means segfaults. It turns out that the vDSO getrandom() function has three really nice characteristics that we can exploit to solve this problem: 1) Due to being wiped during fork(), the vDSO code is already robust to having the contents of the pages it reads zeroed out midway through the function's execution. 2) In the absolute worst case of whatever contingency we're coding for, we have the option to fallback to the getrandom() syscall, and everything is fine. 3) The buffers the function uses are only ever useful for a maximum of 60 seconds -- a sort of cache, rather than a long term allocation. These characteristics mean that we can introduce VM_DROPPABLE, which has the following semantics: a) It never is written out to swap. b) Under memory pressure, mm can just drop the pages (so that they're zero when read back again). c) It is inherited by fork. d) It doesn't count against the mlock budget, since nothing is locked. e) If there's not enough memory to service a page fault, it's not fatal, and no signal is sent. This way, allocations used by vDSO getrandom() can use: VM_DROPPABLE | VM_DONTDUMP | VM_WIPEONFORK | VM_NORESERVE And there will be no problem with OOMing, crashing on overcommitment, using memory when not in use, not wiping on fork(), coredumps, or writing out to swap. In order to let vDSO getrandom() use this, expose these via mmap(2) as MAP_DROPPABLE. Note that this involves removing the MADV_FREE special case from sort_folio(), which according to Yu Zhao is unnecessary and will simply result in an extra call to shrink_folio_list() in the worst case. The chunk removed reenables the swapbacked flag, which we don't want for VM_DROPPABLE, and we can't conditionalize it here because there isn't a vma reference available. Finally, the provided self test ensures that this is working as desired. Cc: linux-mm@kvack.org Acked-by: David Hildenbrand Signed-off-by: Jason A. Donenfeld --- include/linux/mm.h | 7 +++++++ include/linux/userfaultfd_k.h | 3 +++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index eb7c96d24ac0..e078c2890bf8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -406,6 +406,13 @@ extern unsigned int kobjsize(const void *objp); #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif +#ifdef CONFIG_64BIT +#define VM_DROPPABLE_BIT 40 +#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) +#else +#define VM_DROPPABLE VM_NONE +#endif + #ifdef CONFIG_64BIT /* VM is sealed, in vm_flags */ #define VM_SEALED _BITUL(63) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 05d59f74fc88..a12bcf042551 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -218,6 +218,9 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, { vm_flags &= __VM_UFFD_FLAGS; + if (vm_flags & VM_DROPPABLE) + return false; + if ((vm_flags & VM_UFFD_MINOR) && (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) return false; -- cgit v1.2.3 From 45b8ee7182d5ef8df6959297046f86dc128d6a06 Mon Sep 17 00:00:00 2001 From: Bastien Curutchet Date: Mon, 17 Jun 2024 14:08:18 +0200 Subject: i2c: mux: gpio: Add support for the 'settle-time-us' property Some hardware need some time to switch from a bus to another. This can cause the first transfers following the selection of a bus to fail. There is no way to configure this kind of waiting time in the driver. Add support for the 'settle-time-us' device-tree property. When set, the i2c_mux_gpio_select() applies a delay before returning, leaving enough time to the hardware to switch to the new bus. Signed-off-by: Bastien Curutchet Reviewed-by: Andi Shyti Acked-by: Peter Rosin Signed-off-by: Andi Shyti --- include/linux/platform_data/i2c-mux-gpio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/i2c-mux-gpio.h b/include/linux/platform_data/i2c-mux-gpio.h index 816a4cd3ccb5..96843aab4d1e 100644 --- a/include/linux/platform_data/i2c-mux-gpio.h +++ b/include/linux/platform_data/i2c-mux-gpio.h @@ -19,6 +19,7 @@ * position * @n_values: Number of multiplexer positions (busses to instantiate) * @idle: Bitmask to write to MUX when idle or GPIO_I2CMUX_NO_IDLE if not used + * @settle_time: Delay to wait when a new bus is selected */ struct i2c_mux_gpio_platform_data { int parent; @@ -26,6 +27,7 @@ struct i2c_mux_gpio_platform_data { const unsigned *values; int n_values; unsigned idle; + u32 settle_time; }; #endif /* _LINUX_I2C_MUX_GPIO_H */ -- cgit v1.2.3 From d83763e44944d4d6dee38099c5d0a0984ac66385 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 16 Jul 2024 10:36:24 +0200 Subject: i2c: header: remove unneeded stuff regarding i2c_algorithm The forward declaration is not needed anymore. The sentence about "following structs" became obsolete when struct i2c_algorithm became a kdoc. The paragraph about return values can go because we have this information in kdoc already. Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index e9cc14b1f9a1..1e34b486f604 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -30,7 +30,6 @@ extern const struct device_type i2c_client_type; /* --- General options ------------------------------------------------ */ struct i2c_msg; -struct i2c_algorithm; struct i2c_adapter; struct i2c_client; struct i2c_driver; @@ -533,8 +532,6 @@ i2c_register_board_info(int busnum, struct i2c_board_info const *info, * @reg_slave: deprecated, use @reg_target * @unreg_slave: deprecated, use @unreg_target * - * - * The following structs are for those who like to implement new bus drivers: * i2c_algorithm is the interface to a class of hardware solutions which can * be addressed using the same bus algorithms - i.e. bit-banging or the PCF8584 * to name two of the most common. @@ -550,9 +547,6 @@ struct i2c_algorithm { * to NULL. If an adapter algorithm can do SMBus access, set * smbus_xfer. If set to NULL, the SMBus protocol is simulated * using common I2C messages. - * - * xfer should return the number of messages successfully - * processed, or a negative value on error */ union { int (*xfer)(struct i2c_adapter *adap, struct i2c_msg *msgs, -- cgit v1.2.3 From 385ac870bdd531348de123d6790626ccd7827f69 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 16 Jul 2024 10:36:25 +0200 Subject: i2c: header: improve kdoc for i2c_algorithm Reword the explanation of @xfer, the old one was confusing and mixing up terminology. Other than that, capitalize some words correctly and use full line length. Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 1e34b486f604..8caaa13834bf 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -511,16 +511,15 @@ i2c_register_board_info(int busnum, struct i2c_board_info const *info, #endif /* I2C_BOARDINFO */ /** - * struct i2c_algorithm - represent I2C transfer method - * @xfer: Issue a set of i2c transactions to the given I2C adapter - * defined by the msgs array, with num messages available to transfer via - * the adapter specified by adap. - * @xfer_atomic: same as @xfer. Yet, only using atomic context - * so e.g. PMICs can be accessed very late before shutdown. Optional. - * @smbus_xfer: Issue smbus transactions to the given I2C adapter. If this + * struct i2c_algorithm - represent I2C transfer methods + * @xfer: Transfer a given number of messages defined by the msgs array via + * the specified adapter. + * @xfer_atomic: Same as @xfer. Yet, only using atomic context so e.g. PMICs + * can be accessed very late before shutdown. Optional. + * @smbus_xfer: Issue SMBus transactions to the given I2C adapter. If this * is not present, then the bus layer will try and convert the SMBus calls * into I2C transfers instead. - * @smbus_xfer_atomic: same as @smbus_xfer. Yet, only using atomic context + * @smbus_xfer_atomic: Same as @smbus_xfer. Yet, only using atomic context * so e.g. PMICs can be accessed very late before shutdown. Optional. * @functionality: Return the flags that this algorithm/adapter pair supports * from the ``I2C_FUNC_*`` flags. -- cgit v1.2.3 From eabd9db64ea8ba64d2a0b1d70da38e1a95dcd08b Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Thu, 13 Jun 2024 16:54:33 +0800 Subject: ACPI: RISCV: Add NUMA support based on SRAT and SLIT Add acpi_numa.c file to enable parse NUMA information from ACPI SRAT and SLIT tables. SRAT table provide CPUs(Hart) and memory nodes to proximity domain mapping, while SLIT table provide the distance metrics between proximity domains. Signed-off-by: Haibo Xu Reviewed-by: Sunil V L Reviewed-by: Hanjun Guo Link: https://lore.kernel.org/r/65dbad1fda08a32922c44886e4581e49b4a2fecc.1718268003.git.haibo1.xu@intel.com Signed-off-by: Palmer Dabbelt --- include/linux/acpi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 28c3fb2bef0d..5a2b620ccd58 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -264,6 +264,12 @@ static inline void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { } #endif +#ifdef CONFIG_RISCV +void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa); +#else +static inline void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa) { } +#endif + #ifndef PHYS_CPUID_INVALID typedef u32 phys_cpuid_t; #define PHYS_CPUID_INVALID (phys_cpuid_t)(-1) -- cgit v1.2.3 From 10a0e6f3d3db7dcfe36e578923e5f038f1d2b72a Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 17 Jul 2024 11:49:40 +0200 Subject: timers/migration: Move hierarchy setup into cpuhotplug prepare callback When a CPU comes online the first time, it is possible that a new top level group will be created. In general all propagation is done from the bottom to top. This minimizes complexity and prevents possible races. But when a new top level group is created, the formely top level group needs to be connected to the new level. This is the only time, when the direction to propagate changes is changed: the changes are propagated from top (new top level group) to bottom (formerly top level group). This introduces two races (see (A) and (B)) as reported by Frederic: (A) This race happens, when marking the formely top level group as active, but the last active CPU of the formerly top level group goes idle. Then it's likely that formerly group is no longer active, but marked nevertheless as active in new top level group: [GRP0:0] migrator = 0 active = 0 nextevt = KTIME_MAX / \ 0 1 .. 7 active idle 0) Hierarchy has for now only 8 CPUs and CPU 0 is the only active CPU. [GRP1:0] migrator = TMIGR_NONE active = NONE nextevt = KTIME_MAX \ [GRP0:0] [GRP0:1] migrator = 0 migrator = TMIGR_NONE active = 0 active = NONE nextevt = KTIME_MAX nextevt = KTIME_MAX / \ 0 1 .. 7 8 active idle !online 1) CPU 8 is booting and creates a new group in first level GRP0:1 and therefore also a new top group GRP1:0. For now the setup code proceeded only until the connected between GRP0:1 to the new top group. The connection between CPU8 and GRP0:1 is not yet established and CPU 8 is still !online. [GRP1:0] migrator = TMIGR_NONE active = NONE nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = 0 migrator = TMIGR_NONE active = 0 active = NONE nextevt = KTIME_MAX nextevt = KTIME_MAX / \ 0 1 .. 7 8 active idle !online 2) Setup code now connects GRP0:0 to GRP1:0 and observes while in tmigr_connect_child_parent() that GRP0:0 is not TMIGR_NONE. So it prepares to call tmigr_active_up() on it. It hasn't done it yet. [GRP1:0] migrator = TMIGR_NONE active = NONE nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = TMIGR_NONE migrator = TMIGR_NONE active = NONE active = NONE nextevt = KTIME_MAX nextevt = KTIME_MAX / \ 0 1 .. 7 8 idle idle !online 3) CPU 0 goes idle. Since GRP0:0->parent has been updated by CPU 8 with GRP0:0->lock held, CPU 0 observes GRP1:0 after calling tmigr_update_events() and it propagates the change to the top (no change there and no wakeup programmed since there is no timer). [GRP1:0] migrator = GRP0:0 active = GRP0:0 nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = TMIGR_NONE migrator = TMIGR_NONE active = NONE active = NONE nextevt = KTIME_MAX nextevt = KTIME_MAX / \ 0 1 .. 7 8 idle idle !online 4) Now the setup code finally calls tmigr_active_up() to and sets GRP0:0 active in GRP1:0 [GRP1:0] migrator = GRP0:0 active = GRP0:0, GRP0:1 nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = TMIGR_NONE migrator = 8 active = NONE active = 8 nextevt = KTIME_MAX nextevt = KTIME_MAX / \ | 0 1 .. 7 8 idle idle active 5) Now CPU 8 is connected with GRP0:1 and CPU 8 calls tmigr_active_up() out of tmigr_cpu_online(). [GRP1:0] migrator = GRP0:0 active = GRP0:0 nextevt = T8 / \ [GRP0:0] [GRP0:1] migrator = TMIGR_NONE migrator = TMIGR_NONE active = NONE active = NONE nextevt = KTIME_MAX nextevt = T8 / \ | 0 1 .. 7 8 idle idle idle 5) CPU 8 goes idle with a timer T8 and relies on GRP0:0 as the migrator. But it's not really active, so T8 gets ignored. --> The update which is done in third step is not noticed by setup code. So a wrong migrator is set to top level group and a timer could get ignored. (B) Reading group->parent and group->childmask when an hierarchy update is ongoing and reaches the formerly top level group is racy as those values could be inconsistent. (The notation of migrator and active now slightly changes in contrast to the above example, as now the childmasks are used.) [GRP1:0] migrator = TMIGR_NONE active = 0x00 nextevt = KTIME_MAX \ [GRP0:0] [GRP0:1] migrator = TMIGR_NONE migrator = TMIGR_NONE active = 0x00 active = 0x00 nextevt = KTIME_MAX nextevt = KTIME_MAX childmask= 0 childmask= 1 parent = NULL parent = GRP1:0 / \ 0 1 .. 7 8 idle idle !online childmask=1 1) Hierarchy has 8 CPUs. CPU 8 is at the moment in the process of onlining but did not yet connect GRP0:0 to GRP1:0. [GRP1:0] migrator = TMIGR_NONE active = 0x00 nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = TMIGR_NONE migrator = TMIGR_NONE active = 0x00 active = 0x00 nextevt = KTIME_MAX nextevt = KTIME_MAX childmask= 0 childmask= 1 parent = GRP1:0 parent = GRP1:0 / \ 0 1 .. 7 8 idle idle !online childmask=1 2) Setup code (running on CPU 8) now connects GRP0:0 to GRP1:0, updates parent pointer of GRP0:0 and ... [GRP1:0] migrator = TMIGR_NONE active = 0x00 nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = 0x01 migrator = TMIGR_NONE active = 0x01 active = 0x00 nextevt = KTIME_MAX nextevt = KTIME_MAX childmask= 0 childmask= 1 parent = GRP1:0 parent = GRP1:0 / \ 0 1 .. 7 8 active idle !online childmask=1 tmigr_walk.childmask = 0 3) ... CPU 0 comes active in the same time. As migrator in GRP0:0 was TMIGR_NONE, childmask of GRP0:0 is stored in update propagation data structure tmigr_walk (as update of childmask is not yet visible/updated). And now ... [GRP1:0] migrator = TMIGR_NONE active = 0x00 nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = 0x01 migrator = TMIGR_NONE active = 0x01 active = 0x00 nextevt = KTIME_MAX nextevt = KTIME_MAX childmask= 2 childmask= 1 parent = GRP1:0 parent = GRP1:0 / \ 0 1 .. 7 8 active idle !online childmask=1 tmigr_walk.childmask = 0 4) ... childmask of GRP0:0 is updated by CPU 8 (still part of setup code). [GRP1:0] migrator = 0x00 active = 0x00 nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = 0x01 migrator = TMIGR_NONE active = 0x01 active = 0x00 nextevt = KTIME_MAX nextevt = KTIME_MAX childmask= 2 childmask= 1 parent = GRP1:0 parent = GRP1:0 / \ 0 1 .. 7 8 active idle !online childmask=1 tmigr_walk.childmask = 0 5) CPU 0 sees the connection to GRP1:0 and now propagates active state to GRP1:0 but with childmask = 0 as stored in propagation data structure. --> Now GRP1:0 always has a migrator as 0x00 != TMIGR_NONE and for all CPUs it looks like GRP1:0 is always active. To prevent those races, the setup of the hierarchy is moved into the cpuhotplug prepare callback. The prepare callback is not executed by the CPU which will come online, it is executed by the CPU which prepares onlining of the other CPU. This CPU is active while it is connecting the formerly top level to the new one. This prevents from (A) to happen and it also prevents from any further walk above the formerly top level until that active CPU becomes inactive, releasing the new ->parent and ->childmask updates to be visible by any subsequent walk up above the formerly top level hierarchy. This prevents from (B) to happen. The direction for the updates is now forced to look like "from bottom to top". However if the active CPU prevents from tmigr_cpu_(in)active() to walk up with the update not-or-half visible, nothing prevents walking up to the new top with a 0 childmask in tmigr_handle_remote_up() or tmigr_requires_handle_remote_up() if the active CPU doing the prepare is not the migrator. But then it looks fine because: * tmigr_check_migrator() should just return false * The migrator is active and should eventually observe the new childmask at some point in a future tick. Split setup functionality of online callback into the cpuhotplug prepare callback and setup hotplug state. Change init call into early_initcall() to make sure an already active CPU prepares everything for newly upcoming CPUs. Reorder the code, that all prepare related functions are close to each other and online and offline callbacks are also close together. Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model") Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240717094940.18687-1-anna-maria@linutronix.de --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 7a5785f405b6..df59666a2a66 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -122,6 +122,7 @@ enum cpuhp_state { CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, CPUHP_TIMERS_PREPARE, + CPUHP_TMIGR_PREPARE, CPUHP_MIPS_SOC_PREPARE, CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, -- cgit v1.2.3 From a2b72b81fb3ba18717fc000949ca9d45a3351130 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:20 +0100 Subject: io_uring: kill REQ_F_CANCEL_SEQ We removed the reliance on the flag by the cancellation path and now it's unused. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e57afe566bbe4fefeb44daffb08900f2a4756577.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3bb6198d1523..e62aa9f0629f 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -461,7 +461,6 @@ enum { REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, REQ_F_POLL_NO_LAZY_BIT, - REQ_F_CANCEL_SEQ_BIT, REQ_F_CAN_POLL_BIT, REQ_F_BL_EMPTY_BIT, REQ_F_BL_NO_RECYCLE_BIT, @@ -536,8 +535,6 @@ enum { REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), /* don't use lazy poll wake for this request */ REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), - /* cancel sequence is set and valid */ - REQ_F_CANCEL_SEQ = IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT), /* file is pollable */ REQ_F_CAN_POLL = IO_REQ_FLAG(REQ_F_CAN_POLL_BIT), /* buffer list was empty after selection of buffer */ -- cgit v1.2.3 From 78eb4ea25cd5fdbdae7eb9fdf87b99195ff67508 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Wed, 24 Jul 2024 20:59:29 +0200 Subject: sysctl: treewide: constify the ctl_table argument of proc_handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit const qualify the struct ctl_table argument in the proc_handler function signatures. This is a prerequisite to moving the static ctl_table structs into .rodata data which will ensure that proc_handler function pointers cannot be modified. This patch has been generated by the following coccinelle script: ``` virtual patch @r1@ identifier ctl, write, buffer, lenp, ppos; identifier func !~ "appldata_(timer|interval)_handler|sched_(rt|rr)_handler|rds_tcp_skbuf_handler|proc_sctp_do_(hmac_alg|rto_min|rto_max|udp_port|alpha_beta|auth|probe_interval)"; @@ int func( - struct ctl_table *ctl + const struct ctl_table *ctl ,int write, void *buffer, size_t *lenp, loff_t *ppos); @r2@ identifier func, ctl, write, buffer, lenp, ppos; @@ int func( - struct ctl_table *ctl + const struct ctl_table *ctl ,int write, void *buffer, size_t *lenp, loff_t *ppos) { ... } @r3@ identifier func; @@ int func( - struct ctl_table * + const struct ctl_table * ,int , void *, size_t *, loff_t *); @r4@ identifier func, ctl; @@ int func( - struct ctl_table *ctl + const struct ctl_table *ctl ,int , void *, size_t *, loff_t *); @r5@ identifier func, write, buffer, lenp, ppos; @@ int func( - struct ctl_table * + const struct ctl_table * ,int write, void *buffer, size_t *lenp, loff_t *ppos); ``` * Code formatting was adjusted in xfs_sysctl.c to comply with code conventions. The xfs_stats_clear_proc_handler, xfs_panic_mask_proc_handler and xfs_deprecated_dointvec_minmax where adjusted. * The ctl_table argument in proc_watchdog_common was const qualified. This is called from a proc_handler itself and is calling back into another proc_handler, making it necessary to change it as part of the proc_handler migration. Co-developed-by: Thomas Weißschuh Signed-off-by: Thomas Weißschuh Co-developed-by: Joel Granados Signed-off-by: Joel Granados --- include/linux/ftrace.h | 4 ++-- include/linux/mm.h | 8 ++++---- include/linux/perf_event.h | 6 +++--- include/linux/security.h | 2 +- include/linux/sysctl.h | 34 +++++++++++++++++----------------- include/linux/vmstat.h | 4 ++-- include/linux/writeback.h | 2 +- 7 files changed, 30 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 51575b76818e..fd5e84d0ec47 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -471,7 +471,7 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, extern int stack_tracer_enabled; -int stack_trace_sysctl(struct ctl_table *table, int write, void *buffer, +int stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); /* DO NOT MODIFY THIS VARIABLE DIRECTLY! */ @@ -1175,7 +1175,7 @@ extern int tracepoint_printk; extern void disable_trace_on_warning(void); extern int __disable_trace_on_warning; -int tracepoint_printk_sysctl(struct ctl_table *table, int write, +int tracepoint_printk_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #else /* CONFIG_TRACING */ diff --git a/include/linux/mm.h b/include/linux/mm.h index aa4fccb2a693..4563b560ced7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -204,11 +204,11 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; -int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, +int overcommit_ratio_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); -int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, +int overcommit_kbytes_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); -int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, +int overcommit_policy_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -3854,7 +3854,7 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, +int drop_caches_sysctl_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); #endif diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 65ece0d5b4b6..1a8942277dda 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1582,11 +1582,11 @@ extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); -int perf_event_max_sample_rate_handler(struct ctl_table *table, int write, +int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, +int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int perf_event_max_stack_handler(struct ctl_table *table, int write, +int perf_event_max_stack_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ diff --git a/include/linux/security.h b/include/linux/security.h index de3af33e6ff5..1390f1efb4f0 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -228,7 +228,7 @@ struct request_sock; #define LSM_UNSAFE_NO_NEW_PRIVS 4 #ifdef CONFIG_MMU -extern int mmap_min_addr_handler(struct ctl_table *table, int write, +extern int mmap_min_addr_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 54fbec062772..aa4c6d44aaa0 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -61,31 +61,31 @@ extern const int sysctl_vals[]; extern const unsigned long sysctl_long_vals[]; -typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, +typedef int proc_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); -int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_dobool(struct ctl_table *table, int write, void *buffer, +int proc_dostring(const struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dobool(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer, +int proc_dointvec(const struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_douintvec(const struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int proc_dou8vec_minmax(struct ctl_table *table, int write, void *buffer, +int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, +int proc_dointvec_jiffies(const struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *, +int proc_dointvec_userhz_jiffies(const struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_dointvec_ms_jiffies(struct ctl_table *, int, void *, size_t *, +int proc_dointvec_ms_jiffies(const struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_doulongvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *, +int proc_doulongvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int, void *, size_t *, loff_t *); -int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_do_static_key(struct ctl_table *table, int write, void *buffer, +int proc_do_large_bitmap(const struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_do_static_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); /* @@ -287,7 +287,7 @@ static inline bool sysctl_is_alias(char *param) } #endif /* CONFIG_SYSCTL */ -int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, +int sysctl_max_threads(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif /* _LINUX_SYSCTL_H */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 16b0cfa80502..23cd17942036 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -17,7 +17,7 @@ extern int sysctl_stat_interval; #define DISABLE_NUMA_STAT 0 extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); -int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, +int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); #endif @@ -301,7 +301,7 @@ void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); struct ctl_table; -int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, +int vmstat_refresh(const struct ctl_table *, int write, void *buffer, size_t *lenp, loff_t *ppos); void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 112d806ddbe4..1a54676d843a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -350,7 +350,7 @@ extern unsigned int dirty_expire_interval; extern unsigned int dirtytime_expire_interval; extern int laptop_mode; -int dirtytime_interval_handler(struct ctl_table *table, int write, +int dirtytime_interval_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); -- cgit v1.2.3 From 9722c3b66e21ff08aec570d02a97d331087fd70f Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Wed, 24 Jul 2024 18:33:06 +0200 Subject: of: remove internal arguments from of_property_for_each_u32() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The of_property_for_each_u32() macro needs five parameters, two of which are primarily meant as internal variables for the macro itself (in the for() clause). Yet these two parameters are used by a few drivers, and this can be considered misuse or at least bad practice. Now that the kernel uses C11 to build, these two parameters can be avoided by declaring them internally, thus changing this pattern: struct property *prop; const __be32 *p; u32 val; of_property_for_each_u32(np, "xyz", prop, p, val) { ... } to this: u32 val; of_property_for_each_u32(np, "xyz", val) { ... } However two variables cannot be declared in the for clause even with C11, so declare one struct that contain the two variables we actually need. As the variables inside this struct are not meant to be used by users of this macro, give the struct instance the noticeable name "_it" so it is visible during code reviews, helping to avoid new code to use it directly. Most usages are trivially converted as they do not use those two parameters, as expected. The non-trivial cases are: - drivers/clk/clk.c, of_clk_get_parent_name(): easily doable anyway - drivers/clk/clk-si5351.c, si5351_dt_parse(): this is more complex as the checks had to be replicated in a different way, making code more verbose and somewhat uglier, but I refrained from a full rework to keep as much of the original code untouched having no hardware to test my changes All the changes have been build tested. The few for which I have the hardware have been runtime-tested too. Reviewed-by: Andre Przywara # drivers/clk/sunxi/clk-simple-gates.c, drivers/clk/sunxi/clk-sun8i-bus-gates.c Acked-by: Bartosz Golaszewski # drivers/gpio/gpio-brcmstb.c Acked-by: Nicolas Ferre # drivers/irqchip/irq-atmel-aic-common.c Acked-by: Jonathan Cameron # drivers/iio/adc/ti_am335x_adc.c Acked-by: Uwe Kleine-König # drivers/pwm/pwm-samsung.c Acked-by: Richard Leitner # drivers/usb/misc/usb251xb.c Acked-by: Mark Brown # sound/soc/codecs/arizona.c Reviewed-by: Richard Fitzgerald # sound/soc/codecs/arizona.c Acked-by: Michael Ellerman # arch/powerpc/sysdev/xive/spapr.c Acked-by: Stephen Boyd # clk Signed-off-by: Luca Ceresoli Acked-by: Lee Jones Link: https://lore.kernel.org/r/20240724-of_property_for_each_u32-v3-1-bea82ce429e2@bootlin.com Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 13cf7a43b473..85b60ac9eec5 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -430,11 +430,9 @@ extern int of_detach_node(struct device_node *); #define of_match_ptr(_ptr) (_ptr) /* - * struct property *prop; - * const __be32 *p; * u32 u; * - * of_property_for_each_u32(np, "propname", prop, p, u) + * of_property_for_each_u32(np, "propname", u) * printk("U32 value: %x\n", u); */ const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur, @@ -1431,11 +1429,12 @@ static inline int of_property_read_s32(const struct device_node *np, err == 0; \ err = of_phandle_iterator_next(it)) -#define of_property_for_each_u32(np, propname, prop, p, u) \ - for (prop = of_find_property(np, propname, NULL), \ - p = of_prop_next_u32(prop, NULL, &u); \ - p; \ - p = of_prop_next_u32(prop, p, &u)) +#define of_property_for_each_u32(np, propname, u) \ + for (struct {struct property *prop; const __be32 *item; } _it = \ + {of_find_property(np, propname, NULL), \ + of_prop_next_u32(_it.prop, NULL, &u)}; \ + _it.item; \ + _it.item = of_prop_next_u32(_it.prop, _it.item, &u)) #define of_property_for_each_string(np, propname, prop, s) \ for (prop = of_find_property(np, propname, NULL), \ -- cgit v1.2.3 From 63c33ca0969cf4d4574103b69fb46f58a19a182b Mon Sep 17 00:00:00 2001 From: Bhoomik Gupta Date: Mon, 8 Jul 2024 11:08:35 +0530 Subject: i3c: master: Enhance i3c_bus_type visibility for device searching & event monitoring Improve the visibility of i3c_bus_type to facilitate searching for i3c devices attached to the i3c bus. Enable other drivers to use bus_register_notifier to monitor i3c bus device events. Signed-off-by: Bhoomik Gupta Link: https://lore.kernel.org/r/20240708053835.3003986-1-bhoomik.gupta@nxp.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 0ca27dd86956..074f632868d9 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -33,6 +33,7 @@ enum { struct i3c_master_controller; struct i3c_bus; struct i3c_device; +extern const struct bus_type i3c_bus_type; /** * struct i3c_i2c_dev_desc - Common part of the I3C/I2C device descriptor -- cgit v1.2.3 From 342b2e395d5f34c9f111a818556e617939f83a8c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 26 Jul 2024 15:24:30 +0100 Subject: io_uring/napi: use ktime in busy polling It's more natural to use ktime/ns instead of keeping around usec, especially since we're comparing it against user provided timers, so convert napi busy poll internal handling to ktime. It's also nicer since the type (ktime_t vs unsigned long) now tells the unit of measure. Keep everything as ktime, which we convert to/from micro seconds for IORING_[UN]REGISTER_NAPI. The net/ busy polling works seems to work with usec, however it's not real usec as shift by 10 is used to get it from nsecs, see busy_loop_current_time(), so it's easy to get truncated nsec back and we get back better precision. Note, we can further improve it later by removing the truncation and maybe convincing net/ to use ktime/ns instead. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/95e7ec8d095069a3ed5d40a4bc6f8b586698bc7e.1722003776.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e62aa9f0629f..3315005df117 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -404,7 +404,7 @@ struct io_ring_ctx { spinlock_t napi_lock; /* napi_list lock */ /* napi busy poll default timeout */ - unsigned int napi_busy_poll_to; + ktime_t napi_busy_poll_dt; bool napi_prefer_busy_poll; bool napi_enabled; -- cgit v1.2.3 From d659b715e94ac039803d7601505d3473393fc0be Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 15 Jul 2024 10:04:23 +1000 Subject: mm/huge_memory: avoid PMD-size page cache if needed xarray can't support arbitrary page cache size. the largest and supported page cache size is defined as MAX_PAGECACHE_ORDER by commit 099d90642a71 ("mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray"). However, it's possible to have 512MB page cache in the huge memory's collapsing path on ARM64 system whose base page size is 64KB. 512MB page cache is breaking the limitation and a warning is raised when the xarray entry is split as shown in the following example. [root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize KernelPageSize: 64 kB [root@dhcp-10-26-1-207 ~]# cat /tmp/test.c : int main(int argc, char **argv) { const char *filename = TEST_XFS_FILENAME; int fd = 0; void *buf = (void *)-1, *p; int pgsize = getpagesize(); int ret = 0; if (pgsize != 0x10000) { fprintf(stdout, "System with 64KB base page size is required!\n"); return -EPERM; } system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb"); system("echo 1 > /proc/sys/vm/drop_caches"); /* Open the xfs file */ fd = open(filename, O_RDONLY); assert(fd > 0); /* Create VMA */ buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0); assert(buf != (void *)-1); fprintf(stdout, "mapped buffer at 0x%p\n", buf); /* Populate VMA */ ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE); assert(ret == 0); ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ); assert(ret == 0); /* Collapse VMA */ ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE); assert(ret == 0); ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE); if (ret) { fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno); goto out; } /* Split xarray entry. Write permission is needed */ munmap(buf, TEST_MEM_SIZE); buf = (void *)-1; close(fd); fd = open(filename, O_RDWR); assert(fd > 0); fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, TEST_MEM_SIZE - pgsize, pgsize); out: if (buf != (void *)-1) munmap(buf, TEST_MEM_SIZE); if (fd > 0) close(fd); return ret; } [root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test [root@dhcp-10-26-1-207 ~]# /tmp/test ------------[ cut here ]------------ WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128 Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \ nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \ nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \ ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse \ xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net \ sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9 Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024 pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : xas_split_alloc+0xf8/0x128 lr : split_huge_page_to_list_to_order+0x1c4/0x780 sp : ffff8000ac32f660 x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0 x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000 x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000 x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8 x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40 x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000 Call trace: xas_split_alloc+0xf8/0x128 split_huge_page_to_list_to_order+0x1c4/0x780 truncate_inode_partial_folio+0xdc/0x160 truncate_inode_pages_range+0x1b4/0x4a8 truncate_pagecache_range+0x84/0xa0 xfs_flush_unmap_range+0x70/0x90 [xfs] xfs_file_fallocate+0xfc/0x4d8 [xfs] vfs_fallocate+0x124/0x2f0 ksys_fallocate+0x4c/0xa0 __arm64_sys_fallocate+0x24/0x38 invoke_syscall.constprop.0+0x7c/0xd8 do_el0_svc+0xb4/0xd0 el0_svc+0x44/0x1d8 el0t_64_sync_handler+0x134/0x150 el0t_64_sync+0x17c/0x180 Fix it by correcting the supported page cache orders, different sets for DAX and other files. With it corrected, 512MB page cache becomes disallowed on all non-DAX files on ARM64 system where the base page size is 64KB. After this patch is applied, the test program fails with error -EINVAL returned from __thp_vma_allowable_orders() and the madvise() system call to collapse the page caches. Link: https://lkml.kernel.org/r/20240715000423.316491-1-gshan@redhat.com Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") Signed-off-by: Gavin Shan Acked-by: David Hildenbrand Reviewed-by: Ryan Roberts Acked-by: Zi Yan Cc: Baolin Wang Cc: Barry Song Cc: Don Dutile Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Ryan Roberts Cc: William Kucharski Cc: [5.17+] Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cff002be83eb..e25d9ebfdf89 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -74,14 +74,20 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define THP_ORDERS_ALL_ANON ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1))) /* - * Mask of all large folio orders supported for file THP. + * Mask of all large folio orders supported for file THP. Folios in a DAX + * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to + * it. */ -#define THP_ORDERS_ALL_FILE (BIT(PMD_ORDER) | BIT(PUD_ORDER)) +#define THP_ORDERS_ALL_FILE_DAX \ + (BIT(PMD_ORDER) | BIT(PUD_ORDER)) +#define THP_ORDERS_ALL_FILE_DEFAULT \ + ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) /* * Mask of all large folio orders supported for THP. */ -#define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE) +#define THP_ORDERS_ALL \ + (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT) #define TVA_SMAPS (1 << 0) /* Will be used for procfs */ #define TVA_IN_PF (1 << 1) /* Page fault handler */ -- cgit v1.2.3 From b3bebe44306e23827397d0d774d206e3fa374041 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 17 Jul 2024 14:28:44 -0700 Subject: alloc_tag: outline and export free_reserved_page() Outline and export free_reserved_page() because modules use it and it in turn uses page_ext_{get|put} which should not be exported. The same result could be obtained by outlining {get|put}_page_tag_ref() but that would have higher performance impact as these functions are used in more performance critical paths. Link: https://lkml.kernel.org/r/20240717212844.2749975-1-surenb@google.com Fixes: dcfe378c81f7 ("lib: introduce support for page allocation tagging") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407080044.DWMC9N9I-lkp@intel.com/ Suggested-by: Christoph Hellwig Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: [6.10] Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4563b560ced7..c4b238a20b76 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3137,21 +3137,7 @@ extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ -static inline void free_reserved_page(struct page *page) -{ - if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); - - if (ref) { - set_codetag_empty(ref); - put_page_tag_ref(ref); - } - } - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - adjust_managed_page_count(page, 1); -} +void free_reserved_page(struct page *page); #define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) -- cgit v1.2.3 From f59adcf5933271ab7247c4c8938c67be8905b725 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 23 Jul 2024 17:12:44 +0000 Subject: mm: memcg: add cacheline padding after lruvec in mem_cgroup_per_node Oliver Sand reported a performance regression caused by commit 98c9daf5ae6b ("mm: memcg: guard memcg1-specific members of struct mem_cgroup_per_node"), which puts some fields of the mem_cgroup_per_node structure under the CONFIG_MEMCG_V1 config option. Apparently it causes a false cache sharing between lruvec and lru_zone_size members of the structure. Fix it by adding an explicit padding after the lruvec member. Even though the padding is not required with CONFIG_MEMCG_V1 set, it seems like the introduced memory overhead is not significant enough to warrant another divergence in the mem_cgroup_per_node layout, so the padding is added unconditionally. Link: https://lkml.kernel.org/r/20240723171244.747521-1-roman.gushchin@linux.dev Fixes: 98c9daf5ae6b ("mm: memcg: guard memcg1-specific members of struct mem_cgroup_per_node") Signed-off-by: Roman Gushchin Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202407121335.31a10cb6-oliver.sang@intel.com Tested-by: Oliver Sang Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7e2eb091049a..0e5bf25d324f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -109,6 +109,7 @@ struct mem_cgroup_per_node { /* Fields which get updated often at the end. */ struct lruvec lruvec; + CACHELINE_PADDING(_pad2_); unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; }; -- cgit v1.2.3 From 3a7e02c040b130b5545e4b115aada7bacd80a2b6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 26 Jul 2024 15:32:27 -0700 Subject: minmax: avoid overly complicated constant expressions in VM code The minmax infrastructure is overkill for simple constants, and can cause huge expansions because those simple constants are then used by other things. For example, 'pageblock_order' is a core VM constant, but because it was implemented using 'min_t()' and all the type-checking that involves, it actually expanded to something like 2.5kB of preprocessor noise. And when that simple constant was then used inside other expansions: #define pageblock_nr_pages (1UL << pageblock_order) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) and we then use that inside a 'max()' macro: case ISOLATE_SUCCESS: update_cached = false; last_migrated_pfn = max(cc->zone->zone_start_pfn, pageblock_start_pfn(cc->migrate_pfn - 1)); the end result was that one statement expanding to 253kB in size. There are probably other cases of this, but this one case certainly stood out. I've added 'MIN_T()' and 'MAX_T()' macros for this kind of "core simple constant with specific type" use. These macros skip the type checking, and as such need to be very sparingly used only for obvious cases that have active issues like this. Reported-by: Lorenzo Stoakes Link: https://lore.kernel.org/all/36aa2cad-1db1-4abf-8dd2-fb20484aabc3@lucifer.local/ Cc: David Laight Signed-off-by: Linus Torvalds --- include/linux/minmax.h | 7 +++++++ include/linux/pageblock-flags.h | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 2ec559284a9f..a7ef65f78933 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -270,4 +270,11 @@ static inline bool in_range32(u32 val, u32 start, u32 len) #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) +/* + * Use these carefully: no type checking, and uses the arguments + * multiple times. Use for obvious constants only. + */ +#define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b)) +#define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b)) + #endif /* _LINUX_MINMAX_H */ diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 547e82cdc89a..fc6b9c87cb0a 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -41,13 +41,13 @@ extern unsigned int pageblock_order; * Huge pages are a constant size, but don't exceed the maximum allocation * granularity. */ -#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER) +#define pageblock_order MIN_T(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ #elif defined(CONFIG_TRANSPARENT_HUGEPAGE) -#define pageblock_order min_t(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER) +#define pageblock_order MIN_T(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER) #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit v1.2.3 From 00e3913b0416fe69d28745c0a2a340e2f76c219c Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 26 Jul 2024 01:16:48 +0900 Subject: Revert "firewire: Annotate struct fw_iso_packet with __counted_by()" This reverts commit d3155742db89df3b3c96da383c400e6ff4d23c25. The header_length field is byte unit, thus it can not express the number of elements in header field. It seems that the argument for counted_by attribute can have no arithmetic expression, therefore this commit just reverts the issued commit. Suggested-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20240725161648.130404-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 00abe0e5d602..1cca14cf5652 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -462,9 +462,8 @@ struct fw_iso_packet { /* rx: Sync bit, wait for matching sy */ u32 tag:2; /* tx: Tag in packet header */ u32 sy:4; /* tx: Sy in packet header */ - u32 header_length:8; /* Length of immediate header */ - /* tx: Top of 1394 isoch. data_block */ - u32 header[] __counted_by(header_length); + u32 header_length:8; /* Size of immediate header */ + u32 header[]; /* tx: Top of 1394 isoch. data_block */ }; #define FW_ISO_CONTEXT_TRANSMIT 0 -- cgit v1.2.3 From 017fa3e89187848fd056af757769c9e66ac3e93d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 13:50:01 -0700 Subject: minmax: simplify and clarify min_t()/max_t() implementation This simplifies the min_t() and max_t() macros by no longer making them work in the context of a C constant expression. That means that you can no longer use them for static initializers or for array sizes in type definitions, but there were only a couple of such uses, and all of them were converted (famous last words) to use MIN_T/MAX_T instead. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- include/linux/minmax.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index a7ef65f78933..9c2848abc804 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -45,17 +45,20 @@ #define __cmp(op, x, y) ((x) __cmp_op_##op (y) ? (x) : (y)) -#define __cmp_once(op, x, y, unique_x, unique_y) ({ \ - typeof(x) unique_x = (x); \ - typeof(y) unique_y = (y); \ +#define __cmp_once_unique(op, type, x, y, ux, uy) \ + ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); }) + +#define __cmp_once(op, type, x, y) \ + __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) + +#define __careful_cmp_once(op, x, y) ({ \ static_assert(__types_ok(x, y), \ #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ - __cmp(op, unique_x, unique_y); }) + __cmp_once(op, __auto_type, x, y); }) #define __careful_cmp(op, x, y) \ __builtin_choose_expr(__is_constexpr((x) - (y)), \ - __cmp(op, x, y), \ - __cmp_once(op, x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y))) + __cmp(op, x, y), __careful_cmp_once(op, x, y)) #define __clamp(val, lo, hi) \ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) @@ -158,7 +161,7 @@ * @x: first value * @y: second value */ -#define min_t(type, x, y) __careful_cmp(min, (type)(x), (type)(y)) +#define min_t(type, x, y) __cmp_once(min, type, x, y) /** * max_t - return maximum of two values, using the specified type @@ -166,7 +169,7 @@ * @x: first value * @y: second value */ -#define max_t(type, x, y) __careful_cmp(max, (type)(x), (type)(y)) +#define max_t(type, x, y) __cmp_once(max, type, x, y) /* * Do not check the array parameter using __must_be_array(). -- cgit v1.2.3