From c25951eb7518844fcb7fc9ec58e888731e8c46d0 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 15 Nov 2024 15:20:55 +0000 Subject: bus: fsl-mc: Remove deadcode fsl_mc_allocator_driver_exit() was added explicitly by commit 1e8ac83b6caf ("bus: fsl-mc: add fsl_mc_allocator cleanup function") but was never used. Remove it. fsl_mc_portal_reset() was added in 2015 by commit 197f4d6a4a00 ("staging: fsl-mc: fsl-mc object allocator driver") but was never used. Remove it. fsl_mc_portal_reset() was the only caller of dpmcp_reset(). Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Ioana Ciornei Acked-by: Christophe Leroy Link: https://lore.kernel.org/r/20241115152055.279732-1-linux@treblig.org Signed-off-by: Christophe Leroy --- include/linux/fsl/mc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 99f30c7d6208..897d6211c163 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -417,8 +417,6 @@ int __must_check fsl_mc_portal_allocate(struct fsl_mc_device *mc_dev, void fsl_mc_portal_free(struct fsl_mc_io *mc_io); -int fsl_mc_portal_reset(struct fsl_mc_io *mc_io); - int __must_check fsl_mc_object_allocate(struct fsl_mc_device *mc_dev, enum fsl_mc_pool_type pool_type, struct fsl_mc_device **new_mc_adev); -- cgit v1.2.3 From a22b3d54de94f82ca057cc2ebf9496fa91ebf698 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 30 Mar 2025 17:52:39 -0400 Subject: cgroup/cpuset: Fix race between newly created partition and dying one There is a possible race between removing a cgroup diectory that is a partition root and the creation of a new partition. The partition to be removed can be dying but still online, it doesn't not currently participate in checking for exclusive CPUs conflict, but the exclusive CPUs are still there in subpartitions_cpus and isolated_cpus. These two cpumasks are global states that affect the operation of cpuset partitions. The exclusive CPUs in dying cpusets will only be removed when cpuset_css_offline() function is called after an RCU delay. As a result, it is possible that a new partition can be created with exclusive CPUs that overlap with those of a dying one. When that dying partition is finally offlined, it removes those overlapping exclusive CPUs from subpartitions_cpus and maybe isolated_cpus resulting in an incorrect CPU configuration. This bug was found when a warning was triggered in remote_partition_disable() during testing because the subpartitions_cpus mask was empty. One possible way to fix this is to iterate the dying cpusets as well and avoid using the exclusive CPUs in those dying cpusets. However, this can still cause random partition creation failures or other anomalies due to racing. A better way to fix this race is to reset the partition state at the moment when a cpuset is being killed. Introduce a new css_killed() CSS function pointer and call it, if defined, before setting CSS_DYING flag in kill_css(). Also update the css_is_dying() helper to use the CSS_DYING flag introduced by commit 33c35aa48178 ("cgroup: Prevent kill_css() from being called more than once") for proper synchronization. Add a new cpuset_css_killed() function to reset the partition state of a valid partition root if it is being killed. Fixes: ee8dde0cd2ce ("cpuset: Add new v2 cpuset.sched.partition flag") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 + include/linux/cgroup.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 485b651869d9..5bc8f55c8cca 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -710,6 +710,7 @@ struct cgroup_subsys { void (*css_released)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css); void (*css_reset)(struct cgroup_subsys_state *css); + void (*css_killed)(struct cgroup_subsys_state *css); void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); int (*css_extra_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 28e999f2c642..e7da3c3b098b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -344,7 +344,7 @@ static inline u64 cgroup_id(const struct cgroup *cgrp) */ static inline bool css_is_dying(struct cgroup_subsys_state *css) { - return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt); + return css->flags & CSS_DYING; } static inline void cgroup_get(struct cgroup *cgrp) -- cgit v1.2.3 From 4c975fd700022c90e61a46326e3444e08317876e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:43 -0700 Subject: net: hold instance lock during NETDEV_REGISTER/UP Callers of inetdev_init can come from several places with inconsistent expectation about netdev instance lock. Grab instance lock during REGISTER (plus UP). Also solve the inconsistency with UNREGISTER where it was locked only during move netns path. WARNING: CPU: 10 PID: 1479 at ./include/net/netdev_lock.h:54 __netdev_update_features+0x65f/0xca0 __warn+0x81/0x180 __netdev_update_features+0x65f/0xca0 report_bug+0x156/0x180 handle_bug+0x4f/0x90 exc_invalid_op+0x13/0x60 asm_exc_invalid_op+0x16/0x20 __netdev_update_features+0x65f/0xca0 netif_disable_lro+0x30/0x1d0 inetdev_init+0x12f/0x1f0 inetdev_event+0x48b/0x870 notifier_call_chain+0x38/0xf0 register_netdevice+0x741/0x8b0 register_netdev+0x1f/0x40 mlx5e_probe+0x4e3/0x8e0 [mlx5_core] auxiliary_bus_probe+0x3f/0x90 really_probe+0xc3/0x3a0 __driver_probe_device+0x80/0x150 driver_probe_device+0x1f/0x90 __device_attach_driver+0x7d/0x100 bus_for_each_drv+0x80/0xd0 __device_attach+0xb4/0x1c0 bus_probe_device+0x91/0xa0 device_add+0x657/0x870 Reviewed-by: Jakub Kicinski Reported-by: Cosmin Ratiu Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations") Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-3-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fa79145518d1..cf3b6445817b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4192,7 +4192,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, int netif_set_alias(struct net_device *dev, const char *alias, size_t len); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); -int netif_change_net_namespace(struct net_device *dev, struct net *net, +int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex, struct netlink_ext_ack *extack); int dev_change_net_namespace(struct net_device *dev, struct net *net, -- cgit v1.2.3 From 459a35111b0a890172a78d51c01b204e13a34a18 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 14 Mar 2025 19:46:23 -0700 Subject: KVM: Allow building irqbypass.ko as as module when kvm.ko is a module Convert HAVE_KVM_IRQ_BYPASS into a tristate so that selecting IRQ_BYPASS_MANAGER follows KVM={m,y}, i.e. doesn't force irqbypass.ko to be built-in. Note, PPC allows building KVM as a module, but selects HAVE_KVM_IRQ_BYPASS from a boolean Kconfig, i.e. KVM PPC unnecessarily forces irqbpass.ko to be built-in. But that flaw is a longstanding PPC specific issue. Fixes: 61df71ee992d ("kvm: move "select IRQ_BYPASS_MANAGER" to common code") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20250315024623.2363994-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5438a1b446a6..291d49b9bf05 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2382,7 +2382,7 @@ static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot) struct kvm_vcpu *kvm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); -#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) bool kvm_arch_has_irq_bypass(void); int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); -- cgit v1.2.3 From 825dfab23bca520629a9e5a21ba5b03aaccc75f2 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:28:55 +0100 Subject: irqdomain: Rename irq_set_default_host() to irq_set_default_domain() Naming interrupt domains host is confusing at best and the irqdomain code uses both domain and host inconsistently. Therefore rename irq_set_default_host() to irq_set_default_domain(). Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-3-jirislaby@kernel.org --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 33ff41eef8f7..4b5c495b5710 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -352,7 +352,7 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, void *host_data); struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token); -void irq_set_default_host(struct irq_domain *host); +void irq_set_default_domain(struct irq_domain *domain); struct irq_domain *irq_get_default_host(void); int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, irq_hw_number_t hwirq, int node, -- cgit v1.2.3 From 0a27ea384c82e70d16e40adbaebeb3725f7e6342 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:28:56 +0100 Subject: irqdomain: Rename irq_get_default_host() to irq_get_default_domain() Naming interrupt domains host is confusing at best and the irqdomain code uses both domain and host inconsistently. Therefore rename irq_get_default_host() to irq_get_default_domain(). Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-4-jirislaby@kernel.org --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 4b5c495b5710..e9ab95fbc5a9 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -353,7 +353,7 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token); void irq_set_default_domain(struct irq_domain *domain); -struct irq_domain *irq_get_default_host(void); +struct irq_domain *irq_get_default_domain(void); int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity); -- cgit v1.2.3 From d2705d33885e3a19f727dff2521fb7d5b1fc5cda Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:28:57 +0100 Subject: irqdomain: Stop using 'host' for domain It is confusing to see 'host' and 'domain' to be used as 'domain'. Given this header is all about domains, switch the remaining 'host' uses to 'domain'. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-5-jirislaby@kernel.org --- include/linux/irqdomain.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e9ab95fbc5a9..bb7111105296 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -72,7 +72,7 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, /** * struct irq_domain_ops - Methods for irq_domain objects - * @match: Match an interrupt controller device node to a host, returns + * @match: Match an interrupt controller device node to a domain, returns * 1 on a match * @select: Match an interrupt controller fw specification. It is more generic * than @match as it receives a complete struct irq_fwspec. Therefore, @@ -454,7 +454,7 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod return IS_ERR(d) ? NULL : d; } -unsigned int irq_create_direct_mapping(struct irq_domain *host); +unsigned int irq_create_direct_mapping(struct irq_domain *domain); #endif static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, @@ -507,7 +507,7 @@ static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fw return IS_ERR(d) ? NULL : d; } -void irq_domain_remove(struct irq_domain *host); +void irq_domain_remove(struct irq_domain *domain); int irq_domain_associate(struct irq_domain *domain, unsigned int irq, irq_hw_number_t hwirq); @@ -515,16 +515,16 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count); -unsigned int irq_create_mapping_affinity(struct irq_domain *host, +unsigned int irq_create_mapping_affinity(struct irq_domain *domain, irq_hw_number_t hwirq, const struct irq_affinity_desc *affinity); unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); void irq_dispose_mapping(unsigned int virq); -static inline unsigned int irq_create_mapping(struct irq_domain *host, +static inline unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - return irq_create_mapping_affinity(host, hwirq, NULL); + return irq_create_mapping_affinity(domain, hwirq, NULL); } struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, -- cgit v1.2.3 From 8fa7292fee5c5240402371ea89ab285ec856c916 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 Apr 2025 10:17:26 +0200 Subject: treewide: Switch/rename to timer_delete[_sync]() timer_delete[_sync]() replaces del_timer[_sync](). Convert the whole tree over and remove the historical wrapper inlines. Conversion was done with coccinelle plus manual fixups where necessary. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/timer.h | 36 +----------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index e67ecd1cbc97..10596d7c3a34 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -30,7 +30,7 @@ * * @TIMER_IRQSAFE: An irqsafe timer is executed with IRQ disabled and * it's safe to wait for the completion of the running instance from - * IRQ handlers, for example, by calling del_timer_sync(). + * IRQ handlers, for example, by calling timer_delete_sync(). * * Note: The irq disabled callback execution is a special case for * workqueue locking issues. It's not meant for executing random crap @@ -168,40 +168,6 @@ extern int timer_delete(struct timer_list *timer); extern int timer_shutdown_sync(struct timer_list *timer); extern int timer_shutdown(struct timer_list *timer); -/** - * del_timer_sync - Delete a pending timer and wait for a running callback - * @timer: The timer to be deleted - * - * See timer_delete_sync() for detailed explanation. - * - * Do not use in new code. Use timer_delete_sync() instead. - * - * Returns: - * * %0 - The timer was not pending - * * %1 - The timer was pending and deactivated - */ -static inline int del_timer_sync(struct timer_list *timer) -{ - return timer_delete_sync(timer); -} - -/** - * del_timer - Delete a pending timer - * @timer: The timer to be deleted - * - * See timer_delete() for detailed explanation. - * - * Do not use in new code. Use timer_delete() instead. - * - * Returns: - * * %0 - The timer was not pending - * * %1 - The timer was pending and deactivated - */ -static inline int del_timer(struct timer_list *timer) -{ - return timer_delete(timer); -} - extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); -- cgit v1.2.3 From 9779489a31d77a7b9cb6f20d2d2caced4e29dbe6 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:10 +0100 Subject: hrtimers: Delete hrtimer_init() hrtimer_init() is now unused. Delete it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/003722f60c7a2a4f8d4ed24fb741aa313b7e5136.1738746927.git.namcao@linutronix.de --- include/linux/hrtimer.h | 2 -- include/linux/hrtimer_types.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 88e078871158..1adcba3ddd76 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -231,8 +231,6 @@ static inline enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) /* Exported timer functions: */ /* Initialize timers: */ -extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, - enum hrtimer_mode mode); extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode); extern void hrtimer_setup_on_stack(struct hrtimer *timer, diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index ad66a3081735..7c5b27daa89d 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -34,7 +34,7 @@ enum hrtimer_restart { * @is_hard: Set if hrtimer will be expired in hard interrupt context * even on RT. * - * The hrtimer structure must be initialized by hrtimer_init() + * The hrtimer structure must be initialized by hrtimer_setup() */ struct hrtimer { struct timerqueue_node node; -- cgit v1.2.3 From 04257da0c99c9d4ff7c5bb93046482e1f7d34938 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:16 +0100 Subject: hrtimers: Make callback function pointer private Make the struct hrtimer::function field private, to prevent users from changing this field in an unsafe way. hrtimer_update_function() should be used if the callback function needs to be changed. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/7d0e6e0c5c59a64a9bea940051aac05d750bc0c2.1738746927.git.namcao@linutronix.de --- include/linux/hrtimer_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 7c5b27daa89d..8fbbb6bdf7a1 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -39,7 +39,7 @@ enum hrtimer_restart { struct hrtimer { struct timerqueue_node node; ktime_t _softexpires; - enum hrtimer_restart (*function)(struct hrtimer *); + enum hrtimer_restart (*__private function)(struct hrtimer *); struct hrtimer_clock_base *base; u8 state; u8 is_rel; -- cgit v1.2.3 From 1c1fd374a2fe72b8a6dde62d3c3a9fd153e7581c Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 1 Apr 2025 15:36:37 +0200 Subject: mtd: spinand: Fix build with gcc < 7.5 __VA_OPT__ is a macro that is useful when some arguments can be present or not to entirely skip some part of a definition. Unfortunately, it is a too recent addition that some of the still supported old GCC versions do not know about, and is anyway not part of C11 that is the version used in the kernel. Find a trick to remove this macro, typically '__VA_ARGS__ + 0' is a workaround used in netlink.h which works very well here, as we either expect: - 0 - A positive value - No value, which means the field should be 0. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202503181330.YcDXGy7F-lkp@intel.com/ Fixes: 7ce0d16d5802 ("mtd: spinand: Add an optional frequency to read from cache macros") Cc: stable@vger.kernel.org Tested-by: Jean Delvare Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 1e748958dad4..311f145eb4e8 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -67,7 +67,7 @@ SPI_MEM_OP_ADDR(2, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 1), \ - __VA_OPT__(SPI_MEM_OP_MAX_FREQ(__VA_ARGS__))) + SPI_MEM_OP_MAX_FREQ(__VA_ARGS__ + 0)) #define SPINAND_PAGE_READ_FROM_CACHE_FAST_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1), \ -- cgit v1.2.3 From 75f8c87555e6ddeff2c49bd47460a71a940edc48 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 6 Mar 2025 09:45:52 +0100 Subject: irqchip/davinci: Remove leftover header Commit fa8dede4d0a0 ("irqchip: remove davinci aintc driver") removed the davinci aintc driver but left behind the associated header. Remove it now. Fixes: fa8dede4d0a0 ("irqchip: remove davinci aintc driver") Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Acked-by: Arnd Bergmann Link: https://lore.kernel.org/all/20250306084552.15894-1-brgl@bgdev.pl --- include/linux/irqchip/irq-davinci-aintc.h | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 include/linux/irqchip/irq-davinci-aintc.h (limited to 'include/linux') diff --git a/include/linux/irqchip/irq-davinci-aintc.h b/include/linux/irqchip/irq-davinci-aintc.h deleted file mode 100644 index ea4e087fac98..000000000000 --- a/include/linux/irqchip/irq-davinci-aintc.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2019 Texas Instruments - */ - -#ifndef _LINUX_IRQ_DAVINCI_AINTC_ -#define _LINUX_IRQ_DAVINCI_AINTC_ - -#include - -/** - * struct davinci_aintc_config - configuration data for davinci-aintc driver. - * - * @reg: register range to map - * @num_irqs: number of HW interrupts supported by the controller - * @prios: an array of size num_irqs containing priority settings for - * each interrupt - */ -struct davinci_aintc_config { - struct resource reg; - unsigned int num_irqs; - u8 *prios; -}; - -void davinci_aintc_init(const struct davinci_aintc_config *config); - -#endif /* _LINUX_IRQ_DAVINCI_AINTC_ */ -- cgit v1.2.3 From d8455a63f731b4f585acc4d49fd7ad78db63b3d0 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 13 Mar 2025 16:55:26 +0800 Subject: platform/x86: intel_pmc_ipc: add option to build without ACPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a configuration option that allows users to build the intel_pmc_ipc driver without ACPI support. This is useful for systems where ACPI is not available or desired. Based on the discussion from the patch [1], it was necessary to provide this option to accommodate specific use cases. Link: https://patchwork.kernel.org/project/netdevbpf/patch/20250227121522.1802832-6-yong.liang.choong@linux.intel.com/#26280764 [1] Signed-off-by: David E. Box Co-developed-by: Choong Yong Liang Signed-off-by: Choong Yong Liang Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250313085526.1439092-1-yong.liang.choong@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/intel_pmc_ipc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/intel_pmc_ipc.h b/include/linux/platform_data/x86/intel_pmc_ipc.h index 6e603a8c075f..1d34435b7001 100644 --- a/include/linux/platform_data/x86/intel_pmc_ipc.h +++ b/include/linux/platform_data/x86/intel_pmc_ipc.h @@ -36,6 +36,7 @@ struct pmc_ipc_rbuf { */ static inline int intel_pmc_ipc(struct pmc_ipc_cmd *ipc_cmd, struct pmc_ipc_rbuf *rbuf) { +#ifdef CONFIG_ACPI struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object params[PMC_IPCS_PARAM_COUNT] = { {.type = ACPI_TYPE_INTEGER,}, @@ -89,6 +90,9 @@ static inline int intel_pmc_ipc(struct pmc_ipc_cmd *ipc_cmd, struct pmc_ipc_rbuf } return 0; +#else + return -ENODEV; +#endif /* CONFIG_ACPI */ } #endif /* INTEL_PMC_IPC_H */ -- cgit v1.2.3 From 04efcee6ef8d0f01eef495db047e7216d6e6e38f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 4 Apr 2025 09:11:22 -0700 Subject: net: hold instance lock during NETDEV_CHANGE Cosmin reports an issue with ipv6_add_dev being called from NETDEV_CHANGE notifier: [ 3455.008776] ? ipv6_add_dev+0x370/0x620 [ 3455.010097] ipv6_find_idev+0x96/0xe0 [ 3455.010725] addrconf_add_dev+0x1e/0xa0 [ 3455.011382] addrconf_init_auto_addrs+0xb0/0x720 [ 3455.013537] addrconf_notify+0x35f/0x8d0 [ 3455.014214] notifier_call_chain+0x38/0xf0 [ 3455.014903] netdev_state_change+0x65/0x90 [ 3455.015586] linkwatch_do_dev+0x5a/0x70 [ 3455.016238] rtnl_getlink+0x241/0x3e0 [ 3455.019046] rtnetlink_rcv_msg+0x177/0x5e0 Similarly, linkwatch might get to ipv6_add_dev without ops lock: [ 3456.656261] ? ipv6_add_dev+0x370/0x620 [ 3456.660039] ipv6_find_idev+0x96/0xe0 [ 3456.660445] addrconf_add_dev+0x1e/0xa0 [ 3456.660861] addrconf_init_auto_addrs+0xb0/0x720 [ 3456.661803] addrconf_notify+0x35f/0x8d0 [ 3456.662236] notifier_call_chain+0x38/0xf0 [ 3456.662676] netdev_state_change+0x65/0x90 [ 3456.663112] linkwatch_do_dev+0x5a/0x70 [ 3456.663529] __linkwatch_run_queue+0xeb/0x200 [ 3456.663990] linkwatch_event+0x21/0x30 [ 3456.664399] process_one_work+0x211/0x610 [ 3456.664828] worker_thread+0x1cc/0x380 [ 3456.665691] kthread+0xf4/0x210 Reclassify NETDEV_CHANGE as a notifier that consistently runs under the instance lock. Link: https://lore.kernel.org/netdev/aac073de8beec3e531c86c101b274d434741c28e.camel@nvidia.com/ Reported-by: Cosmin Ratiu Tested-by: Cosmin Ratiu Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations") Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250404161122.3907628-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ include/linux/rtnetlink.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf3b6445817b..2d11d013cabe 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4429,6 +4429,7 @@ void linkwatch_fire_event(struct net_device *dev); * pending work list (if queued). */ void linkwatch_sync_dev(struct net_device *dev); +void __linkwatch_sync_dev(struct net_device *dev); /** * netif_carrier_ok - test if carrier present @@ -4974,6 +4975,7 @@ void dev_set_rx_mode(struct net_device *dev); int dev_set_promiscuity(struct net_device *dev, int inc); int netif_set_allmulti(struct net_device *dev, int inc, bool notify); int dev_set_allmulti(struct net_device *dev, int inc); +void netif_state_change(struct net_device *dev); void netdev_state_change(struct net_device *dev); void __netdev_notify_peers(struct net_device *dev); void netdev_notify_peers(struct net_device *dev); diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index ccaaf4c7d5f6..ea39dd23a197 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -240,6 +240,6 @@ rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group) return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group); } -void netdev_set_operstate(struct net_device *dev, int newstate); +void netif_set_operstate(struct net_device *dev, int newstate); #endif /* __LINUX_RTNETLINK_H */ -- cgit v1.2.3 From 6deb8435f6bfcc9b6c7efe3b8a941ae2fb731495 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 1 Apr 2025 14:46:42 +0200 Subject: gpio: deprecate the GPIOD_FLAGS_BIT_NONEXCLUSIVE flag The non-exclusive GPIO request flag looks like a functional feature but is in fact a workaround for a corner-case that got out of hand. It should be removed so deprecate it officially so that nobody uses it anymore. Acked-by: Linus Walleij Link: https://lore.kernel.org/r/20250401-gpio-todo-remove-nonexclusive-v2-1-7c1380797b0d@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 45b651c05b9c..8adc8e9cb4a7 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -31,6 +31,7 @@ struct gpio_descs { #define GPIOD_FLAGS_BIT_DIR_OUT BIT(1) #define GPIOD_FLAGS_BIT_DIR_VAL BIT(2) #define GPIOD_FLAGS_BIT_OPEN_DRAIN BIT(3) +/* GPIOD_FLAGS_BIT_NONEXCLUSIVE is DEPRECATED, don't use in new code. */ #define GPIOD_FLAGS_BIT_NONEXCLUSIVE BIT(4) /** -- cgit v1.2.3 From 56799bc035658738f362acec3e7647bb84e68933 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 4 Mar 2025 14:54:46 +0100 Subject: perf: Fix hang while freeing sigtrap event Perf can hang while freeing a sigtrap event if a related deferred signal hadn't managed to be sent before the file got closed: perf_event_overflow() task_work_add(perf_pending_task) fput() task_work_add(____fput()) task_work_run() ____fput() perf_release() perf_event_release_kernel() _free_event() perf_pending_task_sync() task_work_cancel() -> FAILED rcuwait_wait_event() Once task_work_run() is running, the list of pending callbacks is removed from the task_struct and from this point on task_work_cancel() can't remove any pending and not yet started work items, hence the task_work_cancel() failure and the hang on rcuwait_wait_event(). Task work could be changed to remove one work at a time, so a work running on the current task can always cancel a pending one, however the wait / wake design is still subject to inverted dependencies when remote targets are involved, as pictured by Oleg: T1 T2 fd = perf_event_open(pid => T2->pid); fd = perf_event_open(pid => T1->pid); close(fd) close(fd) perf_event_overflow() perf_event_overflow() task_work_add(perf_pending_task) task_work_add(perf_pending_task) fput() fput() task_work_add(____fput()) task_work_add(____fput()) task_work_run() task_work_run() ____fput() ____fput() perf_release() perf_release() perf_event_release_kernel() perf_event_release_kernel() _free_event() _free_event() perf_pending_task_sync() perf_pending_task_sync() rcuwait_wait_event() rcuwait_wait_event() Therefore the only option left is to acquire the event reference count upon queueing the perf task work and release it from the task work, just like it was done before 3a5465418f5f ("perf: Fix event leak upon exec and file release") but without the leaks it fixed. Some adjustments are necessary to make it work: * A child event might dereference its parent upon freeing. Care must be taken to release the parent last. * Some places assuming the event doesn't have any reference held and therefore can be freed right away must instead put the reference and let the reference counting to its job. Reported-by: "Yi Lai" Closes: https://lore.kernel.org/all/Zx9Losv4YcJowaP%2F@ly-workstation/ Reported-by: syzbot+3c4321e10eea460eb606@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/673adf75.050a0220.87769.0024.GAE@google.com/ Fixes: 3a5465418f5f ("perf: Fix event leak upon exec and file release") Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250304135446.18905-1-frederic@kernel.org --- include/linux/perf_event.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5a9bf15d4461..0069ba6866a4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -823,7 +823,6 @@ struct perf_event { struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; - struct rcuwait pending_work_wait; atomic_t event_limit; -- cgit v1.2.3 From 2424e146bee00ddb4d4f79d3224f54634ca8d2bc Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Tue, 8 Apr 2025 12:38:54 +0200 Subject: hrtimer: Add missing ACCESS_PRIVATE() for hrtimer::function The "function" field of struct hrtimer has been changed to private, but two instances have not been converted to use ACCESS_PRIVATE(). Convert them to use ACCESS_PRIVATE(). Fixes: 04257da0c99c ("hrtimers: Make callback function pointer private") Reported-by: kernel test robot Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250408103854.1851093-1-namcao@linutronix.de Closes: https://lore.kernel.org/oe-kbuild-all/202504071931.vOVl13tt-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202504072155.5UAZjYGU-lkp@intel.com/ --- include/linux/hrtimer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1adcba3ddd76..1ef867bb8c44 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -345,7 +345,7 @@ static inline void hrtimer_update_function(struct hrtimer *timer, if (WARN_ON_ONCE(!function)) return; #endif - timer->function = function; + ACCESS_PRIVATE(timer, function) = function; } /* Forward a hrtimer so it expires after now: */ -- cgit v1.2.3 From 285b2c74cf9982e873ef82a2cb1328d9e9406f65 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 10 Apr 2025 14:21:29 +0100 Subject: firmware: cs_dsp: test_bin_error: Fix uninitialized data used as fw version Call cs_dsp_mock_xm_header_get_fw_version() to get the firmware version from the dummy XM header data in cs_dsp_bin_err_test_common_init(). Make the same change to cs_dsp_bin_test_common_init() and remove the cs_dsp_mock_xm_header_get_fw_version_from_regmap() function. The code in cs_dsp_test_bin.c was correctly calling cs_dsp_mock_xm_header_get_fw_version_from_regmap() to fetch the fw version from a dummy header it wrote to XM registers. However in cs_dsp_test_bin_error.c the test doesn't stuff a dummy header into XM, it populates it the normal way using a wmfw file. It should have called cs_dsp_mock_xm_header_get_fw_version() to get the data from its blob buffer, but was calling cs_dsp_mock_xm_header_get_fw_version_from_regmap(). As nothing had been written to the registers this returned the value of uninitialized data. The only other use of cs_dsp_mock_xm_header_get_fw_version_from_regmap() was cs_dsp_test_bin.c, but it doesn't need to use it. It already has a blob buffer containing the dummy XM header so it can use cs_dsp_mock_xm_header_get_fw_version() to read from that. Fixes: cd8c058499b6 ("firmware: cs_dsp: Add KUnit testing of bin error cases") Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20250410132129.1312541-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp_test_utils.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp_test_utils.h b/include/linux/firmware/cirrus/cs_dsp_test_utils.h index 4f87a908ab4f..ecd821ed8064 100644 --- a/include/linux/firmware/cirrus/cs_dsp_test_utils.h +++ b/include/linux/firmware/cirrus/cs_dsp_test_utils.h @@ -104,7 +104,6 @@ unsigned int cs_dsp_mock_num_dsp_words_to_num_packed_regs(unsigned int num_dsp_w unsigned int cs_dsp_mock_xm_header_get_alg_base_in_words(struct cs_dsp_test *priv, unsigned int alg_id, int mem_type); -unsigned int cs_dsp_mock_xm_header_get_fw_version_from_regmap(struct cs_dsp_test *priv); unsigned int cs_dsp_mock_xm_header_get_fw_version(struct cs_dsp_mock_xm_header *header); void cs_dsp_mock_xm_header_drop_from_regmap_cache(struct cs_dsp_test *priv); int cs_dsp_mock_xm_header_write_to_regmap(struct cs_dsp_mock_xm_header *header); -- cgit v1.2.3 From b2b4483b5d05026218127fc8f38c69adf69c235b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 8 Apr 2025 13:00:53 -0700 Subject: dcache: convert dentry flag macros to enum Commit 9748cb2dc393 ("VFS: repack DENTRY_ flags.") changed the value of DCACHE_MOUNTED, which broke drgn's path_lookup() helper. drgn is forced to hard-code it because it's a macro, and macros aren't preserved in debugging information by default. Enums, on the other hand, are included in debugging information. Convert the DCACHE_* flag macros to an enum so that debugging tools like drgn and bpftrace can make use of them. Link: https://github.com/osandov/drgn/blob/2027d0fea84d74b835e77392f7040c2a333180c6/drgn/helpers/linux/fs.py#L43-L46 Signed-off-by: Omar Sandoval Link: https://lore.kernel.org/177665a082f048cf536b9cd6af467b3be6b6e6ed.1744141838.git.osandov@fb.com Signed-off-by: Christian Brauner --- include/linux/dcache.h | 106 +++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8d1395f945bf..e9f07e37dd6f 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -173,65 +173,59 @@ struct dentry_operations { */ /* d_flags entries */ -#define DCACHE_OP_HASH BIT(0) -#define DCACHE_OP_COMPARE BIT(1) -#define DCACHE_OP_REVALIDATE BIT(2) -#define DCACHE_OP_DELETE BIT(3) -#define DCACHE_OP_PRUNE BIT(4) - -#define DCACHE_DISCONNECTED BIT(5) - /* This dentry is possibly not currently connected to the dcache tree, in - * which case its parent will either be itself, or will have this flag as - * well. nfsd will not use a dentry with this bit set, but will first - * endeavour to clear the bit either by discovering that it is connected, - * or by performing lookup operations. Any filesystem which supports - * nfsd_operations MUST have a lookup function which, if it finds a - * directory inode with a DCACHE_DISCONNECTED dentry, will d_move that - * dentry into place and return that dentry rather than the passed one, - * typically using d_splice_alias. */ - -#define DCACHE_REFERENCED BIT(6) /* Recently used, don't discard. */ - -#define DCACHE_DONTCACHE BIT(7) /* Purge from memory on final dput() */ - -#define DCACHE_CANT_MOUNT BIT(8) -#define DCACHE_GENOCIDE BIT(9) -#define DCACHE_SHRINK_LIST BIT(10) - -#define DCACHE_OP_WEAK_REVALIDATE BIT(11) - -#define DCACHE_NFSFS_RENAMED BIT(12) - /* this dentry has been "silly renamed" and has to be deleted on the last - * dput() */ -#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(13) - /* Parent inode is watched by some fsnotify listener */ - -#define DCACHE_DENTRY_KILLED BIT(14) - -#define DCACHE_MOUNTED BIT(15) /* is a mountpoint */ -#define DCACHE_NEED_AUTOMOUNT BIT(16) /* handle automount on this dir */ -#define DCACHE_MANAGE_TRANSIT BIT(17) /* manage transit from this dirent */ +enum dentry_flags { + DCACHE_OP_HASH = BIT(0), + DCACHE_OP_COMPARE = BIT(1), + DCACHE_OP_REVALIDATE = BIT(2), + DCACHE_OP_DELETE = BIT(3), + DCACHE_OP_PRUNE = BIT(4), + /* + * This dentry is possibly not currently connected to the dcache tree, + * in which case its parent will either be itself, or will have this + * flag as well. nfsd will not use a dentry with this bit set, but will + * first endeavour to clear the bit either by discovering that it is + * connected, or by performing lookup operations. Any filesystem which + * supports nfsd_operations MUST have a lookup function which, if it + * finds a directory inode with a DCACHE_DISCONNECTED dentry, will + * d_move that dentry into place and return that dentry rather than the + * passed one, typically using d_splice_alias. + */ + DCACHE_DISCONNECTED = BIT(5), + DCACHE_REFERENCED = BIT(6), /* Recently used, don't discard. */ + DCACHE_DONTCACHE = BIT(7), /* Purge from memory on final dput() */ + DCACHE_CANT_MOUNT = BIT(8), + DCACHE_GENOCIDE = BIT(9), + DCACHE_SHRINK_LIST = BIT(10), + DCACHE_OP_WEAK_REVALIDATE = BIT(11), + /* + * this dentry has been "silly renamed" and has to be deleted on the + * last dput() + */ + DCACHE_NFSFS_RENAMED = BIT(12), + DCACHE_FSNOTIFY_PARENT_WATCHED = BIT(13), /* Parent inode is watched by some fsnotify listener */ + DCACHE_DENTRY_KILLED = BIT(14), + DCACHE_MOUNTED = BIT(15), /* is a mountpoint */ + DCACHE_NEED_AUTOMOUNT = BIT(16), /* handle automount on this dir */ + DCACHE_MANAGE_TRANSIT = BIT(17), /* manage transit from this dirent */ + DCACHE_LRU_LIST = BIT(18), + DCACHE_ENTRY_TYPE = (7 << 19), /* bits 19..21 are for storing type: */ + DCACHE_MISS_TYPE = (0 << 19), /* Negative dentry */ + DCACHE_WHITEOUT_TYPE = (1 << 19), /* Whiteout dentry (stop pathwalk) */ + DCACHE_DIRECTORY_TYPE = (2 << 19), /* Normal directory */ + DCACHE_AUTODIR_TYPE = (3 << 19), /* Lookupless directory (presumed automount) */ + DCACHE_REGULAR_TYPE = (4 << 19), /* Regular file type */ + DCACHE_SPECIAL_TYPE = (5 << 19), /* Other file type */ + DCACHE_SYMLINK_TYPE = (6 << 19), /* Symlink */ + DCACHE_NOKEY_NAME = BIT(22), /* Encrypted name encoded without key */ + DCACHE_OP_REAL = BIT(23), + DCACHE_PAR_LOOKUP = BIT(24), /* being looked up (with parent locked shared) */ + DCACHE_DENTRY_CURSOR = BIT(25), + DCACHE_NORCU = BIT(26), /* No RCU delay for freeing */ +}; + #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) -#define DCACHE_LRU_LIST BIT(18) - -#define DCACHE_ENTRY_TYPE (7 << 19) /* bits 19..21 are for storing type: */ -#define DCACHE_MISS_TYPE (0 << 19) /* Negative dentry */ -#define DCACHE_WHITEOUT_TYPE (1 << 19) /* Whiteout dentry (stop pathwalk) */ -#define DCACHE_DIRECTORY_TYPE (2 << 19) /* Normal directory */ -#define DCACHE_AUTODIR_TYPE (3 << 19) /* Lookupless directory (presumed automount) */ -#define DCACHE_REGULAR_TYPE (4 << 19) /* Regular file type */ -#define DCACHE_SPECIAL_TYPE (5 << 19) /* Other file type */ -#define DCACHE_SYMLINK_TYPE (6 << 19) /* Symlink */ - -#define DCACHE_NOKEY_NAME BIT(22) /* Encrypted name encoded without key */ -#define DCACHE_OP_REAL BIT(23) - -#define DCACHE_PAR_LOOKUP BIT(24) /* being looked up (with parent locked shared) */ -#define DCACHE_DENTRY_CURSOR BIT(25) -#define DCACHE_NORCU BIT(26) /* No RCU delay for freeing */ - extern seqlock_t rename_lock; /* -- cgit v1.2.3 From 51339d99c0131bc0d16d378e9b05bc498d2967e2 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 2 Apr 2025 19:55:14 -0700 Subject: locking/local_lock, mm: replace localtry_ helpers with local_trylock_t type Partially revert commit 0aaddfb06882 ("locking/local_lock: Introduce localtry_lock_t"). Remove localtry_*() helpers, since localtry_lock() name might be misinterpreted as "try lock". Introduce local_trylock[_irqsave]() helpers that only work with newly introduced local_trylock_t type. Note that attempt to use local_trylock[_irqsave]() with local_lock_t will cause compilation failure. Usage and behavior in !PREEMPT_RT: local_lock_t lock; // sizeof(lock) == 0 local_lock(&lock); // preempt disable local_lock_irqsave(&lock, ...); // irq save if (local_trylock_irqsave(&lock, ...)) // compilation error local_trylock_t lock; // sizeof(lock) == 4 local_lock(&lock); // preempt disable, acquired = 1 local_lock_irqsave(&lock, ...); // irq save, acquired = 1 if (local_trylock(&lock)) // if (!acquired) preempt disable, acquired = 1 if (local_trylock_irqsave(&lock, ...)) // if (!acquired) irq save, acquired = 1 The existing local_lock_*() macros can be used either with local_lock_t or local_trylock_t. With local_trylock_t they set acquired = 1 while local_unlock_*() clears it. In !PREEMPT_RT local_lock_irqsave(local_lock_t *) disables interrupts to protect critical section, but it doesn't prevent NMI, so the fully reentrant code cannot use local_lock_irqsave(local_lock_t *) for exclusive access. The local_lock_irqsave(local_trylock_t *) helper disables interrupts and sets acquired=1, so local_trylock_irqsave(local_trylock_t *) from NMI attempting to acquire the same lock will return false. In PREEMPT_RT local_lock_irqsave() maps to preemptible spin_lock(). Map local_trylock_irqsave() to preemptible spin_trylock(). When in hard IRQ or NMI return false right away, since spin_trylock() is not safe due to explicit locking in the underneath rt_spin_trylock() implementation. Removing this explicit locking and attempting only "trylock" is undesired due to PI implications. The local_trylock() without _irqsave can be used to avoid the cost of disabling/enabling interrupts by only disabling preemption, so local_trylock() in an interrupt attempting to acquire the same lock will return false. Note there is no need to use local_inc for acquired variable, since it's a percpu variable with strict nesting scopes. Note that guard(local_lock)(&lock) works only for "local_lock_t lock". The patch also makes sure that local_lock_release(l) is called before WRITE_ONCE(l->acquired, 0). Though IRQs are disabled at this point the local_trylock() from NMI will succeed and local_lock_acquire(l) will warn. Link: https://lkml.kernel.org/r/20250403025514.41186-1-alexei.starovoitov@gmail.com Fixes: 0aaddfb06882 ("locking/local_lock: Introduce localtry_lock_t") Signed-off-by: Alexei Starovoitov Acked-by: Vlastimil Babka Acked-by: Sebastian Andrzej Siewior Reviewed-by: Shakeel Butt Cc: Daniel Borkman Cc: Linus Torvalds Cc: Martin KaFai Lau Cc: Michal Hocko Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/local_lock.h | 58 ++-------- include/linux/local_lock_internal.h | 207 +++++++++++++++--------------------- 2 files changed, 95 insertions(+), 170 deletions(-) (limited to 'include/linux') diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 1a0bc35839e3..16a2ee4f8310 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -52,44 +52,23 @@ __local_unlock_irqrestore(lock, flags) /** - * localtry_lock_init - Runtime initialize a lock instance - */ -#define localtry_lock_init(lock) __localtry_lock_init(lock) - -/** - * localtry_lock - Acquire a per CPU local lock - * @lock: The lock variable - */ -#define localtry_lock(lock) __localtry_lock(lock) - -/** - * localtry_lock_irq - Acquire a per CPU local lock and disable interrupts - * @lock: The lock variable - */ -#define localtry_lock_irq(lock) __localtry_lock_irq(lock) - -/** - * localtry_lock_irqsave - Acquire a per CPU local lock, save and disable - * interrupts - * @lock: The lock variable - * @flags: Storage for interrupt flags + * local_lock_init - Runtime initialize a lock instance */ -#define localtry_lock_irqsave(lock, flags) \ - __localtry_lock_irqsave(lock, flags) +#define local_trylock_init(lock) __local_trylock_init(lock) /** - * localtry_trylock - Try to acquire a per CPU local lock. + * local_trylock - Try to acquire a per CPU local lock * @lock: The lock variable * * The function can be used in any context such as NMI or HARDIRQ. Due to * locking constrains it will _always_ fail to acquire the lock in NMI or * HARDIRQ context on PREEMPT_RT. */ -#define localtry_trylock(lock) __localtry_trylock(lock) +#define local_trylock(lock) __local_trylock(lock) /** - * localtry_trylock_irqsave - Try to acquire a per CPU local lock, save and disable - * interrupts if acquired + * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable + * interrupts if acquired * @lock: The lock variable * @flags: Storage for interrupt flags * @@ -97,29 +76,8 @@ * locking constrains it will _always_ fail to acquire the lock in NMI or * HARDIRQ context on PREEMPT_RT. */ -#define localtry_trylock_irqsave(lock, flags) \ - __localtry_trylock_irqsave(lock, flags) - -/** - * local_unlock - Release a per CPU local lock - * @lock: The lock variable - */ -#define localtry_unlock(lock) __localtry_unlock(lock) - -/** - * local_unlock_irq - Release a per CPU local lock and enable interrupts - * @lock: The lock variable - */ -#define localtry_unlock_irq(lock) __localtry_unlock_irq(lock) - -/** - * localtry_unlock_irqrestore - Release a per CPU local lock and restore - * interrupt flags - * @lock: The lock variable - * @flags: Interrupt flags to restore - */ -#define localtry_unlock_irqrestore(lock, flags) \ - __localtry_unlock_irqrestore(lock, flags) +#define local_trylock_irqsave(lock, flags) \ + __local_trylock_irqsave(lock, flags) DEFINE_GUARD(local_lock, local_lock_t __percpu*, local_lock(_T), diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 67bd13d142fa..bf2bf40d7b18 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -15,10 +15,11 @@ typedef struct { #endif } local_lock_t; +/* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */ typedef struct { local_lock_t llock; - unsigned int acquired; -} localtry_lock_t; + u8 acquired; +} local_trylock_t; #ifdef CONFIG_DEBUG_LOCK_ALLOC # define LOCAL_LOCK_DEBUG_INIT(lockname) \ @@ -29,6 +30,9 @@ typedef struct { }, \ .owner = NULL, +# define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \ + .llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) }, + static inline void local_lock_acquire(local_lock_t *l) { lock_map_acquire(&l->dep_map); @@ -56,6 +60,7 @@ static inline void local_lock_debug_init(local_lock_t *l) } #else /* CONFIG_DEBUG_LOCK_ALLOC */ # define LOCAL_LOCK_DEBUG_INIT(lockname) +# define LOCAL_TRYLOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { } static inline void local_trylock_acquire(local_lock_t *l) { } static inline void local_lock_release(local_lock_t *l) { } @@ -63,7 +68,7 @@ static inline void local_lock_debug_init(local_lock_t *l) { } #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } -#define INIT_LOCALTRY_LOCK(lockname) { .llock = { LOCAL_LOCK_DEBUG_INIT(lockname.llock) }} +#define INIT_LOCAL_TRYLOCK(lockname) { LOCAL_TRYLOCK_DEBUG_INIT(lockname) } #define __local_lock_init(lock) \ do { \ @@ -76,6 +81,8 @@ do { \ local_lock_debug_init(lock); \ } while (0) +#define __local_trylock_init(lock) __local_lock_init(lock.llock) + #define __spinlock_nested_bh_init(lock) \ do { \ static struct lock_class_key __key; \ @@ -87,149 +94,117 @@ do { \ local_lock_debug_init(lock); \ } while (0) +#define __local_lock_acquire(lock) \ + do { \ + local_trylock_t *tl; \ + local_lock_t *l; \ + \ + l = (local_lock_t *)this_cpu_ptr(lock); \ + tl = (local_trylock_t *)l; \ + _Generic((lock), \ + local_trylock_t *: ({ \ + lockdep_assert(tl->acquired == 0); \ + WRITE_ONCE(tl->acquired, 1); \ + }), \ + default:(void)0); \ + local_lock_acquire(l); \ + } while (0) + #define __local_lock(lock) \ do { \ preempt_disable(); \ - local_lock_acquire(this_cpu_ptr(lock)); \ + __local_lock_acquire(lock); \ } while (0) #define __local_lock_irq(lock) \ do { \ local_irq_disable(); \ - local_lock_acquire(this_cpu_ptr(lock)); \ + __local_lock_acquire(lock); \ } while (0) #define __local_lock_irqsave(lock, flags) \ do { \ local_irq_save(flags); \ - local_lock_acquire(this_cpu_ptr(lock)); \ - } while (0) - -#define __local_unlock(lock) \ - do { \ - local_lock_release(this_cpu_ptr(lock)); \ - preempt_enable(); \ + __local_lock_acquire(lock); \ } while (0) -#define __local_unlock_irq(lock) \ - do { \ - local_lock_release(this_cpu_ptr(lock)); \ - local_irq_enable(); \ - } while (0) - -#define __local_unlock_irqrestore(lock, flags) \ - do { \ - local_lock_release(this_cpu_ptr(lock)); \ - local_irq_restore(flags); \ - } while (0) - -#define __local_lock_nested_bh(lock) \ - do { \ - lockdep_assert_in_softirq(); \ - local_lock_acquire(this_cpu_ptr(lock)); \ - } while (0) - -#define __local_unlock_nested_bh(lock) \ - local_lock_release(this_cpu_ptr(lock)) - -/* localtry_lock_t variants */ - -#define __localtry_lock_init(lock) \ -do { \ - __local_lock_init(&(lock)->llock); \ - WRITE_ONCE((lock)->acquired, 0); \ -} while (0) - -#define __localtry_lock(lock) \ - do { \ - localtry_lock_t *lt; \ - preempt_disable(); \ - lt = this_cpu_ptr(lock); \ - local_lock_acquire(<->llock); \ - WRITE_ONCE(lt->acquired, 1); \ - } while (0) - -#define __localtry_lock_irq(lock) \ - do { \ - localtry_lock_t *lt; \ - local_irq_disable(); \ - lt = this_cpu_ptr(lock); \ - local_lock_acquire(<->llock); \ - WRITE_ONCE(lt->acquired, 1); \ - } while (0) - -#define __localtry_lock_irqsave(lock, flags) \ - do { \ - localtry_lock_t *lt; \ - local_irq_save(flags); \ - lt = this_cpu_ptr(lock); \ - local_lock_acquire(<->llock); \ - WRITE_ONCE(lt->acquired, 1); \ - } while (0) - -#define __localtry_trylock(lock) \ +#define __local_trylock(lock) \ ({ \ - localtry_lock_t *lt; \ - bool _ret; \ + local_trylock_t *tl; \ \ preempt_disable(); \ - lt = this_cpu_ptr(lock); \ - if (!READ_ONCE(lt->acquired)) { \ - WRITE_ONCE(lt->acquired, 1); \ - local_trylock_acquire(<->llock); \ - _ret = true; \ - } else { \ - _ret = false; \ + tl = this_cpu_ptr(lock); \ + if (READ_ONCE(tl->acquired)) { \ preempt_enable(); \ + tl = NULL; \ + } else { \ + WRITE_ONCE(tl->acquired, 1); \ + local_trylock_acquire( \ + (local_lock_t *)tl); \ } \ - _ret; \ + !!tl; \ }) -#define __localtry_trylock_irqsave(lock, flags) \ +#define __local_trylock_irqsave(lock, flags) \ ({ \ - localtry_lock_t *lt; \ - bool _ret; \ + local_trylock_t *tl; \ \ local_irq_save(flags); \ - lt = this_cpu_ptr(lock); \ - if (!READ_ONCE(lt->acquired)) { \ - WRITE_ONCE(lt->acquired, 1); \ - local_trylock_acquire(<->llock); \ - _ret = true; \ - } else { \ - _ret = false; \ + tl = this_cpu_ptr(lock); \ + if (READ_ONCE(tl->acquired)) { \ local_irq_restore(flags); \ + tl = NULL; \ + } else { \ + WRITE_ONCE(tl->acquired, 1); \ + local_trylock_acquire( \ + (local_lock_t *)tl); \ } \ - _ret; \ + !!tl; \ }) -#define __localtry_unlock(lock) \ +#define __local_lock_release(lock) \ + do { \ + local_trylock_t *tl; \ + local_lock_t *l; \ + \ + l = (local_lock_t *)this_cpu_ptr(lock); \ + tl = (local_trylock_t *)l; \ + local_lock_release(l); \ + _Generic((lock), \ + local_trylock_t *: ({ \ + lockdep_assert(tl->acquired == 1); \ + WRITE_ONCE(tl->acquired, 0); \ + }), \ + default:(void)0); \ + } while (0) + +#define __local_unlock(lock) \ do { \ - localtry_lock_t *lt; \ - lt = this_cpu_ptr(lock); \ - WRITE_ONCE(lt->acquired, 0); \ - local_lock_release(<->llock); \ + __local_lock_release(lock); \ preempt_enable(); \ } while (0) -#define __localtry_unlock_irq(lock) \ +#define __local_unlock_irq(lock) \ do { \ - localtry_lock_t *lt; \ - lt = this_cpu_ptr(lock); \ - WRITE_ONCE(lt->acquired, 0); \ - local_lock_release(<->llock); \ + __local_lock_release(lock); \ local_irq_enable(); \ } while (0) -#define __localtry_unlock_irqrestore(lock, flags) \ +#define __local_unlock_irqrestore(lock, flags) \ do { \ - localtry_lock_t *lt; \ - lt = this_cpu_ptr(lock); \ - WRITE_ONCE(lt->acquired, 0); \ - local_lock_release(<->llock); \ + __local_lock_release(lock); \ local_irq_restore(flags); \ } while (0) +#define __local_lock_nested_bh(lock) \ + do { \ + lockdep_assert_in_softirq(); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_unlock_nested_bh(lock) \ + local_lock_release(this_cpu_ptr(lock)) + #else /* !CONFIG_PREEMPT_RT */ /* @@ -237,16 +212,18 @@ do { \ * critical section while staying preemptible. */ typedef spinlock_t local_lock_t; -typedef spinlock_t localtry_lock_t; +typedef spinlock_t local_trylock_t; #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) -#define INIT_LOCALTRY_LOCK(lockname) INIT_LOCAL_LOCK(lockname) +#define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) #define __local_lock_init(l) \ do { \ local_spin_lock_init((l)); \ } while (0) +#define __local_trylock_init(l) __local_lock_init(l) + #define __local_lock(__lock) \ do { \ migrate_disable(); \ @@ -283,17 +260,7 @@ do { \ spin_unlock(this_cpu_ptr((lock))); \ } while (0) -/* localtry_lock_t variants */ - -#define __localtry_lock_init(lock) __local_lock_init(lock) -#define __localtry_lock(lock) __local_lock(lock) -#define __localtry_lock_irq(lock) __local_lock(lock) -#define __localtry_lock_irqsave(lock, flags) __local_lock_irqsave(lock, flags) -#define __localtry_unlock(lock) __local_unlock(lock) -#define __localtry_unlock_irq(lock) __local_unlock(lock) -#define __localtry_unlock_irqrestore(lock, flags) __local_unlock_irqrestore(lock, flags) - -#define __localtry_trylock(lock) \ +#define __local_trylock(lock) \ ({ \ int __locked; \ \ @@ -308,11 +275,11 @@ do { \ __locked; \ }) -#define __localtry_trylock_irqsave(lock, flags) \ +#define __local_trylock_irqsave(lock, flags) \ ({ \ typecheck(unsigned long, flags); \ flags = 0; \ - __localtry_trylock(lock); \ + __local_trylock(lock); \ }) #endif /* CONFIG_PREEMPT_RT */ -- cgit v1.2.3 From 8c56c5dbcf52220cc9be7a36e7f21ebd5939e0b9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Apr 2025 10:59:50 +0200 Subject: mm: (un)track_pfn_copy() fix + doc improvements We got a late smatch warning and some additional review feedback. smatch warnings: mm/memory.c:1428 copy_page_range() error: uninitialized symbol 'pfn'. We actually use the pfn only when it is properly initialized; however, we may pass an uninitialized value to a function -- although it will not use it that likely still is UB in C. So let's just fix it by always initializing pfn in the caller of track_pfn_copy(), and improving the documentation of track_pfn_copy(). While at it, clarify the doc of untrack_pfn_copy(), that internal checks make sure if we actually have to untrack anything. Link: https://lkml.kernel.org/r/20250408085950.976103-1-david@redhat.com Fixes: dc84bc2aba85 ("x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range()") Signed-off-by: David Hildenbrand Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202503270941.IFILyNCX-lkp@intel.com/ Reviewed-by: Lorenzo Stoakes Acked-by: Ingo Molnar Cc: Andrew Morton Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e2b705c14945..b50447ef1c92 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1511,8 +1511,9 @@ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, /* * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page - * tables copied during copy_page_range(). On success, stores the pfn to be - * passed to untrack_pfn_copy(). + * tables copied during copy_page_range(). Will store the pfn to be + * passed to untrack_pfn_copy() only if there is something to be untracked. + * Callers should initialize the pfn to 0. */ static inline int track_pfn_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long *pfn) @@ -1522,7 +1523,9 @@ static inline int track_pfn_copy(struct vm_area_struct *dst_vma, /* * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during - * copy_page_range(), but after track_pfn_copy() was already called. + * copy_page_range(), but after track_pfn_copy() was already called. Can + * be called even if track_pfn_copy() did not actually track anything: + * handled internally. */ static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn) -- cgit v1.2.3 From cd35b6cb46649750b7dbd0df0e2d767415d8917b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 1 Apr 2025 15:02:21 -0700 Subject: nfs: add missing selections of CONFIG_CRC32 nfs.ko, nfsd.ko, and lockd.ko all use crc32_le(), which is available only when CONFIG_CRC32 is enabled. But the only NFS kconfig option that selected CONFIG_CRC32 was CONFIG_NFS_DEBUG, which is client-specific and did not actually guard the use of crc32_le() even on the client. The code worked around this bug by only actually calling crc32_le() when CONFIG_CRC32 is built-in, instead hard-coding '0' in other cases. This avoided randconfig build errors, and in real kernels the fallback code was unlikely to be reached since CONFIG_CRC32 is 'default y'. But, this really needs to just be done properly, especially now that I'm planning to update CONFIG_CRC32 to not be 'default y'. Therefore, make CONFIG_NFS_FS, CONFIG_NFSD, and CONFIG_LOCKD select CONFIG_CRC32. Then remove the fallback code that becomes unnecessary, as well as the selection of CONFIG_CRC32 from CONFIG_NFS_DEBUG. Fixes: 1264a2f053a3 ("NFS: refactor code for calculating the crc32 hash of a filehandle") Signed-off-by: Eric Biggers Acked-by: Anna Schumaker Signed-off-by: Chuck Lever --- include/linux/nfs.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs.h b/include/linux/nfs.h index 9ad727ddfedb..0906a0b40c6a 100644 --- a/include/linux/nfs.h +++ b/include/linux/nfs.h @@ -55,7 +55,6 @@ enum nfs3_stable_how { NFS_INVALID_STABLE_HOW = -1 }; -#ifdef CONFIG_CRC32 /** * nfs_fhandle_hash - calculate the crc32 hash for the filehandle * @fh - pointer to filehandle @@ -67,10 +66,4 @@ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) { return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); } -#else /* CONFIG_CRC32 */ -static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) -{ - return 0; -} -#endif /* CONFIG_CRC32 */ #endif /* _LINUX_NFS_H */ -- cgit v1.2.3 From cf761e3dacc6ad5f65a4886d00da1f9681e6805a Mon Sep 17 00:00:00 2001 From: Jonathan Currier Date: Sun, 17 Nov 2024 17:48:42 -0600 Subject: PCI/MSI: Add an option to write MSIX ENTRY_DATA before any reads Commit 7d5ec3d36123 ("PCI/MSI: Mask all unused MSI-X entries") introduced a readl() from ENTRY_VECTOR_CTRL before the writel() to ENTRY_DATA. This is correct, however some hardware, like the Sun Neptune chips, the NIU module, will cause an error and/or fatal trap if any MSIX table entry is read before the corresponding ENTRY_DATA field is written to. Add an optional early writel() in msix_prepare_msi_desc(). Fixes: 7d5ec3d36123 ("PCI/MSI: Mask all unused MSI-X entries") Signed-off-by: Jonathan Currier Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20241117234843.19236-2-dullfire@yahoo.com --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0e8e3fd77e96..51e2bd6405cd 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -245,6 +245,8 @@ enum pci_dev_flags { PCI_DEV_FLAGS_NO_RELAXED_ORDERING = (__force pci_dev_flags_t) (1 << 11), /* Device does honor MSI masking despite saying otherwise */ PCI_DEV_FLAGS_HAS_MSI_MASKING = (__force pci_dev_flags_t) (1 << 12), + /* Device requires write to PCI_MSIX_ENTRY_DATA before any MSIX reads */ + PCI_DEV_FLAGS_MSIX_TOUCH_ENTRY_DATA_FIRST = (__force pci_dev_flags_t) (1 << 13), }; enum pci_irq_reroute_variant { -- cgit v1.2.3 From c86b300b1ea35959a6e2a63a6497226a6ea90b67 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Apr 2025 22:13:33 +0200 Subject: fs: add kern_path_locked_negative() The audit code relies on the fact that kern_path_locked() returned a path even for a negative dentry. If it doesn't find a valid dentry it immediately calls: audit_find_parent(d_backing_inode(parent_path.dentry)); which assumes that parent_path.dentry is still valid. But it isn't since kern_path_locked() has been changed to path_put() also for a negative dentry. Fix this by adding a helper that implements the required audit semantics and allows us to fix the immediate bleeding. We can find a unified solution for this afterwards. Link: https://lore.kernel.org/20250414-rennt-wimmeln-f186c3a780f1@brauner Fixes: 1c3cb50b58c3 ("VFS: change kern_path_locked() and user_path_locked_at() to never return negative dentry") Reported-and-tested-by: Vlastimil Babka Signed-off-by: Christian Brauner --- include/linux/namei.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index e3042176cdf4..bbaf55fb3101 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -62,6 +62,7 @@ extern struct dentry *kern_path_create(int, const char *, struct path *, unsigne extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); +extern struct dentry *kern_path_locked_negative(const char *, struct path *); extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, -- cgit v1.2.3 From 777d0961ff95b26d5887fdae69900374364976f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Apr 2025 08:40:42 +0200 Subject: fs: move the bdex_statx call to vfs_getattr_nosec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently bdex_statx is only called from the very high-level vfs_statx_path function, and thus bypassing it for in-kernel calls to vfs_getattr or vfs_getattr_nosec. This breaks querying the block ѕize of the underlying device in the loop driver and also is a pitfall for any other new kernel caller. Move the call into the lowest level helper to ensure all callers get the right results. Fixes: 2d985f8c6b91 ("vfs: support STATX_DIOALIGN on block devices") Fixes: f4774e92aab8 ("loop: take the file system minimum dio alignment into account") Reported-by: "Darrick J. Wong" Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250417064042.712140-1-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e39c45bc0a97..678dc38442bf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1685,7 +1685,7 @@ int sync_blockdev(struct block_device *bdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); -void bdev_statx(struct path *, struct kstat *, u32); +void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask); void printk_all_partitions(void); int __init early_lookup_bdev(const char *pathname, dev_t *dev); #else @@ -1703,8 +1703,8 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) static inline void sync_bdevs(bool wait) { } -static inline void bdev_statx(struct path *path, struct kstat *stat, - u32 request_mask) +static inline void bdev_statx(const struct path *path, struct kstat *stat, + u32 request_mask) { } static inline void printk_all_partitions(void) -- cgit v1.2.3 From 4067196a52278156d18d8d6fa7f43970611b1b49 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sat, 29 Mar 2025 19:10:29 +0200 Subject: mm/page_alloc: fix deadlock on cpu_hotplug_lock in __accept_page() When the last page in the zone is accepted, __accept_page() calls static_branch_dec(). This function takes cpu_hotplug_lock, which can lead to a deadlock if the allocation occurs during CPU bringup path as _cpu_up() also takes the lock. To prevent this deadlock, defer static_branch_dec() to a workqueue. Call static_branch_dec() only when the workqueue is not yet initialized. Workqueues are initialized before CPU bring up, so this will not conflict with the first scenario. Link: https://lkml.kernel.org/r/20250329171030.3942298-1-kirill.shutemov@linux.intel.com Fixes: 55ad43e8ba0f ("mm: add a helper to accept page") Signed-off-by: Kirill A. Shutemov Reported-by: Srikanth Aithal Tested-by: Srikanth Aithal Cc: Dave Hansen Cc: Ashish Kalra Cc: David Hildenbrand Cc: "Edgecombe, Rick P" Cc: Mel Gorman Cc: "Mike Rapoport (IBM)" Cc: Thomas Lendacky Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 25e80b2ca7f4..4c95fcc9e9df 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -967,6 +967,9 @@ struct zone { #ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */ struct list_head unaccepted_pages; + + /* To be called once the last page in the zone is accepted */ + struct work_struct unaccepted_cleanup; #endif /* zone flags, see below */ -- cgit v1.2.3 From 98b1917cdef92c29fc9a14060d5606c619050c2c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 10 Apr 2025 11:10:20 +0200 Subject: fs/dax: fix folio splitting issue by resetting old folio order + _nr_pages Alison reports an issue with fsdax when large extends end up using large ZONE_DEVICE folios: [ 417.796271] BUG: kernel NULL pointer dereference, address: 0000000000000b00 [ 417.796982] #PF: supervisor read access in kernel mode [ 417.797540] #PF: error_code(0x0000) - not-present page [ 417.798123] PGD 2a5c5067 P4D 2a5c5067 PUD 2a5c6067 PMD 0 [ 417.798690] Oops: Oops: 0000 [#1] SMP NOPTI [ 417.799178] CPU: 5 UID: 0 PID: 1515 Comm: mmap Tainted: ... [ 417.800150] Tainted: [O]=OOT_MODULE [ 417.800583] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 [ 417.801358] RIP: 0010:__lruvec_stat_mod_folio+0x7e/0x250 [ 417.801948] Code: ... [ 417.803662] RSP: 0000:ffffc90002be3a08 EFLAGS: 00010206 [ 417.804234] RAX: 0000000000000000 RBX: 0000000000000200 RCX: 0000000000000002 [ 417.804984] RDX: ffffffff815652d7 RSI: 0000000000000000 RDI: ffffffff82a2beae [ 417.805689] RBP: ffffc90002be3a28 R08: 0000000000000000 R09: 0000000000000000 [ 417.806384] R10: ffffea0007000040 R11: ffff888376ffe000 R12: 0000000000000001 [ 417.807099] R13: 0000000000000012 R14: ffff88807fe4ab40 R15: ffff888029210580 [ 417.807801] FS: 00007f339fa7a740(0000) GS:ffff8881fa9b9000(0000) knlGS:0000000000000000 [ 417.808570] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 417.809193] CR2: 0000000000000b00 CR3: 000000002a4f0004 CR4: 0000000000370ef0 [ 417.809925] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 417.810622] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 417.811353] Call Trace: [ 417.811709] [ 417.812038] folio_add_file_rmap_ptes+0x143/0x230 [ 417.812566] insert_page_into_pte_locked+0x1ee/0x3c0 [ 417.813132] insert_page+0x78/0xf0 [ 417.813558] vmf_insert_page_mkwrite+0x55/0xa0 [ 417.814088] dax_fault_iter+0x484/0x7b0 [ 417.814542] dax_iomap_pte_fault+0x1ca/0x620 [ 417.815055] dax_iomap_fault+0x39/0x40 [ 417.815499] __xfs_write_fault+0x139/0x380 [ 417.815995] ? __handle_mm_fault+0x5e5/0x1a60 [ 417.816483] xfs_write_fault+0x41/0x50 [ 417.816966] xfs_filemap_fault+0x3b/0xe0 [ 417.817424] __do_fault+0x31/0x180 [ 417.817859] __handle_mm_fault+0xee1/0x1a60 [ 417.818325] ? debug_smp_processor_id+0x17/0x20 [ 417.818844] handle_mm_fault+0xe1/0x2b0 [...] The issue is that when we split a large ZONE_DEVICE folio to order-0 ones, we don't reset the order/_nr_pages. As folio->_nr_pages overlays page[1]->memcg_data, once page[1] is a folio, it suddenly looks like it has folio->memcg_data set. And we never manually initialize folio->memcg_data in fsdax code, because we never expect it to be set at all. When __lruvec_stat_mod_folio() then stumbles over such a folio, it tries to use folio->memcg_data (because it's non-NULL) but it does not actually point at a memcg, resulting in the problem. Alison also observed that these folios sometimes have "locked" set, which is rather concerning (folios locked from the beginning ...). The reason is that the order for large folios is stored in page[1]->flags, which become the folio->flags of a new small folio. Let's fix it by adding a folio helper to clear order/_nr_pages for splitting purposes. Maybe we should reinitialize other large folio flags / folio members as well when splitting, because they might similarly cause harm once page[1] becomes a folio? At least other flags in PAGE_FLAGS_SECOND should not be set for fsdax, so at least page[1]->flags might be as expected with this fix. From a quick glimpse, initializing ->mapping, ->pgmap and ->share should re-initialize most things from a previous page[1] used by large folios that fsdax cares about. For example folio->private might not get reinitialized, but maybe that's not relevant -- no traces of it's use in fsdax code. Needs a closer look. Another thing that should be considered in the future is performing similar checks as we perform in free_tail_page_prepare() -- checking pincount etc. -- when freeing a large fsdax folio. Link: https://lkml.kernel.org/r/20250410091020.119116-1-david@redhat.com Fixes: 4996fc547f5b ("mm: let _folio_nr_pages overlay memcg_data in first tail page") Fixes: 38607c62b34b ("fs/dax: properly refcount fs dax pages") Signed-off-by: David Hildenbrand Reported-by: Alison Schofield Closes: https://lkml.kernel.org/r/Z_W9Oeg-D9FhImf3@aschofie-mobl2.lan Tested-by: Alison Schofield Reviewed-by: Dan Williams Tested-by: "Darrick J. Wong" Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Matthew Wilcox Cc: Alistair Popple Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b7f13f087954..bf55206935c4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1218,6 +1218,23 @@ static inline unsigned int folio_order(const struct folio *folio) return folio_large_order(folio); } +/** + * folio_reset_order - Reset the folio order and derived _nr_pages + * @folio: The folio. + * + * Reset the order and derived _nr_pages to 0. Must only be used in the + * process of splitting large folios. + */ +static inline void folio_reset_order(struct folio *folio) +{ + if (WARN_ON_ONCE(!folio_test_large(folio))) + return; + folio->_flags_1 &= ~0xffUL; +#ifdef NR_PAGES_IN_LARGE_FOLIO + folio->_nr_pages = 0; +#endif +} + #include /* -- cgit v1.2.3 From 9e888998ea4d22257b07ce911576509486fa0667 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 12 Apr 2025 18:39:12 +0200 Subject: writeback: fix false warning in inode_to_wb() inode_to_wb() is used also for filesystems that don't support cgroup writeback. For these filesystems inode->i_wb is stable during the lifetime of the inode (it points to bdi->wb) and there's no need to hold locks protecting the inode->i_wb dereference. Improve the warning in inode_to_wb() to not trigger for these filesystems. Link: https://lkml.kernel.org/r/20250412163914.3773459-3-agruenba@redhat.com Fixes: aaa2cacf8184 ("writeback: add lockdep annotation to inode_to_wb()") Signed-off-by: Jan Kara Signed-off-by: Andreas Gruenbacher Reviewed-by: Andreas Gruenbacher Cc: Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 8e7af9a03b41..e721148c95d0 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -249,6 +249,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(debug_locks && + (inode->i_sb->s_iflags & SB_I_CGROUPWB) && (!lockdep_is_held(&inode->i_lock) && !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && !lockdep_is_held(&inode->i_wb->list_lock))); -- cgit v1.2.3 From 38448181459e24257b40d5258afdbaa3565e8cfc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 16 Apr 2025 09:45:39 -0400 Subject: mm: vmscan: restore high-cpu watermark safety in kswapd Vlastimil points out that commit a211c6550efc ("mm: page_alloc: defrag_mode kswapd/kcompactd watermarks") switched kswapd from zone_watermark_ok_safe() to the standard, percpu-cached version of reading free pages, thus dropping the watermark safety precautions for systems with high CPU counts (e.g. >212 cpus on 64G). Restore them. Since zone_watermark_ok_safe() is no longer the right interface, and this was the last caller of the function anyway, open-code the zone_page_state_snapshot() conditional and delete the function. Link: https://lkml.kernel.org/r/20250416135142.778933-2-hannes@cmpxchg.org Fixes: a211c6550efc ("mm: page_alloc: defrag_mode kswapd/kcompactd watermarks") Signed-off-by: Johannes Weiner Reported-by: Vlastimil Babka Reviewed-by: Vlastimil Babka Cc: Brendan Jackman Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4c95fcc9e9df..6ccec1bf2896 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1502,8 +1502,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); -bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int highest_zoneidx); /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. -- cgit v1.2.3