From 234249d88b091d006b82f8d570343aae5f383736 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Fri, 25 Aug 2023 03:00:55 -0400 Subject: wifi: cfg80211/mac80211: hold link BSSes when assoc fails for MLO connection When connect to MLO AP with more than one link, and the assoc response of AP is not success, then cfg80211_unhold_bss() is not called for all the links' cfg80211_bss except the primary link which means the link used by the latest successful association request. Thus the hold value of the cfg80211_bss is not reset to 0 after the assoc fail, and then the __cfg80211_unlink_bss() will not be called for the cfg80211_bss by __cfg80211_bss_expire(). Then the AP always looks exist even the AP is shutdown or reconfigured to another type, then it will lead error while connecting it again. The detail info are as below. When connect with muti-links AP, cfg80211_hold_bss() is called by cfg80211_mlme_assoc() for each cfg80211_bss of all the links. When assoc response from AP is not success(such as status_code==1), the ieee80211_link_data of non-primary link(sdata->link[link_id]) is NULL because ieee80211_assoc_success()->ieee80211_vif_update_links() is not called for the links. Then struct cfg80211_rx_assoc_resp resp in cfg80211_rx_assoc_resp() and struct cfg80211_connect_resp_params cr in __cfg80211_connect_result() will only have the data of the primary link, and finally function cfg80211_connect_result_release_bsses() only call cfg80211_unhold_bss() for the primary link. Then cfg80211_bss of the other links will never free because its hold is always > 0 now. Hence assign value for the bss and status from assoc_data since it is valid for this case. Also assign value of addr from assoc_data when the link is NULL because the addrs of assoc_data and link both represent the local link addr and they are same value for success connection. Fixes: 81151ce462e5 ("wifi: mac80211: support MLO authentication/association with one link") Signed-off-by: Wen Gong Link: https://lore.kernel.org/r/20230825070055.28164-1-quic_wgong@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 3a4b684f89bf..ed3bc2a78d82 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7231,7 +7231,7 @@ struct cfg80211_rx_assoc_resp { int uapsd_queues; const u8 *ap_mld_addr; struct { - const u8 *addr; + u8 addr[ETH_ALEN] __aligned(2); struct cfg80211_bss *bss; u16 status; } links[IEEE80211_MLD_MAX_NUM_LINKS]; -- cgit v1.2.3 From 37c20b2effe987b806c8de6d12978e4ffeff026f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 16 Aug 2023 15:38:04 +0200 Subject: wifi: cfg80211: fix cqm_config access race Max Schulze reports crashes with brcmfmac. The reason seems to be a race between userspace removing the CQM config and the driver calling cfg80211_cqm_rssi_notify(), where if the data is freed while cfg80211_cqm_rssi_notify() runs it will crash since it assumes wdev->cqm_config is set. This can't be fixed with a simple non-NULL check since there's nothing we can do for locking easily, so use RCU instead to protect the pointer, but that requires pulling the updates out into an asynchronous worker so they can sleep and call back into the driver. Since we need to change the free anyway, also change it to go back to the old settings if changing the settings fails. Reported-and-tested-by: Max Schulze Closes: https://lore.kernel.org/r/ac96309a-8d8d-4435-36e6-6d152eb31876@online.de Fixes: 4a4b8169501b ("cfg80211: Accept multiple RSSI thresholds for CQM") Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ed3bc2a78d82..aebfa54d547a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6013,7 +6013,8 @@ struct wireless_dev { } wext; #endif - struct cfg80211_cqm_config *cqm_config; + struct wiphy_work cqm_rssi_work; + struct cfg80211_cqm_config __rcu *cqm_config; struct list_head pmsr_list; spinlock_t pmsr_lock; -- cgit v1.2.3 From ef7d9593390a050c50eba5fc02d2cb65a1104434 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 11 Sep 2023 08:39:04 -0700 Subject: xfs: remove CPU hotplug infrastructure There are no users of the cpu hotplug hooks in xfs now, so remove it. This reverts f1653c2e2831e ("xfs: introduce CPU hotplug infrastructure"). Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- include/linux/cpuhotplug.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 06dda85f0424..068f7738be22 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -90,7 +90,6 @@ enum cpuhp_state { CPUHP_FS_BUFF_DEAD, CPUHP_PRINTK_DEAD, CPUHP_MM_MEMCQ_DEAD, - CPUHP_XFS_DEAD, CPUHP_PERCPU_CNT_DEAD, CPUHP_RADIX_DEAD, CPUHP_PAGE_ALLOC, -- cgit v1.2.3 From 49f776724e64c27dd861e7ac8da9d42f01d9d172 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Wed, 23 Aug 2023 23:43:04 +0000 Subject: PCI/AER: Export pcie_aer_is_native() Export and move the declaration of pcie_aer_is_native() to a common header file to be reused by cxl/pci module. Signed-off-by: Smita Koralahalli Acked-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Robert Richter Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20230823234305.27333-3-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dan Williams --- include/linux/aer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/aer.h b/include/linux/aer.h index 2dd175f5debd..29cc10220952 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -42,11 +42,13 @@ struct aer_capability_regs { #if defined(CONFIG_PCIEAER) int pci_aer_clear_nonfatal_status(struct pci_dev *dev); +int pcie_aer_is_native(struct pci_dev *dev); #else static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev) { return -EINVAL; } +static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; } #endif void cper_print_aer(struct pci_dev *dev, int aer_severity, -- cgit v1.2.3 From 373beef00f7d781a000b12c31fb17a5a9c25969c Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 11 Sep 2023 15:52:57 +0100 Subject: KVM: arm64: nvhe: Ignore SVE hint in SMCCC function ID When SVE is enabled, the host may set bit 16 in SMCCC function IDs, a hint that indicates an unused SVE state. At the moment NVHE doesn't account for this bit when inspecting the function ID, and rejects most calls. Clear the hint bit before comparing function IDs. About version compatibility: the host's PSCI driver initially probes the firmware for a SMCCC version number. If the firmware implements a protocol recent enough (1.3), subsequent SMCCC calls have the hint bit set. Since the hint bit was reserved in earlier versions of the protocol, clearing it is fine regardless of the version in use. When a new hint is added to the protocol in the future, it will be added to ARM_SMCCC_CALL_HINTS and NVHE will handle it straight away. This patch only clears known hints and leaves reserved bits as is, because future SMCCC versions could use reserved bits as modifiers for the function ID, rather than hints. Fixes: cfa7ff959a78 ("arm64: smccc: Support SMCCC v1.3 SVE register saving hint") Reported-by: Ben Horgan Signed-off-by: Jean-Philippe Brucker Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230911145254.934414-4-jean-philippe@linaro.org --- include/linux/arm-smccc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 7c67c17321d4..083f85653716 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -67,6 +67,8 @@ #define ARM_SMCCC_VERSION_1_3 0x10003 #define ARM_SMCCC_1_3_SVE_HINT 0x10000 +#define ARM_SMCCC_CALL_HINTS ARM_SMCCC_1_3_SVE_HINT + #define ARM_SMCCC_VERSION_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ -- cgit v1.2.3 From d1383077c225ceb87ac7a3b56b2c505193f77ed7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 13 Sep 2023 09:36:57 +0200 Subject: wifi: cfg80211: add missing kernel-doc for cqm_rssi_work As reported by Stephen, I neglected to add the kernel-doc for the new struct member. Fix that. Reported-by: Stephen Rothwell Fixes: 37c20b2effe9 ("wifi: cfg80211: fix cqm_config access race") Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index aebfa54d547a..7192346e4a22 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5941,6 +5941,7 @@ void wiphy_delayed_work_cancel(struct wiphy *wiphy, * @event_lock: (private) lock for event list * @owner_nlportid: (private) owner socket port ID * @nl_owner_dead: (private) owner socket went away + * @cqm_rssi_work: (private) CQM RSSI reporting work * @cqm_config: (private) nl80211 RSSI monitor state * @pmsr_list: (private) peer measurement requests * @pmsr_lock: (private) peer measurements requests/results lock -- cgit v1.2.3 From 531108ec5b5cd45ec6272a6115e73275baef7d22 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 12 Sep 2023 19:23:21 +0300 Subject: uapi: stddef.h: Fix header guard location The #endif for the header guard wasn't at the end of the header. This was harmless since the define that escaped was already testing for its own redefinition. Regardless, move the #endif to the correct place. Signed-off-by: Alexey Dobriyan Fixes: c8248faf3ca2 ("Compiler Attributes: counted_by: Adjust name and identifier expansion") Link: https://lore.kernel.org/r/b1f5081e-339d-421d-81b2-cbb94e1f6f5f@p183 Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- include/uapi/linux/stddef.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index 7c3fc3980881..c027b2070d79 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -44,8 +44,9 @@ struct { } __empty_ ## NAME; \ TYPE NAME[]; \ } -#endif #ifndef __counted_by #define __counted_by(m) #endif + +#endif /* _UAPI_LINUX_STDDEF_H */ -- cgit v1.2.3 From 32a4ec211d4164e667d9d0b807fadf02053cd2e9 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 12 Sep 2023 19:22:24 +0300 Subject: uapi: stddef.h: Fix __DECLARE_FLEX_ARRAY for C++ __DECLARE_FLEX_ARRAY(T, member) macro expands to struct { struct {} __empty_member; T member[]; }; which is subtly wrong in C++ because sizeof(struct{}) is 1 not 0, changing UAPI structures layouts. This can be fixed by expanding to T member[]; Now g++ doesn't like "T member[]" either, throwing errors on the following code: struct S { union { T1 member1[]; T2 member2[]; }; }; or struct S { T member[]; }; Use "T member[0];" which seems to work and does the right thing wrt structure layout. Signed-off-by: Alexey Dobriyan Fixes: 3080ea5553cc ("stddef: Introduce DECLARE_FLEX_ARRAY() helper") Link: https://lore.kernel.org/r/97242381-f1ec-4a4a-9472-1a464f575657@p183 Signed-off-by: Kees Cook --- include/uapi/linux/stddef.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index c027b2070d79..5c6c4269f7ef 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -29,6 +29,11 @@ struct TAG { MEMBERS } ATTRS NAME; \ } +#ifdef __cplusplus +/* sizeof(struct{}) is 1 in C++, not 0, can't use C version of the macro. */ +#define __DECLARE_FLEX_ARRAY(T, member) \ + T member[0] +#else /** * __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union * @@ -44,6 +49,7 @@ struct { } __empty_ ## NAME; \ TYPE NAME[]; \ } +#endif #ifndef __counted_by #define __counted_by(m) -- cgit v1.2.3 From d57125b55a292a8e74a1fb17182576a3b2b2e795 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 18 Sep 2023 10:44:08 +0200 Subject: Revert "ceph: make members in struct ceph_mds_request_args_ext a union" This reverts commit 3af5ae22030cb59fab4fba35f5a2b62f47e14df9. ceph_mds_request_args_ext was already (and remains to be) a union. An additional anonymous union inside is bogus: union ceph_mds_request_args_ext { union { union ceph_mds_request_args old; struct { ... } __attribute__ ((packed)) setattr_ext; }; } Signed-off-by: Ilya Dryomov Reviewed-by: Xiubo Li --- include/linux/ceph/ceph_fs.h | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 5f2301ee88bc..f3b3593254b9 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -467,19 +467,17 @@ union ceph_mds_request_args { } __attribute__ ((packed)); union ceph_mds_request_args_ext { - union { - union ceph_mds_request_args old; - struct { - __le32 mode; - __le32 uid; - __le32 gid; - struct ceph_timespec mtime; - struct ceph_timespec atime; - __le64 size, old_size; /* old_size needed by truncate */ - __le32 mask; /* CEPH_SETATTR_* */ - struct ceph_timespec btime; - } __attribute__ ((packed)) setattr_ext; - }; + union ceph_mds_request_args old; + struct { + __le32 mode; + __le32 uid; + __le32 gid; + struct ceph_timespec mtime; + struct ceph_timespec atime; + __le64 size, old_size; /* old_size needed by truncate */ + __le32 mask; /* CEPH_SETATTR_* */ + struct ceph_timespec btime; + } __attribute__ ((packed)) setattr_ext; }; #define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ -- cgit v1.2.3 From 9ea9cb00a82b53ec39630eac718776d37e41b35a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Sep 2023 11:21:39 -0400 Subject: mm: memcontrol: fix GFP_NOFS recursion in memory.high enforcement Breno and Josef report a deadlock scenario from cgroup reclaim re-entering the filesystem: [ 361.546690] ====================================================== [ 361.559210] WARNING: possible circular locking dependency detected [ 361.571703] 6.5.0-0_fbk700_debug_rc0_kbuilder_13159_gbf787a128001 #1 Tainted: G S E [ 361.589704] ------------------------------------------------------ [ 361.602277] find/9315 is trying to acquire lock: [ 361.611625] ffff88837ba140c0 (&delayed_node->mutex){+.+.}-{4:4}, at: __btrfs_release_delayed_node+0x68/0x4f0 [ 361.631437] [ 361.631437] but task is already holding lock: [ 361.643243] ffff8881765b8678 (btrfs-tree-01){++++}-{4:4}, at: btrfs_tree_read_lock+0x1e/0x40 [ 362.904457] mutex_lock_nested+0x1c/0x30 [ 362.912414] __btrfs_release_delayed_node+0x68/0x4f0 [ 362.922460] btrfs_evict_inode+0x301/0x770 [ 362.982726] evict+0x17c/0x380 [ 362.988944] prune_icache_sb+0x100/0x1d0 [ 363.005559] super_cache_scan+0x1f8/0x260 [ 363.013695] do_shrink_slab+0x2a2/0x540 [ 363.021489] shrink_slab_memcg+0x237/0x3d0 [ 363.050606] shrink_slab+0xa7/0x240 [ 363.083382] shrink_node_memcgs+0x262/0x3b0 [ 363.091870] shrink_node+0x1a4/0x720 [ 363.099150] shrink_zones+0x1f6/0x5d0 [ 363.148798] do_try_to_free_pages+0x19b/0x5e0 [ 363.157633] try_to_free_mem_cgroup_pages+0x266/0x370 [ 363.190575] reclaim_high+0x16f/0x1f0 [ 363.208409] mem_cgroup_handle_over_high+0x10b/0x270 [ 363.246678] try_charge_memcg+0xaf2/0xc70 [ 363.304151] charge_memcg+0xf0/0x350 [ 363.320070] __mem_cgroup_charge+0x28/0x40 [ 363.328371] __filemap_add_folio+0x870/0xd50 [ 363.371303] filemap_add_folio+0xdd/0x310 [ 363.399696] __filemap_get_folio+0x2fc/0x7d0 [ 363.419086] pagecache_get_page+0xe/0x30 [ 363.427048] alloc_extent_buffer+0x1cd/0x6a0 [ 363.435704] read_tree_block+0x43/0xc0 [ 363.443316] read_block_for_search+0x361/0x510 [ 363.466690] btrfs_search_slot+0xc8c/0x1520 This is caused by the mem_cgroup_handle_over_high() not respecting the gfp_mask of the allocation context. We used to only call this function on resume to userspace, where no locks were held. But c9afe31ec443 ("memcg: synchronously enforce memory.high for large overcharges") added a call from the allocation context without considering the gfp. Link: https://lkml.kernel.org/r/20230914152139.100822-1-hannes@cmpxchg.org Fixes: c9afe31ec443 ("memcg: synchronously enforce memory.high for large overcharges") Signed-off-by: Johannes Weiner Reported-by: Breno Leitao Reported-by: Josef Bacik Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Cc: [5.17+] Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- include/linux/resume_user_mode.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ab94ad4597d0..e4e24da16d2c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -920,7 +920,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } -void mem_cgroup_handle_over_high(void); +void mem_cgroup_handle_over_high(gfp_t gfp_mask); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); @@ -1458,7 +1458,7 @@ static inline void mem_cgroup_unlock_pages(void) rcu_read_unlock(); } -static inline void mem_cgroup_handle_over_high(void) +static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index 285189454449..f8f3e958e9cf 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -55,7 +55,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs) } #endif - mem_cgroup_handle_over_high(); + mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); rseq_handle_notify_resume(NULL, regs); -- cgit v1.2.3 From b724a6418f1f853bcb39c8923bf14a50c7bdbd07 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 17 Sep 2023 23:38:46 +0800 Subject: bpf: Fix tr dereferencing Fix 'tr' dereferencing bug when CONFIG_BPF_JIT is turned off. When CONFIG_BPF_JIT is turned off, 'bpf_trampoline_get()' returns NULL, which is same as the cases when CONFIG_BPF_JIT is turned on. Closes: https://lore.kernel.org/r/202309131936.5Nc8eUD0-lkp@intel.com/ Fixes: f7b12b6fea00 ("bpf: verifier: refactor check_attach_btf_id()") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Leon Hwang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20230917153846.88732-1-hffilwlqm@gmail.com --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 024e8b28c34b..49f8b691496c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1307,7 +1307,7 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, static inline struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { - return ERR_PTR(-EOPNOTSUPP); + return NULL; } static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} #define DEFINE_BPF_DISPATCHER(name) -- cgit v1.2.3 From 2a86f1b56a30e242caf7ee1268af68f4f49ce847 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 20 Sep 2023 14:26:29 +0800 Subject: kasan: Cleanup the __HAVE_ARCH_SHADOW_MAP usage As Linus suggested, __HAVE_ARCH_XYZ is "stupid" and "having historical uses of it doesn't make it good". So migrate __HAVE_ARCH_SHADOW_MAP to separate macros named after the respective functions. Suggested-by: Linus Torvalds Reviewed-by: WANG Xuerui Reviewed-by: Andrey Konovalov Signed-off-by: Huacai Chen --- include/linux/kasan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 3df5499f7936..842623d708c2 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -54,7 +54,7 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D]; int kasan_populate_early_shadow(const void *shadow_start, const void *shadow_end); -#ifndef __HAVE_ARCH_SHADOW_MAP +#ifndef kasan_mem_to_shadow static inline void *kasan_mem_to_shadow(const void *addr) { return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT) -- cgit v1.2.3 From 6d2779ecaeb56f92d7105c56772346c71c88c278 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 19 Sep 2023 18:14:29 +0100 Subject: locking/atomic: scripts: fix fallback ifdeffery Since commit: 9257959a6e5b4fca ("locking/atomic: scripts: restructure fallback ifdeffery") The ordering fallbacks for atomic*_read_acquire() and atomic*_set_release() erroneously fall back to the implictly relaxed atomic*_read() and atomic*_set() variants respectively, without any additional barriers. This loses the ACQUIRE and RELEASE ordering semantics, which can result in a wide variety of problems, even on strongly-ordered architectures where the implementation of atomic*_read() and/or atomic*_set() allows the compiler to reorder those relative to other accesses. In practice this has been observed to break bit spinlocks on arm64, resulting in dentry cache corruption. The fallback logic was intended to allow ACQUIRE/RELEASE/RELAXED ops to be defined in terms of FULL ops, but where an op had RELAXED ordering by default, this unintentionally permitted the ACQUIRE/RELEASE ops to be defined in terms of the implicitly RELAXED default. This patch corrects the logic to avoid falling back to implicitly RELAXED ops, resulting in the same behaviour as prior to commit 9257959a6e5b4fca. I've verified the resulting assembly on arm64 by generating outlined wrappers of the atomics. Prior to this patch the compiler generates sequences using relaxed load (LDR) and store (STR) instructions, e.g. | : | ldr x0, [x0] | ret | | : | str x1, [x0] | ret With this patch applied the compiler generates sequences using the intended load-acquire (LDAR) and store-release (STLR) instructions, e.g. | : | ldar x0, [x0] | ret | | : | stlr x1, [x0] | ret To make sure that there were no other victims of the ifdeffery rewrite, I generated outlined copies of all of the {atomic,atomic64,atomic_long} atomic operations before and after commit 9257959a6e5b4fca. A diff of the generated assembly on arm64 shows that only the read_acquire() and set_release() operations were changed, and only lost their intended ordering: | [mark@lakrids:~/src/linux]% diff -u \ | <(aarch64-linux-gnu-objdump -d before-9257959a6e5b4fca.o) | <(aarch64-linux-gnu-objdump -d after-9257959a6e5b4fca.o) | --- /proc/self/fd/11 2023-09-19 16:51:51.114779415 +0100 | +++ /proc/self/fd/16 2023-09-19 16:51:51.114779415 +0100 | @@ -1,5 +1,5 @@ | | -before-9257959a6e5b4fca.o: file format elf64-littleaarch64 | +after-9257959a6e5b4fca.o: file format elf64-littleaarch64 | | | Disassembly of section .text: | @@ -9,7 +9,7 @@ | 4: d65f03c0 ret | | 0000000000000008 : | - 8: 88dffc00 ldar w0, [x0] | + 8: b9400000 ldr w0, [x0] | c: d65f03c0 ret | | 0000000000000010 : | @@ -17,7 +17,7 @@ | 14: d65f03c0 ret | | 0000000000000018 : | - 18: 889ffc01 stlr w1, [x0] | + 18: b9000001 str w1, [x0] | 1c: d65f03c0 ret | | 0000000000000020 : | @@ -1230,7 +1230,7 @@ | 1070: d65f03c0 ret | | 0000000000001074 : | - 1074: c8dffc00 ldar x0, [x0] | + 1074: f9400000 ldr x0, [x0] | 1078: d65f03c0 ret | | 000000000000107c : | @@ -1238,7 +1238,7 @@ | 1080: d65f03c0 ret | | 0000000000001084 : | - 1084: c89ffc01 stlr x1, [x0] | + 1084: f9000001 str x1, [x0] | 1088: d65f03c0 ret | | 000000000000108c : | @@ -2427,7 +2427,7 @@ | 207c: d65f03c0 ret | | 0000000000002080 : | - 2080: c8dffc00 ldar x0, [x0] | + 2080: f9400000 ldr x0, [x0] | 2084: d65f03c0 ret | | 0000000000002088 : | @@ -2435,7 +2435,7 @@ | 208c: d65f03c0 ret | | 0000000000002090 : | - 2090: c89ffc01 stlr x1, [x0] | + 2090: f9000001 str x1, [x0] | 2094: d65f03c0 ret | | 0000000000002098 : I've build tested this with a variety of configs for alpha, arm, arm64, csky, i386, m68k, microblaze, mips, nios2, openrisc, powerpc, riscv, s390, sh, sparc, x86_64, and xtensa, for which I've seen no issues. I was unable to build test for ia64 and parisc due to existing build breakage in v6.6-rc2. Fixes: 9257959a6e5b4fca ("locking/atomic: scripts: restructure fallback ifdeffery") Reported-by: Ming Lei Reported-by: Darrick J. Wong Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Tested-by: Baokun Li Link: https://lkml.kernel.org/r/20230919171430.2697727-1-mark.rutland@arm.com --- include/linux/atomic/atomic-arch-fallback.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h index 18f5744dfb5d..b83ef19da13d 100644 --- a/include/linux/atomic/atomic-arch-fallback.h +++ b/include/linux/atomic/atomic-arch-fallback.h @@ -459,8 +459,6 @@ raw_atomic_read_acquire(const atomic_t *v) { #if defined(arch_atomic_read_acquire) return arch_atomic_read_acquire(v); -#elif defined(arch_atomic_read) - return arch_atomic_read(v); #else int ret; @@ -508,8 +506,6 @@ raw_atomic_set_release(atomic_t *v, int i) { #if defined(arch_atomic_set_release) arch_atomic_set_release(v, i); -#elif defined(arch_atomic_set) - arch_atomic_set(v, i); #else if (__native_word(atomic_t)) { smp_store_release(&(v)->counter, i); @@ -2575,8 +2571,6 @@ raw_atomic64_read_acquire(const atomic64_t *v) { #if defined(arch_atomic64_read_acquire) return arch_atomic64_read_acquire(v); -#elif defined(arch_atomic64_read) - return arch_atomic64_read(v); #else s64 ret; @@ -2624,8 +2618,6 @@ raw_atomic64_set_release(atomic64_t *v, s64 i) { #if defined(arch_atomic64_set_release) arch_atomic64_set_release(v, i); -#elif defined(arch_atomic64_set) - arch_atomic64_set(v, i); #else if (__native_word(atomic64_t)) { smp_store_release(&(v)->counter, i); @@ -4657,4 +4649,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v) } #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// 202b45c7db600ce36198eb1f1fc2c2d5268ace2d +// 2fdd6702823fa842f9cea57a002e6e4476ae780c -- cgit v1.2.3 From dcda165706b9fbfd685898d46a6749d7d397e0c0 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 15 Sep 2023 14:42:27 -0700 Subject: Bluetooth: hci_core: Fix build warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the following warnings: net/bluetooth/hci_core.c: In function ‘hci_register_dev’: net/bluetooth/hci_core.c:2620:54: warning: ‘%d’ directive output may be truncated writing between 1 and 10 bytes into a region of size 5 [-Wformat-truncation=] 2620 | snprintf(hdev->name, sizeof(hdev->name), "hci%d", id); | ^~ net/bluetooth/hci_core.c:2620:50: note: directive argument in the range [0, 2147483647] 2620 | snprintf(hdev->name, sizeof(hdev->name), "hci%d", id); | ^~~~~~~ net/bluetooth/hci_core.c:2620:9: note: ‘snprintf’ output between 5 and 14 bytes into a destination of size 8 2620 | snprintf(hdev->name, sizeof(hdev->name), "hci%d", id); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index e6359f7346f1..c33348ba1657 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -350,7 +350,7 @@ struct hci_dev { struct list_head list; struct mutex lock; - char name[8]; + const char *name; unsigned long flags; __u16 id; __u8 bus; -- cgit v1.2.3 From 41b43b6c6e30a832c790b010a06772e793bca193 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 20 Sep 2023 12:46:27 +0200 Subject: locking/seqlock: Do the lockdep annotation before locking in do_write_seqcount_begin_nested() It was brought up by Tetsuo that the following sequence: write_seqlock_irqsave() printk_deferred_enter() could lead to a deadlock if the lockdep annotation within write_seqlock_irqsave() triggers. The problem is that the sequence counter is incremented before the lockdep annotation is performed. The lockdep splat would then attempt to invoke printk() but the reader side, of the same seqcount, could have a tty_port::lock acquired waiting for the sequence number to become even again. The other lockdep annotations come before the actual locking because "we want to see the locking error before it happens". There is no reason why seqcount should be different here. Do the lockdep annotation first then perform the locking operation (the sequence increment). Fixes: 1ca7d67cf5d5a ("seqcount: Add lockdep functionality to seqcount/seqlock structures") Reported-by: Tetsuo Handa Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230920104627._DTHgPyA@linutronix.de Closes: https://lore.kernel.org/20230621130641.-5iueY1I@linutronix.de --- include/linux/seqlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 987a59d977c5..e9bd2f65d7f4 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -512,8 +512,8 @@ do { \ static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass) { - do_raw_write_seqcount_begin(s); seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); + do_raw_write_seqcount_begin(s); } /** -- cgit v1.2.3 From 2132df16f53b4f01ab25f5d404f36a22244ae342 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 15 Sep 2023 11:20:34 +0900 Subject: scsi: core: ata: Do no try to probe for CDL on old drives Some old drives (e.g. an Ultra320 SCSI disk as reported by John) do not seem to execute MAINTENANCE_IN / MI_REPORT_SUPPORTED_OPERATION_CODES commands correctly and hang when a non-zero service action is specified (one command format with service action case in scsi_report_opcode()). Currently, CDL probing with scsi_cdl_check_cmd() is the only caller using a non zero service action for scsi_report_opcode(). To avoid issues with these old drives, do not attempt CDL probe if the device reports support for an SPC version lower than 5 (CDL was introduced in SPC-5). To keep things working with ATA devices which probe for the CDL T2A and T2B pages introduced with SPC-6, modify ata_scsiop_inq_std() to claim SPC-6 version compatibility for ATA drives supporting CDL. SPC-6 standard version number is defined as Dh (= 13) in SPC-6 r09. Fix scsi_probe_lun() to correctly capture this value by changing the bit mask for the second byte of the INQUIRY response from 0x7 to 0xf. include/scsi/scsi.h is modified to add the definition SCSI_SPC_6 with the value 14 (Dh + 1). The missing definitions for the SCSI_SPC_4 and SCSI_SPC_5 versions are also added. Reported-by: John David Anglin Fixes: 624885209f31 ("scsi: core: Detect support for command duration limits") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20230915022034.678121-1-dlemoal@kernel.org Tested-by: David Gow Reviewed-by: Bart Van Assche Reviewed-by: Niklas Cassel Signed-off-by: Martin K. Petersen --- include/scsi/scsi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h index ec093594ba53..4498f845b112 100644 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@ -157,6 +157,9 @@ enum scsi_disposition { #define SCSI_3 4 /* SPC */ #define SCSI_SPC_2 5 #define SCSI_SPC_3 6 +#define SCSI_SPC_4 7 +#define SCSI_SPC_5 8 +#define SCSI_SPC_6 14 /* * INQ PERIPHERAL QUALIFIERS -- cgit v1.2.3 From 753a4d531bc518633ea88ac0ed02b25a16823d51 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Fri, 22 Sep 2023 22:55:16 +0200 Subject: ata: libata-sata: increase PMP SRST timeout to 10s On certain SATA controllers, softreset fails after wakeup from S2RAM with the message "softreset failed (1st FIS failed)", sometimes resulting in drives not being detected again. With the increased timeout, this issue is avoided. Instead, "softreset failed (device not ready)" is now logged 1-2 times; this later failure seems to cause fewer problems however, and the drives are detected reliably once they've spun up and the probe is retried. The issue was observed with the primary SATA controller of the QNAP TS-453B, which is an "Intel Corporation Celeron/Pentium Silver Processor SATA Controller [8086:31e3] (rev 06)" integrated in the Celeron J4125 CPU, and the following drives: - Seagate IronWolf ST12000VN0008 - Seagate IronWolf ST8000NE0004 The SATA controller seems to be more relevant to this issue than the drives, as the same drives are always detected reliably on the secondary SATA controller on the same board (an ASMedia 106x) without any "softreset failed" errors even without the increased timeout. Fixes: e7d3ef13d52a ("libata: change drive ready wait after hard reset to 5s") Cc: stable@vger.kernel.org Signed-off-by: Matthias Schiffer Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index bf4913f4d7ac..84aca8c44fa3 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -259,7 +259,7 @@ enum { * advised to wait only for the following duration before * doing SRST. */ - ATA_TMOUT_PMP_SRST_WAIT = 5000, + ATA_TMOUT_PMP_SRST_WAIT = 10000, /* When the LPM policy is set to ATA_LPM_MAX_POWER, there might * be a spurious PHY event, so ignore the first PHY event that -- cgit v1.2.3 From 2d5780bbef8dbe6375d481cbea212606a80e4453 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Tue, 26 Sep 2023 20:55:56 +0200 Subject: swiotlb: fix the check whether a device has used software IO TLB When CONFIG_SWIOTLB_DYNAMIC=y, devices which do not use the software IO TLB can avoid swiotlb lookup. A flag is added by commit 1395706a1490 ("swiotlb: search the software IO TLB only if the device makes use of it"), the flag is correctly set, but it is then never checked. Add the actual check here. Note that this code is an alternative to the default pool check, not an additional check, because: 1. swiotlb_find_pool() also searches the default pool; 2. if dma_uses_io_tlb is false, the default swiotlb pool is not used. Tested in a KVM guest against a QEMU RAM-backed SATA disk over virtio and *not* using software IO TLB, this patch increases IOPS by approx 2% for 4-way parallel I/O. The write memory barrier in swiotlb_dyn_alloc() is not needed, because a newly allocated pool must always be observed by swiotlb_find_slots() before an address from that pool is passed to is_swiotlb_buffer(). Correctness was verified using the following litmus test: C swiotlb-new-pool (* * Result: Never * * Check that a newly allocated pool is always visible when the * corresponding swiotlb buffer is visible. *) { mem_pools = default; } P0(int **mem_pools, int *pool) { /* add_mem_pool() */ WRITE_ONCE(*pool, 999); rcu_assign_pointer(*mem_pools, pool); } P1(int **mem_pools, int *flag, int *buf) { /* swiotlb_find_slots() */ int *r0; int r1; rcu_read_lock(); r0 = READ_ONCE(*mem_pools); r1 = READ_ONCE(*r0); rcu_read_unlock(); if (r1) { WRITE_ONCE(*flag, 1); smp_mb(); } /* device driver (presumed) */ WRITE_ONCE(*buf, r1); } P2(int **mem_pools, int *flag, int *buf) { /* device driver (presumed) */ int r0 = READ_ONCE(*buf); /* is_swiotlb_buffer() */ int r1; int *r2; int r3; smp_rmb(); r1 = READ_ONCE(*flag); if (r1) { /* swiotlb_find_pool() */ rcu_read_lock(); r2 = READ_ONCE(*mem_pools); r3 = READ_ONCE(*r2); rcu_read_unlock(); } } exists (2:r0<>0 /\ 2:r3=0) (* Not found. *) Fixes: 1395706a1490 ("swiotlb: search the software IO TLB only if the device makes use of it") Reported-by: Jonathan Corbet Closes: https://lore.kernel.org/linux-iommu/87a5uz3ob8.fsf@meer.lwn.net/ Signed-off-by: Petr Tesarik Reviewed-by: Catalin Marinas Signed-off-by: Christoph Hellwig --- include/linux/swiotlb.h | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index b4536626f8ff..ecde0312dd52 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -172,14 +172,23 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) if (!mem) return false; - if (IS_ENABLED(CONFIG_SWIOTLB_DYNAMIC)) { - /* Pairs with smp_wmb() in swiotlb_find_slots() and - * swiotlb_dyn_alloc(), which modify the RCU lists. - */ - smp_rmb(); - return swiotlb_find_pool(dev, paddr); - } +#ifdef CONFIG_SWIOTLB_DYNAMIC + /* + * All SWIOTLB buffer addresses must have been returned by + * swiotlb_tbl_map_single() and passed to a device driver. + * If a SWIOTLB address is checked on another CPU, then it was + * presumably loaded by the device driver from an unspecified private + * data structure. Make sure that this load is ordered before reading + * dev->dma_uses_io_tlb here and mem->pools in swiotlb_find_pool(). + * + * This barrier pairs with smp_mb() in swiotlb_find_slots(). + */ + smp_rmb(); + return READ_ONCE(dev->dma_uses_io_tlb) && + swiotlb_find_pool(dev, paddr); +#else return paddr >= mem->defpool.start && paddr < mem->defpool.end; +#endif } static inline bool is_swiotlb_force_bounce(struct device *dev) -- cgit v1.2.3 From 1a6a464774947920dcedcf7409be62495c7cedd0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 Sep 2023 12:44:06 +0200 Subject: timers: Tag (hr)timer softirq as hotplug safe Specific stress involving frequent CPU-hotplug operations, such as running rcutorture for example, may trigger the following message: NOHZ tick-stop error: local softirq work is pending, handler #02!!!" This happens in the CPU-down hotplug process, after CPUHP_AP_SMPBOOT_THREADS whose teardown callback parks ksoftirqd, and before the target CPU shuts down through CPUHP_AP_IDLE_DEAD. In this fragile intermediate state, softirqs waiting for threaded handling may be forever ignored and eventually reported by the idle task as in the above example. However some vectors are known to be safe as long as the corresponding subsystems have teardown callbacks handling the migration of their events. The above error message reports pending timers softirq although this vector can be considered as hotplug safe because the CPUHP_TIMERS_PREPARE teardown callback performs the necessary migration of timers after the death of the CPU. Hrtimers also have a similar hotplug handling. Therefore this error message, as far as (hr-)timers are concerned, can be considered spurious and the relevant softirq vectors can be marked as hotplug safe. Fixes: 0345691b24c0 ("tick/rcu: Stop allowing RCU_SOFTIRQ in idle") Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Reviewed-by: Joel Fernandes (Google) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230912104406.312185-6-frederic@kernel.org --- include/linux/interrupt.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a92bce40b04b..4a1dc88ddbff 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -569,8 +569,12 @@ enum * 2) rcu_report_dead() reports the final quiescent states. * * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue + * + * _ (HR)TIMER_SOFTIRQ: (hr)timers_dead_cpu() migrates the queue */ -#define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(RCU_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ)) +#define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(TIMER_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ) |\ + BIT(HRTIMER_SOFTIRQ) | BIT(RCU_SOFTIRQ)) + /* map softirq index to softirq name. update 'softirq_to_name' in * kernel/softirq.c when adding a new softirq. -- cgit v1.2.3 From fb99ef17865035a6657786d4b2af11a27ba23f9b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 25 Aug 2023 15:41:14 +0900 Subject: ata: libata-scsi: link ata port and scsi device There is no direct device ancestry defined between an ata_device and its scsi device which prevents the power management code from correctly ordering suspend and resume operations. Create such ancestry with the ata device as the parent to ensure that the scsi device (child) is suspended before the ata device and that resume handles the ata device before the scsi device. The parent-child (supplier-consumer) relationship is established between the ata_port (parent) and the scsi device (child) with the function device_add_link(). The parent used is not the ata_device as the PM operations are defined per port and the status of all devices connected through that port is controlled from the port operations. The device link is established with the new function ata_scsi_slave_alloc(), and this function is used to define the ->slave_alloc callback of the scsi host template of all ata drivers. Fixes: a19a93e4c6a9 ("scsi: core: pm: Rely on the device driver core for async power management") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Niklas Cassel Tested-by: Geert Uytterhoeven Reviewed-by: Martin K. Petersen Reviewed-by: John Garry --- include/linux/libata.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index 84aca8c44fa3..3ce1ab408114 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1148,6 +1148,7 @@ extern int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]); extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev); +extern int ata_scsi_slave_alloc(struct scsi_device *sdev); extern int ata_scsi_slave_config(struct scsi_device *sdev); extern void ata_scsi_slave_destroy(struct scsi_device *sdev); extern int ata_scsi_change_queue_depth(struct scsi_device *sdev, @@ -1396,6 +1397,7 @@ extern const struct attribute_group *ata_common_sdev_groups[]; .this_id = ATA_SHT_THIS_ID, \ .emulated = ATA_SHT_EMULATED, \ .proc_name = drv_name, \ + .slave_alloc = ata_scsi_slave_alloc, \ .slave_destroy = ata_scsi_slave_destroy, \ .bios_param = ata_std_bios_param, \ .unlock_native_capacity = ata_scsi_unlock_native_capacity,\ -- cgit v1.2.3 From 3cc2ffe5c16dc65dfac354bc5b5bc98d3b397567 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 15 Sep 2023 10:02:41 +0900 Subject: scsi: sd: Differentiate system and runtime start/stop management The underlying device and driver of a SCSI disk may have different system and runtime power mode control requirements. This is because runtime power management affects only the SCSI disk, while system level power management affects all devices, including the controller for the SCSI disk. For instance, issuing a START STOP UNIT command when a SCSI disk is runtime suspended and resumed is fine: the command is translated to a STANDBY IMMEDIATE command to spin down the ATA disk and to a VERIFY command to wake it up. The SCSI disk runtime operations have no effect on the ata port device used to connect the ATA disk. However, for system suspend/resume operations, the ATA port used to connect the device will also be suspended and resumed, with the resume operation requiring re-validating the device link and the device itself. In this case, issuing a VERIFY command to spinup the disk must be done before starting to revalidate the device, when the ata port is being resumed. In such case, we must not allow the SCSI disk driver to issue START STOP UNIT commands. Allow a low level driver to refine the SCSI disk start/stop management by differentiating system and runtime cases with two new SCSI device flags: manage_system_start_stop and manage_runtime_start_stop. These new flags replace the current manage_start_stop flag. Drivers setting the manage_start_stop are modifed to set both new flags, thus preserving the existing start/stop management behavior. For backward compatibility, the old manage_start_stop sysfs device attribute is kept as a read-only attribute showing a value of 1 for devices enabling both new flags and 0 otherwise. Fixes: 0a8589055936 ("ata,scsi: do not issue START STOP UNIT on resume") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Tested-by: Geert Uytterhoeven Reviewed-by: Martin K. Petersen --- include/scsi/scsi_device.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index b9230b6add04..fd41fdac0a8e 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -161,6 +161,10 @@ struct scsi_device { * pass settings from slave_alloc to scsi * core. */ unsigned int eh_timeout; /* Error handling timeout */ + + bool manage_system_start_stop; /* Let HLD (sd) manage system start/stop */ + bool manage_runtime_start_stop; /* Let HLD (sd) manage runtime start/stop */ + unsigned removable:1; unsigned changed:1; /* Data invalid due to media change */ unsigned busy:1; /* Used to prevent races */ @@ -193,7 +197,6 @@ struct scsi_device { unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */ unsigned no_start_on_add:1; /* do not issue start on add */ unsigned allow_restart:1; /* issue START_UNIT in error handler */ - unsigned manage_start_stop:1; /* Let HLD (sd) manage start/stop */ unsigned no_start_on_resume:1; /* Do not issue START_STOP_UNIT on resume */ unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */ unsigned no_uld_attach:1; /* disable connecting to upper level drivers */ -- cgit v1.2.3 From aa3998dbeb3abce63653b7f6d4542e7dcd022590 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 26 Aug 2023 09:43:39 +0900 Subject: ata: libata-scsi: Disable scsi device manage_system_start_stop The introduction of a device link to create a consumer/supplier relationship between the scsi device of an ATA device and the ATA port of that ATA device fixes the ordering of system suspend and resume operations. For suspend, the scsi device is suspended first and the ata port after it. This is fine as this allows the synchronize cache and START STOP UNIT commands issued by the scsi disk driver to be executed before the ata port is disabled. For resume operations, the ata port is resumed first, followed by the scsi device. This allows having the request queue of the scsi device to be unfrozen after the ata port resume is scheduled in EH, thus avoiding to see new requests prematurely issued to the ATA device. Since libata sets manage_system_start_stop to 1, the scsi disk resume operation also results in issuing a START STOP UNIT command to the device being resumed so that the device exits standby power mode. However, restoring the ATA device to the active power mode must be synchronized with libata EH processing of the port resume operation to avoid either 1) seeing the start stop unit command being received too early when the port is not yet resumed and ready to accept commands, or after the port resume process issues commands such as IDENTIFY to revalidate the device. In this last case, the risk is that the device revalidation fails with timeout errors as the drive is still spun down. Commit 0a8589055936 ("ata,scsi: do not issue START STOP UNIT on resume") disabled issuing the START STOP UNIT command to avoid issues with it. But this is incorrect as transitioning a device to the active power mode from the standby power mode set on suspend requires a media access command. The IDENTIFY, READ LOG and SET FEATURES commands executed in libata EH context triggered by the ata port resume operation may thus fail. Fix these synchronization issues is by handling a device power mode transitions for system suspend and resume directly in libata EH context, without relying on the scsi disk driver management triggered with the manage_system_start_stop flag. To do this, the following libata helper functions are introduced: 1) ata_dev_power_set_standby(): This function issues a STANDBY IMMEDIATE command to transitiom a device to the standby power mode. For HDDs, this spins down the disks. This function applies only to ATA and ZAC devices and does nothing otherwise. This function also does nothing for devices that have the ATA_FLAG_NO_POWEROFF_SPINDOWN or ATA_FLAG_NO_HIBERNATE_SPINDOWN flag set. For suspend, call ata_dev_power_set_standby() in ata_eh_handle_port_suspend() before the port is disabled and frozen. ata_eh_unload() is also modified to transition all enabled devices to the standby power mode when the system is shutdown or devices removed. 2) ata_dev_power_set_active() and This function applies to ATA or ZAC devices and issues a VERIFY command for 1 sector at LBA 0 to transition the device to the active power mode. For HDDs, since this function will complete only once the disk spin up. Its execution uses the same timeouts as for reset, to give the drive enough time to complete spinup without triggering a command timeout. For resume, call ata_dev_power_set_active() in ata_eh_revalidate_and_attach() after the port has been enabled and before any other command is issued to the device. With these changes, the manage_system_start_stop and no_start_on_resume scsi device flags do not need to be set in ata_scsi_dev_config(). The flag manage_runtime_start_stop is still set to allow the sd driver to spinup/spindown a disk through the sd runtime operations. Fixes: 0a8589055936 ("ata,scsi: do not issue START STOP UNIT on resume") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Tested-by: Geert Uytterhoeven Reviewed-by: Martin K. Petersen --- include/linux/libata.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index 3ce1ab408114..2a7d2af0ed80 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -192,6 +192,7 @@ enum { ATA_PFLAG_UNLOADING = (1 << 9), /* driver is being unloaded */ ATA_PFLAG_UNLOADED = (1 << 10), /* driver is unloaded */ + ATA_PFLAG_RESUMING = (1 << 16), /* port is being resumed */ ATA_PFLAG_SUSPENDED = (1 << 17), /* port is suspended (power) */ ATA_PFLAG_PM_PENDING = (1 << 18), /* PM operation pending */ ATA_PFLAG_INIT_GTM_VALID = (1 << 19), /* initial gtm data valid */ @@ -318,9 +319,10 @@ enum { ATA_EH_ENABLE_LINK = (1 << 3), ATA_EH_PARK = (1 << 5), /* unload heads and stop I/O */ ATA_EH_GET_SUCCESS_SENSE = (1 << 6), /* Get sense data for successful cmd */ + ATA_EH_SET_ACTIVE = (1 << 7), /* Set a device to active power mode */ ATA_EH_PERDEV_MASK = ATA_EH_REVALIDATE | ATA_EH_PARK | - ATA_EH_GET_SUCCESS_SENSE, + ATA_EH_GET_SUCCESS_SENSE | ATA_EH_SET_ACTIVE, ATA_EH_ALL_ACTIONS = ATA_EH_REVALIDATE | ATA_EH_RESET | ATA_EH_ENABLE_LINK, @@ -357,7 +359,7 @@ enum { /* This should match the actual table size of * ata_eh_cmd_timeout_table in libata-eh.c. */ - ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 7, + ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 8, /* Horkage types. May be set by libata or controller on drives (some horkage may be drive/controller pair dependent */ -- cgit v1.2.3 From ff48b37802e5c134e2dfc4d091f10b2eb5065a72 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 15 Sep 2023 15:00:13 +0900 Subject: scsi: Do not attempt to rescan suspended devices scsi_rescan_device() takes a scsi device lock before executing a device handler and device driver rescan methods. Waiting for the completion of any command issued to the device by these methods will thus be done with the device lock held. As a result, there is a risk of deadlocking within the power management code if scsi_rescan_device() is called to handle a device resume with the associated scsi device not yet resumed. Avoid such situation by checking that the target scsi device is in the running state, that is, fully capable of executing commands, before proceeding with the rescan and bailout returning -EWOULDBLOCK otherwise. With this error return, the caller can retry rescaning the device after a delay. The state check is done with the device lock held and is thus safe against incoming suspend power management operations. Fixes: 6aa0365a3c85 ("ata: libata-scsi: Avoid deadlock on rescan after device resume") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Niklas Cassel Tested-by: Geert Uytterhoeven Reviewed-by: Martin K. Petersen Reviewed-by: Bart Van Assche --- include/scsi/scsi_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 49f768d0ff37..4c2dc8150c6d 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -764,7 +764,7 @@ scsi_template_proc_dir(const struct scsi_host_template *sht); #define scsi_template_proc_dir(sht) NULL #endif extern void scsi_scan_host(struct Scsi_Host *); -extern void scsi_rescan_device(struct scsi_device *); +extern int scsi_rescan_device(struct scsi_device *sdev); extern void scsi_remove_host(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_get(struct Scsi_Host *); extern int scsi_host_busy(struct Scsi_Host *shost); -- cgit v1.2.3 From ce60f27bb62dfeb1bf827350520f34abc84e0933 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Sep 2023 05:09:58 +0100 Subject: mm: abstract moving to the next PFN In order to fix the L1TF vulnerability, x86 can invert the PTE bits for PROT_NONE VMAs, which means we cannot move from one PTE to the next by adding 1 to the PFN field of the PTE. This results in the BUG reported at [1]. Abstract advancing the PTE to the next PFN through a pte_next_pfn() function/macro. Link: https://lkml.kernel.org/r/20230920040958.866520-1-willy@infradead.org Fixes: bcc6cc832573 ("mm: add default definition of set_ptes()") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: syzbot+55cc72f8cc3a549119df@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/000000000000d099fa0604f03351@google.com [1] Reviewed-by: Yin Fengwei Cc: Dave Hansen Cc: David Hildenbrand Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 1fba072b3dac..af7639c3b0a3 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -206,6 +206,14 @@ static inline int pmd_young(pmd_t pmd) #endif #ifndef set_ptes + +#ifndef pte_next_pfn +static inline pte_t pte_next_pfn(pte_t pte) +{ + return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); +} +#endif + /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. @@ -231,7 +239,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, if (--nr == 0) break; ptep++; - pte = __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + pte = pte_next_pfn(pte); } arch_leave_lazy_mmu_mode(); } -- cgit v1.2.3 From 5c590804b6b0ff933ed4e5cee5d76de3a5048d9f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 21 Sep 2023 14:12:35 -0400 Subject: maple_tree: add mas_is_active() to detect in-tree walks Patch series "maple_tree: Fix mas_prev() state regression". Pedro Falcato retported an mprotect regression [1] which was bisected back to the iterator changes for maple tree. Root cause analysis showed the mas_prev() running off the end of the VMA space (previous from 0) followed by mas_find(), would skip the first value. This patchset introduces maple state underflow/overflow so the sequence of calls on the maple state will return what the user expects. Users who encounter this bug may see mprotect(), userfaultfd_register(), and mlock() fail on VMAs mapped with address 0. This patch (of 2): Instead of constantly checking each possibility of the maple state, create a fast path that will skip over checking unlikely states. Link: https://lkml.kernel.org/r/20230921181236.509072-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230921181236.509072-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Pedro Falcato Cc: Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e41c70ac7744..f66f5f78f8cf 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -511,6 +511,15 @@ static inline bool mas_is_paused(const struct ma_state *mas) return mas->node == MAS_PAUSE; } +/* Check if the mas is pointing to a node or not */ +static inline bool mas_is_active(struct ma_state *mas) +{ + if ((unsigned long)mas->node >= MAPLE_RESERVED_RANGE) + return true; + + return false; +} + /** * mas_reset() - Reset a Maple Tree operation state. * @mas: Maple Tree operation state. -- cgit v1.2.3 From a8091f039c1ebf5cb0d5261e3613f18eb2a5d8b7 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 21 Sep 2023 14:12:36 -0400 Subject: maple_tree: add MAS_UNDERFLOW and MAS_OVERFLOW states When updating the maple tree iterator to avoid rewalks, an issue was introduced when shifting beyond the limits. This can be seen by trying to go to the previous address of 0, which would set the maple node to MAS_NONE and keep the range as the last entry. Subsequent calls to mas_find() would then search upwards from mas->last and skip the value at mas->index/mas->last. This showed up as a bug in mprotect which skips the actual VMA at the current range after attempting to go to the previous VMA from 0. Since MAS_NONE may already be set when searching for a value that isn't contained within a node, changing the handling of MAS_NONE in mas_find() would make the code more complicated and error prone. Furthermore, there was no way to tell which limit was hit, and thus which action to take (next or the entry at the current range). This solution is to add two states to track what happened with the previous iterator action. This allows for the expected behaviour of the next command to return the correct item (either the item at the range requested, or the next/previous). Tests are also added and updated accordingly. Link: https://lkml.kernel.org/r/20230921181236.509072-3-Liam.Howlett@oracle.com Link: https://gist.github.com/heatd/85d2971fae1501b55b6ea401fbbe485b Link: https://lore.kernel.org/linux-mm/20230921181236.509072-1-Liam.Howlett@oracle.com/ Fixes: 39193685d585 ("maple_tree: try harder to keep active node with mas_prev()") Signed-off-by: Liam R. Howlett Reported-by: Pedro Falcato Closes: https://gist.github.com/heatd/85d2971fae1501b55b6ea401fbbe485b Closes: https://bugs.archlinux.org/task/79656 Cc: Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index f66f5f78f8cf..d01e850b570f 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -428,6 +428,8 @@ struct ma_wr_state { #define MAS_ROOT ((struct maple_enode *)5UL) #define MAS_NONE ((struct maple_enode *)9UL) #define MAS_PAUSE ((struct maple_enode *)17UL) +#define MAS_OVERFLOW ((struct maple_enode *)33UL) +#define MAS_UNDERFLOW ((struct maple_enode *)65UL) #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) -- cgit v1.2.3 From 935d4f0c6dc8b3533e6e39346de7389a84490178 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 22 Sep 2023 12:58:03 +0100 Subject: mm: hugetlb: add huge page size param to set_huge_pte_at() Patch series "Fix set_huge_pte_at() panic on arm64", v2. This series fixes a bug in arm64's implementation of set_huge_pte_at(), which can result in an unprivileged user causing a kernel panic. The problem was triggered when running the new uffd poison mm selftest for HUGETLB memory. This test (and the uffd poison feature) was merged for v6.5-rc7. Ideally, I'd like to get this fix in for v6.6 and I've cc'ed stable (correctly this time) to get it backported to v6.5, where the issue first showed up. Description of Bug ================== arm64's huge pte implementation supports multiple huge page sizes, some of which are implemented in the page table with multiple contiguous entries. So set_huge_pte_at() needs to work out how big the logical pte is, so that it can also work out how many physical ptes (or pmds) need to be written. It previously did this by grabbing the folio out of the pte and querying its size. However, there are cases when the pte being set is actually a swap entry. But this also used to work fine, because for huge ptes, we only ever saw migration entries and hwpoison entries. And both of these types of swap entries have a PFN embedded, so the code would grab that and everything still worked out. But over time, more calls to set_huge_pte_at() have been added that set swap entry types that do not embed a PFN. And this causes the code to go bang. The triggering case is for the uffd poison test, commit 99aa77215ad0 ("selftests/mm: add uffd unit test for UFFDIO_POISON"), which causes a PTE_MARKER_POISONED swap entry to be set, coutesey of commit 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs") - added in v6.5-rc7. Although review shows that there are other call sites that set PTE_MARKER_UFFD_WP (which also has no PFN), these don't trigger on arm64 because arm64 doesn't support UFFD WP. If CONFIG_DEBUG_VM is enabled, we do at least get a BUG(), but otherwise, it will dereference a bad pointer in page_folio(): static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry) { VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry)); return page_folio(pfn_to_page(swp_offset_pfn(entry))); } Fix === The simplest fix would have been to revert the dodgy cleanup commit 18f3962953e4 ("mm: hugetlb: kill set_huge_swap_pte_at()"), but since things have moved on, this would have required an audit of all the new set_huge_pte_at() call sites to see if they should be converted to set_huge_swap_pte_at(). As per the original intent of the change, it would also leave us open to future bugs when people invariably get it wrong and call the wrong helper. So instead, I've added a huge page size parameter to set_huge_pte_at(). This means that the arm64 code has the size in all cases. It's a bigger change, due to needing to touch the arches that implement the function, but it is entirely mechanical, so in my view, low risk. I've compile-tested all touched arches; arm64, parisc, powerpc, riscv, s390, sparc (and additionally x86_64). I've additionally booted and run mm selftests against arm64, where I observe the uffd poison test is fixed, and there are no other regressions. This patch (of 2): In order to fix a bug, arm64 needs to be told the size of the huge page for which the pte is being set in set_huge_pte_at(). Provide for this by adding an `unsigned long sz` parameter to the function. This follows the same pattern as huge_pte_clear(). This commit makes the required interface modifications to the core mm as well as all arches that implement this function (arm64, parisc, powerpc, riscv, s390, sparc). The actual arm64 bug will be fixed in a separate commit. No behavioral changes intended. Link: https://lkml.kernel.org/r/20230922115804.2043771-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20230922115804.2043771-2-ryan.roberts@arm.com Fixes: 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs") Signed-off-by: Ryan Roberts Reviewed-by: Christophe Leroy [powerpc 8xx] Reviewed-by: Lorenzo Stoakes [vmalloc change] Cc: Alexandre Ghiti Cc: Albert Ou Cc: Alexander Gordeev Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Qi Zheng Cc: Ryan Roberts Cc: SeongJae Park Cc: Sven Schnelle Cc: Uladzislau Rezki (Sony) Cc: Vasily Gorbik Cc: Will Deacon Cc: [6.5+] Signed-off-by: Andrew Morton --- include/asm-generic/hugetlb.h | 2 +- include/linux/hugetlb.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 4da02798a00b..6dcf4d576970 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -76,7 +76,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, #ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t pte, unsigned long sz) { set_pte_at(mm, addr, ptep, pte); } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5b2626063f4f..a30686e649f7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -984,7 +984,9 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + unsigned long psize = huge_page_size(hstate_vma(vma)); + + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } #endif @@ -1173,7 +1175,7 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, } static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t pte, unsigned long sz) { } -- cgit v1.2.3 From 15e86643d5b6df08ebd65c7b5aa607fd2ed2d9d1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 30 Sep 2023 16:13:35 +0900 Subject: vmlinux.lds.h: remove unused CPU_KEEP and CPU_DISCARD macros Remove the left-over of commit e24f6628811e ("modpost: remove all traces of cpuinit/cpuexit sections"). Signed-off-by: Masahiro Yamada Acked-by: Paul Gortmaker --- include/asm-generic/vmlinux.lds.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 9c59409104f6..67d8dd2f1bde 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -138,13 +138,6 @@ * are handled as text/data or they can be discarded (which * often happens at runtime) */ -#ifdef CONFIG_HOTPLUG_CPU -#define CPU_KEEP(sec) *(.cpu##sec) -#define CPU_DISCARD(sec) -#else -#define CPU_KEEP(sec) -#define CPU_DISCARD(sec) *(.cpu##sec) -#endif #if defined(CONFIG_MEMORY_HOTPLUG) #define MEM_KEEP(sec) *(.mem##sec) -- cgit v1.2.3 From 5baa0433a15eadd729625004c37463acb982eca7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2023 09:27:13 +0000 Subject: neighbour: fix data-races around n->output n->output field can be read locklessly, while a writer might change the pointer concurrently. Add missing annotations to prevent load-store tearing. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 6da68886fabb..07022bb0d44d 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -539,7 +539,7 @@ static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, READ_ONCE(hh->hh_len)) return neigh_hh_output(hh, skb); - return n->output(n, skb); + return READ_ONCE(n->output)(n, skb); } static inline struct neighbour * -- cgit v1.2.3 From 4b2b606075e50cdae62ab2356b0a1e206947c354 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 22 Sep 2023 15:55:08 +0800 Subject: ipv4/fib: send notify when delete source address routes After deleting an interface address in fib_del_ifaddr(), the function scans the fib_info list for stray entries and calls fib_flush() and fib_table_flush(). Then the stray entries will be deleted silently and no RTM_DELROUTE notification will be sent. This lack of notification can make routing daemons, or monitor like `ip monitor route` miss the routing changes. e.g. + ip link add dummy1 type dummy + ip link add dummy2 type dummy + ip link set dummy1 up + ip link set dummy2 up + ip addr add 192.168.5.5/24 dev dummy1 + ip route add 7.7.7.0/24 dev dummy2 src 192.168.5.5 + ip -4 route 7.7.7.0/24 dev dummy2 scope link src 192.168.5.5 192.168.5.0/24 dev dummy1 proto kernel scope link src 192.168.5.5 + ip monitor route + ip addr del 192.168.5.5/24 dev dummy1 Deleted 192.168.5.0/24 dev dummy1 proto kernel scope link src 192.168.5.5 Deleted broadcast 192.168.5.255 dev dummy1 table local proto kernel scope link src 192.168.5.5 Deleted local 192.168.5.5 dev dummy1 table local proto kernel scope host src 192.168.5.5 As Ido reminded, fib_table_flush() isn't only called when an address is deleted, but also when an interface is deleted or put down. The lack of notification in these cases is deliberate. And commit 7c6bb7d2faaf ("net/ipv6: Add knob to skip DELROUTE message on device down") introduced a sysctl to make IPv6 behave like IPv4 in this regard. So we can't send the route delete notify blindly in fib_table_flush(). To fix this issue, let's add a new flag in "struct fib_info" to track the deleted prefer source address routes, and only send notify for them. After update: + ip monitor route + ip addr del 192.168.5.5/24 dev dummy1 Deleted 192.168.5.0/24 dev dummy1 proto kernel scope link src 192.168.5.5 Deleted broadcast 192.168.5.255 dev dummy1 table local proto kernel scope link src 192.168.5.5 Deleted local 192.168.5.5 dev dummy1 table local proto kernel scope host src 192.168.5.5 Deleted 7.7.7.0/24 dev dummy2 scope link src 192.168.5.5 Suggested-by: Thomas Haller Signed-off-by: Hangbin Liu Acked-by: Nicolas Dichtel Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230922075508.848925-1-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- include/net/ip_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f0c13864180e..15de07d36540 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -154,6 +154,7 @@ struct fib_info { int fib_nhs; bool fib_nh_is_v6; bool nh_updated; + bool pfsrc_removed; struct nexthop *nh; struct rcu_head rcu; struct fib_nh fib_nh[]; -- cgit v1.2.3 From 8e56b063c86569e51eed1c5681ce6361fa97fc7a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 3 Oct 2023 13:17:53 -0400 Subject: netfilter: handle the connecting collision properly in nf_conntrack_proto_sctp In Scenario A and B below, as the delayed INIT_ACK always changes the peer vtag, SCTP ct with the incorrect vtag may cause packet loss. Scenario A: INIT_ACK is delayed until the peer receives its own INIT_ACK 192.168.1.2 > 192.168.1.1: [INIT] [init tag: 1328086772] 192.168.1.1 > 192.168.1.2: [INIT] [init tag: 1414468151] 192.168.1.2 > 192.168.1.1: [INIT ACK] [init tag: 1328086772] 192.168.1.1 > 192.168.1.2: [INIT ACK] [init tag: 1650211246] * 192.168.1.2 > 192.168.1.1: [COOKIE ECHO] 192.168.1.1 > 192.168.1.2: [COOKIE ECHO] 192.168.1.2 > 192.168.1.1: [COOKIE ACK] Scenario B: INIT_ACK is delayed until the peer completes its own handshake 192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT] [init tag: 144230885] 192.168.1.2 > 192.168.1.1: sctp (1) [INIT ACK] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [COOKIE ECHO] 192.168.1.2 > 192.168.1.1: sctp (1) [COOKIE ACK] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT ACK] [init tag: 3914796021] * This patch fixes it as below: In SCTP_CID_INIT processing: - clear ct->proto.sctp.init[!dir] if ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]. (Scenario E) - set ct->proto.sctp.init[dir]. In SCTP_CID_INIT_ACK processing: - drop it if !ct->proto.sctp.init[!dir] && ct->proto.sctp.vtag[!dir] && ct->proto.sctp.vtag[!dir] != ih->init_tag. (Scenario B, Scenario C) - drop it if ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && ct->proto.sctp.vtag[!dir] != ih->init_tag. (Scenario A) In SCTP_CID_COOKIE_ACK processing: - clear ct->proto.sctp.init[dir] and ct->proto.sctp.init[!dir]. (Scenario D) Also, it's important to allow the ct state to move forward with cookie_echo and cookie_ack from the opposite dir for the collision scenarios. There are also other Scenarios where it should allow the packet through, addressed by the processing above: Scenario C: new CT is created by INIT_ACK. Scenario D: start INIT on the existing ESTABLISHED ct. Scenario E: start INIT after the old collision on the existing ESTABLISHED ct. 192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT] [init tag: 144230885] (both side are stopped, then start new connection again in hours) 192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 242308742] Fixes: 9fb9cbb1082d ("[NETFILTER]: Add nf_conntrack subsystem.") Signed-off-by: Xin Long Signed-off-by: Florian Westphal --- include/linux/netfilter/nf_conntrack_sctp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_sctp.h b/include/linux/netfilter/nf_conntrack_sctp.h index 625f491b95de..fb31312825ae 100644 --- a/include/linux/netfilter/nf_conntrack_sctp.h +++ b/include/linux/netfilter/nf_conntrack_sctp.h @@ -9,6 +9,7 @@ struct ip_ct_sctp { enum sctp_conntrack state; __be32 vtag[IP_CT_DIR_MAX]; + u8 init[IP_CT_DIR_MAX]; u8 last_dir; u8 flags; }; -- cgit v1.2.3 From 513dbc10cfc1da6754e004ea651d6bc480c23eb9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 30 Sep 2023 17:38:45 -0700 Subject: page_pool: fix documentation typos Correct grammar for better readability. Signed-off-by: Randy Dunlap Cc: Jesper Dangaard Brouer Reviewed-by: Simon Horman Acked-by: Ilias Apalodimas Link: https://lore.kernel.org/r/20231001003846.29541-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 94231533a369..8e7751464ff5 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -16,13 +16,13 @@ * page_pool_alloc_pages() call. Drivers should use * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). * - * API keeps track of in-flight pages, in order to let API user know + * The API keeps track of in-flight pages, in order to let API users know * when it is safe to free a page_pool object. Thus, API users * must call page_pool_put_page() to free the page, or attach - * the page to a page_pool-aware objects like skbs marked with + * the page to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * - * API user must call page_pool_put_page() once on a page, as it + * API users must call page_pool_put_page() once on a page, as it * will either recycle the page, or in case of refcnt > 1, it will * release the DMA mapping and in-flight state accounting. */ -- cgit v1.2.3 From 059217c18be6757b95bfd77ba53fb50b48b8a816 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 1 Oct 2023 11:12:38 -0400 Subject: tcp: fix quick-ack counting to count actual ACKs of new data This commit fixes quick-ack counting so that it only considers that a quick-ack has been provided if we are sending an ACK that newly acknowledges data. The code was erroneously using the number of data segments in outgoing skbs when deciding how many quick-ack credits to remove. This logic does not make sense, and could cause poor performance in request-response workloads, like RPC traffic, where requests or responses can be multi-segment skbs. When a TCP connection decides to send N quick-acks, that is to accelerate the cwnd growth of the congestion control module controlling the remote endpoint of the TCP connection. That quick-ack decision is purely about the incoming data and outgoing ACKs. It has nothing to do with the outgoing data or the size of outgoing data. And in particular, an ACK only serves the intended purpose of allowing the remote congestion control to grow the congestion window quickly if the ACK is ACKing or SACKing new data. The fix is simple: only count packets as serving the goal of the quickack mechanism if they are ACKing/SACKing new data. We can tell whether this is the case by checking inet_csk_ack_scheduled(), since we schedule an ACK exactly when we are ACKing/SACKing new data. Fixes: fc6415bcb0f5 ("[TCP]: Fix quick-ack decrementing with TSO.") Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20231001151239.1866845-1-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 91688d0dadcd..7b1a720691ae 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -348,12 +348,14 @@ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule); -static inline void tcp_dec_quickack_mode(struct sock *sk, - const unsigned int pkts) +static inline void tcp_dec_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ack.quick) { + /* How many ACKs S/ACKing new data have we sent? */ + const unsigned int pkts = inet_csk_ack_scheduled(sk) ? 1 : 0; + if (pkts >= icsk->icsk_ack.quick) { icsk->icsk_ack.quick = 0; /* Leaving quickack mode we deflate ATO. */ -- cgit v1.2.3 From a43e8e9ffa0d1de058964edf1a0622cbb7e27cfe Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Fri, 29 Sep 2023 13:42:27 -0700 Subject: net: mana: Fix oversized sge0 for GSO packets Handle the case when GSO SKB linear length is too large. MANA NIC requires GSO packets to put only the header part to SGE0, otherwise the TX queue may stop at the HW level. So, use 2 SGEs for the skb linear part which contains more than the packet header. Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Signed-off-by: Haiyang Zhang Reviewed-by: Simon Horman Reviewed-by: Shradha Gupta Signed-off-by: Paolo Abeni --- include/net/mana/mana.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 9f70b4332238..4d43adf18606 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -103,9 +103,10 @@ struct mana_txq { /* skb data and frags dma mappings */ struct mana_skb_head { - dma_addr_t dma_handle[MAX_SKB_FRAGS + 1]; + /* GSO pkts may have 2 SGEs for the linear part*/ + dma_addr_t dma_handle[MAX_SKB_FRAGS + 2]; - u32 size[MAX_SKB_FRAGS + 1]; + u32 size[MAX_SKB_FRAGS + 2]; }; #define MANA_HEADROOM sizeof(struct mana_skb_head) -- cgit v1.2.3