From 4fe6a946823a9bc8619fd16b7ea7d15914a30f22 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Sep 2021 21:51:01 +0100 Subject: afs: Try to avoid taking RCU read lock when checking vnode validity Try to avoid taking the RCU read lock when checking the validity of a vnode's callback state. The only thing it's needed for is to pin the parent volume's server list whilst we search it to find the record of the server we're currently using to see if it has been reinitialised (ie. it sent us a CB.InitCallBackState* RPC). Do this by the following means: (1) Keep an additional per-cell counter (fs_s_break) that's incremented each time any of the fileservers in the cell reinitialises. Since the new counter can be accessed without RCU from the vnode, we can check that first - and only if it differs, get the RCU read lock and check the volume's server list. (2) Replace afs_get_s_break_rcu() with afs_check_server_good() which now indicates whether the callback promise is still expected to be present on the server. This does the checks as described in (1). (3) Restructure afs_check_validity() to take account of the change in (2). We can also get rid of the valid variable and just use the need_clear variable with the addition of the afs_cb_break_no_promise reason. (4) afs_check_validity() probably shouldn't be altering vnode->cb_v_break and vnode->cb_s_break when it doesn't have cb_lock exclusively locked. Move the change to vnode->cb_v_break to __afs_break_callback(). Delegate the change to vnode->cb_s_break to afs_select_fileserver() and set vnode->cb_fs_s_break there also. (5) afs_validate() no longer needs to get the RCU read lock around its call to afs_check_validity() - and can skip the call entirely if we don't have a promise. Signed-off-by: David Howells Tested-by: Markus Suvanto cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/163111669583.283156.1397603105683094563.stgit@warthog.procyon.org.uk/ --- include/trace/events/afs.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 9f73ed2cf061..bca73e8c8cde 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -306,11 +306,13 @@ enum afs_flock_operation { enum afs_cb_break_reason { afs_cb_break_no_break, + afs_cb_break_no_promise, afs_cb_break_for_callback, afs_cb_break_for_deleted, afs_cb_break_for_lapsed, + afs_cb_break_for_s_reinit, afs_cb_break_for_unlink, - afs_cb_break_for_vsbreak, + afs_cb_break_for_v_break, afs_cb_break_for_volume_callback, afs_cb_break_for_zap, }; @@ -602,11 +604,13 @@ enum afs_cb_break_reason { #define afs_cb_break_reasons \ EM(afs_cb_break_no_break, "no-break") \ + EM(afs_cb_break_no_promise, "no-promise") \ EM(afs_cb_break_for_callback, "break-cb") \ EM(afs_cb_break_for_deleted, "break-del") \ EM(afs_cb_break_for_lapsed, "break-lapsed") \ + EM(afs_cb_break_for_s_reinit, "s-reinit") \ EM(afs_cb_break_for_unlink, "break-unlink") \ - EM(afs_cb_break_for_vsbreak, "break-vs") \ + EM(afs_cb_break_for_v_break, "break-v") \ EM(afs_cb_break_for_volume_callback, "break-v-cb") \ E_(afs_cb_break_for_zap, "break-zap") -- cgit v1.2.3 From 099dd788e31b4f426ef49c2785069804925a84e1 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 13 Sep 2021 14:51:10 -0500 Subject: cifs: remove pathname for file from SPDX header checkpatch complains about source files with filenames (e.g. in these cases just below the SPDX header in comments at the top of various files in fs/cifs). It also is helpful to change this now so will be less confusing when the parent directory is renamed e.g. from fs/cifs to fs/smb_client (or fs/smbfs) Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- include/uapi/linux/cifs/cifs_mount.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/cifs/cifs_mount.h b/include/uapi/linux/cifs/cifs_mount.h index 69829205fdb5..8e87d27b0951 100644 --- a/include/uapi/linux/cifs/cifs_mount.h +++ b/include/uapi/linux/cifs/cifs_mount.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1+ WITH Linux-syscall-note */ /* - * include/uapi/linux/cifs/cifs_mount.h * * Author(s): Scott Lovenberg (scott.lovenberg@gmail.com) * -- cgit v1.2.3 From b564171ade70570b7f335fa8ed17adb28409e3ac Mon Sep 17 00:00:00 2001 From: Li Li Date: Fri, 10 Sep 2021 09:42:10 -0700 Subject: binder: fix freeze race Currently cgroup freezer is used to freeze the application threads, and BINDER_FREEZE is used to freeze the corresponding binder interface. There's already a mechanism in ioctl(BINDER_FREEZE) to wait for any existing transactions to drain out before actually freezing the binder interface. But freezing an app requires 2 steps, freezing the binder interface with ioctl(BINDER_FREEZE) and then freezing the application main threads with cgroupfs. This is not an atomic operation. The following race issue might happen. 1) Binder interface is frozen by ioctl(BINDER_FREEZE); 2) Main thread A initiates a new sync binder transaction to process B; 3) Main thread A is frozen by "echo 1 > cgroup.freeze"; 4) The response from process B reaches the frozen thread, which will unexpectedly fail. This patch provides a mechanism to check if there's any new pending transaction happening between ioctl(BINDER_FREEZE) and freezing the main thread. If there's any, the main thread freezing operation can be rolled back to finish the pending transaction. Furthermore, the response might reach the binder driver before the rollback actually happens. That will still cause failed transaction. As the other process doesn't wait for another response of the response, the response transaction failure can be fixed by treating the response transaction like an oneway/async one, allowing it to reach the frozen thread. And it will be consumed when the thread gets unfrozen later. NOTE: This patch reuses the existing definition of struct binder_frozen_status_info but expands the bit assignments of __u32 member sync_recv. To ensure backward compatibility, bit 0 of sync_recv still indicates there's an outstanding sync binder transaction. This patch adds new information to bit 1 of sync_recv, indicating the binder transaction happens exactly when there's a race. If an existing userspace app runs on a new kernel, a sync binder call will set bit 0 of sync_recv so ioctl(BINDER_GET_FROZEN_INFO) still return the expected value (true). The app just doesn't check bit 1 intentionally so it doesn't have the ability to tell if there's a race. This behavior is aligned with what happens on an old kernel which doesn't set bit 1 at all. A new userspace app can 1) check bit 0 to know if there's a sync binder transaction happened when being frozen - same as before; and 2) check bit 1 to know if that sync binder transaction happened exactly when there's a race - a new information for rollback decision. the same time, confirmed the pending transactions succeeded. Fixes: 432ff1e91694 ("binder: BINDER_FREEZE ioctl") Acked-by: Todd Kjos Cc: stable Signed-off-by: Li Li Test: stress test with apps being frozen and initiating binder calls at Link: https://lore.kernel.org/r/20210910164210.2282716-2-dualli@chromium.org Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/android/binder.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 20e435fe657a..3246f2c74696 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -225,7 +225,14 @@ struct binder_freeze_info { struct binder_frozen_status_info { __u32 pid; + + /* process received sync transactions since last frozen + * bit 0: received sync transaction after being frozen + * bit 1: new pending sync transaction during freezing + */ __u32 sync_recv; + + /* process received async transactions since last frozen */ __u32 async_recv; }; -- cgit v1.2.3 From 7a8aa39d44564703620d937bb54cdea2d003657f Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 13 Sep 2021 17:05:51 +0100 Subject: nvmem: core: Add stubs for nvmem_cell_read_variable_le_u32/64 if !CONFIG_NVMEM When I added nvmem_cell_read_variable_le_u32() and nvmem_cell_read_variable_le_u64() I forgot to add the "static inline" stub functions for when CONFIG_NVMEM wasn't defined. Add them now. This was causing problems with randconfig builds that compiled `drivers/soc/qcom/cpr.c`. Fixes: 6feba6a62c57 ("PM: AVS: qcom-cpr: Use nvmem_cell_read_variable_le_u32()") Fixes: a28e824fb827 ("nvmem: core: Add functions to make number reading easy") Reported-by: kernel test robot Reviewed-by: Bjorn Andersson Signed-off-by: Douglas Anderson Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20210913160551.12907-1-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-consumer.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h index 923dada24eb4..c0c0cefc3b92 100644 --- a/include/linux/nvmem-consumer.h +++ b/include/linux/nvmem-consumer.h @@ -150,6 +150,20 @@ static inline int nvmem_cell_read_u64(struct device *dev, return -EOPNOTSUPP; } +static inline int nvmem_cell_read_variable_le_u32(struct device *dev, + const char *cell_id, + u32 *val) +{ + return -EOPNOTSUPP; +} + +static inline int nvmem_cell_read_variable_le_u64(struct device *dev, + const char *cell_id, + u64 *val) +{ + return -EOPNOTSUPP; +} + static inline struct nvmem_device *nvmem_device_get(struct device *dev, const char *name) { -- cgit v1.2.3 From 58877b0824da15698bd85a0a9dbfa8c354e6ecb7 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 9 Sep 2021 12:11:58 +0530 Subject: usb: core: hcd: Add support for deferring roothub registration It has been observed with certain PCIe USB cards (like Inateck connected to AM64 EVM or J7200 EVM) that as soon as the primary roothub is registered, port status change is handled even before xHC is running leading to cold plug USB devices not detected. For such cases, registering both the root hubs along with the second HCD is required. Add support for deferring roothub registration in usb_add_hcd(), so that both primary and secondary roothubs are registered along with the second HCD. CC: stable@vger.kernel.org # 5.4+ Suggested-by: Mathias Nyman Tested-by: Chris Chiu Acked-by: Alan Stern Signed-off-by: Kishon Vijay Abraham I Link: https://lore.kernel.org/r/20210909064200.16216-2-kishon@ti.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/hcd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 548a028f2dab..2c1fc9212cf2 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -124,6 +124,7 @@ struct usb_hcd { #define HCD_FLAG_RH_RUNNING 5 /* root hub is running? */ #define HCD_FLAG_DEAD 6 /* controller has died? */ #define HCD_FLAG_INTF_AUTHORIZED 7 /* authorize interfaces? */ +#define HCD_FLAG_DEFER_RH_REGISTER 8 /* Defer roothub registration */ /* The flags can be tested using these macros; they are likely to * be slightly faster than test_bit(). @@ -134,6 +135,7 @@ struct usb_hcd { #define HCD_WAKEUP_PENDING(hcd) ((hcd)->flags & (1U << HCD_FLAG_WAKEUP_PENDING)) #define HCD_RH_RUNNING(hcd) ((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING)) #define HCD_DEAD(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEAD)) +#define HCD_DEFER_RH_REGISTER(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEFER_RH_REGISTER)) /* * Specifies if interfaces are authorized by default -- cgit v1.2.3 From 3c9cfb5269f76d447dbadb67835368f3111a91d7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Sep 2021 14:17:35 +0300 Subject: net: update NXP copyright text NXP Legal insists that the following are not fine: - Saying "NXP Semiconductors" instead of "NXP", since the company's registered name is "NXP" - Putting a "(c)" sign in the copyright string - Putting a comma in the copyright string The only accepted copyright string format is "Copyright NXP". This patch changes the copyright headers in the networking files that were sent by me, or derived from code sent by me. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/ocelot.h | 2 +- include/linux/packing.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index c6bc45ae5e03..435777a0073c 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 - * Copyright 2019-2021 NXP Semiconductors + * Copyright 2019-2021 NXP */ #ifndef _NET_DSA_TAG_OCELOT_H diff --git a/include/linux/packing.h b/include/linux/packing.h index 54667735cc67..8d6571feb95d 100644 --- a/include/linux/packing.h +++ b/include/linux/packing.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright (c) 2016-2018, NXP Semiconductors + * Copyright 2016-2018 NXP * Copyright (c) 2018-2019, Vladimir Oltean */ #ifndef _LINUX_PACKING_H -- cgit v1.2.3 From cf9579976f724ad517cc15b7caadea728c7e245c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Sep 2021 16:34:32 +0300 Subject: net: mdio: introduce a shutdown method to mdio device drivers MDIO-attached devices might have interrupts and other things that might need quiesced when we kexec into a new kernel. Things are even more creepy when those interrupt lines are shared, and in that case it is absolutely mandatory to disable all interrupt sources. Moreover, MDIO devices might be DSA switches, and DSA needs its own shutdown method to unlink from the DSA master, which is a new requirement that appeared after commit 2f1e8ea726e9 ("net: dsa: link interfaces with the DSA master to get rid of lockdep warnings"). So introduce a ->shutdown method in the MDIO device driver structure. Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index ffb787d5ebde..5e6dc38f418e 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -80,6 +80,9 @@ struct mdio_driver { /* Clears up any memory if needed */ void (*remove)(struct mdio_device *mdiodev); + + /* Quiesces the device on system shutdown, turns off interrupts etc */ + void (*shutdown)(struct mdio_device *mdiodev); }; static inline struct mdio_driver * -- cgit v1.2.3 From 0650bf52b31ff35dc6430fc2e37969c36baba724 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Sep 2021 16:34:33 +0300 Subject: net: dsa: be compatible with masters which unregister on shutdown Lino reports that on his system with bcmgenet as DSA master and KSZ9897 as a switch, rebooting or shutting down never works properly. What does the bcmgenet driver have special to trigger this, that other DSA masters do not? It has an implementation of ->shutdown which simply calls its ->remove implementation. Otherwise said, it unregisters its network interface on shutdown. This message can be seen in a loop, and it hangs the reboot process there: unregister_netdevice: waiting for eth0 to become free. Usage count = 3 So why 3? A usage count of 1 is normal for a registered network interface, and any virtual interface which links itself as an upper of that will increment it via dev_hold. In the case of DSA, this is the call path: dsa_slave_create -> netdev_upper_dev_link -> __netdev_upper_dev_link -> __netdev_adjacent_dev_insert -> dev_hold So a DSA switch with 3 interfaces will result in a usage count elevated by two, and netdev_wait_allrefs will wait until they have gone away. Other stacked interfaces, like VLAN, watch NETDEV_UNREGISTER events and delete themselves, but DSA cannot just vanish and go poof, at most it can unbind itself from the switch devices, but that must happen strictly earlier compared to when the DSA master unregisters its net_device, so reacting on the NETDEV_UNREGISTER event is way too late. It seems that it is a pretty established pattern to have a driver's ->shutdown hook redirect to its ->remove hook, so the same code is executed regardless of whether the driver is unbound from the device, or the system is just shutting down. As Florian puts it, it is quite a big hammer for bcmgenet to unregister its net_device during shutdown, but having a common code path with the driver unbind helps ensure it is well tested. So DSA, for better or for worse, has to live with that and engage in an arms race of implementing the ->shutdown hook too, from all individual drivers, and do something sane when paired with masters that unregister their net_device there. The only sane thing to do, of course, is to unlink from the master. However, complications arise really quickly. The pattern of redirecting ->shutdown to ->remove is not unique to bcmgenet or even to net_device drivers. In fact, SPI controllers do it too (see dspi_shutdown -> dspi_remove), and presumably, I2C controllers and MDIO controllers do it too (this is something I have not researched too deeply, but even if this is not the case today, it is certainly plausible to happen in the future, and must be taken into consideration). Since DSA switches might be SPI devices, I2C devices, MDIO devices, the insane implication is that for the exact same DSA switch device, we might have both ->shutdown and ->remove getting called. So we need to do something with that insane environment. The pattern I've come up with is "if this, then not that", so if either ->shutdown or ->remove gets called, we set the device's drvdata to NULL, and in the other hook, we check whether the drvdata is NULL and just do nothing. This is probably not necessary for platform devices, just for devices on buses, but I would really insist for consistency among drivers, because when code is copy-pasted, it is not always copy-pasted from the best sources. So depending on whether the DSA switch's ->remove or ->shutdown will get called first, we cannot really guarantee even for the same driver if rebooting will result in the same code path on all platforms. But nonetheless, we need to do something minimally reasonable on ->shutdown too to fix the bug. Of course, the ->remove will do more (a full teardown of the tree, with all data structures freed, and this is why the bug was not caught for so long). The new ->shutdown method is kept separate from dsa_unregister_switch not because we couldn't have unregistered the switch, but simply in the interest of doing something quick and to the point. The big question is: does the DSA switch's ->shutdown get called earlier than the DSA master's ->shutdown? If not, there is still a risk that we might still trigger the WARN_ON in unregister_netdevice that says we are attempting to unregister a net_device which has uppers. That's no good. Although the reference to the master net_device won't physically go away even if DSA's ->shutdown comes afterwards, remember we have a dev_hold on it. The answer to that question lies in this comment above device_link_add: * A side effect of the link creation is re-ordering of dpm_list and the * devices_kset list by moving the consumer device and all devices depending * on it to the ends of these lists (that does not happen to devices that have * not been registered when this function is called). so the fact that DSA uses device_link_add towards its master is not exactly for nothing. device_shutdown() walks devices_kset from the back, so this is our guarantee that DSA's shutdown happens before the master's shutdown. Fixes: 2f1e8ea726e9 ("net: dsa: link interfaces with the DSA master to get rid of lockdep warnings") Link: https://lore.kernel.org/netdev/20210909095324.12978-1-LinoSanfilippo@gmx.de/ Reported-by: Lino Sanfilippo Signed-off-by: Vladimir Oltean Tested-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 258867eff230..6e29c0e080f6 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1046,6 +1046,7 @@ static inline int dsa_ndo_eth_ioctl(struct net_device *dev, struct ifreq *ifr, void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); +void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); -- cgit v1.2.3 From 2dcb96bacce36021c2f3eaae0cef607b5bb71ede Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 18 Sep 2021 14:42:35 +0200 Subject: net: core: Correct the sock::sk_lock.owned lockdep annotations lock_sock_fast() and lock_sock_nested() contain lockdep annotations for the sock::sk_lock.owned 'mutex'. sock::sk_lock.owned is not a regular mutex. It is just lockdep wise equivalent. In fact it's an open coded trivial mutex implementation with some interesting features. sock::sk_lock.slock is a regular spinlock protecting the 'mutex' representation sock::sk_lock.owned which is a plain boolean. If 'owned' is true, then some other task holds the 'mutex', otherwise it is uncontended. As this locking construct is obviously endangered by lock ordering issues as any other locking primitive it got lockdep annotated via a dedicated dependency map sock::sk_lock.dep_map which has to be updated at the lock and unlock sites. lock_sock_nested() is a straight forward 'mutex' lock operation: might_sleep(); spin_lock_bh(sock::sk_lock.slock) while (!try_lock(sock::sk_lock.owned)) { spin_unlock_bh(sock::sk_lock.slock); wait_for_release(); spin_lock_bh(sock::sk_lock.slock); } The lockdep annotation for sock::sk_lock.owned is for unknown reasons _after_ the lock has been acquired, i.e. after the code block above and after releasing sock::sk_lock.slock, but inside the bottom halves disabled region: spin_unlock(sock::sk_lock.slock); mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); local_bh_enable(); The placement after the unlock is obvious because otherwise the mutex_acquire() would nest into the spin lock held region. But that's from the lockdep perspective still the wrong place: 1) The mutex_acquire() is issued _after_ the successful acquisition which is pointless because in a dead lock scenario this point is never reached which means that if the deadlock is the first instance of exposing the wrong lock order lockdep does not have a chance to detect it. 2) It only works because lockdep is rather lax on the context from which the mutex_acquire() is issued. Acquiring a mutex inside a bottom halves and therefore non-preemptible region is obviously invalid, except for a trylock which is clearly not the case here. This 'works' stops working on RT enabled kernels where the bottom halves serialization is done via a local lock, which exposes this misplacement because the 'mutex' and the local lock nest the wrong way around and lockdep complains rightfully about a lock inversion. The placement is wrong since the initial commit a5b5bb9a053a ("[PATCH] lockdep: annotate sk_locks") which introduced this. Fix it by moving the mutex_acquire() in front of the actual lock acquisition, which is what the regular mutex_lock() operation does as well. lock_sock_fast() is not that straight forward. It looks at the first glance like a convoluted trylock operation: spin_lock_bh(sock::sk_lock.slock) if (!sock::sk_lock.owned) return false; while (!try_lock(sock::sk_lock.owned)) { spin_unlock_bh(sock::sk_lock.slock); wait_for_release(); spin_lock_bh(sock::sk_lock.slock); } spin_unlock(sock::sk_lock.slock); mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); local_bh_enable(); return true; But that's not the case: lock_sock_fast() is an interesting optimization for short critical sections which can run with bottom halves disabled and sock::sk_lock.slock held. This allows to shortcut the 'mutex' operation in the non contended case by preventing other lockers to acquire sock::sk_lock.owned because they are blocked on sock::sk_lock.slock, which in turn avoids the overhead of doing the heavy processing in release_sock() including waking up wait queue waiters. In the contended case, i.e. when sock::sk_lock.owned == true the behavior is the same as lock_sock_nested(). Semantically this shortcut means, that the task acquired the 'mutex' even if it does not touch the sock::sk_lock.owned field in the non-contended case. Not telling lockdep about this shortcut acquisition is hiding potential lock ordering violations in the fast path. As a consequence the same reasoning as for the above lock_sock_nested() case vs. the placement of the lockdep annotation applies. The current placement of the lockdep annotation was just copied from the original lock_sock(), now renamed to lock_sock_nested(), implementation. Fix this by moving the mutex_acquire() in front of the actual lock acquisition and adding the corresponding mutex_release() into unlock_sock_fast(). Also document the fast path return case with a comment. Reported-by: Sebastian Siewior Signed-off-by: Thomas Gleixner Cc: netdev@vger.kernel.org Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 66a9a90f9558..c005c3c750e8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1640,6 +1640,7 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow) release_sock(sk); __release(&sk->sk_lock.slock); } else { + mutex_release(&sk->sk_lock.dep_map, _RET_IP_); spin_unlock_bh(&sk->sk_lock.slock); } } -- cgit v1.2.3 From fd292c189a979838622d5e03e15fa688c81dd50b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Sep 2021 17:29:16 +0300 Subject: net: dsa: tear down devlink port regions when tearing down the devlink port on error Commit 86f8b1c01a0a ("net: dsa: Do not make user port errors fatal") decided it was fine to ignore errors on certain ports that fail to probe, and go on with the ports that do probe fine. Commit fb6ec87f7229 ("net: dsa: Fix type was not set for devlink port") noticed that devlink_port_type_eth_set(dlp, dp->slave); does not get called, and devlink notices after a timeout of 3600 seconds and prints a WARN_ON. So it went ahead to unregister the devlink port. And because there exists an UNUSED port flavour, we actually re-register the devlink port as UNUSED. Commit 08156ba430b4 ("net: dsa: Add devlink port regions support to DSA") added devlink port regions, which are set up by the driver and not by DSA. When we trigger the devlink port deregistration and reregistration as unused, devlink now prints another WARN_ON, from here: devlink_port_unregister: WARN_ON(!list_empty(&devlink_port->region_list)); So the port still has regions, which makes sense, because they were set up by the driver, and the driver doesn't know we're unregistering the devlink port. Somebody needs to tear them down, and optionally (actually it would be nice, to be consistent) set them up again for the new devlink port. But DSA's layering stays in our way quite badly here. The options I've considered are: 1. Introduce a function in devlink to just change a port's type and flavour. No dice, devlink keeps a lot of state, it really wants the port to not be registered when you set its parameters, so changing anything can only be done by destroying what we currently have and recreating it. 2. Make DSA cache the parameters passed to dsa_devlink_port_region_create, and the region returned, keep those in a list, then when the devlink port unregister needs to take place, the existing devlink regions are destroyed by DSA, and we replay the creation of new regions using the cached parameters. Problem: mv88e6xxx keeps the region pointers in chip->ports[port].region, and these will remain stale after DSA frees them. There are many things DSA can do, but updating mv88e6xxx's private pointers is not one of them. 3. Just let the driver do it (i.e. introduce a very specific method called ds->ops->port_reinit_as_unused, which unregisters its devlink port devlink regions, then the old devlink port, then registers the new one, then the devlink port regions for it). While it does work, as opposed to the others, it's pretty horrible from an API perspective and we can do better. 4. Introduce a new pair of methods, ->port_setup and ->port_teardown, which in the case of mv88e6xxx must register and unregister the devlink port regions. Call these 2 methods when the port must be reinitialized as unused. Naturally, I went for the 4th approach. Fixes: 08156ba430b4 ("net: dsa: Add devlink port regions support to DSA") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6e29c0e080f6..d784e76113b8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -585,8 +585,16 @@ struct dsa_switch_ops { int (*change_tag_protocol)(struct dsa_switch *ds, int port, enum dsa_tag_protocol proto); + /* Optional switch-wide initialization and destruction methods */ int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); + + /* Per-port initialization and destruction methods. Mandatory if the + * driver registers devlink port regions, optional otherwise. + */ + int (*port_setup)(struct dsa_switch *ds, int port); + void (*port_teardown)(struct dsa_switch *ds, int port); + u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* -- cgit v1.2.3 From 794d5b8a497ff053f56856472e2fae038fa761aa Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 17 Sep 2021 12:50:38 +0200 Subject: swiotlb-xen: this is PV-only on x86 The code is unreachable for HVM or PVH, and it also makes little sense in auto-translated environments. On Arm, with xen_{create,destroy}_contiguous_region() both being stubs, I have a hard time seeing what good the Xen specific variant does - the generic one ought to be fine for all purposes there. Still Arm code explicitly references symbols here, so the code will continue to be included there. Instead of making PCI_XEN's "select" conditional, simply drop it - SWIOTLB_XEN will be available unconditionally in the PV case anyway, and is - as explained above - dead code in non-PV environments. This in turn allows dropping the stubs for xen_{create,destroy}_contiguous_region(), the former of which was broken anyway - it failed to set the DMA handle output. Signed-off-by: Jan Beulich Reviewed-by: Christoph Hellwig Reviewed-by: Stefano Stabellini Link: https://lore.kernel.org/r/5947b8ae-fdc7-225c-4838-84712265fc1e@suse.com Signed-off-by: Juergen Gross --- include/xen/xen-ops.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 39a5580f8feb..db28e79b77ee 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -46,19 +46,7 @@ extern unsigned long *xen_contiguous_bitmap; int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, unsigned int address_bits, dma_addr_t *dma_handle); - void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order); -#else -static inline int xen_create_contiguous_region(phys_addr_t pstart, - unsigned int order, - unsigned int address_bits, - dma_addr_t *dma_handle) -{ - return 0; -} - -static inline void xen_destroy_contiguous_region(phys_addr_t pstart, - unsigned int order) { } #endif #if defined(CONFIG_XEN_PV) -- cgit v1.2.3 From d4ffd5df9d18031b6a53f934388726775b4452d3 Mon Sep 17 00:00:00 2001 From: Jiashuo Liang Date: Fri, 30 Jul 2021 11:01:52 +0800 Subject: x86/fault: Fix wrong signal when vsyscall fails with pkey The function __bad_area_nosemaphore() calls kernelmode_fixup_or_oops() with the parameter @signal being actually @pkey, which will send a signal numbered with the argument in @pkey. This bug can be triggered when the kernel fails to access user-given memory pages that are protected by a pkey, so it can go down the do_user_addr_fault() path and pass the !user_mode() check in __bad_area_nosemaphore(). Most cases will simply run the kernel fixup code to make an -EFAULT. But when another condition current->thread.sig_on_uaccess_err is met, which is only used to emulate vsyscall, the kernel will generate the wrong signal. Add a new parameter @pkey to kernelmode_fixup_or_oops() to fix this. [ bp: Massage commit message, fix build error as reported by the 0day bot: https://lkml.kernel.org/r/202109202245.APvuT8BX-lkp@intel.com ] Fixes: 5042d40a264c ("x86/fault: Bypass no_context() for implicit kernel faults from usermode") Reported-by: kernel test robot Signed-off-by: Jiashuo Liang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/20210730030152.249106-1-liangjs@pku.edu.cn --- include/linux/pkeys.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index 6beb26b7151d..86be8bf27b41 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -4,6 +4,8 @@ #include +#define ARCH_DEFAULT_PKEY 0 + #ifdef CONFIG_ARCH_HAS_PKEYS #include #else /* ! CONFIG_ARCH_HAS_PKEYS */ -- cgit v1.2.3 From a4869faf9642518145a8aa4b52e0d5ab0e7ee896 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 15 Sep 2021 17:07:15 +0800 Subject: scsi: core: Remove 'current_tag' The 'current_tag' field in struct scsi_device is unused now; remove it. Link: https://lore.kernel.org/r/1631696835-136198-4-git-send-email-john.garry@huawei.com Reviewed-by: Bart Van Assche Signed-off-by: Hannes Reinecke Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- include/scsi/scsi_device.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 09a17f6e93a7..b97e142a7ca9 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -146,7 +146,6 @@ struct scsi_device { struct scsi_vpd __rcu *vpd_pg83; struct scsi_vpd __rcu *vpd_pg80; struct scsi_vpd __rcu *vpd_pg89; - unsigned char current_tag; /* current tag */ struct scsi_target *sdev_target; blist_flags_t sdev_bflags; /* black/white flags as also found in -- cgit v1.2.3 From 20c36ce2164f1774b487d443ece99b754bc6ad43 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Thu, 16 Sep 2021 10:52:03 +0800 Subject: irqdomain: Change the type of 'size' in __irq_domain_add() to be consistent The 'size' is used in struct_size(domain, revmap, size) and its input parameter type is 'size_t'(unsigned int). Changing the size to 'unsigned int' to make the type consistent. Signed-off-by: Bixuan Cui Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210916025203.44841-1-cuibixuan@huawei.com --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 23e4ee523576..9ee238ad29ce 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -251,7 +251,7 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) } void irq_domain_free_fwnode(struct fwnode_handle *fwnode); -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data); -- cgit v1.2.3 From a68de80f61f6af397bc06fb391ff2e571c9c4d80 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Sep 2021 13:30:27 -0700 Subject: entry: rseq: Call rseq_handle_notify_resume() in tracehook_notify_resume() Invoke rseq_handle_notify_resume() from tracehook_notify_resume() now that the two function are always called back-to-back by architectures that have rseq. The rseq helper is stubbed out for architectures that don't support rseq, i.e. this is a nop across the board. Note, tracehook_notify_resume() is horribly named and arguably does not belong in tracehook.h as literally every line of code in it has nothing to do with tracing. But, that's been true since commit a42c6ded827d ("move key_repace_session_keyring() into tracehook_notify_resume()") first usurped tracehook_notify_resume() back in 2012. Punt cleaning that mess up to future patches. No functional change intended. Acked-by: Mathieu Desnoyers Signed-off-by: Sean Christopherson Message-Id: <20210901203030.1292304-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/tracehook.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 3e80c4bc66f7..2564b7434b4d 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -197,6 +197,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) mem_cgroup_handle_over_high(); blkcg_maybe_throttle_current(); + + rseq_handle_notify_resume(NULL, regs); } /* -- cgit v1.2.3 From 93368aab0efc87288cac65e99c9ed2e0ffc9e7d0 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 21 Sep 2021 22:35:30 +0800 Subject: erofs: fix up erofs_lookup tracepoint Fix up a misuse that the filename pointer isn't always valid in the ring buffer, and we should copy the content instead. Link: https://lore.kernel.org/r/20210921143531.81356-1-hsiangkao@linux.alibaba.com Fixes: 13f06f48f7bf ("staging: erofs: support tracepoint") Cc: stable@vger.kernel.org # 4.19+ Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- include/trace/events/erofs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index bf9806fd1306..db4f2cec8360 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -35,20 +35,20 @@ TRACE_EVENT(erofs_lookup, TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) - __field(const char *, name ) + __string(name, dentry->d_name.name ) __field(unsigned int, flags ) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->nid = EROFS_I(dir)->nid; - __entry->name = dentry->d_name.name; + __assign_str(name, dentry->d_name.name); __entry->flags = flags; ), TP_printk("dev = (%d,%d), pnid = %llu, name:%s, flags:%x", show_dev_nid(__entry), - __entry->name, + __get_str(name), __entry->flags) ); -- cgit v1.2.3 From 12064c1768439fa0882547010afae6b52aafa7af Mon Sep 17 00:00:00 2001 From: Jia He Date: Thu, 23 Sep 2021 11:35:57 +0800 Subject: Revert "ACPI: Add memory semantics to acpi_os_map_memory()" This reverts commit 437b38c51162f8b87beb28a833c4d5dc85fa864e. The memory semantics added in commit 437b38c51162 causes SystemMemory Operation region, whose address range is not described in the EFI memory map to be mapped as NormalNC memory on arm64 platforms (through acpi_os_map_memory() in acpi_ex_system_memory_space_handler()). This triggers the following abort on an ARM64 Ampere eMAG machine, because presumably the physical address range area backing the Opregion does not support NormalNC memory attributes driven on the bus. Internal error: synchronous external abort: 96000410 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.14.0+ #462 Hardware name: MiTAC RAPTOR EV-883832-X3-0001/RAPTOR, BIOS 0.14 02/22/2019 pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [...snip...] Call trace: acpi_ex_system_memory_space_handler+0x26c/0x2c8 acpi_ev_address_space_dispatch+0x228/0x2c4 acpi_ex_access_region+0x114/0x268 acpi_ex_field_datum_io+0x128/0x1b8 acpi_ex_extract_from_field+0x14c/0x2ac acpi_ex_read_data_from_field+0x190/0x1b8 acpi_ex_resolve_node_to_value+0x1ec/0x288 acpi_ex_resolve_to_value+0x250/0x274 acpi_ds_evaluate_name_path+0xac/0x124 acpi_ds_exec_end_op+0x90/0x410 acpi_ps_parse_loop+0x4ac/0x5d8 acpi_ps_parse_aml+0xe0/0x2c8 acpi_ps_execute_method+0x19c/0x1ac acpi_ns_evaluate+0x1f8/0x26c acpi_ns_init_one_device+0x104/0x140 acpi_ns_walk_namespace+0x158/0x1d0 acpi_ns_initialize_devices+0x194/0x218 acpi_initialize_objects+0x48/0x50 acpi_init+0xe0/0x498 If the Opregion address range is not present in the EFI memory map there is no way for us to determine the memory attributes to use to map it - defaulting to NormalNC does not work (and it is not correct on a memory region that may have read side-effects) and therefore commit 437b38c51162 should be reverted, which means reverting back to the original behavior whereby address ranges that are mapped using acpi_os_map_memory() default to the safe devicenGnRnE attributes on ARM64 if the mapped address range is not defined in the EFI memory map. Fixes: 437b38c51162 ("ACPI: Add memory semantics to acpi_os_map_memory()") Signed-off-by: Jia He Acked-by: Lorenzo Pieralisi Acked-by: Catalin Marinas Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_io.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/acpi/acpi_io.h b/include/acpi/acpi_io.h index a0212e67d6f4..027faa8883aa 100644 --- a/include/acpi/acpi_io.h +++ b/include/acpi/acpi_io.h @@ -14,14 +14,6 @@ static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys, } #endif -#ifndef acpi_os_memmap -static inline void __iomem *acpi_os_memmap(acpi_physical_address phys, - acpi_size size) -{ - return ioremap_cache(phys, size); -} -#endif - extern bool acpi_permanent_mmap; void __iomem __ref -- cgit v1.2.3 From 243418e3925d5b5b0657ae54c322d43035e97eed Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 24 Sep 2021 15:43:47 -0700 Subject: mm: fs: invalidate bh_lrus for only cold path The kernel test robot reported the regression of fio.write_iops[1] with commit 8cc621d2f45d ("mm: fs: invalidate BH LRU during page migration"). Since lru_add_drain is called frequently, invalidate bh_lrus there could increase bh_lrus cache miss ratio, which needs more IO in the end. This patch moves the bh_lrus invalidation from the hot path( e.g., zap_page_range, pagevec_release) to cold path(i.e., lru_add_drain_all, lru_cache_disable). Zhengjun Xing confirmed "I test the patch, the regression reduced to -2.9%" [1] https://lore.kernel.org/lkml/20210520083144.GD14190@xsang-OptiPlex-9020/ [2] 8cc621d2f45d, mm: fs: invalidate BH LRU during page migration Link: https://lkml.kernel.org/r/20210907212347.1977686-1-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: kernel test robot Reviewed-by: Chris Goldsworthy Tested-by: "Xing, Zhengjun" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6486d3c19463..36f33685c8c0 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -194,7 +194,7 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size, struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); -void invalidate_bh_lrus_cpu(int cpu); +void invalidate_bh_lrus_cpu(void); bool has_bh_in_lru(int cpu, void *dummy); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); @@ -408,7 +408,7 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } -static inline void invalidate_bh_lrus_cpu(int cpu) {} +static inline void invalidate_bh_lrus_cpu(void) {} static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } #define buffer_heads_over_limit 0 -- cgit v1.2.3 From 57ed7b4303a1c4d1885019fef03e6a5af2e8468a Mon Sep 17 00:00:00 2001 From: Weizhao Ouyang Date: Fri, 24 Sep 2021 15:43:53 -0700 Subject: mm/debug: sync up latest migrate_reason to migrate_reason_names Sync up MR_DEMOTION to migrate_reason_names and add a synch prompt. Link: https://lkml.kernel.org/r/20210921064553.293905-3-o451686892@gmail.com Fixes: 26aa2d199d6f ("mm/migrate: demote pages during reclaim") Signed-off-by: Weizhao Ouyang Reviewed-by: "Huang, Ying" Reviewed-by: John Hubbard Cc: Anshuman Khandual Cc: Michal Hocko Cc: Pavel Tatashin Cc: Yang Shi Cc: Zi Yan Cc: Dave Hansen Cc: Minchan Kim Cc: Mina Almasry Cc: "Matthew Wilcox (Oracle)" Cc: Oscar Salvador Cc: Wei Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 326250996b4e..c8077e936691 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -19,6 +19,11 @@ struct migration_target_control; */ #define MIGRATEPAGE_SUCCESS 0 +/* + * Keep sync with: + * - macro MIGRATE_REASON in include/trace/events/migrate.h + * - migrate_reason_names[MR_TYPES] in mm/debug.c + */ enum migrate_reason { MR_COMPACTION, MR_MEMORY_FAILURE, @@ -32,7 +37,6 @@ enum migrate_reason { MR_TYPES }; -/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION -- cgit v1.2.3