From 0cce06ba859a515bd06224085d3addb870608b6d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Apr 2023 17:03:13 +0200 Subject: debugobjects,locking: Annotate debug_object_fill_pool() wait type violation There is an explicit wait-type violation in debug_object_fill_pool() for PREEMPT_RT=n kernels which allows them to more easily fill the object pool and reduce the chance of allocation failures. Lockdep's wait-type checks are designed to check the PREEMPT_RT locking rules even for PREEMPT_RT=n kernels and object to this, so create a lockdep annotation to allow this to stand. Specifically, create a 'lock' type that overrides the inner wait-type while it is held -- allowing one to temporarily raise it, such that the violation is hidden. Reported-by: Vlastimil Babka Reported-by: Qi Zheng Signed-off-by: Peter Zijlstra (Intel) Tested-by: Qi Zheng Link: https://lkml.kernel.org/r/20230429100614.GA1489784@hirez.programming.kicks-ass.net --- include/linux/lockdep.h | 14 ++++++++++++++ include/linux/lockdep_types.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 1023f349af71..a3329fb49b33 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -339,6 +339,16 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) +/* + * Must use lock_map_aquire_try() with override maps to avoid + * lockdep thinking they participate in the block chain. + */ +#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ + struct lockdep_map _name = { \ + .name = #_name "-wait-type-override", \ + .wait_type_inner = _wait_type, \ + .lock_type = LD_LOCK_WAIT_OVERRIDE, } + #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) @@ -427,6 +437,9 @@ extern int lockdep_is_held(const void *); #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) +#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ + struct lockdep_map __maybe_unused _name = {} + #endif /* !LOCKDEP */ enum xhlock_context_t { @@ -551,6 +564,7 @@ do { \ #define rwsem_release(l, i) lock_release(l, i) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) +#define lock_map_acquire_try(l) lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, _THIS_IP_) diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index d22430840b53..59f4fb1626ea 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -33,6 +33,7 @@ enum lockdep_wait_type { enum lockdep_lock_type { LD_LOCK_NORMAL = 0, /* normal, catch all */ LD_LOCK_PERCPU, /* percpu */ + LD_LOCK_WAIT_OVERRIDE, /* annotation */ LD_LOCK_MAX, }; -- cgit v1.2.3 From 23a5b8bb022c1e071ca91b1a9c10f0ad6a0966e9 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 27 Apr 2023 00:33:36 -0500 Subject: x86/amd_nb: Add PCI ID for family 19h model 78h Commit 310e782a99c7 ("platform/x86/amd: pmc: Utilize SMN index 0 for driver probe") switched to using amd_smn_read() which relies upon the misc PCI ID used by DF function 3 being included in a table. The ID for model 78h is missing in that table, so amd_smn_read() doesn't work. Add the missing ID into amd_nb, restoring s2idle on this system. [ bp: Simplify commit message. ] Fixes: 310e782a99c7 ("platform/x86/amd: pmc: Utilize SMN index 0 for driver probe") Signed-off-by: Mario Limonciello Signed-off-by: Borislav Petkov (AMD) Acked-by: Bjorn Helgaas # pci_ids.h Acked-by: Guenter Roeck Link: https://lore.kernel.org/r/20230427053338.16653-2-mario.limonciello@amd.com --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 45c3d62e616d..95f33dadb2be 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -567,6 +567,7 @@ #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F3 0x166d #define PCI_DEVICE_ID_AMD_19H_M60H_DF_F3 0x14e3 #define PCI_DEVICE_ID_AMD_19H_M70H_DF_F3 0x14f3 +#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F3 0x12fb #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 -- cgit v1.2.3 From c00bc80462afc7963f449d7f21d896d2f629cacc Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 15 Apr 2023 20:23:34 +0200 Subject: power: supply: bq27xxx: Fix poll_interval handling and races on remove Before this patch bq27xxx_battery_teardown() was setting poll_interval = 0 to avoid bq27xxx_battery_update() requeuing the delayed_work item. There are 2 problems with this: 1. If the driver is unbound through sysfs, rather then the module being rmmod-ed, this changes poll_interval unexpectedly 2. This is racy, after it being set poll_interval could be changed before bq27xxx_battery_update() checks it through /sys/module/bq27xxx_battery/parameters/poll_interval Fix this by added a removed attribute to struct bq27xxx_device_info and using that instead of setting poll_interval to 0. There also is another poll_interval related race on remove(), writing /sys/module/bq27xxx_battery/parameters/poll_interval will requeue the delayed_work item for all devices on the bq27xxx_battery_devices list and the device being removed was only removed from that list after cancelling the delayed_work item. Fix this by moving the removal from the bq27xxx_battery_devices list to before cancelling the delayed_work item. Fixes: 8cfaaa811894 ("bq27x00_battery: Fix OOPS caused by unregistring bq27x00 driver") Signed-off-by: Hans de Goede Signed-off-by: Sebastian Reichel --- include/linux/power/bq27xxx_battery.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h index a1aa68141d0b..e3322dad9c85 100644 --- a/include/linux/power/bq27xxx_battery.h +++ b/include/linux/power/bq27xxx_battery.h @@ -68,6 +68,7 @@ struct bq27xxx_device_info { struct bq27xxx_access_methods bus; struct bq27xxx_reg_cache cache; int charge_design_full; + bool removed; unsigned long last_update; struct delayed_work work; struct power_supply *bat; -- cgit v1.2.3 From 939a116142012926e25de0ea6b7e2f8d86a5f1b6 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 15 Apr 2023 20:23:37 +0200 Subject: power: supply: bq27xxx: Ensure power_supply_changed() is called on current sign changes On gauges where the current register is signed, there is no charging flag in the flags register. So only checking flags will not result in power_supply_changed() getting called when e.g. a charger is plugged in and the current sign changes from negative (discharging) to positive (charging). This causes userspace's notion of the status to lag until userspace does a poll. And when a power_supply_leds.c LED trigger is used to indicate charging status with a LED, this LED will lag until the capacity percentage changes, which may take many minutes (because the LED trigger only is updated on power_supply_changed() calls). Fix this by calling bq27xxx_battery_current_and_status() on gauges with a signed current register and checking if the status has changed. Fixes: 297a533b3e62 ("bq27x00: Cache battery registers") Signed-off-by: Hans de Goede Signed-off-by: Sebastian Reichel --- include/linux/power/bq27xxx_battery.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h index e3322dad9c85..7c8d65414a70 100644 --- a/include/linux/power/bq27xxx_battery.h +++ b/include/linux/power/bq27xxx_battery.h @@ -2,6 +2,8 @@ #ifndef __LINUX_BQ27X00_BATTERY_H__ #define __LINUX_BQ27X00_BATTERY_H__ +#include + enum bq27xxx_chip { BQ27000 = 1, /* bq27000, bq27200 */ BQ27010, /* bq27010, bq27210 */ @@ -70,6 +72,7 @@ struct bq27xxx_device_info { int charge_design_full; bool removed; unsigned long last_update; + union power_supply_propval last_status; struct delayed_work work; struct power_supply *bat; struct list_head list; -- cgit v1.2.3 From 19b8766459c41c6f318f8a548cc1c66dffd18363 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 20 Apr 2023 16:06:03 +0100 Subject: firmware: arm_ffa: Fix FFA device names for logical partitions Each physical partition can provide multiple services each with UUID. Each such service can be presented as logical partition with a unique combination of VM ID and UUID. The number of distinct UUID in a system will be less than or equal to the number of logical partitions. However, currently it fails to register more than one logical partition or service within a physical partition as the device name contains only VM ID while both VM ID and UUID are maintained in the partition information. The kernel complains with the below message: | sysfs: cannot create duplicate filename '/devices/arm-ffa-8001' | CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc7 #8 | Hardware name: FVP Base RevC (DT) | Call trace: | dump_backtrace+0xf8/0x118 | show_stack+0x18/0x24 | dump_stack_lvl+0x50/0x68 | dump_stack+0x18/0x24 | sysfs_create_dir_ns+0xe0/0x13c | kobject_add_internal+0x220/0x3d4 | kobject_add+0x94/0x100 | device_add+0x144/0x5d8 | device_register+0x20/0x30 | ffa_device_register+0x88/0xd8 | ffa_setup_partitions+0x108/0x1b8 | ffa_init+0x2ec/0x3a4 | do_one_initcall+0xcc/0x240 | do_initcall_level+0x8c/0xac | do_initcalls+0x54/0x94 | do_basic_setup+0x1c/0x28 | kernel_init_freeable+0x100/0x16c | kernel_init+0x20/0x1a0 | ret_from_fork+0x10/0x20 | kobject_add_internal failed for arm-ffa-8001 with -EEXIST, don't try to | register things with the same name in the same directory. | arm_ffa arm-ffa: unable to register device arm-ffa-8001 err=-17 | ARM FF-A: ffa_setup_partitions: failed to register partition ID 0x8001 By virtue of being random enough to avoid collisions when generated in a distributed system, there is no way to compress UUID keys to the number of bits required to identify each. We can eliminate '-' in the name but it is not worth eliminating 4 bytes and add unnecessary logic for doing that. Also v1.0 doesn't provide the UUID of the partitions which makes it hard to use the same for the device name. So to keep it simple, let us alloc an ID using ida_alloc() and append the same to "arm-ffa" to make up a unique device name. Also stash the id value in ffa_dev to help freeing the ID later when the device is destroyed. Fixes: e781858488b9 ("firmware: arm_ffa: Add initial FFA bus support for device enumeration") Reported-by: Lucian Paul-Trifu Link: https://lore.kernel.org/r/20230419-ffa_fixes_6-4-v2-3-d9108e43a176@arm.com Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index c87aeecaa9b2..583fe3b49a49 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -96,6 +96,7 @@ /* FFA Bus/Device/Driver related */ struct ffa_device { + u32 id; int vm_id; bool mode_32bit; uuid_t uuid; -- cgit v1.2.3 From 162bd18eb55adf464a0fa2b4144b8d61c75ff7c2 Mon Sep 17 00:00:00 2001 From: Roy Novich Date: Sun, 7 May 2023 16:57:43 +0300 Subject: linux/dim: Do nothing if no time delta between samples Add return value for dim_calc_stats. This is an indication for the caller if curr_stats was assigned by the function. Avoid using curr_stats uninitialized over {rdma/net}_dim, when no time delta between samples. Coverity reported this potential use of an uninitialized variable. Fixes: 4c4dbb4a7363 ("net/mlx5e: Move dynamic interrupt coalescing code to include/linux") Fixes: cb3c7fd4f839 ("net/mlx5e: Support adaptive RX coalescing") Signed-off-by: Roy Novich Reviewed-by: Aya Levin Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Reviewed-by: Leon Romanovsky Reviewed-by: Michal Kubiak Link: https://lore.kernel.org/r/20230507135743.138993-1-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- include/linux/dim.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dim.h b/include/linux/dim.h index 6c5733981563..f343bc9aa2ec 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -236,8 +236,9 @@ void dim_park_tired(struct dim *dim); * * Calculate the delta between two samples (in data rates). * Takes into consideration counter wrap-around. + * Returned boolean indicates whether curr_stats are reliable. */ -void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, +bool dim_calc_stats(struct dim_sample *start, struct dim_sample *end, struct dim_stats *curr_stats); /** -- cgit v1.2.3 From 293007b033418c8c9d1b35d68dec49a500750fde Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 May 2023 12:13:27 -0600 Subject: io_uring: make io_uring_sqe_cmd() unconditionally available If CONFIG_IO_URING isn't set, then io_uring_sqe_cmd() is not defined. As the nvme driver uses this helper, it causes a compilation issue: drivers/nvme/host/ioctl.c: In function 'nvme_uring_cmd_io': drivers/nvme/host/ioctl.c:555:44: error: implicit declaration of function 'io_uring_sqe_cmd'; did you mean 'io_uring_free'? [-Werror=implicit-function-declaration] 555 | const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe); | ^~~~~~~~~~~~~~~~ | io_uring_free Fix it by just making io_uring_sqe_cmd() generally available - the types are known, and there's no reason to hide it under CONFIG_IO_URING. Fixes: fd9b8547bc5c ("io_uring: Pass whole sqe to commands") Reported-by: kernel test robot Reported-by: Chen-Yu Tsai Tested-by: Chen-Yu Tsai Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 3399d979ee1c..7fe31b2cd02f 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -36,6 +36,11 @@ struct io_uring_cmd { u8 pdu[32]; /* available inline for free use */ }; +static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) +{ + return sqe->cmd; +} + #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); @@ -66,11 +71,6 @@ static inline void io_uring_free(struct task_struct *tsk) if (tsk->io_uring) __io_uring_free(tsk); } - -static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) -{ - return sqe->cmd; -} #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) -- cgit v1.2.3 From 4063384ef762cc5946fc7a3f89879e76c6ec51e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 13:18:57 +0000 Subject: net: add vlan_get_protocol_and_depth() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before blamed commit, pskb_may_pull() was used instead of skb_header_pointer() in __vlan_get_protocol() and friends. Few callers depended on skb->head being populated with MAC header, syzbot caught one of them (skb_mac_gso_segment()) Add vlan_get_protocol_and_depth() to make the intent clearer and use it where sensible. This is a more generic fix than commit e9d3f80935b6 ("net/af_packet: make sure to pull mac header") which was dealing with a similar issue. kernel BUG at include/linux/skbuff.h:2655 ! invalid opcode: 0000 [#1] SMP KASAN CPU: 0 PID: 1441 Comm: syz-executor199 Not tainted 6.1.24-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/14/2023 RIP: 0010:__skb_pull include/linux/skbuff.h:2655 [inline] RIP: 0010:skb_mac_gso_segment+0x68f/0x6a0 net/core/gro.c:136 Code: fd 48 8b 5c 24 10 44 89 6b 70 48 c7 c7 c0 ae 0d 86 44 89 e6 e8 a1 91 d0 00 48 c7 c7 00 af 0d 86 48 89 de 31 d2 e8 d1 4a e9 ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 RSP: 0018:ffffc90001bd7520 EFLAGS: 00010286 RAX: ffffffff8469736a RBX: ffff88810f31dac0 RCX: ffff888115a18b00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffc90001bd75e8 R08: ffffffff84697183 R09: fffff5200037adf9 R10: 0000000000000000 R11: dffffc0000000001 R12: 0000000000000012 R13: 000000000000fee5 R14: 0000000000005865 R15: 000000000000fed7 FS: 000055555633f300(0000) GS:ffff8881f6a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000000 CR3: 0000000116fea000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [] __skb_gso_segment+0x32d/0x4c0 net/core/dev.c:3419 [] skb_gso_segment include/linux/netdevice.h:4819 [inline] [] validate_xmit_skb+0x3aa/0xee0 net/core/dev.c:3725 [] __dev_queue_xmit+0x1332/0x3300 net/core/dev.c:4313 [] dev_queue_xmit+0x17/0x20 include/linux/netdevice.h:3029 [] packet_snd net/packet/af_packet.c:3111 [inline] [] packet_sendmsg+0x49d2/0x6470 net/packet/af_packet.c:3142 [] sock_sendmsg_nosec net/socket.c:716 [inline] [] sock_sendmsg net/socket.c:736 [inline] [] __sys_sendto+0x472/0x5f0 net/socket.c:2139 [] __do_sys_sendto net/socket.c:2151 [inline] [] __se_sys_sendto net/socket.c:2147 [inline] [] __x64_sys_sendto+0xe5/0x100 net/socket.c:2147 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x2f/0x50 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 469aceddfa3e ("vlan: consolidate VLAN parsing code and limit max parsing depth") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Toke Høiland-Jørgensen Cc: Willem de Bruijn Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 0f40f379d75c..6ba71957851e 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -637,6 +637,23 @@ static inline __be16 vlan_get_protocol(const struct sk_buff *skb) return __vlan_get_protocol(skb, skb->protocol, NULL); } +/* This version of __vlan_get_protocol() also pulls mac header in skb->head */ +static inline __be16 vlan_get_protocol_and_depth(struct sk_buff *skb, + __be16 type, int *depth) +{ + int maclen; + + type = __vlan_get_protocol(skb, type, &maclen); + + if (type) { + if (!pskb_may_pull(skb, maclen)) + type = 0; + else if (depth) + *depth = maclen; + } + return type; +} + /* A getter for the SKB protocol field which will handle VLAN tags consistently * whether VLAN acceleration is enabled or not. */ -- cgit v1.2.3 From 8432a15cd810c99db464b7e7858d559f3e1920e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B3=20=C3=81gila=20Bitsch?= Date: Sat, 22 Apr 2023 18:05:32 +0200 Subject: usb: gadget: drop superfluous ':' in doc string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There was one superfluous ':' that kernel-doc complained about. Reported-by: Randy Dunlap Closes: https://lore.kernel.org/all/c718a490-028d-2682-9ad7-8256d16504bf@infradead.org/ Fixes: fb6211f1584a ("usb: gadget: add doc to struct usb_composite_dev") Signed-off-by: Jó Ágila Bitsch Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/ZEQFzMntIrwvZl4+@jo-einhundert Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/composite.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index a2448e98854f..07531c4f4350 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -443,7 +443,7 @@ static inline struct usb_composite_driver *to_cdriver( * @bcd_webusb_version: 0x0100 by default, WebUSB specification version * @b_webusb_vendor_code: 0x0 by default, vendor code for WebUSB * @landing_page: empty by default, landing page to announce in WebUSB - * @use_webusb:: false by default, interested gadgets set it + * @use_webusb: false by default, interested gadgets set it * @os_desc_config: the configuration to be used with OS descriptors * @setup_pending: true when setup request is queued but not completed * @os_desc_pending: true when os_desc request is queued but not completed -- cgit v1.2.3 From 948f072ada23e0a504c5e4d7d71d4c83bd0785ec Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 May 2023 09:42:47 +1000 Subject: SUNRPC: always free ctxt when freeing deferred request Since the ->xprt_ctxt pointer was added to svc_deferred_req, it has not been sufficient to use kfree() to free a deferred request. We may need to free the ctxt as well. As freeing the ctxt is all that ->xpo_release_rqst() does, we repurpose it to explicit do that even when the ctxt is not stored in an rqst. So we now have ->xpo_release_ctxt() which is given an xprt and a ctxt, which may have been taken either from an rqst or from a dreq. The caller is now responsible for clearing that pointer after the call to ->xpo_release_ctxt. We also clear dr->xprt_ctxt when the ctxt is moved into a new rqst when revisiting a deferred request. This ensures there is only one pointer to the ctxt, so the risk of double freeing in future is reduced. The new code in svc_xprt_release which releases both the ctxt and any rq_deferred depends on this. Fixes: 773f91b2cf3f ("SUNRPC: Fix NFSD's request deferral on RDMA transports") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 +- include/linux/sunrpc/svc_xprt.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 24aa159d29a7..fbc4bd423b35 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -176,7 +176,7 @@ extern struct svc_rdma_recv_ctxt * extern void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt); extern void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma); -extern void svc_rdma_release_rqst(struct svc_rqst *rqstp); +extern void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *ctxt); extern int svc_rdma_recvfrom(struct svc_rqst *); /* svc_rdma_rw.c */ diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 867479204840..a6b12631db21 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -23,7 +23,7 @@ struct svc_xprt_ops { int (*xpo_sendto)(struct svc_rqst *); int (*xpo_result_payload)(struct svc_rqst *, unsigned int, unsigned int); - void (*xpo_release_rqst)(struct svc_rqst *); + void (*xpo_release_ctxt)(struct svc_xprt *xprt, void *ctxt); void (*xpo_detach)(struct svc_xprt *); void (*xpo_free)(struct svc_xprt *); void (*xpo_kill_temp_xprt)(struct svc_xprt *); -- cgit v1.2.3 From 99d46450625590d410f86fe4660a5eff7d3b8343 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Wed, 26 Apr 2023 20:29:28 +0300 Subject: tpm: Prevent hwrng from activating during resume Set TPM_CHIP_FLAG_SUSPENDED in tpm_pm_suspend() and reset in tpm_pm_resume(). While the flag is set, tpm_hwrng() gives back zero bytes. This prevents hwrng from racing during resume. Cc: stable@vger.kernel.org Fixes: 6e592a065d51 ("tpm: Move Linux RNG connection to hwrng") Reviewed-by: Jerry Snitselaar Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 77693389c3f9..6a1e8f157255 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -282,6 +282,7 @@ enum tpm_chip_flags { TPM_CHIP_FLAG_ALWAYS_POWERED = BIT(5), TPM_CHIP_FLAG_FIRMWARE_POWER_MANAGED = BIT(6), TPM_CHIP_FLAG_FIRMWARE_UPGRADE = BIT(7), + TPM_CHIP_FLAG_SUSPENDED = BIT(8), }; #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev) -- cgit v1.2.3 From f15afbd34d8fadbd375f1212e97837e32bc170cc Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Mon, 24 Apr 2023 13:18:35 +0800 Subject: fs: fix undefined behavior in bit shift for SB_NOUSER Shifting signed 32-bit value by 31 bits is undefined, so changing significant bit to unsigned. It was spotted by UBSAN. So let's just fix this by using the BIT() helper for all SB_* flags. Fixes: e462ec50cb5f ("VFS: Differentiate mount flags (MS_*) from internal superblock flags") Signed-off-by: Hao Ge Message-Id: <20230424051835.374204-1-gehao@kylinos.cn> [brauner@kernel.org: use BIT() for all SB_* flags] Signed-off-by: Christian Brauner --- include/linux/fs.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 21a981680856..133f0640fb24 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1076,29 +1076,29 @@ extern int send_sigurg(struct fown_struct *fown); * sb->s_flags. Note that these mirror the equivalent MS_* flags where * represented in both. */ -#define SB_RDONLY 1 /* Mount read-only */ -#define SB_NOSUID 2 /* Ignore suid and sgid bits */ -#define SB_NODEV 4 /* Disallow access to device special files */ -#define SB_NOEXEC 8 /* Disallow program execution */ -#define SB_SYNCHRONOUS 16 /* Writes are synced at once */ -#define SB_MANDLOCK 64 /* Allow mandatory locks on an FS */ -#define SB_DIRSYNC 128 /* Directory modifications are synchronous */ -#define SB_NOATIME 1024 /* Do not update access times. */ -#define SB_NODIRATIME 2048 /* Do not update directory access times */ -#define SB_SILENT 32768 -#define SB_POSIXACL (1<<16) /* VFS does not apply the umask */ -#define SB_INLINECRYPT (1<<17) /* Use blk-crypto for encrypted files */ -#define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */ -#define SB_I_VERSION (1<<23) /* Update inode I_version field */ -#define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ +#define SB_RDONLY BIT(0) /* Mount read-only */ +#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ +#define SB_NODEV BIT(2) /* Disallow access to device special files */ +#define SB_NOEXEC BIT(3) /* Disallow program execution */ +#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ +#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ +#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ +#define SB_NOATIME BIT(10) /* Do not update access times. */ +#define SB_NODIRATIME BIT(11) /* Do not update directory access times */ +#define SB_SILENT BIT(15) +#define SB_POSIXACL BIT(16) /* VFS does not apply the umask */ +#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ +#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ +#define SB_I_VERSION BIT(23) /* Update inode I_version field */ +#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ -#define SB_SUBMOUNT (1<<26) -#define SB_FORCE (1<<27) -#define SB_NOSEC (1<<28) -#define SB_BORN (1<<29) -#define SB_ACTIVE (1<<30) -#define SB_NOUSER (1<<31) +#define SB_SUBMOUNT BIT(26) +#define SB_FORCE BIT(27) +#define SB_NOSEC BIT(28) +#define SB_BORN BIT(29) +#define SB_ACTIVE BIT(30) +#define SB_NOUSER BIT(31) /* These flags relate to encoding and casefolding */ #define SB_ENC_STRICT_MODE_FL (1 << 0) -- cgit v1.2.3 From a18ef64fe1e4558b14a6e0ca9fbe8264475b7013 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 14:47:12 +0200 Subject: tracing: make ftrace_likely_update() declaration visible This function is only used when CONFIG_TRACE_BRANCH_PROFILING is set and DISABLE_BRANCH_PROFILING is not set, and the declaration is hidden behind this combination of tests. But that causes a warning when building with CONFIG_TRACING_BRANCHES, since that sets DISABLE_BRANCH_PROFILING for the tracing code, and the declaration is thus hidden: kernel/trace/trace_branch.c:205:6: error: no previous prototype for 'ftrace_likely_update' [-Werror=missing-prototypes] Move the declaration out of the #ifdef to avoid the warning. Signed-off-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 947a60b801db..d7779a18b24f 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -12,11 +12,10 @@ * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code * to disable branch tracing on a per file basis. */ -#if defined(CONFIG_TRACE_BRANCH_PROFILING) \ - && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__) void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect, int is_constant); - +#if defined(CONFIG_TRACE_BRANCH_PROFILING) \ + && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__) #define likely_notrace(x) __builtin_expect(!!(x), 1) #define unlikely_notrace(x) __builtin_expect(!!(x), 0) -- cgit v1.2.3 From 26e239b37ebdfd189a2e6f94d3407e70313348bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joan=20Bruguera=20Mic=C3=B3?= Date: Wed, 3 May 2023 01:32:32 +0000 Subject: mm: shrinkers: fix race condition on debugfs cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When something registers and unregisters many shrinkers, such as: for x in $(seq 10000); do unshare -Ui true; done Sometimes the following error is printed to the kernel log: debugfs: Directory '...' with parent 'shrinker' already present! This occurs since commit badc28d4924b ("mm: shrinkers: fix deadlock in shrinker debugfs") / v6.2: Since the call to `debugfs_remove_recursive` was moved outside the `shrinker_rwsem`/`shrinker_mutex` lock, but the call to `ida_free` stayed inside, a newly registered shrinker can be re-assigned that ID and attempt to create the debugfs directory before the directory from the previous shrinker has been removed. The locking changes in commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless") made the race condition more likely, though it existed before then. Commit badc28d4924b ("mm: shrinkers: fix deadlock in shrinker debugfs") could be reverted since the issue is addressed should no longer occur since the count and scan operations are lockless since commit 20cd1892fcc3 ("mm: shrinkers: make count and scan in shrinker debugfs lockless"). However, since this is a contended lock, prefer instead moving `ida_free` outside the lock to avoid the race. Link: https://lkml.kernel.org/r/20230503013232.299211-1-joanbrugueram@gmail.com Fixes: badc28d4924b ("mm: shrinkers: fix deadlock in shrinker debugfs") Signed-off-by: Joan Bruguera Micó Cc: Qi Zheng Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 7bde8e1c228a..224293b2dd06 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -107,7 +107,10 @@ extern void synchronize_shrinkers(void); #ifdef CONFIG_SHRINKER_DEBUG extern int shrinker_debugfs_add(struct shrinker *shrinker); -extern struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker); +extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, + int *debugfs_id); +extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, + int debugfs_id); extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...); #else /* CONFIG_SHRINKER_DEBUG */ @@ -115,10 +118,16 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } -static inline struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker) +static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, + int *debugfs_id) { + *debugfs_id = -1; return NULL; } +static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, + int debugfs_id) +{ +} static inline __printf(2, 3) int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) { -- cgit v1.2.3 From 2e9f8ab68f42b059e80db71266c1675c07c664bd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 May 2023 21:45:36 +0200 Subject: mdio_bus: unhide mdio_bus_init prototype mdio_bus_init() is either used as a local module_init() entry, or it gets called in phy_device.c. In the former case, there is no declaration, which causes a warning: drivers/net/phy/mdio_bus.c:1371:12: error: no previous prototype for 'mdio_bus_init' [-Werror=missing-prototypes] Remove the #ifdef around the declaration to avoid the warning.. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20230516194625.549249-4-arnd@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index c5a0dc829714..6478838405a0 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1900,10 +1900,8 @@ void phy_package_leave(struct phy_device *phydev); int devm_phy_package_join(struct device *dev, struct phy_device *phydev, int addr, size_t priv_size); -#if IS_ENABLED(CONFIG_PHYLIB) int __init mdio_bus_init(void); void mdio_bus_exit(void); -#endif int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); int phy_ethtool_get_sset_count(struct phy_device *phydev); -- cgit v1.2.3 From 14c4be92ebb3e36e392aa9dd8f314038a9f96f3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 May 2023 18:50:38 -0700 Subject: tls: rx: strp: force mixed decrypted records into copy mode If a record is partially decrypted we'll have to CoW it, anyway, so go into copy mode and allocate a writable skb right away. This will make subsequent fix simpler because we won't have to teach tls_strp_msg_make_copy() how to copy skbs while preserving decrypt status. Tested-by: Shai Amiram Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/skbuff.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 738776ab8838..0b40417457cd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1587,6 +1587,16 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) to->l4_hash = from->l4_hash; }; +static inline int skb_cmp_decrypted(const struct sk_buff *skb1, + const struct sk_buff *skb2) +{ +#ifdef CONFIG_TLS_DEVICE + return skb2->decrypted - skb1->decrypted; +#else + return 0; +#endif +} + static inline void skb_copy_decrypted(struct sk_buff *to, const struct sk_buff *from) { -- cgit v1.2.3 From ddaf098ea779b3c1302c7843f6ff01e89b1fd380 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 16 May 2023 21:20:14 +0200 Subject: driver core: class: properly reference count class_dev_iter() When class_dev_iter is initialized, the reference count for the subsys private structure is incremented, but never decremented, causing a memory leak over time. To resolve this, save off a pointer to the internal structure into the class_dev_iter structure and then when the iterator is finished, drop the reference count. Reported-and-tested-by: syzbot+e7afd76ad060fa0d2605@syzkaller.appspotmail.com Fixes: 7b884b7f24b4 ("driver core: class.c: convert to only use class_to_subsys") Reported-by: Mirsad Goran Todorovac Cc: Alan Stern Acked-by: Rafael J. Wysocki Tested-by: Mirsad Goran Todorovac Link: https://lore.kernel.org/r/2023051610-stove-condense-9a77@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/device/class.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 9deeaeb457bb..abf3d3bfb6fe 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -74,6 +74,7 @@ struct class { struct class_dev_iter { struct klist_iter ki; const struct device_type *type; + struct subsys_private *sp; }; int __must_check class_register(const struct class *class); -- cgit v1.2.3 From ae9b15fbe63447bc1d3bba3769f409d17ca6fdf6 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 17 May 2023 14:30:10 +0000 Subject: net: fix stack overflow when LRO is disabled for virtual interfaces When the virtual interface's feature is updated, it synchronizes the updated feature for its own lower interface. This propagation logic should be worked as the iteration, not recursively. But it works recursively due to the netdev notification unexpectedly. This problem occurs when it disables LRO only for the team and bonding interface type. team0 | +------+------+-----+-----+ | | | | | team1 team2 team3 ... team200 If team0's LRO feature is updated, it generates the NETDEV_FEAT_CHANGE event to its own lower interfaces(team1 ~ team200). It is worked by netdev_sync_lower_features(). So, the NETDEV_FEAT_CHANGE notification logic of each lower interface work iteratively. But generated NETDEV_FEAT_CHANGE event is also sent to the upper interface too. upper interface(team0) generates the NETDEV_FEAT_CHANGE event for its own lower interfaces again. lower and upper interfaces receive this event and generate this event again and again. So, the stack overflow occurs. But it is not the infinite loop issue. Because the netdev_sync_lower_features() updates features before generating the NETDEV_FEAT_CHANGE event. Already synchronized lower interfaces skip notification logic. So, it is just the problem that iteration logic is changed to the recursive unexpectedly due to the notification mechanism. Reproducer: ip link add team0 type team ethtool -K team0 lro on for i in {1..200} do ip link add team$i master team0 type team ethtool -K team$i lro on done ethtool -K team0 lro off In order to fix it, the notifier_ctx member of bonding/team is introduced. Reported-by: syzbot+60748c96cf5c6df8e581@syzkaller.appspotmail.com Fixes: fd867d51f889 ("net/core: generic support for disabling netdev features down stack") Signed-off-by: Taehee Yoo Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230517143010.3596250-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/if_team.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index fc985e5c739d..8de6b6e67829 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -208,6 +208,7 @@ struct team { bool queue_override_enabled; struct list_head *qom_lists; /* array of queue override mapping lists */ bool port_mtu_change_allowed; + bool notifier_ctx; struct { unsigned int count; unsigned int interval; /* in ms */ -- cgit v1.2.3 From e3afec91aad23c52dfe426c7d7128e4839c3eed8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 May 2023 11:00:10 +0200 Subject: block: remove NFL4_UFLG_MASK The NFL4_UFLG_MASK define slipped in in commit 9208d4149758 ("block: add a ->get_unique_id method") and should never have been added, as NFSD as the only user of it already has it's copy. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230520090010.527046-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b441e633f4dd..c0ffe203a602 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1376,8 +1376,6 @@ enum blk_unique_id { BLK_UID_NAA = 3, }; -#define NFL4_UFLG_MASK 0x0000003F - struct block_device_operations { void (*submit_bio)(struct bio *bio); int (*poll_bio)(struct bio *bio, struct io_comp_batch *iob, -- cgit v1.2.3 From c7dd225bc224726c22db08e680bf787f60ebdee3 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 2 Apr 2023 17:14:10 +0300 Subject: net/mlx5: DR, Check force-loopback RC QP capability independently from RoCE SW Steering uses RC QP for writing STEs to ICM. This writingis done in LB (loopback), and FL (force-loopback) QP is preferred for performance. FL is available when RoCE is enabled or disabled based on RoCE caps. This patch adds reading of FL capability from HCA caps in addition to the existing reading from RoCE caps, thus fixing the case where we didn't have loopback enabled when RoCE was disabled. Fixes: 7304d603a57a ("net/mlx5: DR, Add support for force-loopback QP") Signed-off-by: Itamar Gozlan Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index dc5e2cb302a5..b89778d0d326 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1705,7 +1705,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 rc[0x1]; u8 uar_4k[0x1]; - u8 reserved_at_241[0x9]; + u8 reserved_at_241[0x7]; + u8 fl_rc_qp_when_roce_disabled[0x1]; + u8 regexp_params[0x1]; u8 uar_sz[0x6]; u8 port_selection_cap[0x1]; u8 reserved_at_248[0x1]; -- cgit v1.2.3 From 29173d07f79883ac94f5570294f98af3d4287382 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:06 -0700 Subject: bpf, sockmap: Convert schedule_work into delayed_work Sk_buffs are fed into sockmap verdict programs either from a strparser (when the user might want to decide how framing of skb is done by attaching another parser program) or directly through tcp_read_sock. The tcp_read_sock is the preferred method for performance when the BPF logic is a stream parser. The flow for Cilium's common use case with a stream parser is, tcp_read_sock() sk_psock_verdict_recv ret = bpf_prog_run_pin_on_cpu() sk_psock_verdict_apply(sock, skb, ret) // if system is under memory pressure or app is slow we may // need to queue skb. Do this queuing through ingress_skb and // then kick timer to wake up handler skb_queue_tail(ingress_skb, skb) schedule_work(work); The work queue is wired up to sk_psock_backlog(). This will then walk the ingress_skb skb list that holds our sk_buffs that could not be handled, but should be OK to run at some later point. However, its possible that the workqueue doing this work still hits an error when sending the skb. When this happens the skbuff is requeued on a temporary 'state' struct kept with the workqueue. This is necessary because its possible to partially send an skbuff before hitting an error and we need to know how and where to restart when the workqueue runs next. Now for the trouble, we don't rekick the workqueue. This can cause a stall where the skbuff we just cached on the state variable might never be sent. This happens when its the last packet in a flow and no further packets come along that would cause the system to kick the workqueue from that side. To fix we could do simple schedule_work(), but while under memory pressure it makes sense to back off some instead of continue to retry repeatedly. So instead to fix convert schedule_work to schedule_delayed_work and add backoff logic to reschedule from backlog queue on errors. Its not obvious though what a good backoff is so use '1'. To test we observed some flakes whil running NGINX compliance test with sockmap we attributed these failed test to this bug and subsequent issue. >From on list discussion. This commit bec217197b41("skmsg: Schedule psock work if the cached skb exists on the psock") was intended to address similar race, but had a couple cases it missed. Most obvious it only accounted for receiving traffic on the local socket so if redirecting into another socket we could still get an sk_buff stuck here. Next it missed the case where copied=0 in the recv() handler and then we wouldn't kick the scheduler. Also its sub-optimal to require userspace to kick the internal mechanisms of sockmap to wake it up and copy data to user. It results in an extra syscall and requires the app to actual handle the EAGAIN correctly. Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: William Findlay Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-3-john.fastabend@gmail.com --- include/linux/skmsg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 84f787416a54..904ff9a32ad6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -105,7 +105,7 @@ struct sk_psock { struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; - struct work_struct work; + struct delayed_work work; struct rcu_work rwork; }; -- cgit v1.2.3 From 405df89dd52cbcd69a3cd7d9a10d64de38f854b2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:08 -0700 Subject: bpf, sockmap: Improved check for empty queue We noticed some rare sk_buffs were stepping past the queue when system was under memory pressure. The general theory is to skip enqueueing sk_buffs when its not necessary which is the normal case with a system that is properly provisioned for the task, no memory pressure and enough cpu assigned. But, if we can't allocate memory due to an ENOMEM error when enqueueing the sk_buff into the sockmap receive queue we push it onto a delayed workqueue to retry later. When a new sk_buff is received we then check if that queue is empty. However, there is a problem with simply checking the queue length. When a sk_buff is being processed from the ingress queue but not yet on the sockmap msg receive queue its possible to also recv a sk_buff through normal path. It will check the ingress queue which is zero and then skip ahead of the pkt being processed. Previously we used sock lock from both contexts which made the problem harder to hit, but not impossible. To fix instead of popping the skb from the queue entirely we peek the skb from the queue and do the copy there. This ensures checks to the queue length are non-zero while skb is being processed. Then finally when the entire skb has been copied to user space queue or another socket we pop it off the queue. This way the queue length check allows bypassing the queue only after the list has been completely processed. To reproduce issue we run NGINX compliance test with sockmap running and observe some flakes in our testing that we attributed to this issue. Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Suggested-by: Jakub Sitnicki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: William Findlay Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-5-john.fastabend@gmail.com --- include/linux/skmsg.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 904ff9a32ad6..054d7911bfc9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -71,7 +71,6 @@ struct sk_psock_link { }; struct sk_psock_work_state { - struct sk_buff *skb; u32 len; u32 off; }; -- cgit v1.2.3 From 335b4223466dd75f9f3ea4918187afbadd22e5c8 Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Wed, 3 May 2023 13:16:53 +0000 Subject: x86/pci/xen: populate MSI sysfs entries Commit bf5e758f02fc ("genirq/msi: Simplify sysfs handling") reworked the creation of sysfs entries for MSI IRQs. The creation used to be in msi_domain_alloc_irqs_descs_locked after calling ops->domain_alloc_irqs. Then it moved into __msi_domain_alloc_irqs which is an implementation of domain_alloc_irqs. However, Xen comes with the only other implementation of domain_alloc_irqs and hence doesn't run the sysfs population code anymore. Commit 6c796996ee70 ("x86/pci/xen: Fixup fallout from the PCI/MSI overhaul") set the flag MSI_FLAG_DEV_SYSFS for the xen msi_domain_info but that doesn't actually have an effect because Xen uses it's own domain_alloc_irqs implementation. Fix this by making use of the fallback functions for sysfs population. Fixes: bf5e758f02fc ("genirq/msi: Simplify sysfs handling") Signed-off-by: Maximilian Heyne Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20230503131656.15928-1-mheyne@amazon.de Signed-off-by: Juergen Gross --- include/linux/msi.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index cdb14a1ef268..a50ea79522f8 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -383,6 +383,13 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc); void arch_teardown_msi_irq(unsigned int irq); int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void arch_teardown_msi_irqs(struct pci_dev *dev); +#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */ + +/* + * Xen uses non-default msi_domain_ops and hence needs a way to populate sysfs + * entries of MSI IRQs. + */ +#if defined(CONFIG_PCI_XEN) || defined(CONFIG_PCI_MSI_ARCH_FALLBACKS) #ifdef CONFIG_SYSFS int msi_device_populate_sysfs(struct device *dev); void msi_device_destroy_sysfs(struct device *dev); @@ -390,7 +397,7 @@ void msi_device_destroy_sysfs(struct device *dev); static inline int msi_device_populate_sysfs(struct device *dev) { return 0; } static inline void msi_device_destroy_sysfs(struct device *dev) { } #endif /* !CONFIG_SYSFS */ -#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */ +#endif /* CONFIG_PCI_XEN || CONFIG_PCI_MSI_ARCH_FALLBACKS */ /* * The restore hook is still available even for fully irq domain based -- cgit v1.2.3 From 9828ed3f695a138f7add89fa2a186ababceb8006 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 25 May 2023 09:32:25 -0700 Subject: module: error out early on concurrent load of the same module file It turns out that udev under certain circumstances will concurrently try to load the same modules over-and-over excessively. This isn't a kernel bug, but it ends up affecting the kernel, to the point that under certain circumstances we can fail to boot, because the kernel uses a lot of memory to read all the module data all at once. Note that it isn't a memory leak, it's just basically a thundering herd problem happening at bootup with a lot of CPUs, with the worst cases then being pretty bad. Admittedly the worst situations are somewhat contrived: lots and lots of CPUs, not a lot of memory, and KASAN enabled to make it all slower and as such (unintentionally) exacerbate the problem. Luis explains: [1] "My best assessment of the situation is that each CPU in udev ends up triggering a load of duplicate set of modules, not just one, but *a lot*. Not sure what heuristics udev uses to load a set of modules per CPU." Petr Pavlu chimes in: [2] "My understanding is that udev workers are forked. An initial kmod context is created by the main udevd process but no sharing happens after the fork. It means that the mentioned memory pool logic doesn't really kick in. Multiple parallel load requests come from multiple udev workers, for instance, each handling an udev event for one CPU device and making the exactly same requests as all others are doing at the same time. The optimization idea would be to recognize these duplicate requests at the udevd/kmod level and converge them" Note that module loading has tried to mitigate this issue before, see for example commit 064f4536d139 ("module: avoid allocation if module is already present and ready"), which has a few ASCII graphs on memory use due to this same issue. However, while that noticed that the module was already loaded, and exited with an error early before spending any more time on setting up the module, it didn't handle the case of multiple concurrent module loads all being active - but not complete - at the same time. Yes, one of them will eventually win the race and finalize its copy, and the others will then notice that the module already exists and error out, but while this all happens, we have tons of unnecessary concurrent work being done. Again, the real fix is for udev to not do that (maybe it should use threads instead of fork, and have actual shared data structures and not cause duplicate work). That real fix is apparently not trivial. But it turns out that the kernel already has a pretty good model for dealing with concurrent access to the same file: the i_writecount of the inode. In fact, the module loading already indirectly uses 'i_writecount' , because 'kernel_file_read()' will in fact do ret = deny_write_access(file); if (ret) return ret; ... allow_write_access(file); around the read of the file data. We do not allow concurrent writes to the file, and return -ETXTBUSY if the file was open for writing at the same time as the module data is loaded from it. And the solution to the reader concurrency problem is to simply extend this "no concurrent writers" logic to simply be "exclusive access". Note that "exclusive" in this context isn't really some absolute thing: it's only exclusion from writers and from other "special readers" that do this writer denial. So we simply introduce a variation of that "deny_write_access()" logic that not only denies write access, but also requires that this is the _only_ such access that denies write access. Which means that you can't start loading a module that is already being loaded as a module by somebody else, or you will get the same -ETXTBSY error that you would get if there were writers around. [ It also means that you can't try to load a currently executing executable as a module, for the same reason: executables do that same "deny_write_access()" thing, and that's obviously where the whole ETXTBSY logic traditionally came from. This is not a problem for kernel modules, since the set of normal executable files and kernel module files is entirely disjoint. ] This new function is called "exclusive_deny_write_access()", and the implementation is trivial, in that it's just an atomic decrement of i_writecount if it was 0 before. To use that new exclusivity check, all we then do is wrap the module loading with that exclusive_deny_write_access()() / allow_write_access() pair. The actual patch is a bit bigger than that, because we want to surround not just the "load file data" part, but the whole module setup, to get maximum exclusion. So this ends up splitting up "finit_module()" into a few helper functions to make it all very clear and legible. In Luis' test-case (bringing up 255 vcpu's in a virtual machine [3]), the "wasted vmalloc" space (ie module data read into a vmalloc'ed area in order to be loaded as a module, but then discarded because somebody else loaded the same module instead) dropped from 1.8GiB to 474kB. Yes, that's gigabytes to kilobytes. It doesn't drop completely to zero, because even with this change, you can still end up having completely serial pointless module loads, where one udev process has loaded a module fully (and thus the kernel has released that exclusive lock on the module file), and then another udev process tries to load the same module again. So while we cannot fully get rid of the fundamental bug in user space, we _can_ get rid of the excessive concurrent thundering herd effect. A couple of final side notes on this all: - This tweak only affects the "finit_module()" system call, which gives the kernel a file descriptor with the module data. You can also just feed the module data as raw data from user space with "init_module()" (note the lack of 'f' at the beginning), and obviously for that case we do _not_ have any "exclusive read" logic. So if you absolutely want to do things wrong in user space, and try to load the same module multiple times, and error out only later when the kernel ends up saying "you can't load the same module name twice", you can still do that. And in fact, some distros will do exactly that, because they will uncompress the kernel module data in user space before feeding it to the kernel (mainly because they haven't started using the new kernel side decompression yet). So this is not some absolute "you can't do concurrent loads of the same module". It's literally just a very simple heuristic that will catch it early in case you try to load the exact same module file at the same time, and in that case avoid a potentially nasty situation. - There is another user of "deny_write_access()": the verity code that enables fs-verity on a file (the FS_IOC_ENABLE_VERITY ioctl). If you use fs-verity and you care about verifying the kernel modules (which does make sense), you should do it *before* loading said kernel module. That may sound obvious, but now the implementation basically requires it. Because if you try to do it concurrently, the kernel may refuse to load the module file that is being set up by the fs-verity code. - This all will obviously mean that if you insist on loading the same module in parallel, only one module load will succeed, and the others will return with an error. That was true before too, but what is different is that the -ETXTBSY error can be returned *before* the success case of another process fully loading and instantiating the module. Again, that might sound obvious, and it is indeed the whole point of the whole change: we are much quicker to notice the whole "you're already in the process of loading this module". So it's very much intentional, but it does mean that if you just spray the kernel with "finit_module()", and expect that the module is immediately loaded afterwards without checking the return value, you are doing something horribly horribly wrong. I'd like to say that that would never happen, but the whole _reason_ for this commit is that udev is currently doing something horribly horribly wrong, so ... Link: https://lore.kernel.org/all/ZEGopJ8VAYnE7LQ2@bombadil.infradead.org/ [1] Link: https://lore.kernel.org/all/23bd0ce6-ef78-1cd8-1f21-0e706a00424a@suse.com/ [2] Link: https://lore.kernel.org/lkml/ZG%2Fa+nrt4%2FAAUi5z@bombadil.infradead.org/ [3] Cc: Greg Kroah-Hartman Cc: Lucas De Marchi Cc: Petr Pavlu Tested-by: Luis Chamberlain Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 133f0640fb24..86b50271b4f7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2566,6 +2566,12 @@ static inline int deny_write_access(struct file *file) struct inode *inode = file_inode(file); return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; } +static inline int exclusive_deny_write_access(struct file *file) +{ + int old = 0; + struct inode *inode = file_inode(file); + return atomic_try_cmpxchg(&inode->i_writecount, &old, -1) ? 0 : -ETXTBSY; +} static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); -- cgit v1.2.3