From 1f1dc442c57ec61c08d21d47e4c5b4f16446fe00 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 22 Mar 2024 14:10:26 -0700 Subject: mshyperv: Introduce hv_numa_node_to_pxm_info() Factor out logic for converting numa node to hv_proximity_domain_info into a helper function. Change hv_proximity_domain_info to a struct to improve readability. While at it, rename hv_add_logical_processor_* structs to the correct hv_input_/hv_output_ prefix, and remove the flags field which is not present in the ABI. Signed-off-by: Nuno Das Neves Reviewed-by: Wei Liu Link: https://lore.kernel.org/r/1711141826-9458-1-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1711141826-9458-1-git-send-email-nunodasneves@linux.microsoft.com> --- include/asm-generic/hyperv-tlfs.h | 19 +++++++------------ include/asm-generic/mshyperv.h | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index fdac4a1714ec..69f3f68dc249 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -512,13 +512,9 @@ struct hv_proximity_domain_flags { u32 proximity_info_valid : 1; } __packed; -/* Not a union in windows but useful for zeroing */ -union hv_proximity_domain_info { - struct { - u32 domain_id; - struct hv_proximity_domain_flags flags; - }; - u64 as_uint64; +struct hv_proximity_domain_info { + u32 domain_id; + struct hv_proximity_domain_flags flags; } __packed; struct hv_lp_startup_status { @@ -532,14 +528,13 @@ struct hv_lp_startup_status { } __packed; /* HvAddLogicalProcessor hypercall */ -struct hv_add_logical_processor_in { +struct hv_input_add_logical_processor { u32 lp_index; u32 apic_id; - union hv_proximity_domain_info proximity_domain_info; - u64 flags; + struct hv_proximity_domain_info proximity_domain_info; } __packed; -struct hv_add_logical_processor_out { +struct hv_output_add_logical_processor { struct hv_lp_startup_status startup_status; } __packed; @@ -560,7 +555,7 @@ struct hv_create_vp { u8 padding[3]; u8 subnode_type; u64 subnode_id; - union hv_proximity_domain_info proximity_domain_info; + struct hv_proximity_domain_info proximity_domain_info; u64 flags; } __packed; diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 430f0ae0dde2..cfb0b51a0998 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +68,19 @@ extern u64 hv_do_fast_hypercall8(u16 control, u64 input8); bool hv_isolation_type_snp(void); bool hv_isolation_type_tdx(void); +static inline struct hv_proximity_domain_info hv_numa_node_to_pxm_info(int node) +{ + struct hv_proximity_domain_info pxm_info = {}; + + if (node != NUMA_NO_NODE) { + pxm_info.domain_id = node_to_pxm(node); + pxm_info.flags.proximity_info_valid = 1; + pxm_info.flags.proximity_preferred = 1; + } + + return pxm_info; +} + /* Helper functions that provide a consistent pattern for checking Hyper-V hypercall status. */ static inline int hv_result(u64 status) { -- cgit v1.2.3 From 7f1dd39aedfccf60772328c5b88d56dbd39954c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 27 Mar 2024 08:33:10 +0100 Subject: clk: Provide !COMMON_CLK dummy for devm_clk_rate_exclusive_get() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be able to compile drivers using devm_clk_rate_exclusive_get() also on platforms without the common clk framework, add a dummy implementation that does the same as clk_rate_exclusive_get() in that case (i.e. nothing). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403270305.ydvX9xq1-lkp@intel.com/ Fixes: b0cde62e4c54 ("clk: Add a devm variant of clk_rate_exclusive_get()") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240327073310.520950-2-u.kleine-koenig@pengutronix.de Signed-off-by: Stephen Boyd --- include/linux/clk.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/clk.h b/include/linux/clk.h index 00623f4de5e1..0fa56d672532 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -286,6 +286,11 @@ static inline int clk_rate_exclusive_get(struct clk *clk) return 0; } +static inline int devm_clk_rate_exclusive_get(struct device *dev, struct clk *clk) +{ + return 0; +} + static inline void clk_rate_exclusive_put(struct clk *clk) {} #endif -- cgit v1.2.3 From 9c573cd313433f6c1f7236fe64b9b743500c1628 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 9 Mar 2024 12:24:48 -0800 Subject: randomize_kstack: Improve entropy diffusion The kstack_offset variable was really only ever using the low bits for kernel stack offset entropy. Add a ror32() to increase bit diffusion. Suggested-by: Arnd Bergmann Fixes: 39218ff4c625 ("stack: Optionally randomize kernel stack offset each syscall") Link: https://lore.kernel.org/r/20240309202445.work.165-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/randomize_kstack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index 5d868505a94e..6d92b68efbf6 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -80,7 +80,7 @@ DECLARE_PER_CPU(u32, kstack_offset); if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ &randomize_kstack_offset)) { \ u32 offset = raw_cpu_read(kstack_offset); \ - offset ^= (rand); \ + offset = ror32(offset, 5) ^ (rand); \ raw_cpu_write(kstack_offset, offset); \ } \ } while (0) -- cgit v1.2.3 From 592447f6cb3c20d606d6c5d8e6af68e99707b786 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 2 Apr 2024 13:27:01 +0000 Subject: memblock tests: fix undefined reference to `BIT' commit 772dd0342727 ("mm: enumerate all gfp flags") define gfp flags with the help of BIT, while gfp_types.h doesn't include header file for the definition. This through an error on building memblock tests. Let's include linux/bits.h to fix it. Signed-off-by: Wei Yang CC: Suren Baghdasaryan CC: Michal Hocko Link: https://lore.kernel.org/r/20240402132701.29744-4-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- include/linux/gfp_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 868c8fb1bbc1..13becafe41df 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -2,6 +2,8 @@ #ifndef __LINUX_GFP_TYPES_H #define __LINUX_GFP_TYPES_H +#include + /* The typedef is in types.h but we want the documentation here */ #if 0 /** -- cgit v1.2.3 From 07ed11afb68d94eadd4ffc082b97c2331307c5ea Mon Sep 17 00:00:00 2001 From: Alex Constantino Date: Thu, 4 Apr 2024 19:14:48 +0100 Subject: Revert "drm/qxl: simplify qxl_fence_wait" This reverts commit 5a838e5d5825c85556011478abde708251cc0776. Changes from commit 5a838e5d5825 ("drm/qxl: simplify qxl_fence_wait") would result in a '[TTM] Buffer eviction failed' exception whenever it reached a timeout. Due to a dependency to DMA_FENCE_WARN this also restores some code deleted by commit d72277b6c37d ("dma-buf: nuke DMA_FENCE_TRACE macros v2"). Fixes: 5a838e5d5825 ("drm/qxl: simplify qxl_fence_wait") Link: https://lore.kernel.org/regressions/ZTgydqRlK6WX_b29@eldamar.lan/ Reported-by: Timo Lindfors Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1054514 Signed-off-by: Alex Constantino Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20240404181448.1643-2-dreaming.about.electric.sheep@gmail.com --- include/linux/dma-fence.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index e06bad467f55..c3f9bb6602ba 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -682,4 +682,11 @@ static inline bool dma_fence_is_container(struct dma_fence *fence) return dma_fence_is_array(fence) || dma_fence_is_chain(fence); } +#define DMA_FENCE_WARN(f, fmt, args...) \ + do { \ + struct dma_fence *__ff = (f); \ + pr_warn("f %llu#%llu: " fmt, __ff->context, __ff->seqno,\ + ##args); \ + } while (0) + #endif /* __LINUX_DMA_FENCE_H */ -- cgit v1.2.3 From 38a15d0a50e0a43778561a5861403851f0b0194c Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Thu, 4 Apr 2024 09:57:40 +0200 Subject: u64_stats: fix u64_stats_init() for lockdep when used repeatedly in one file Fix bogus lockdep warnings if multiple u64_stats_sync variables are initialized in the same file. With CONFIG_LOCKDEP, seqcount_init() is a macro which declares: static struct lock_class_key __key; Since u64_stats_init() is a function (albeit an inline one), all calls within the same file end up using the same instance, effectively treating them all as a single lock-class. Fixes: 9464ca650008 ("net: make u64_stats_init() a function") Closes: https://lore.kernel.org/netdev/ea1567d9-ce66-45e6-8168-ac40a47d1821@roeck-us.net/ Signed-off-by: Petr Tesarik Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240404075740.30682-1-petr@tesarici.cz Signed-off-by: Jakub Kicinski --- include/linux/u64_stats_sync.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index ffe48e69b3f3..457879938fc1 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -135,10 +135,11 @@ static inline void u64_stats_inc(u64_stats_t *p) p->v++; } -static inline void u64_stats_init(struct u64_stats_sync *syncp) -{ - seqcount_init(&syncp->seq); -} +#define u64_stats_init(syncp) \ + do { \ + struct u64_stats_sync *__s = (syncp); \ + seqcount_init(&__s->seq); \ + } while (0) static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { -- cgit v1.2.3 From 6648e613226e18897231ab5e42ffc29e63fa3365 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 4 Apr 2024 10:10:01 +0800 Subject: bpf, skmsg: Fix NULL pointer dereference in sk_psock_skb_ingress_enqueue Fix NULL pointer data-races in sk_psock_skb_ingress_enqueue() which syzbot reported [1]. [1] BUG: KCSAN: data-race in sk_psock_drop / sk_psock_skb_ingress_enqueue write to 0xffff88814b3278b8 of 8 bytes by task 10724 on cpu 1: sk_psock_stop_verdict net/core/skmsg.c:1257 [inline] sk_psock_drop+0x13e/0x1f0 net/core/skmsg.c:843 sk_psock_put include/linux/skmsg.h:459 [inline] sock_map_close+0x1a7/0x260 net/core/sock_map.c:1648 unix_release+0x4b/0x80 net/unix/af_unix.c:1048 __sock_release net/socket.c:659 [inline] sock_close+0x68/0x150 net/socket.c:1421 __fput+0x2c1/0x660 fs/file_table.c:422 __fput_sync+0x44/0x60 fs/file_table.c:507 __do_sys_close fs/open.c:1556 [inline] __se_sys_close+0x101/0x1b0 fs/open.c:1541 __x64_sys_close+0x1f/0x30 fs/open.c:1541 do_syscall_64+0xd3/0x1d0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 read to 0xffff88814b3278b8 of 8 bytes by task 10713 on cpu 0: sk_psock_data_ready include/linux/skmsg.h:464 [inline] sk_psock_skb_ingress_enqueue+0x32d/0x390 net/core/skmsg.c:555 sk_psock_skb_ingress_self+0x185/0x1e0 net/core/skmsg.c:606 sk_psock_verdict_apply net/core/skmsg.c:1008 [inline] sk_psock_verdict_recv+0x3e4/0x4a0 net/core/skmsg.c:1202 unix_read_skb net/unix/af_unix.c:2546 [inline] unix_stream_read_skb+0x9e/0xf0 net/unix/af_unix.c:2682 sk_psock_verdict_data_ready+0x77/0x220 net/core/skmsg.c:1223 unix_stream_sendmsg+0x527/0x860 net/unix/af_unix.c:2339 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x140/0x180 net/socket.c:745 ____sys_sendmsg+0x312/0x410 net/socket.c:2584 ___sys_sendmsg net/socket.c:2638 [inline] __sys_sendmsg+0x1e9/0x280 net/socket.c:2667 __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x46/0x50 net/socket.c:2674 do_syscall_64+0xd3/0x1d0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 value changed: 0xffffffff83d7feb0 -> 0x0000000000000000 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 10713 Comm: syz-executor.4 Tainted: G W 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/29/2024 Prior to this, commit 4cd12c6065df ("bpf, sockmap: Fix NULL pointer dereference in sk_psock_verdict_data_ready()") fixed one NULL pointer similarly due to no protection of saved_data_ready. Here is another different caller causing the same issue because of the same reason. So we should protect it with sk_callback_lock read lock because the writer side in the sk_psock_drop() uses "write_lock_bh(&sk->sk_callback_lock);". To avoid errors that could happen in future, I move those two pairs of lock into the sk_psock_data_ready(), which is suggested by John Fastabend. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Reported-by: syzbot+aa8c8ec2538929f18f2d@syzkaller.appspotmail.com Signed-off-by: Jason Xing Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Closes: https://syzkaller.appspot.com/bug?extid=aa8c8ec2538929f18f2d Link: https://lore.kernel.org/all/20240329134037.92124-1-kerneljasonxing@gmail.com Link: https://lore.kernel.org/bpf/20240404021001.94815-1-kerneljasonxing@gmail.com --- include/linux/skmsg.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e65ec3fd2799..a509caf823d6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -461,10 +461,12 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { + read_lock_bh(&sk->sk_callback_lock); if (psock->saved_data_ready) psock->saved_data_ready(sk); else sk->sk_data_ready(sk); + read_unlock_bh(&sk->sk_callback_lock); } static inline void psock_set_prog(struct bpf_prog **pprog, -- cgit v1.2.3 From ffe6176b7f53ca0c99355f13e14a33a40cf49406 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 31 Mar 2024 10:43:48 +0200 Subject: virtio: store owner from modules with register_virtio_driver() Modules registering driver with register_virtio_driver() might forget to set .owner field. i2c-virtio.c for example has it missing. The field is used by some other kernel parts for reference counting (try_module_get()), so it is expected that drivers will set it. Solve the problem by moving this task away from the drivers to the core virtio code, just like we did for platform_driver in commit 9447057eaff8 ("platform_device: use a macro instead of platform_driver_register"). Fixes: 3cfc88380413 ("i2c: virtio: add a virtio i2c frontend driver") Cc: "Jie Deng" Signed-off-by: Krzysztof Kozlowski Message-Id: <20240331-module-owner-virtio-v2-1-98f04bfaf46a@linaro.org> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index b0201747a263..26c4325aa373 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -170,7 +170,7 @@ size_t virtio_max_dma_size(const struct virtio_device *vdev); /** * struct virtio_driver - operations for a virtio I/O driver - * @driver: underlying device driver (populate name and owner). + * @driver: underlying device driver (populate name). * @id_table: the ids serviced by this driver. * @feature_table: an array of feature numbers supported by this driver. * @feature_table_size: number of entries in the feature table array. @@ -208,7 +208,10 @@ static inline struct virtio_driver *drv_to_virtio(struct device_driver *drv) return container_of(drv, struct virtio_driver, driver); } -int register_virtio_driver(struct virtio_driver *drv); +/* use a macro to avoid include chaining to get THIS_MODULE */ +#define register_virtio_driver(drv) \ + __register_virtio_driver(drv, THIS_MODULE) +int __register_virtio_driver(struct virtio_driver *drv, struct module *owner); void unregister_virtio_driver(struct virtio_driver *drv); /* module_virtio_driver() - Helper macro for drivers that don't do -- cgit v1.2.3 From 2855c2a7820bc8198ae937a9a67dbdc3990e9d2c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 2 Apr 2024 17:21:43 -0400 Subject: vhost-vdpa: change ioctl # for VDPA_GET_VRING_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VDPA_GET_VRING_SIZE by mistake uses the already occupied ioctl # 0x80 and we never noticed - it happens to work because the direction and size are different, but confuses tools such as perf which like to look at just the number, and breaks the extra robustness of the ioctl numbering macros. To fix, sort the entries and renumber the ioctl - not too late since it wasn't in any released kernels yet. Cc: Arnaldo Carvalho de Melo Reported-by: Namhyung Kim Fixes: 1496c47065f9 ("vhost-vdpa: uapi to support reporting per vq size") Cc: "Zhu Lingshan" Signed-off-by: Michael S. Tsirkin Message-Id: <41c1c5489688abe5bfef9f7cf15584e3fb872ac5.1712092759.git.mst@redhat.com> Reviewed-by: Eugenio Pérez Reviewed-by: Zhu Lingshan Reviewed-by: Stefano Garzarella Acked-by: Jason Wang --- include/uapi/linux/vhost.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index bea697390613..b95dd84eef2d 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -179,12 +179,6 @@ /* Get the config size */ #define VHOST_VDPA_GET_CONFIG_SIZE _IOR(VHOST_VIRTIO, 0x79, __u32) -/* Get the count of all virtqueues */ -#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) - -/* Get the number of virtqueue groups. */ -#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) - /* Get the number of address spaces. */ #define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) @@ -228,10 +222,17 @@ #define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ struct vhost_vring_state) + +/* Get the count of all virtqueues */ +#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) + +/* Get the number of virtqueue groups. */ +#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) + /* Get the queue size of a specific virtqueue. * userspace set the vring index in vhost_vring_state.index * kernel set the queue size in vhost_vring_state.num */ -#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x80, \ +#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) #endif -- cgit v1.2.3 From d8a6213d70accb403b82924a1c229e733433a5ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Apr 2024 10:30:34 +0000 Subject: geneve: fix header validation in geneve[6]_xmit_skb syzbot is able to trigger an uninit-value in geneve_xmit() [1] Problem : While most ip tunnel helpers (like ip_tunnel_get_dsfield()) uses skb_protocol(skb, true), pskb_inet_may_pull() is only using skb->protocol. If anything else than ETH_P_IPV6 or ETH_P_IP is found in skb->protocol, pskb_inet_may_pull() does nothing at all. If a vlan tag was provided by the caller (af_packet in the syzbot case), the network header might not point to the correct location, and skb linear part could be smaller than expected. Add skb_vlan_inet_prepare() to perform a complete mac validation. Use this in geneve for the moment, I suspect we need to adopt this more broadly. v4 - Jakub reported v3 broke l2_tos_ttl_inherit.sh selftest - Only call __vlan_get_protocol() for vlan types. Link: https://lore.kernel.org/netdev/20240404100035.3270a7d5@kernel.org/ v2,v3 - Addressed Sabrina comments on v1 and v2 Link: https://lore.kernel.org/netdev/Zg1l9L2BNoZWZDZG@hog/ [1] BUG: KMSAN: uninit-value in geneve_xmit_skb drivers/net/geneve.c:910 [inline] BUG: KMSAN: uninit-value in geneve_xmit+0x302d/0x5420 drivers/net/geneve.c:1030 geneve_xmit_skb drivers/net/geneve.c:910 [inline] geneve_xmit+0x302d/0x5420 drivers/net/geneve.c:1030 __netdev_start_xmit include/linux/netdevice.h:4903 [inline] netdev_start_xmit include/linux/netdevice.h:4917 [inline] xmit_one net/core/dev.c:3531 [inline] dev_hard_start_xmit+0x247/0xa20 net/core/dev.c:3547 __dev_queue_xmit+0x348d/0x52c0 net/core/dev.c:4335 dev_queue_xmit include/linux/netdevice.h:3091 [inline] packet_xmit+0x9c/0x6c0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3081 [inline] packet_sendmsg+0x8bb0/0x9ef0 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 __sys_sendto+0x685/0x830 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2199 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was created at: slab_post_alloc_hook mm/slub.c:3804 [inline] slab_alloc_node mm/slub.c:3845 [inline] kmem_cache_alloc_node+0x613/0xc50 mm/slub.c:3888 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:577 __alloc_skb+0x35b/0x7a0 net/core/skbuff.c:668 alloc_skb include/linux/skbuff.h:1318 [inline] alloc_skb_with_frags+0xc8/0xbf0 net/core/skbuff.c:6504 sock_alloc_send_pskb+0xa81/0xbf0 net/core/sock.c:2795 packet_alloc_skb net/packet/af_packet.c:2930 [inline] packet_snd net/packet/af_packet.c:3024 [inline] packet_sendmsg+0x722d/0x9ef0 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 __sys_sendto+0x685/0x830 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2199 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 CPU: 0 PID: 5033 Comm: syz-executor346 Not tainted 6.9.0-rc1-syzkaller-00005-g928a87efa423 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/29/2024 Fixes: d13f048dd40e ("net: geneve: modify IP header check in geneve6_xmit_skb and geneve_xmit_skb") Reported-by: syzbot+9ee20ec1de7b3168db09@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/000000000000d19c3a06152f9ee4@google.com/ Signed-off-by: Eric Dumazet Cc: Phillip Potter Cc: Sabrina Dubroca Reviewed-by: Sabrina Dubroca Reviewed-by: Phillip Potter Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5cd64bb2104d..c286cc2e766e 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -361,6 +361,39 @@ static inline bool pskb_inet_may_pull(struct sk_buff *skb) return pskb_network_may_pull(skb, nhlen); } +/* Variant of pskb_inet_may_pull(). + */ +static inline bool skb_vlan_inet_prepare(struct sk_buff *skb) +{ + int nhlen = 0, maclen = ETH_HLEN; + __be16 type = skb->protocol; + + /* Essentially this is skb_protocol(skb, true) + * And we get MAC len. + */ + if (eth_type_vlan(type)) + type = __vlan_get_protocol(skb, type, &maclen); + + switch (type) { +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + nhlen = sizeof(struct ipv6hdr); + break; +#endif + case htons(ETH_P_IP): + nhlen = sizeof(struct iphdr); + break; + } + /* For ETH_P_IPV6/ETH_P_IP we make sure to pull + * a base network header in skb->head. + */ + if (!pskb_may_pull(skb, maclen + nhlen)) + return false; + + skb_set_network_header(skb, maclen); + return true; +} + static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; -- cgit v1.2.3 From 70ee853eec5693fefd8348a2b049d9cb83362e58 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 8 Apr 2024 11:18:00 +0100 Subject: regmap: Add regmap_read_bypassed() Add a regmap_read_bypassed() to allow reads from the hardware registers while the regmap is in cache-only mode. A typical use for this is to keep the cache in cache-only mode until the hardware has reached a valid state, but one or more status registers must be polled to determine when this state is reached. For example, firmware download on the cs35l56 can take several seconds if there are multiple amps sharing limited bus bandwidth. This is too long to block in probe() so it is done as a background task. The device must be soft-reset to reboot the firmware and during this time the registers are not accessible, so the cache should be in cache-only. But the driver must poll a register to detect when reboot has completed. Signed-off-by: Richard Fitzgerald Fixes: 8a731fd37f8b ("ASoC: cs35l56: Move utility functions to shared file") Link: https://msgid.link/r/20240408101803.43183-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index b743241cfb7c..d470303b1bbb 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1230,6 +1230,7 @@ int regmap_multi_reg_write_bypassed(struct regmap *map, int regmap_raw_write_async(struct regmap *map, unsigned int reg, const void *val, size_t val_len); int regmap_read(struct regmap *map, unsigned int reg, unsigned int *val); +int regmap_read_bypassed(struct regmap *map, unsigned int reg, unsigned int *val); int regmap_raw_read(struct regmap *map, unsigned int reg, void *val, size_t val_len); int regmap_noinc_read(struct regmap *map, unsigned int reg, @@ -1739,6 +1740,13 @@ static inline int regmap_read(struct regmap *map, unsigned int reg, return -EINVAL; } +static inline int regmap_read_bypassed(struct regmap *map, unsigned int reg, + unsigned int *val) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + static inline int regmap_raw_read(struct regmap *map, unsigned int reg, void *val, size_t val_len) { -- cgit v1.2.3 From dfd2ffb373999630a14d7ff614440f1c2fcc704c Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 8 Apr 2024 11:18:03 +0100 Subject: ASoC: cs35l56: Prevent overwriting firmware ASP config Only populate the ASP1 config registers in the regmap cache if the ASP DAI is used. This prevents regcache_sync() from overwriting these registers with their defaults when the firmware owns control of these registers. On a SoundWire system the ASP could be owned by the firmware to share reference audio with the firmware on other cs35l56. Or it can be used as a normal codec-codec interface owned by the driver. The driver must not overwrite the registers if the firmware has control of them. The original implementation for this in commit 07f7d6e7a124 ("ASoC: cs35l56: Fix for initializing ASP1 mixer registers") was to still provide defaults for these registers, assuming that if they were never reconfigured from defaults then regcache_sync() would not write them out because they are not dirty. Unfortunately regcache_sync() is not that smart. If the chip has not reset (so the driver has not called regcache_mark_dirty()) a regcache_sync() could write out registers that are not dirty. To avoid accidental overwriting of the ASP registers, they are removed from the table of defaults and instead are populated with defaults only if one of the ASP DAI configuration functions is called. So if the DAI has never been configured, the firmware is assumed to have ownership of these registers, and the regmap cache will not contain any entries for them. Signed-off-by: Richard Fitzgerald Fixes: 07f7d6e7a124 ("ASoC: cs35l56: Fix for initializing ASP1 mixer registers") Link: https://msgid.link/r/20240408101803.43183-5-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs35l56.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index e0629699b563..1a3c6f66f620 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -267,6 +267,7 @@ struct cs35l56_base { bool fw_patched; bool secured; bool can_hibernate; + bool fw_owns_asp1; bool cal_data_valid; s8 cal_index; struct cirrus_amp_cal_data cal_data; @@ -283,6 +284,7 @@ extern const char * const cs35l56_tx_input_texts[CS35L56_NUM_INPUT_SRC]; extern const unsigned int cs35l56_tx_input_values[CS35L56_NUM_INPUT_SRC]; int cs35l56_set_patch(struct cs35l56_base *cs35l56_base); +int cs35l56_init_asp1_regs_for_driver_control(struct cs35l56_base *cs35l56_base); int cs35l56_force_sync_asp1_registers_from_cache(struct cs35l56_base *cs35l56_base); int cs35l56_mbox_send(struct cs35l56_base *cs35l56_base, unsigned int command); int cs35l56_firmware_shutdown(struct cs35l56_base *cs35l56_base); -- cgit v1.2.3 From c1d11fc2c8320871b40730991071dd0a0b405bc8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2024 09:46:01 +0200 Subject: irqflags: Explicitly ignore lockdep_hrtimer_exit() argument When building with 'make W=1' but CONFIG_TRACE_IRQFLAGS=n, the unused argument to lockdep_hrtimer_exit() causes a warning: kernel/time/hrtimer.c:1655:14: error: variable 'expires_in_hardirq' set but not used [-Werror=unused-but-set-variable] This is intentional behavior, so add a cast to void to shut up the warning. Fixes: 73d20564e0dc ("hrtimer: Don't dereference the hrtimer pointer after the callback") Reported-by: kernel test robot Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240408074609.3170807-1-arnd@kernel.org Closes: https://lore.kernel.org/oe-kbuild-all/202311191229.55QXHVc6-lkp@intel.com/ --- include/linux/irqflags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 147feebd508c..3f003d5fde53 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -114,7 +114,7 @@ do { \ # define lockdep_softirq_enter() do { } while (0) # define lockdep_softirq_exit() do { } while (0) # define lockdep_hrtimer_enter(__hrtimer) false -# define lockdep_hrtimer_exit(__context) do { } while (0) +# define lockdep_hrtimer_exit(__context) do { (void)(__context); } while (0) # define lockdep_posixtimer_enter() do { } while (0) # define lockdep_posixtimer_exit() do { } while (0) # define lockdep_irq_work_enter(__work) do { } while (0) -- cgit v1.2.3 From fa1f51162338b3e2f520d4bfedc42b3b2e00da6d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 19 Mar 2024 19:20:50 +0100 Subject: locking: Make rwsem_assert_held_write_nolockdep() build with PREEMPT_RT=y The commit cited below broke the build for PREEMPT_RT because rwsem_assert_held_write_nolockdep() passes a struct rw_semaphore but rw_base_assert_held_write() expects struct rwbase_rt. Fixing the type alone leads to the problem that WARN_ON() is not found because bug.h is missing. In order to resolve this: - Keep the assert (WARN_ON()) in rwsem.h (not rwbase_rt.h) - Make rwsem_assert_held_write_nolockdep() do the implementation specific (rw_base) writer check. - Replace the "inline" with __always_inline which was used before. Fixes: f70405afc99b1 ("locking: Add rwsem_assert_held() and rwsem_assert_held_write()") Reported-by: Clark Williams Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Waiman Long Link: https://lore.kernel.org/r/20240319182050.U4AzUF3I@linutronix.de --- include/linux/rwbase_rt.h | 4 ++-- include/linux/rwsem.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/rwbase_rt.h b/include/linux/rwbase_rt.h index 29c4e4f243e4..f2394a409c9d 100644 --- a/include/linux/rwbase_rt.h +++ b/include/linux/rwbase_rt.h @@ -31,9 +31,9 @@ static __always_inline bool rw_base_is_locked(const struct rwbase_rt *rwb) return atomic_read(&rwb->readers) != READER_BIAS; } -static inline void rw_base_assert_held_write(const struct rwbase_rt *rwb) +static __always_inline bool rw_base_is_write_locked(const struct rwbase_rt *rwb) { - WARN_ON(atomic_read(&rwb->readers) != WRITER_BIAS); + return atomic_read(&rwb->readers) == WRITER_BIAS; } static __always_inline bool rw_base_is_contended(const struct rwbase_rt *rwb) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 4f1c18992f76..c8b543d428b0 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -167,14 +167,14 @@ static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem) return rw_base_is_locked(&sem->rwbase); } -static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) +static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!rwsem_is_locked(sem)); } -static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) +static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) { - rw_base_assert_held_write(sem); + WARN_ON(!rw_base_is_write_locked(&sem->rwbase)); } static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) -- cgit v1.2.3 From aca1a5287ea328fd1f7e2bfa6806646486d86a70 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 28 Mar 2024 09:25:40 +0530 Subject: ACPI: bus: allow _UID matching for integer zero Commit b2b32a173881 ("ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types") added _UID matching support for both integer and string types, which satisfies NULL @uid2 argument for string types using inversion, but this logic prevents _UID comparision in case the argument is integer 0, which may result in false positives. Fix this using _Generic(), which will allow NULL @uid2 argument for string types as well as _UID matching for all possible integer values. Fixes: b2b32a173881 ("ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types") Signed-off-by: Raag Jadav [ rjw: Comment adjustment ] Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 5de954e2b18a..e7796f373d0d 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -911,17 +911,19 @@ static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2) * acpi_dev_hid_uid_match - Match device by supplied HID and UID * @adev: ACPI device to match. * @hid2: Hardware ID of the device. - * @uid2: Unique ID of the device, pass 0 or NULL to not check _UID. + * @uid2: Unique ID of the device, pass NULL to not check _UID. * * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2 * will be treated as a match. If user wants to validate @uid2, it should be * done before calling this function. * - * Returns: %true if matches or @uid2 is 0 or NULL, %false otherwise. + * Returns: %true if matches or @uid2 is NULL, %false otherwise. */ #define acpi_dev_hid_uid_match(adev, hid2, uid2) \ (acpi_dev_hid_match(adev, hid2) && \ - (!(uid2) || acpi_dev_uid_match(adev, uid2))) + /* Distinguish integer 0 from NULL @uid2 */ \ + (_Generic(uid2, ACPI_STR_TYPES(!(uid2)), default: 0) || \ + acpi_dev_uid_match(adev, uid2))) void acpi_dev_clear_dependencies(struct acpi_device *supplier); bool acpi_dev_ready_for_enumeration(const struct acpi_device *device); -- cgit v1.2.3 From ab9177d83c040eba58387914077ebca56f14fae6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 26 Mar 2024 22:08:54 +0100 Subject: wifi: mac80211: don't use rate mask for scanning The rate mask is intended for use during operation, and can be set to only have masks for the currently active band. As such, it cannot be used for scanning which can be on other bands as well. Simply ignore the rate masks during scanning to avoid warnings from incorrect settings. Reported-by: syzbot+fdc5123366fb9c3fdc6d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=fdc5123366fb9c3fdc6d Co-developed-by: Dmitry Antipov Signed-off-by: Dmitry Antipov Tested-by: Dmitry Antipov Link: https://msgid.link/20240326220854.9594cbb418ca.I7f86c0ba1f98cf7e27c2bacf6c2d417200ecea5c@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 353488ab94a2..2d7f87bc5324 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -953,6 +953,8 @@ enum mac80211_tx_info_flags { * of their QoS TID or other priority field values. * @IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX: first MLO TX, used mostly internally * for sequence number assignment + * @IEEE80211_TX_CTRL_SCAN_TX: Indicates that this frame is transmitted + * due to scanning, not in normal operation on the interface. * @IEEE80211_TX_CTRL_MLO_LINK: If not @IEEE80211_LINK_UNSPECIFIED, this * frame should be transmitted on the specific link. This really is * only relevant for frames that do not have data present, and is @@ -973,6 +975,7 @@ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_NO_SEQNO = BIT(7), IEEE80211_TX_CTRL_DONT_REORDER = BIT(8), IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX = BIT(9), + IEEE80211_TX_CTRL_SCAN_TX = BIT(10), IEEE80211_TX_CTRL_MLO_LINK = 0xf0000000, }; -- cgit v1.2.3 From c722cea208789d9e2660992bcd05fb9fac3adb56 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 8 Apr 2024 21:43:58 -0700 Subject: fs/proc: Skip bootloader comment if no embedded kernel parameters If the "bootconfig" kernel command-line argument was specified or if the kernel was built with CONFIG_BOOT_CONFIG_FORCE, but if there are no embedded kernel parameter, omit the "# Parameters from bootloader:" comment from the /proc/bootconfig file. This will cause automation to fall back to the /proc/cmdline file, which will be identical to the comment in this no-embedded-kernel-parameters case. Link: https://lore.kernel.org/all/20240409044358.1156477-2-paulmck@kernel.org/ Fixes: 8b8ce6c75430 ("fs/proc: remove redundant comments from /proc/bootconfig") Signed-off-by: Masami Hiramatsu Signed-off-by: Paul E. McKenney Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- include/linux/bootconfig.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index ca73940e26df..e5ee2c694401 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -10,6 +10,7 @@ #ifdef __KERNEL__ #include #include +bool __init cmdline_has_extra_options(void); #else /* !__KERNEL__ */ /* * NOTE: This is only for tools/bootconfig, because tools/bootconfig will -- cgit v1.2.3 From ed09f81eeaa8f9265e1787282cb283f10285c259 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Sat, 6 Apr 2024 15:01:09 +0200 Subject: firmware: qcom: uefisecapp: Fix memory related IO errors and crashes It turns out that while the QSEECOM APP_SEND command has specific fields for request and response buffers, uefisecapp expects them both to be in a single memory region. Failure to adhere to this has (so far) resulted in either no response being written to the response buffer (causing an EIO to be emitted down the line), the SCM call to fail with EINVAL (i.e., directly from TZ/firmware), or the device to be hard-reset. While this issue can be triggered deterministically, in the current form it seems to happen rather sporadically (which is why it has gone unnoticed during earlier testing). This is likely due to the two kzalloc() calls (for request and response) being directly after each other. Which means that those likely return consecutive regions most of the time, especially when not much else is going on in the system. Fix this by allocating a single memory region for both request and response buffers, properly aligning both structs inside it. This unfortunately also means that the qcom_scm_qseecom_app_send() interface needs to be restructured, as it should no longer map the DMA regions separately. Therefore, move the responsibility of DMA allocation (or mapping) to the caller. Fixes: 759e7a2b62eb ("firmware: Add support for Qualcomm UEFI Secure Application") Cc: stable@vger.kernel.org # 6.7 Tested-by: Johan Hovold Reviewed-by: Johan Hovold Signed-off-by: Maximilian Luz Tested-by: Konrad Dybcio # X13s Link: https://lore.kernel.org/r/20240406130125.1047436-1-luzmaximilian@gmail.com Signed-off-by: Bjorn Andersson --- include/linux/firmware/qcom/qcom_qseecom.h | 55 +++++++++++++++++++++++++++--- include/linux/firmware/qcom/qcom_scm.h | 10 +++--- 2 files changed, 56 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/firmware/qcom/qcom_qseecom.h b/include/linux/firmware/qcom/qcom_qseecom.h index 5c28298a98be..366243ee9609 100644 --- a/include/linux/firmware/qcom/qcom_qseecom.h +++ b/include/linux/firmware/qcom/qcom_qseecom.h @@ -10,6 +10,7 @@ #define __QCOM_QSEECOM_H #include +#include #include #include @@ -24,12 +25,57 @@ struct qseecom_client { u32 app_id; }; +/** + * qseecom_scm_dev() - Get the SCM device associated with the QSEECOM client. + * @client: The QSEECOM client device. + * + * Returns the SCM device under which the provided QSEECOM client device + * operates. This function is intended to be used for DMA allocations. + */ +static inline struct device *qseecom_scm_dev(struct qseecom_client *client) +{ + return client->aux_dev.dev.parent->parent; +} + +/** + * qseecom_dma_alloc() - Allocate DMA memory for a QSEECOM client. + * @client: The QSEECOM client to allocate the memory for. + * @size: The number of bytes to allocate. + * @dma_handle: Pointer to where the DMA address should be stored. + * @gfp: Allocation flags. + * + * Wrapper function for dma_alloc_coherent(), allocating DMA memory usable for + * TZ/QSEECOM communication. Refer to dma_alloc_coherent() for details. + */ +static inline void *qseecom_dma_alloc(struct qseecom_client *client, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + return dma_alloc_coherent(qseecom_scm_dev(client), size, dma_handle, gfp); +} + +/** + * dma_free_coherent() - Free QSEECOM DMA memory. + * @client: The QSEECOM client for which the memory has been allocated. + * @size: The number of bytes allocated. + * @cpu_addr: Virtual memory address to free. + * @dma_handle: DMA memory address to free. + * + * Wrapper function for dma_free_coherent(), freeing memory previously + * allocated with qseecom_dma_alloc(). Refer to dma_free_coherent() for + * details. + */ +static inline void qseecom_dma_free(struct qseecom_client *client, size_t size, + void *cpu_addr, dma_addr_t dma_handle) +{ + return dma_free_coherent(qseecom_scm_dev(client), size, cpu_addr, dma_handle); +} + /** * qcom_qseecom_app_send() - Send to and receive data from a given QSEE app. * @client: The QSEECOM client associated with the target app. - * @req: Request buffer sent to the app (must be DMA-mappable). + * @req: DMA address of the request buffer sent to the app. * @req_size: Size of the request buffer. - * @rsp: Response buffer, written to by the app (must be DMA-mappable). + * @rsp: DMA address of the response buffer, written to by the app. * @rsp_size: Size of the response buffer. * * Sends a request to the QSEE app associated with the given client and read @@ -43,8 +89,9 @@ struct qseecom_client { * * Return: Zero on success, nonzero on failure. */ -static inline int qcom_qseecom_app_send(struct qseecom_client *client, void *req, size_t req_size, - void *rsp, size_t rsp_size) +static inline int qcom_qseecom_app_send(struct qseecom_client *client, + dma_addr_t req, size_t req_size, + dma_addr_t rsp, size_t rsp_size) { return qcom_scm_qseecom_app_send(client->app_id, req, req_size, rsp, rsp_size); } diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index ccaf28846054..aaa19f93ac43 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -118,8 +118,8 @@ bool qcom_scm_lmh_dcvsh_available(void); #ifdef CONFIG_QCOM_QSEECOM int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id); -int qcom_scm_qseecom_app_send(u32 app_id, void *req, size_t req_size, void *rsp, - size_t rsp_size); +int qcom_scm_qseecom_app_send(u32 app_id, dma_addr_t req, size_t req_size, + dma_addr_t rsp, size_t rsp_size); #else /* CONFIG_QCOM_QSEECOM */ @@ -128,9 +128,9 @@ static inline int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id) return -EINVAL; } -static inline int qcom_scm_qseecom_app_send(u32 app_id, void *req, - size_t req_size, void *rsp, - size_t rsp_size) +static inline int qcom_scm_qseecom_app_send(u32 app_id, + dma_addr_t req, size_t req_size, + dma_addr_t rsp, size_t rsp_size) { return -EINVAL; } -- cgit v1.2.3 From d7a62d0a9a17e97ce2d4d40431094e09956a0568 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 9 Apr 2024 17:46:23 +0200 Subject: compiler.h: Add missing quote in macro comment Add a missing doublequote in the __is_constexpr() macro comment. Signed-off-by: Thorsten Blum Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c00cc6c0878a..8c252e073bd8 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -268,7 +268,7 @@ static inline void *offset_to_ptr(const int *off) * - When one operand is a null pointer constant (i.e. when x is an integer * constant expression) and the other is an object pointer (i.e. our * third operand), the conditional operator returns the type of the - * object pointer operand (i.e. "int *). Here, within the sizeof(), we + * object pointer operand (i.e. "int *"). Here, within the sizeof(), we * would then get: * sizeof(*((int *)(...)) == sizeof(int) == 4 * - When one operand is a void pointer (i.e. when x is not an integer -- cgit v1.2.3 From ff81dade48608363136d52bb2493a6df76458b28 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Wed, 10 Apr 2024 01:35:28 +0800 Subject: io-uring: correct typo in comment for IOU_F_TWQ_LAZY_WAKE The 'r' key is near to 't' key, that makes 'with' to be 'wirh' ? :) Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/r/20240409173531.846714-1-haiyue.wang@intel.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 05df0e399d7c..ac333ea81d31 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -13,7 +13,7 @@ enum { * A hint to not wake right away but delay until there are enough of * tw's queued to match the number of CQEs the task is waiting for. * - * Must not be used wirh requests generating more than one CQE. + * Must not be used with requests generating more than one CQE. * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. */ IOU_F_TWQ_LAZY_WAKE = 1, -- cgit v1.2.3 From 6309863b31dd80317cd7d6824820b44e254e2a9c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Apr 2024 08:28:43 +0000 Subject: net: add copy_safe_from_sockptr() helper copy_from_sockptr() helper is unsafe, unless callers did the prior check against user provided optlen. Too many callers get this wrong, lets add a helper to fix them and avoid future copy/paste bugs. Instead of : if (optlen < sizeof(opt)) { err = -EINVAL; break; } if (copy_from_sockptr(&opt, optval, sizeof(opt)) { err = -EFAULT; break; } Use : err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240408082845.3957374-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/sockptr.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index 307961b41541..317200cd3a60 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -50,11 +50,36 @@ static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, return 0; } +/* Deprecated. + * This is unsafe, unless caller checked user provided optlen. + * Prefer copy_safe_from_sockptr() instead. + */ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) { return copy_from_sockptr_offset(dst, src, 0, size); } +/** + * copy_safe_from_sockptr: copy a struct from sockptr + * @dst: Destination address, in kernel space. This buffer must be @ksize + * bytes long. + * @ksize: Size of @dst struct. + * @optval: Source address. (in user or kernel space) + * @optlen: Size of @optval data. + * + * Returns: + * * -EINVAL: @optlen < @ksize + * * -EFAULT: access to userspace failed. + * * 0 : @ksize bytes were copied + */ +static inline int copy_safe_from_sockptr(void *dst, size_t ksize, + sockptr_t optval, unsigned int optlen) +{ + if (optlen < ksize) + return -EINVAL; + return copy_from_sockptr(dst, optval, ksize); +} + static inline int copy_struct_from_sockptr(void *dst, size_t ksize, sockptr_t src, size_t usize) { -- cgit v1.2.3 From 7633c4da919ad51164acbf1aa322cc1a3ead6129 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Mon, 8 Apr 2024 16:18:21 +0200 Subject: ipv6: fix race condition between ipv6_get_ifaddr and ipv6_del_addr Although ipv6_get_ifaddr walks inet6_addr_lst under the RCU lock, it still means hlist_for_each_entry_rcu can return an item that got removed from the list. The memory itself of such item is not freed thanks to RCU but nothing guarantees the actual content of the memory is sane. In particular, the reference count can be zero. This can happen if ipv6_del_addr is called in parallel. ipv6_del_addr removes the entry from inet6_addr_lst (hlist_del_init_rcu(&ifp->addr_lst)) and drops all references (__in6_ifa_put(ifp) + in6_ifa_put(ifp)). With bad enough timing, this can happen: 1. In ipv6_get_ifaddr, hlist_for_each_entry_rcu returns an entry. 2. Then, the whole ipv6_del_addr is executed for the given entry. The reference count drops to zero and kfree_rcu is scheduled. 3. ipv6_get_ifaddr continues and tries to increments the reference count (in6_ifa_hold). 4. The rcu is unlocked and the entry is freed. 5. The freed entry is returned. Prevent increasing of the reference count in such case. The name in6_ifa_hold_safe is chosen to mimic the existing fib6_info_hold_safe. [ 41.506330] refcount_t: addition on 0; use-after-free. [ 41.506760] WARNING: CPU: 0 PID: 595 at lib/refcount.c:25 refcount_warn_saturate+0xa5/0x130 [ 41.507413] Modules linked in: veth bridge stp llc [ 41.507821] CPU: 0 PID: 595 Comm: python3 Not tainted 6.9.0-rc2.main-00208-g49563be82afa #14 [ 41.508479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) [ 41.509163] RIP: 0010:refcount_warn_saturate+0xa5/0x130 [ 41.509586] Code: ad ff 90 0f 0b 90 90 c3 cc cc cc cc 80 3d c0 30 ad 01 00 75 a0 c6 05 b7 30 ad 01 01 90 48 c7 c7 38 cc 7a 8c e8 cc 18 ad ff 90 <0f> 0b 90 90 c3 cc cc cc cc 80 3d 98 30 ad 01 00 0f 85 75 ff ff ff [ 41.510956] RSP: 0018:ffffbda3c026baf0 EFLAGS: 00010282 [ 41.511368] RAX: 0000000000000000 RBX: ffff9e9c46914800 RCX: 0000000000000000 [ 41.511910] RDX: ffff9e9c7ec29c00 RSI: ffff9e9c7ec1c900 RDI: ffff9e9c7ec1c900 [ 41.512445] RBP: ffff9e9c43660c9c R08: 0000000000009ffb R09: 00000000ffffdfff [ 41.512998] R10: 00000000ffffdfff R11: ffffffff8ca58a40 R12: ffff9e9c4339a000 [ 41.513534] R13: 0000000000000001 R14: ffff9e9c438a0000 R15: ffffbda3c026bb48 [ 41.514086] FS: 00007fbc4cda1740(0000) GS:ffff9e9c7ec00000(0000) knlGS:0000000000000000 [ 41.514726] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 41.515176] CR2: 000056233b337d88 CR3: 000000000376e006 CR4: 0000000000370ef0 [ 41.515713] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 41.516252] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 41.516799] Call Trace: [ 41.517037] [ 41.517249] ? __warn+0x7b/0x120 [ 41.517535] ? refcount_warn_saturate+0xa5/0x130 [ 41.517923] ? report_bug+0x164/0x190 [ 41.518240] ? handle_bug+0x3d/0x70 [ 41.518541] ? exc_invalid_op+0x17/0x70 [ 41.520972] ? asm_exc_invalid_op+0x1a/0x20 [ 41.521325] ? refcount_warn_saturate+0xa5/0x130 [ 41.521708] ipv6_get_ifaddr+0xda/0xe0 [ 41.522035] inet6_rtm_getaddr+0x342/0x3f0 [ 41.522376] ? __pfx_inet6_rtm_getaddr+0x10/0x10 [ 41.522758] rtnetlink_rcv_msg+0x334/0x3d0 [ 41.523102] ? netlink_unicast+0x30f/0x390 [ 41.523445] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 41.523832] netlink_rcv_skb+0x53/0x100 [ 41.524157] netlink_unicast+0x23b/0x390 [ 41.524484] netlink_sendmsg+0x1f2/0x440 [ 41.524826] __sys_sendto+0x1d8/0x1f0 [ 41.525145] __x64_sys_sendto+0x1f/0x30 [ 41.525467] do_syscall_64+0xa5/0x1b0 [ 41.525794] entry_SYSCALL_64_after_hwframe+0x72/0x7a [ 41.526213] RIP: 0033:0x7fbc4cfcea9a [ 41.526528] Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 7e c3 0f 1f 44 00 00 41 54 48 83 ec 30 44 89 [ 41.527942] RSP: 002b:00007ffcf54012a8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 41.528593] RAX: ffffffffffffffda RBX: 00007ffcf5401368 RCX: 00007fbc4cfcea9a [ 41.529173] RDX: 000000000000002c RSI: 00007fbc4b9d9bd0 RDI: 0000000000000005 [ 41.529786] RBP: 00007fbc4bafb040 R08: 00007ffcf54013e0 R09: 000000000000000c [ 41.530375] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 41.530977] R13: ffffffffc4653600 R14: 0000000000000001 R15: 00007fbc4ca85d1b [ 41.531573] Fixes: 5c578aedcb21d ("IPv6: convert addrconf hash list to RCU") Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jiri Benc Link: https://lore.kernel.org/r/8ab821e36073a4a406c50ec83c9e8dc586c539e4.1712585809.git.jbenc@redhat.com Signed-off-by: Jakub Kicinski --- include/net/addrconf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 9d06eb945509..62a407db1bf5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -438,6 +438,10 @@ static inline void in6_ifa_hold(struct inet6_ifaddr *ifp) refcount_inc(&ifp->refcnt); } +static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp) +{ + return refcount_inc_not_zero(&ifp->refcnt); +} /* * compute link-local solicited-node multicast address -- cgit v1.2.3 From 0871bc0129d403747ea0272a4384895d7ad37a6c Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: mm: Move lowmem_page_address() a little later LoongArch will override page_to_virt() which use page_address() in the KFENCE case (by defining WANT_PAGE_VIRTUAL/HASHED_PAGE_VIRTUAL). So move lowmem_page_address() a little later to avoid such build errors: error: implicit declaration of function 'page_address'. Acked-by: Andrew Morton Signed-off-by: Huacai Chen --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0436b919f1c7..7b0ee64225de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2207,11 +2207,6 @@ static inline int arch_make_folio_accessible(struct folio *folio) */ #include -static __always_inline void *lowmem_page_address(const struct page *page) -{ - return page_to_virt(page); -} - #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif @@ -2234,6 +2229,11 @@ void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif +static __always_inline void *lowmem_page_address(const struct page *page) +{ + return page_to_virt(page); +} + #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) -- cgit v1.2.3 From 51eda36d33e43201e7a4fd35232e069b2c850b01 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 15:41:52 -0400 Subject: Bluetooth: SCO: Fix not validating setsockopt user input syzbot reported sco_sock_setsockopt() is copying data without checking user input length. BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in sco_sock_setsockopt+0xc0b/0xf90 net/bluetooth/sco.c:893 Read of size 4 at addr ffff88805f7b15a3 by task syz-executor.5/12578 Fixes: ad10b1a48754 ("Bluetooth: Add Bluetooth socket voice option") Fixes: b96e9c671b05 ("Bluetooth: Add BT_DEFER_SETUP option to sco socket") Fixes: 00398e1d5183 ("Bluetooth: Add support for BT_PKT_STATUS CMSG data for SCO connections") Fixes: f6873401a608 ("Bluetooth: Allow setting of codec for HFP offload use case") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 9fe95a22abeb..eaec5d6caa29 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -585,6 +585,15 @@ static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk, return skb; } +static inline int bt_copy_from_sockptr(void *dst, size_t dst_size, + sockptr_t src, size_t src_size) +{ + if (dst_size > src_size) + return -EINVAL; + + return copy_from_sockptr(dst, src, dst_size); +} + int bt_to_errno(u16 code); __u8 bt_status(int err); -- cgit v1.2.3 From 5284984a4fbacb0883bfebe905902cdda2891a07 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Apr 2024 18:32:12 +0300 Subject: bug: Fix no-return-statement warning with !CONFIG_BUG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BUG() does not return, and arch implementations of BUG() use unreachable() or other non-returning code. However with !CONFIG_BUG, the default implementation is often used instead, and that does not do that. x86 always uses its own implementation, but powerpc with !CONFIG_BUG gives a build error: kernel/time/timekeeping.c: In function ‘timekeeping_debug_get_ns’: kernel/time/timekeeping.c:286:1: error: no return statement in function returning non-void [-Werror=return-type] Add unreachable() to default !CONFIG_BUG BUG() implementation. Fixes: e8e9d21a5df6 ("timekeeping: Refactor timekeeping helpers") Reported-by: Naresh Kamboju Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Tested-by: Linux Kernel Functional Testing Link: https://lore.kernel.org/r/20240410153212.127477-1-adrian.hunter@intel.com Closes: https://lore.kernel.org/all/CA+G9fYvjdZCW=7ZGxS6A_3bysjQ56YF7S-+PNLQ_8a4DKh1Bhg@mail.gmail.com/ --- include/asm-generic/bug.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 6e794420bd39..b7de3a4eade1 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -156,7 +156,10 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #else /* !CONFIG_BUG */ #ifndef HAVE_ARCH_BUG -#define BUG() do {} while (1) +#define BUG() do { \ + do {} while (1); \ + unreachable(); \ +} while (0) #endif #ifndef HAVE_ARCH_BUG_ON -- cgit v1.2.3 From 211f514ebf1ef5de37b1cf6df9d28a56cfd242ca Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:55 -0700 Subject: Drivers: hv: vmbus: Track decrypted status in vmbus_gpadl In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. In order to make sure callers of vmbus_establish_gpadl() and vmbus_teardown_gpadl() don't return decrypted/shared pages to allocators, add a field in struct vmbus_gpadl to keep track of the decryption status of the buffers. This will allow the callers to know if they should free or leak the pages. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-3-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-3-mhklinux@outlook.com> --- include/linux/hyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 6ef0557b4bff..96ceb4095425 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -832,6 +832,7 @@ struct vmbus_gpadl { u32 gpadl_handle; u32 size; void *buffer; + bool decrypted; }; struct vmbus_channel { -- cgit v1.2.3 From a4833e3abae132d613ce7da0e0c9a9465d1681fa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 10 Apr 2024 12:38:13 -0400 Subject: SUNRPC: Fix rpcgss_context trace event acceptor field The rpcgss_context trace event acceptor field is a dynamically sized string that records the "data" parameter. But this parameter is also dependent on the "len" field to determine the size of the data. It needs to use __string_len() helper macro where the length can be passed in. It also incorrectly uses strncpy() to save it instead of __assign_str(). As these macros can change, it is not wise to open code them in trace events. As of commit c759e609030c ("tracing: Remove __assign_str_len()"), __assign_str() can be used for both __string() and __string_len() fields. Before that commit, __assign_str_len() is required to be used. This needs to be noted for backporting. (In actuality, commit c1fa617caeb0 ("tracing: Rework __assign_str() and __string() to not duplicate getting the string") is the commit that makes __string_str_len() obsolete). Cc: stable@vger.kernel.org Fixes: 0c77668ddb4e ("SUNRPC: Introduce trace points in rpc_auth_gss.ko") Signed-off-by: Steven Rostedt (Google) Signed-off-by: Chuck Lever --- include/trace/events/rpcgss.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index ba2d96a1bc2f..f50fcafc69de 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -609,7 +609,7 @@ TRACE_EVENT(rpcgss_context, __field(unsigned int, timeout) __field(u32, window_size) __field(int, len) - __string(acceptor, data) + __string_len(acceptor, data, len) ), TP_fast_assign( @@ -618,7 +618,7 @@ TRACE_EVENT(rpcgss_context, __entry->timeout = timeout; __entry->window_size = window_size; __entry->len = len; - strncpy(__get_str(acceptor), data, len); + __assign_str(acceptor, data); ), TP_printk("win_size=%u expiry=%lu now=%lu timeout=%u acceptor=%.*s", -- cgit v1.2.3 From 29b359cf6d95fd60730533f7f10464e95bd17c73 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 10 Apr 2024 18:50:45 +0200 Subject: netfilter: nft_set_pipapo: walk over current view on netlink dump The generation mask can be updated while netlink dump is in progress. The pipapo set backend walk iterator cannot rely on it to infer what view of the datastructure is to be used. Add notation to specify if user wants to read/update the set. Based on patch from Florian Westphal. Fixes: 2b84e215f874 ("netfilter: nft_set_pipapo: .walk does not deal with generations") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e27c28b612e4..3f1ed467f951 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -307,9 +307,23 @@ static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) return (void *)priv; } + +/** + * enum nft_iter_type - nftables set iterator type + * + * @NFT_ITER_READ: read-only iteration over set elements + * @NFT_ITER_UPDATE: iteration under mutex to update set element state + */ +enum nft_iter_type { + NFT_ITER_UNSPEC, + NFT_ITER_READ, + NFT_ITER_UPDATE, +}; + struct nft_set; struct nft_set_iter { u8 genmask; + enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; -- cgit v1.2.3 From 87b3593bed1868b2d9fe096c01bcdf0ea86cbebf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Apr 2024 13:47:33 +0200 Subject: netfilter: flowtable: validate pppoe header Ensure there is sufficient room to access the protocol field of the PPPoe header. Validate it once before the flowtable lookup, then use a helper function to access protocol field. Reported-by: syzbot+b6f07e1c07ef40199081@syzkaller.appspotmail.com Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index a763dd327c6e..9abb7ee40d72 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -336,7 +336,7 @@ int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); -static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) +static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb) { __be16 proto; @@ -352,6 +352,16 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) return 0; } +static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto) +{ + if (!pskb_may_pull(skb, PPPOE_SES_HLEN)) + return false; + + *inner_proto = __nf_flow_pppoe_proto(skb); + + return true; +} + #define NF_FLOW_TABLE_STAT_INC(net, count) __this_cpu_inc((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_DEC(net, count) __this_cpu_dec((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count) \ -- cgit v1.2.3 From ebaed6d4def877d2035786ff318379eb750044c8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Mar 2024 11:29:10 -0700 Subject: peci: linux/peci.h: fix Excess kernel-doc description warning Remove the @controller: line to prevent the kernel-doc warning: include/linux/peci.h:84: warning: Excess struct member 'controller' description in 'peci_device' Signed-off-by: Randy Dunlap Cc: Iwona Winiarska Cc: openbmc@lists.ozlabs.org Reviewed-by: Iwona Winiarska Fixes: 6523d3b2ffa2 ("peci: Add core infrastructure") Cc: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240329182910.29495-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/peci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/peci.h b/include/linux/peci.h index 9b3d36aff431..90e241458ef6 100644 --- a/include/linux/peci.h +++ b/include/linux/peci.h @@ -58,7 +58,6 @@ static inline struct peci_controller *to_peci_controller(void *d) /** * struct peci_device - PECI device * @dev: device object to register PECI device to the device model - * @controller: manages the bus segment hosting this PECI device * @info: PECI device characteristics * @info.family: device family * @info.model: device model -- cgit v1.2.3 From 89f9a1e876b5a7ad884918c03a46831af202c8a0 Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Sun, 14 Apr 2024 19:49:45 +0800 Subject: bootconfig: use memblock_free_late to free xbc memory to buddy On the time to free xbc memory in xbc_exit(), memblock may has handed over memory to buddy allocator. So it doesn't make sense to free memory back to memblock. memblock_free() called by xbc_exit() even causes UAF bugs on architectures with CONFIG_ARCH_KEEP_MEMBLOCK disabled like x86. Following KASAN logs shows this case. This patch fixes the xbc memory free problem by calling memblock_free() in early xbc init error rewind path and calling memblock_free_late() in xbc exit path to free memory to buddy allocator. [ 9.410890] ================================================================== [ 9.418962] BUG: KASAN: use-after-free in memblock_isolate_range+0x12d/0x260 [ 9.426850] Read of size 8 at addr ffff88845dd30000 by task swapper/0/1 [ 9.435901] CPU: 9 PID: 1 Comm: swapper/0 Tainted: G U 6.9.0-rc3-00208-g586b5dfb51b9 #5 [ 9.446403] Hardware name: Intel Corporation RPLP LP5 (CPU:RaptorLake)/RPLP LP5 (ID:13), BIOS IRPPN02.01.01.00.00.19.015.D-00000000 Dec 28 2023 [ 9.460789] Call Trace: [ 9.463518] [ 9.465859] dump_stack_lvl+0x53/0x70 [ 9.469949] print_report+0xce/0x610 [ 9.473944] ? __virt_addr_valid+0xf5/0x1b0 [ 9.478619] ? memblock_isolate_range+0x12d/0x260 [ 9.483877] kasan_report+0xc6/0x100 [ 9.487870] ? memblock_isolate_range+0x12d/0x260 [ 9.493125] memblock_isolate_range+0x12d/0x260 [ 9.498187] memblock_phys_free+0xb4/0x160 [ 9.502762] ? __pfx_memblock_phys_free+0x10/0x10 [ 9.508021] ? mutex_unlock+0x7e/0xd0 [ 9.512111] ? __pfx_mutex_unlock+0x10/0x10 [ 9.516786] ? kernel_init_freeable+0x2d4/0x430 [ 9.521850] ? __pfx_kernel_init+0x10/0x10 [ 9.526426] xbc_exit+0x17/0x70 [ 9.529935] kernel_init+0x38/0x1e0 [ 9.533829] ? _raw_spin_unlock_irq+0xd/0x30 [ 9.538601] ret_from_fork+0x2c/0x50 [ 9.542596] ? __pfx_kernel_init+0x10/0x10 [ 9.547170] ret_from_fork_asm+0x1a/0x30 [ 9.551552] [ 9.555649] The buggy address belongs to the physical page: [ 9.561875] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x45dd30 [ 9.570821] flags: 0x200000000000000(node=0|zone=2) [ 9.576271] page_type: 0xffffffff() [ 9.580167] raw: 0200000000000000 ffffea0011774c48 ffffea0012ba1848 0000000000000000 [ 9.588823] raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 [ 9.597476] page dumped because: kasan: bad access detected [ 9.605362] Memory state around the buggy address: [ 9.610714] ffff88845dd2ff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 9.618786] ffff88845dd2ff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 9.626857] >ffff88845dd30000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 9.634930] ^ [ 9.638534] ffff88845dd30080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 9.646605] ffff88845dd30100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 9.654675] ================================================================== Link: https://lore.kernel.org/all/20240414114944.1012359-1-qiang4.zhang@linux.intel.com/ Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed") Cc: Stable@vger.kernel.org Signed-off-by: Qiang Zhang Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- include/linux/bootconfig.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index e5ee2c694401..3f4b4ac527ca 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -288,7 +288,12 @@ int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos); int __init xbc_get_info(int *node_size, size_t *data_size); /* XBC cleanup data structures */ -void __init xbc_exit(void); +void __init _xbc_exit(bool early); + +static inline void xbc_exit(void) +{ + _xbc_exit(false); +} /* XBC embedded bootconfig data in kernel */ #ifdef CONFIG_BOOT_CONFIG_EMBED -- cgit v1.2.3 From 1382e3b6a3500c245e5278c66d210c02926f804f Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Thu, 11 Apr 2024 08:11:24 +0300 Subject: net: change maximum number of UDP segments to 128 The commit fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") adds check of potential number of UDP segments vs UDP_MAX_SEGMENTS in linux/virtio_net.h. After this change certification test of USO guest-to-guest transmit on Windows driver for virtio-net device fails, for example with packet size of ~64K and mss of 536 bytes. In general the USO should not be more restrictive than TSO. Indeed, in case of unreasonably small mss a lot of segments can cause queue overflow and packet loss on the destination. Limit of 128 segments is good for any practical purpose, with minimal meaningful mss of 536 the maximal UDP packet will be divided to ~120 segments. The number of segments for UDP packets is validated vs UDP_MAX_SEGMENTS also in udp.c (v4,v6), this does not affect quest-to-guest path but does affect packets sent to host, for example. It is important to mention that UDP_MAX_SEGMENTS is kernel-only define and not available to user mode socket applications. In order to request MSS smaller than MTU the applications just uses setsockopt with SOL_UDP and UDP_SEGMENT and there is no limitations on socket API level. Fixes: fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") Signed-off-by: Yuri Benditovich Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index 17539d089666..e398e1dbd2d3 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,7 +108,7 @@ struct udp_sock { #define udp_assign_bit(nr, sk, val) \ assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val) -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk) -- cgit v1.2.3 From fe90f3967bdb3e13f133e5f44025e15f943a99c5 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 15 Apr 2024 11:21:13 -0400 Subject: sched: Add missing memory barrier in switch_mm_cid Many architectures' switch_mm() (e.g. arm64) do not have an smp_mb() which the core scheduler code has depended upon since commit: commit 223baf9d17f25 ("sched: Fix performance regression introduced by mm_cid") If switch_mm() doesn't call smp_mb(), sched_mm_cid_remote_clear() can unset the actively used cid when it fails to observe active task after it sets lazy_put. There *is* a memory barrier between storing to rq->curr and _return to userspace_ (as required by membarrier), but the rseq mm_cid has stricter requirements: the barrier needs to be issued between store to rq->curr and switch_mm_cid(), which happens earlier than: - spin_unlock(), - switch_to(). So it's fine when the architecture switch_mm() happens to have that barrier already, but less so when the architecture only provides the full barrier in switch_to() or spin_unlock(). It is a bug in the rseq switch_mm_cid() implementation. All architectures that don't have memory barriers in switch_mm(), but rather have the full barrier either in finish_lock_switch() or switch_to() have them too late for the needs of switch_mm_cid(). Introduce a new smp_mb__after_switch_mm(), defined as smp_mb() in the generic barrier.h header, and use it in switch_mm_cid() for scheduler transitions where switch_mm() is expected to provide a memory barrier. Architectures can override smp_mb__after_switch_mm() if their switch_mm() implementation provides an implicit memory barrier. Override it with a no-op on x86 which implicitly provide this memory barrier by writing to CR3. Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid") Reported-by: levi.yun Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar Reviewed-by: Catalin Marinas # for arm64 Acked-by: Dave Hansen # for x86 Cc: # 6.4.x Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240415152114.59122-2-mathieu.desnoyers@efficios.com --- include/asm-generic/barrier.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 0c0695763bea..d4f581c1e21d 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -294,5 +294,13 @@ do { \ #define io_stop_wc() do { } while (0) #endif +/* + * Architectures that guarantee an implicit smp_mb() in switch_mm() + * can override smp_mb__after_switch_mm. + */ +#ifndef smp_mb__after_switch_mm +# define smp_mb__after_switch_mm() smp_mb() +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_GENERIC_BARRIER_H */ -- cgit v1.2.3 From 07a57a338adb6ec9e766d6a6790f76527f45ceb5 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Sun, 7 Apr 2024 15:05:37 +0200 Subject: mm,swapops: update check in is_pfn_swap_entry for hwpoison entries Tony reported that the Machine check recovery was broken in v6.9-rc1, as he was hitting a VM_BUG_ON when injecting uncorrectable memory errors to DRAM. After some more digging and debugging on his side, he realized that this went back to v6.1, with the introduction of 'commit 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry")'. That commit, among other things, introduced swp_offset_pfn(), replacing hwpoison_entry_to_pfn() in its favour. The patch also introduced a VM_BUG_ON() check for is_pfn_swap_entry(), but is_pfn_swap_entry() never got updated to cover hwpoison entries, which means that we would hit the VM_BUG_ON whenever we would call swp_offset_pfn() for such entries on environments with CONFIG_DEBUG_VM set. Fix this by updating the check to cover hwpoison entries as well, and update the comment while we are it. Link: https://lkml.kernel.org/r/20240407130537.16977-1-osalvador@suse.de Fixes: 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry") Signed-off-by: Oscar Salvador Reported-by: Tony Luck Closes: https://lore.kernel.org/all/Zg8kLSl2yAlA3o5D@agluck-desk3/ Tested-by: Tony Luck Reviewed-by: Peter Xu Reviewed-by: David Hildenbrand Acked-by: Miaohe Lin Cc: [6.1.x] Signed-off-by: Andrew Morton --- include/linux/swapops.h | 65 +++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 48b700ba1d18..a5c560a2f8c2 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -390,6 +390,35 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) } #endif /* CONFIG_MIGRATION */ +#ifdef CONFIG_MEMORY_FAILURE + +/* + * Support for hardware poisoned pages + */ +static inline swp_entry_t make_hwpoison_entry(struct page *page) +{ + BUG_ON(!PageLocked(page)); + return swp_entry(SWP_HWPOISON, page_to_pfn(page)); +} + +static inline int is_hwpoison_entry(swp_entry_t entry) +{ + return swp_type(entry) == SWP_HWPOISON; +} + +#else + +static inline swp_entry_t make_hwpoison_entry(struct page *page) +{ + return swp_entry(0, 0); +} + +static inline int is_hwpoison_entry(swp_entry_t swp) +{ + return 0; +} +#endif + typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) @@ -483,8 +512,9 @@ static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) /* * A pfn swap entry is a special type of swap entry that always has a pfn stored - * in the swap offset. They are used to represent unaddressable device memory - * and to restrict access to a page undergoing migration. + * in the swap offset. They can either be used to represent unaddressable device + * memory, to restrict access to a page undergoing migration or to represent a + * pfn which has been hwpoisoned and unmapped. */ static inline bool is_pfn_swap_entry(swp_entry_t entry) { @@ -492,7 +522,7 @@ static inline bool is_pfn_swap_entry(swp_entry_t entry) BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); return is_migration_entry(entry) || is_device_private_entry(entry) || - is_device_exclusive_entry(entry); + is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); } struct page_vma_mapped_walk; @@ -561,35 +591,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd) } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -#ifdef CONFIG_MEMORY_FAILURE - -/* - * Support for hardware poisoned pages - */ -static inline swp_entry_t make_hwpoison_entry(struct page *page) -{ - BUG_ON(!PageLocked(page)); - return swp_entry(SWP_HWPOISON, page_to_pfn(page)); -} - -static inline int is_hwpoison_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_HWPOISON; -} - -#else - -static inline swp_entry_t make_hwpoison_entry(struct page *page) -{ - return swp_entry(0, 0); -} - -static inline int is_hwpoison_entry(swp_entry_t swp) -{ - return 0; -} -#endif - static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; -- cgit v1.2.3 From 1f737846aa3c45f07a06fa0d018b39e1afb8084a Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Tue, 9 Apr 2024 17:54:07 +0200 Subject: mm/shmem: inline shmem_is_huge() for disabled transparent hugepages In order to minimize code size (CONFIG_CC_OPTIMIZE_FOR_SIZE=y), compiler might choose to make a regular function call (out-of-line) for shmem_is_huge() instead of inlining it. When transparent hugepages are disabled (CONFIG_TRANSPARENT_HUGEPAGE=n), it can cause compilation error. mm/shmem.c: In function `shmem_getattr': ./include/linux/huge_mm.h:383:27: note: in expansion of macro `BUILD_BUG' 383 | #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) | ^~~~~~~~~ mm/shmem.c:1148:33: note: in expansion of macro `HPAGE_PMD_SIZE' 1148 | stat->blksize = HPAGE_PMD_SIZE; To prevent the possible error, always inline shmem_is_huge() when transparent hugepages are disabled. Link: https://lkml.kernel.org/r/20240409155407.2322714-1-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ilya Leoshkevich Cc: Vasily Gorbik Cc: Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a4c15db2f5e5..3fb18f7eb73e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -110,8 +110,17 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags); +#else +static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags) +{ + return false; +} +#endif + #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); #else -- cgit v1.2.3 From 69ffed4b62523bbc85511f150500329d28aba356 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 17 Apr 2024 17:19:13 +0300 Subject: gpiolib: swnode: Remove wrong header inclusion The flags in the software node properties are supposed to be the GPIO lookup flags, which are provided by gpio/machine.h, as the software nodes are the kernel internal thing and doesn't need to rely to any of ABIs. Fixes: e7f9ff5dc90c ("gpiolib: add support for software nodes") Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/property.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/gpio/property.h b/include/linux/gpio/property.h index 6c75c8bd44a0..1a14e239221f 100644 --- a/include/linux/gpio/property.h +++ b/include/linux/gpio/property.h @@ -2,7 +2,6 @@ #ifndef __LINUX_GPIO_PROPERTY_H #define __LINUX_GPIO_PROPERTY_H -#include /* for GPIO_* flags */ #include #define PROPERTY_ENTRY_GPIO(_name_, _chip_node_, _idx_, _flags_) \ -- cgit v1.2.3 From 0f022d32c3eca477fbf79a205243a6123ed0fe11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Apr 2024 18:07:28 -0300 Subject: net/sched: Fix mirred deadlock on device recursion When the mirred action is used on a classful egress qdisc and a packet is mirrored or redirected to self we hit a qdisc lock deadlock. See trace below. [..... other info removed for brevity....] [ 82.890906] [ 82.890906] ============================================ [ 82.890906] WARNING: possible recursive locking detected [ 82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G W [ 82.890906] -------------------------------------------- [ 82.890906] ping/418 is trying to acquire lock: [ 82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at: __dev_queue_xmit+0x1778/0x3550 [ 82.890906] [ 82.890906] but task is already holding lock: [ 82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at: __dev_queue_xmit+0x1778/0x3550 [ 82.890906] [ 82.890906] other info that might help us debug this: [ 82.890906] Possible unsafe locking scenario: [ 82.890906] [ 82.890906] CPU0 [ 82.890906] ---- [ 82.890906] lock(&sch->q.lock); [ 82.890906] lock(&sch->q.lock); [ 82.890906] [ 82.890906] *** DEADLOCK *** [ 82.890906] [..... other info removed for brevity....] Example setup (eth0->eth0) to recreate tc qdisc add dev eth0 root handle 1: htb default 30 tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth0 Another example(eth0->eth1->eth0) to recreate tc qdisc add dev eth0 root handle 1: htb default 30 tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth1 tc qdisc add dev eth1 root handle 1: htb default 30 tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth0 We fix this by adding an owner field (CPU id) to struct Qdisc set after root qdisc is entered. When the softirq enters it a second time, if the qdisc owner is the same CPU, the packet is dropped to break the loop. Reported-by: Mingshuai Ren Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/ Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()") Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops") Signed-off-by: Eric Dumazet Reviewed-by: Victor Nogueira Reviewed-by: Pedro Tammela Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240415210728.36949-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cefe0c4bdae3..41ca14e81d55 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -117,6 +117,7 @@ struct Qdisc { struct qdisc_skb_head q; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; + int owner; unsigned long state; unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; -- cgit v1.2.3 From 752863bddacab6b5c5164b1df8c8b2e3a175ee28 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Apr 2024 16:47:43 +0200 Subject: block: propagate partition scanning errors to the BLKRRPART ioctl Commit 4601b4b130de ("block: reopen the device in blkdev_reread_part") lost the propagation of I/O errors from the low-level read of the partition table to the user space caller of the BLKRRPART. Apparently some user space relies on, so restore the propagation. This isn't exactly pretty as other block device open calls explicitly do not are about these errors, so add a new BLK_OPEN_STRICT_SCAN to opt into the error propagation. Fixes: 4601b4b130de ("block: reopen the device in blkdev_reread_part") Reported-by: Saranya Muruganandam Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Shin'ichiro Kawasaki Tested-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20240417144743.2277601-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c3e8f7cf96be..d16320852c4b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -128,6 +128,8 @@ typedef unsigned int __bitwise blk_mode_t; #define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) /* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */ #define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5)) +/* return partition scanning errors */ +#define BLK_OPEN_STRICT_SCAN ((__force blk_mode_t)(1 << 6)) struct gendisk { /* -- cgit v1.2.3 From 32cf5a4eda464d76d553ee3f1b06c4d33d796c52 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 19 Apr 2024 11:51:12 -0400 Subject: Revert "svcrdma: Add Write chunk WRs to the RPC's Send WR chain" Performance regression reported with NFS/RDMA using Omnipath, bisected to commit e084ee673c77 ("svcrdma: Add Write chunk WRs to the RPC's Send WR chain"). Tracing on the server reports: nfsd-7771 [060] 1758.891809: svcrdma_sq_post_err: cq.id=205 cid=226 sc_sq_avail=13643/851 status=-12 sq_post_err reports ENOMEM, and the rdma->sc_sq_avail (13643) is larger than rdma->sc_sq_depth (851). The number of available Send Queue entries is always supposed to be smaller than the Send Queue depth. That seems like a Send Queue accounting bug in svcrdma. As it's getting to be late in the 6.9-rc cycle, revert this commit. It can be revisited in a subsequent kernel release. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218743 Fixes: e084ee673c77 ("svcrdma: Add Write chunk WRs to the RPC's Send WR chain") Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 24cd199dd6f3..d33bab33099a 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -210,7 +210,6 @@ struct svc_rdma_recv_ctxt { */ struct svc_rdma_write_info { struct svcxprt_rdma *wi_rdma; - struct list_head wi_list; const struct svc_rdma_chunk *wi_chunk; @@ -239,10 +238,7 @@ struct svc_rdma_send_ctxt { struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; struct xdr_stream sc_stream; - - struct list_head sc_write_info_list; struct svc_rdma_write_info sc_reply_info; - void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; @@ -274,14 +270,11 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir); -extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt); extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); -extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_pcl *write_pcl, - struct svc_rdma_send_ctxt *sctxt, - const struct xdr_buf *xdr); +extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr); extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_pcl *write_pcl, const struct svc_rdma_pcl *reply_pcl, -- cgit v1.2.3 From 98a821546b3919a10a58faa12ebe5e9a55cd638e Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Mon, 15 Apr 2024 19:10:47 +0800 Subject: vDPA: code clean for vhost_vdpa uapi This commit cleans up the uapi for vhost_vdpa by better naming some of the enums which report blk information to user space, and they are not in any official releases yet. Fixes: 1ac61ddfee93 ("vDPA: report virtio-blk flush info to user space") Fixes: ae1374b7f72c ("vDPA: report virtio-block read-only info to user space") Fixes: 330b8aea6924 ("vDPA: report virtio-block max segment size to user space") Signed-off-by: Zhu Lingshan Message-Id: <20240415111047.1047774-1-lingshan.zhu@intel.com> Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/vdpa.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h index 43c51698195c..842bf1201ac4 100644 --- a/include/uapi/linux/vdpa.h +++ b/include/uapi/linux/vdpa.h @@ -57,7 +57,7 @@ enum vdpa_attr { VDPA_ATTR_DEV_FEATURES, /* u64 */ VDPA_ATTR_DEV_BLK_CFG_CAPACITY, /* u64 */ - VDPA_ATTR_DEV_BLK_CFG_SEG_SIZE, /* u32 */ + VDPA_ATTR_DEV_BLK_CFG_SIZE_MAX, /* u32 */ VDPA_ATTR_DEV_BLK_CFG_BLK_SIZE, /* u32 */ VDPA_ATTR_DEV_BLK_CFG_SEG_MAX, /* u32 */ VDPA_ATTR_DEV_BLK_CFG_NUM_QUEUES, /* u16 */ @@ -70,8 +70,8 @@ enum vdpa_attr { VDPA_ATTR_DEV_BLK_CFG_DISCARD_SEC_ALIGN,/* u32 */ VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEC, /* u32 */ VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEG, /* u32 */ - VDPA_ATTR_DEV_BLK_CFG_READ_ONLY, /* u8 */ - VDPA_ATTR_DEV_BLK_CFG_FLUSH, /* u8 */ + VDPA_ATTR_DEV_BLK_READ_ONLY, /* u8 */ + VDPA_ATTR_DEV_BLK_FLUSH, /* u8 */ /* new attributes must be added above here */ VDPA_ATTR_MAX, -- cgit v1.2.3 From 96e20adc43c4f81e9163a5188cee75a6dd393e09 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Mon, 22 Apr 2024 09:38:33 +0300 Subject: regulator: change stubbed devm_regulator_get_enable to return Ok The devm_regulator_get_enable() should be a 'call and forget' API, meaning, when it is used to enable the regulators, the API does not provide a handle to do any further control of the regulators. It gives no real benefit to return an error from the stub if CONFIG_REGULATOR is not set. On the contrary, returning and error is causing problems to drivers when hardware is such it works out just fine with no regulator control. Returning an error forces drivers to specifically handle the case where CONFIG_REGULATOR is not set, making the mere existence of the stub questionalble. Furthermore, the stub of the regulator_enable() seems to be returning Ok. Change the stub implementation for the devm_regulator_get_enable() to return Ok so drivers do not separately handle the case where the CONFIG_REGULATOR is not set. Signed-off-by: Matti Vaittinen Reported-by: Aleksander Mazur Suggested-by: Guenter Roeck Fixes: da279e6965b3 ("regulator: Add devm helpers for get and enable") Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/ZiYF6d1V1vSPcsJS@drtxq0yyyyyyyyyyyyyby-3.rev.dnainternet.fi Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 4660582a3302..71232fb7dda3 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -320,7 +320,7 @@ devm_regulator_get_exclusive(struct device *dev, const char *id) static inline int devm_regulator_get_enable(struct device *dev, const char *id) { - return -ENODEV; + return 0; } static inline int devm_regulator_get_enable_optional(struct device *dev, -- cgit v1.2.3 From ff33132605c1a0acea59e4c523cb7c6fabe856b2 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Tue, 23 Apr 2024 14:38:28 +0300 Subject: regulator: change devm_regulator_get_enable_optional() stub to return Ok The devm_regulator_get_enable_optional() should be a 'call and forget' API, meaning, when it is used to enable the regulators, the API does not provide a handle to do any further control of the regulators. It gives no real benefit to return an error from the stub if CONFIG_REGULATOR is not set. On the contrary, returning an error is causing problems to drivers when hardware is such it works out just fine with no regulator control. Returning an error forces drivers to specifically handle the case where CONFIG_REGULATOR is not set, making the mere existence of the stub questionalble. Change the stub implementation for the devm_regulator_get_enable_optional() to return Ok so drivers do not separately handle the case where the CONFIG_REGULATOR is not set. Signed-off-by: Matti Vaittinen Fixes: da279e6965b3 ("regulator: Add devm helpers for get and enable") Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/ZiedtOE00Zozd3XO@fedora Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 71232fb7dda3..ed180ca419da 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -326,7 +326,7 @@ static inline int devm_regulator_get_enable(struct device *dev, const char *id) static inline int devm_regulator_get_enable_optional(struct device *dev, const char *id) { - return -ENODEV; + return 0; } static inline struct regulator *__must_check -- cgit v1.2.3 From 3584718cf2ec7e79b6814f2596dcf398c5fb2eca Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 21 Apr 2024 17:52:48 +0000 Subject: net: fix sk_memory_allocated_{add|sub} vs softirqs Jonathan Heathcote reported a regression caused by blamed commit on aarch64 architecture. x86 happens to have irq-safe __this_cpu_add_return() and __this_cpu_sub(), but this is not generic. I think my confusion came from "struct sock" argument, because these helpers are called with a locked socket. But the memory accounting is per-proto (and per-cpu after the blamed commit). We might cleanup these helpers later to directly accept a "struct proto *proto" argument. Switch to this_cpu_add_return() and this_cpu_xchg() operations, and get rid of preempt_disable()/preempt_enable() pairs. Fast path becomes a bit faster as a result :) Many thanks to Jonathan Heathcote for his awesome report and investigations. Fixes: 3cd3399dd7a8 ("net: implement per-cpu reserves for memory_allocated") Reported-by: Jonathan Heathcote Closes: https://lore.kernel.org/netdev/VI1PR01MB42407D7947B2EA448F1E04EFD10D2@VI1PR01MB4240.eurprd01.prod.exchangelabs.com/ Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/r/20240421175248.1692552-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index f57bfd8a2ad2..b4b553df7870 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1410,32 +1410,34 @@ sk_memory_allocated(const struct sock *sk) #define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) extern int sysctl_mem_pcpu_rsv; +static inline void proto_memory_pcpu_drain(struct proto *proto) +{ + int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0); + + if (val) + atomic_long_add(val, proto->memory_allocated); +} + static inline void -sk_memory_allocated_add(struct sock *sk, int amt) +sk_memory_allocated_add(const struct sock *sk, int val) { - int local_reserve; + struct proto *proto = sk->sk_prot; - preempt_disable(); - local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt); - if (local_reserve >= READ_ONCE(sysctl_mem_pcpu_rsv)) { - __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve); - atomic_long_add(local_reserve, sk->sk_prot->memory_allocated); - } - preempt_enable(); + val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val); + + if (unlikely(val >= READ_ONCE(sysctl_mem_pcpu_rsv))) + proto_memory_pcpu_drain(proto); } static inline void -sk_memory_allocated_sub(struct sock *sk, int amt) +sk_memory_allocated_sub(const struct sock *sk, int val) { - int local_reserve; + struct proto *proto = sk->sk_prot; - preempt_disable(); - local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt); - if (local_reserve <= -READ_ONCE(sysctl_mem_pcpu_rsv)) { - __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve); - atomic_long_add(local_reserve, sk->sk_prot->memory_allocated); - } - preempt_enable(); + val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val); + + if (unlikely(val <= -READ_ONCE(sysctl_mem_pcpu_rsv))) + proto_memory_pcpu_drain(proto); } #define SK_ALLOC_PERCPU_COUNTER_BATCH 16 -- cgit v1.2.3 From 2e7ed5f5e69b6fe93dd3c6b651d041e0a7a456d1 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 16:40:33 -0400 Subject: Bluetooth: hci_sync: Use advertised PHYs on hci_le_ext_create_conn_sync The extended advertising reports do report the PHYs so this store then in hci_conn so it can be later used in hci_le_ext_create_conn_sync to narrow the PHYs to be scanned since the controller will also perform a scan having a smaller set of PHYs shall reduce the time it takes to find and connect peers. Fixes: 288c90224eec ("Bluetooth: Enable all supported LE PHY by default") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 56fb42df44a3..02af7d7013da 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -738,6 +738,8 @@ struct hci_conn { __u8 le_per_adv_data[HCI_MAX_PER_AD_TOT_LEN]; __u16 le_per_adv_data_len; __u16 le_per_adv_data_offset; + __u8 le_adv_phy; + __u8 le_adv_sec_phy; __u8 le_tx_phy; __u8 le_rx_phy; __s8 rssi; @@ -1512,7 +1514,7 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, enum conn_reasons conn_reason); struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, bool dst_resolved, u8 sec_level, - u16 conn_timeout, u8 role); + u16 conn_timeout, u8 role, u8 phy, u8 sec_phy); void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status); struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, u8 sec_level, u8 auth_type, -- cgit v1.2.3 From a9a830a676a9a93c5020f5c61236166931fa4266 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 15 Apr 2024 13:41:01 -0400 Subject: Bluetooth: hci_event: Fix sending HCI_OP_READ_ENC_KEY_SIZE The code shall always check if HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE has been set before attempting to use HCI_OP_READ_ENC_KEY_SIZE. Fixes: c569242cd492 ("Bluetooth: hci_event: set the conn encrypted before conn establishes") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 02af7d7013da..e8f581f3f3ce 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1907,6 +1907,10 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define privacy_mode_capable(dev) (use_ll_privacy(dev) && \ (hdev->commands[39] & 0x04)) +#define read_key_size_capable(dev) \ + ((dev)->commands[20] & 0x10 && \ + !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks)) + /* Use enhanced synchronous connection if command is supported and its quirk * has not been set. */ -- cgit v1.2.3 From 12bbaae7635a56049779db3bef6e7140d9aa5f67 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 21 Mar 2024 14:24:40 +0000 Subject: mm: create FOLIO_FLAG_FALSE and FOLIO_TYPE_OPS macros Following the separation of FOLIO_FLAGS from PAGEFLAGS, separate FOLIO_FLAG_FALSE from PAGEFLAG_FALSE and FOLIO_TYPE_OPS from PAGE_TYPE_OPS. Link: https://lkml.kernel.org/r/20240321142448.1645400-3-willy@infradead.org Fixes: 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Miaohe Lin Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 70 +++++++++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 652d77805e99..dc1607f1415e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -458,30 +458,51 @@ static __always_inline int TestClearPage##uname(struct page *page) \ TESTSETFLAG(uname, lname, policy) \ TESTCLEARFLAG(uname, lname, policy) +#define FOLIO_TEST_FLAG_FALSE(name) \ +static inline bool folio_test_##name(const struct folio *folio) \ +{ return false; } +#define FOLIO_SET_FLAG_NOOP(name) \ +static inline void folio_set_##name(struct folio *folio) { } +#define FOLIO_CLEAR_FLAG_NOOP(name) \ +static inline void folio_clear_##name(struct folio *folio) { } +#define __FOLIO_SET_FLAG_NOOP(name) \ +static inline void __folio_set_##name(struct folio *folio) { } +#define __FOLIO_CLEAR_FLAG_NOOP(name) \ +static inline void __folio_clear_##name(struct folio *folio) { } +#define FOLIO_TEST_SET_FLAG_FALSE(name) \ +static inline bool folio_test_set_##name(struct folio *folio) \ +{ return false; } +#define FOLIO_TEST_CLEAR_FLAG_FALSE(name) \ +static inline bool folio_test_clear_##name(struct folio *folio) \ +{ return false; } + +#define FOLIO_FLAG_FALSE(name) \ +FOLIO_TEST_FLAG_FALSE(name) \ +FOLIO_SET_FLAG_NOOP(name) \ +FOLIO_CLEAR_FLAG_NOOP(name) + #define TESTPAGEFLAG_FALSE(uname, lname) \ -static inline bool folio_test_##lname(const struct folio *folio) { return false; } \ +FOLIO_TEST_FLAG_FALSE(lname) \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname, lname) \ -static inline void folio_set_##lname(struct folio *folio) { } \ +FOLIO_SET_FLAG_NOOP(lname) \ static inline void SetPage##uname(struct page *page) { } #define CLEARPAGEFLAG_NOOP(uname, lname) \ -static inline void folio_clear_##lname(struct folio *folio) { } \ +FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void ClearPage##uname(struct page *page) { } #define __CLEARPAGEFLAG_NOOP(uname, lname) \ -static inline void __folio_clear_##lname(struct folio *folio) { } \ +__FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void __ClearPage##uname(struct page *page) { } #define TESTSETFLAG_FALSE(uname, lname) \ -static inline bool folio_test_set_##lname(struct folio *folio) \ -{ return 0; } \ +FOLIO_TEST_SET_FLAG_FALSE(lname) \ static inline int TestSetPage##uname(struct page *page) { return 0; } #define TESTCLEARFLAG_FALSE(uname, lname) \ -static inline bool folio_test_clear_##lname(struct folio *folio) \ -{ return 0; } \ +FOLIO_TEST_CLEAR_FLAG_FALSE(lname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } #define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname) \ @@ -977,35 +998,38 @@ static inline int page_has_type(const struct page *page) return page_type_has_type(page->page_type); } +#define FOLIO_TYPE_OPS(lname, fname) \ +static __always_inline bool folio_test_##fname(const struct folio *folio)\ +{ \ + return folio_test_type(folio, PG_##lname); \ +} \ +static __always_inline void __folio_set_##fname(struct folio *folio) \ +{ \ + VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \ + folio->page.page_type &= ~PG_##lname; \ +} \ +static __always_inline void __folio_clear_##fname(struct folio *folio) \ +{ \ + VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ + folio->page.page_type |= PG_##lname; \ +} + #define PAGE_TYPE_OPS(uname, lname, fname) \ +FOLIO_TYPE_OPS(lname, fname) \ static __always_inline int Page##uname(const struct page *page) \ { \ return PageType(page, PG_##lname); \ } \ -static __always_inline int folio_test_##fname(const struct folio *folio)\ -{ \ - return folio_test_type(folio, PG_##lname); \ -} \ static __always_inline void __SetPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!PageType(page, 0), page); \ page->page_type &= ~PG_##lname; \ } \ -static __always_inline void __folio_set_##fname(struct folio *folio) \ -{ \ - VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \ - folio->page.page_type &= ~PG_##lname; \ -} \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type |= PG_##lname; \ -} \ -static __always_inline void __folio_clear_##fname(struct folio *folio) \ -{ \ - VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ - folio->page.page_type |= PG_##lname; \ -} \ +} /* * PageBuddy() indicates that the page is free and in the buddy system -- cgit v1.2.3 From fd1a745ce03e37945674c14833870a9af0882e2d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 21 Mar 2024 14:24:42 +0000 Subject: mm: support page_mapcount() on page_has_type() pages Return 0 for pages which can't be mapped. This matches how page_mapped() works. It is more convenient for users to not have to filter out these pages. Link: https://lkml.kernel.org/r/20240321142448.1645400-5-willy@infradead.org Fixes: 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Miaohe Lin Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 +++++--- include/linux/page-flags.h | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b0ee64225de..b6bdaa18b9e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1223,14 +1223,16 @@ static inline void page_mapcount_reset(struct page *page) * a large folio, it includes the number of times this page is mapped * as part of that folio. * - * The result is undefined for pages which cannot be mapped into userspace. - * For example SLAB or special types of pages. See function page_has_type(). - * They use this field in struct page differently. + * Will report 0 for pages which cannot be mapped into userspace, eg + * slab, page tables and similar. */ static inline int page_mapcount(struct page *page) { int mapcount = atomic_read(&page->_mapcount) + 1; + /* Handle page_has_type() pages */ + if (mapcount < 0) + mapcount = 0; if (unlikely(PageCompound(page))) mapcount += folio_entire_mapcount(page_folio(page)); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index dc1607f1415e..35a0087d0910 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -971,12 +971,12 @@ static inline bool is_page_hwpoison(struct page *page) * page_type may be used. Because it is initialised to -1, we invert the * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and - * low bits so that an underflow or overflow of page_mapcount() won't be + * low bits so that an underflow or overflow of _mapcount won't be * mistaken for a page type value. */ #define PAGE_TYPE_BASE 0xf0000000 -/* Reserve 0x0000007f to catch underflows of page_mapcount */ +/* Reserve 0x0000007f to catch underflows of _mapcount */ #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 #define PG_offline 0x00000100 -- cgit v1.2.3 From d99e3140a4d33e26066183ff727d8f02f56bec64 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 21 Mar 2024 14:24:43 +0000 Subject: mm: turn folio_test_hugetlb into a PageType The current folio_test_hugetlb() can be fooled by a concurrent folio split into returning true for a folio which has never belonged to hugetlbfs. This can't happen if the caller holds a refcount on it, but we have a few places (memory-failure, compaction, procfs) which do not and should not take a speculative reference. Since hugetlb pages do not use individual page mapcounts (they are always fully mapped and use the entire_mapcount field to record the number of mappings), the PageType field is available now that page_mapcount() ignores the value in this field. In compaction and with CONFIG_DEBUG_VM enabled, the current implementation can result in an oops, as reported by Luis. This happens since 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") effectively added some VM_BUG_ON() checks in the PageHuge() testing path. [willy@infradead.org: update vmcoreinfo] Link: https://lkml.kernel.org/r/ZgGZUvsdhaT1Va-T@casper.infradead.org Link: https://lkml.kernel.org/r/20240321142448.1645400-6-willy@infradead.org Fixes: 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Reported-by: Luis Chamberlain Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218227 Cc: Miaohe Lin Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 70 ++++++++++++++++++++---------------------- include/trace/events/mmflags.h | 1 + 2 files changed, 34 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 35a0087d0910..4bf1c25fd1dc 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -190,7 +190,6 @@ enum pageflags { /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_error, - PG_hugetlb = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ }; @@ -876,29 +875,6 @@ TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable) #define PG_head_mask ((1UL << PG_head)) -#ifdef CONFIG_HUGETLB_PAGE -int PageHuge(const struct page *page); -SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) -CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) - -/** - * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs - * @folio: The folio to test. - * - * Context: Any context. Caller should have a reference on the folio to - * prevent it from being turned into a tail page. - * Return: True for hugetlbfs folios, false for anon folios or folios - * belonging to other filesystems. - */ -static inline bool folio_test_hugetlb(const struct folio *folio) -{ - return folio_test_large(folio) && - test_bit(PG_hugetlb, const_folio_flags(folio, 1)); -} -#else -TESTPAGEFLAG_FALSE(Huge, hugetlb) -#endif - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for @@ -954,18 +930,6 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) #endif -/* - * Check if a page is currently marked HWPoisoned. Note that this check is - * best effort only and inherently racy: there is no way to synchronize with - * failing hardware. - */ -static inline bool is_page_hwpoison(struct page *page) -{ - if (PageHWPoison(page)) - return true; - return PageHuge(page) && PageHWPoison(compound_head(page)); -} - /* * For pages that are never mapped to userspace (and aren't PageSlab), * page_type may be used. Because it is initialised to -1, we invert the @@ -982,6 +946,7 @@ static inline bool is_page_hwpoison(struct page *page) #define PG_offline 0x00000100 #define PG_table 0x00000200 #define PG_guard 0x00000400 +#define PG_hugetlb 0x00000800 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) @@ -1076,6 +1041,37 @@ PAGE_TYPE_OPS(Table, table, pgtable) */ PAGE_TYPE_OPS(Guard, guard, guard) +#ifdef CONFIG_HUGETLB_PAGE +FOLIO_TYPE_OPS(hugetlb, hugetlb) +#else +FOLIO_TEST_FLAG_FALSE(hugetlb) +#endif + +/** + * PageHuge - Determine if the page belongs to hugetlbfs + * @page: The page to test. + * + * Context: Any context. + * Return: True for hugetlbfs pages, false for anon pages or pages + * belonging to other filesystems. + */ +static inline bool PageHuge(const struct page *page) +{ + return folio_test_hugetlb(page_folio(page)); +} + +/* + * Check if a page is currently marked HWPoisoned. Note that this check is + * best effort only and inherently racy: there is no way to synchronize with + * failing hardware. + */ +static inline bool is_page_hwpoison(struct page *page) +{ + if (PageHWPoison(page)) + return true; + return PageHuge(page) && PageHWPoison(compound_head(page)); +} + extern bool is_free_buddy_page(struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); @@ -1142,7 +1138,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ - 1UL << PG_hugetlb | 1UL << PG_large_rmappable) + 1UL << PG_large_rmappable) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index d801409b33cf..d55e53ac91bd 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -135,6 +135,7 @@ IF_HAVE_PG_ARCH_X(arch_3) #define DEF_PAGETYPE_NAME(_name) { PG_##_name, __stringify(_name) } #define __def_pagetype_names \ + DEF_PAGETYPE_NAME(hugetlb), \ DEF_PAGETYPE_NAME(offline), \ DEF_PAGETYPE_NAME(guard), \ DEF_PAGETYPE_NAME(table), \ -- cgit v1.2.3 From ce0abef6a1d540acef85068e0e82bdf1fbeeb0e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Apr 2024 17:05:55 -0700 Subject: cpu: Ignore "mitigations" kernel parameter if CPU_MITIGATIONS=n Explicitly disallow enabling mitigations at runtime for kernels that were built with CONFIG_CPU_MITIGATIONS=n, as some architectures may omit code entirely if mitigations are disabled at compile time. E.g. on x86, a large pile of Kconfigs are buried behind CPU_MITIGATIONS, and trying to provide sane behavior for retroactively enabling mitigations is extremely difficult, bordering on impossible. E.g. page table isolation and call depth tracking require build-time support, BHI mitigations will still be off without additional kernel parameters, etc. [ bp: Touchups. ] Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov (AMD) Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240420000556.2645001-3-seanjc@google.com --- include/linux/cpu.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 272e4e79e15c..861c3bfc5f17 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -221,7 +221,18 @@ void cpuhp_report_idle_dead(void); static inline void cpuhp_report_idle_dead(void) { } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_CPU_MITIGATIONS extern bool cpu_mitigations_off(void); extern bool cpu_mitigations_auto_nosmt(void); +#else +static inline bool cpu_mitigations_off(void) +{ + return true; +} +static inline bool cpu_mitigations_auto_nosmt(void) +{ + return false; +} +#endif #endif /* _LINUX_CPU_H_ */ -- cgit v1.2.3 From e877d705704d7c8fe17b6b5ebdfdb14b84c207a7 Mon Sep 17 00:00:00 2001 From: Christian Gmeiner Date: Sat, 20 Apr 2024 15:41:58 +0200 Subject: Revert "drm/etnaviv: Expose a few more chipspecs to userspace" This reverts commit 1dccdba084897443d116508a8ed71e0ac8a031a4. In userspace a different approach was choosen - hwdb. As a result, there is no need for these values. Signed-off-by: Christian Gmeiner Reviewed-by: Tomeu Vizoso Signed-off-by: Lucas Stach --- include/uapi/drm/etnaviv_drm.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/uapi/drm/etnaviv_drm.h b/include/uapi/drm/etnaviv_drm.h index d87410a8443a..af024d90453d 100644 --- a/include/uapi/drm/etnaviv_drm.h +++ b/include/uapi/drm/etnaviv_drm.h @@ -77,11 +77,6 @@ struct drm_etnaviv_timespec { #define ETNAVIV_PARAM_GPU_PRODUCT_ID 0x1c #define ETNAVIV_PARAM_GPU_CUSTOMER_ID 0x1d #define ETNAVIV_PARAM_GPU_ECO_ID 0x1e -#define ETNAVIV_PARAM_GPU_NN_CORE_COUNT 0x1f -#define ETNAVIV_PARAM_GPU_NN_MAD_PER_CORE 0x20 -#define ETNAVIV_PARAM_GPU_TP_CORE_COUNT 0x21 -#define ETNAVIV_PARAM_GPU_ON_CHIP_SRAM_SIZE 0x22 -#define ETNAVIV_PARAM_GPU_AXI_SRAM_SIZE 0x23 #define ETNA_MAX_PIPES 4 -- cgit v1.2.3 From 475747a19316b08e856c666a20503e73d7ed67ed Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 23 Apr 2024 11:13:02 -0700 Subject: macsec: Enable devices to advertise whether they update sk_buff md_dst during offloads Cannot know whether a Rx skb missing md_dst is intended for MACsec or not without knowing whether the device is able to update this field during an offload. Assume that an offload to a MACsec device cannot support updating md_dst by default. Capable devices can advertise that they do indicate that an skb is related to a MACsec offloaded packet using the md_dst. Cc: Sabrina Dubroca Cc: stable@vger.kernel.org Fixes: 860ead89b851 ("net/macsec: Add MACsec skb_metadata_dst Rx Data path support") Signed-off-by: Rahul Rameshbabu Reviewed-by: Benjamin Poirier Reviewed-by: Cosmin Ratiu Reviewed-by: Sabrina Dubroca Link: https://lore.kernel.org/r/20240423181319.115860-2-rrameshbabu@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/macsec.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/macsec.h b/include/net/macsec.h index dbd22180cc5c..de216cbc6b05 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -321,6 +321,7 @@ struct macsec_context { * for the TX tag * @needed_tailroom: number of bytes reserved at the end of the sk_buff for the * TX tag + * @rx_uses_md_dst: whether MACsec device offload supports sk_buff md_dst */ struct macsec_ops { /* Device wide */ @@ -352,6 +353,7 @@ struct macsec_ops { struct sk_buff *skb); unsigned int needed_headroom; unsigned int needed_tailroom; + bool rx_uses_md_dst; }; void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa); -- cgit v1.2.3 From 6e159fd653d7ebf6290358e0330a0cb8a75cf73b Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 23 Apr 2024 11:13:03 -0700 Subject: ethernet: Add helper for assigning packet type when dest address does not match device address Enable reuse of logic in eth_type_trans for determining packet type. Suggested-by: Sabrina Dubroca Cc: stable@vger.kernel.org Signed-off-by: Rahul Rameshbabu Reviewed-by: Sabrina Dubroca Link: https://lore.kernel.org/r/20240423181319.115860-3-rrameshbabu@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/etherdevice.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 224645f17c33..297231854ada 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -607,6 +607,31 @@ static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr, eth_hw_addr_set(dev, addr); } +/** + * eth_skb_pkt_type - Assign packet type if destination address does not match + * @skb: Assigned a packet type if address does not match @dev address + * @dev: Network device used to compare packet address against + * + * If the destination MAC address of the packet does not match the network + * device address, assign an appropriate packet type. + */ +static inline void eth_skb_pkt_type(struct sk_buff *skb, + const struct net_device *dev) +{ + const struct ethhdr *eth = eth_hdr(skb); + + if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) { + if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { + if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } else { + skb->pkt_type = PACKET_OTHERHOST; + } + } +} + /** * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame * @skb: Buffer to pad -- cgit v1.2.3 From 0844370f8945086eb9335739d10205dcea8d707b Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 24 Apr 2024 12:25:47 +0200 Subject: tls: fix lockless read of strp->msg_ready in ->poll tls_sk_poll is called without locking the socket, and needs to read strp->msg_ready (via tls_strp_msg_ready). Convert msg_ready to a bool and use READ_ONCE/WRITE_ONCE where needed. The remaining reads are only performed when the socket is locked. Fixes: 121dca784fc0 ("tls: suppress wakeups unless we have a full record") Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/0b7ee062319037cf86af6b317b3d72f7bfcd2e97.1713797701.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- include/net/tls.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 340ad43971e4..33f657d3c051 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -111,7 +111,8 @@ struct tls_strparser { u32 stopped : 1; u32 copy_mode : 1; u32 mixed_decrypted : 1; - u32 msg_ready : 1; + + bool msg_ready; struct strp_msg stm; -- cgit v1.2.3 From 1971d13ffa84a551d29a81fdf5b5ec5be166ac83 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Apr 2024 10:04:43 -0700 Subject: af_unix: Suppress false-positive lockdep splat for spin_lock() in __unix_gc(). syzbot reported a lockdep splat regarding unix_gc_lock and unix_state_lock(). One is called from recvmsg() for a connected socket, and another is called from GC for TCP_LISTEN socket. So, the splat is false-positive. Let's add a dedicated lock class for the latter to suppress the splat. Note that this change is not necessary for net-next.git as the issue is only applied to the old GC impl. [0]: WARNING: possible circular locking dependency detected 6.9.0-rc5-syzkaller-00007-g4d2008430ce8 #0 Not tainted ----------------------------------------------------- kworker/u8:1/11 is trying to acquire lock: ffff88807cea4e70 (&u->lock){+.+.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] ffff88807cea4e70 (&u->lock){+.+.}-{2:2}, at: __unix_gc+0x40e/0xf70 net/unix/garbage.c:302 but task is already holding lock: ffffffff8f6ab638 (unix_gc_lock){+.+.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] ffffffff8f6ab638 (unix_gc_lock){+.+.}-{2:2}, at: __unix_gc+0x117/0xf70 net/unix/garbage.c:261 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (unix_gc_lock){+.+.}-{2:2}: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:351 [inline] unix_notinflight+0x13d/0x390 net/unix/garbage.c:140 unix_detach_fds net/unix/af_unix.c:1819 [inline] unix_destruct_scm+0x221/0x350 net/unix/af_unix.c:1876 skb_release_head_state+0x100/0x250 net/core/skbuff.c:1188 skb_release_all net/core/skbuff.c:1200 [inline] __kfree_skb net/core/skbuff.c:1216 [inline] kfree_skb_reason+0x16d/0x3b0 net/core/skbuff.c:1252 kfree_skb include/linux/skbuff.h:1262 [inline] manage_oob net/unix/af_unix.c:2672 [inline] unix_stream_read_generic+0x1125/0x2700 net/unix/af_unix.c:2749 unix_stream_splice_read+0x239/0x320 net/unix/af_unix.c:2981 do_splice_read fs/splice.c:985 [inline] splice_file_to_pipe+0x299/0x500 fs/splice.c:1295 do_splice+0xf2d/0x1880 fs/splice.c:1379 __do_splice fs/splice.c:1436 [inline] __do_sys_splice fs/splice.c:1652 [inline] __se_sys_splice+0x331/0x4a0 fs/splice.c:1634 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (&u->lock){+.+.}-{2:2}: check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869 __lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:351 [inline] __unix_gc+0x40e/0xf70 net/unix/garbage.c:302 process_one_work kernel/workqueue.c:3254 [inline] process_scheduled_works+0xa10/0x17c0 kernel/workqueue.c:3335 worker_thread+0x86d/0xd70 kernel/workqueue.c:3416 kthread+0x2f0/0x390 kernel/kthread.c:388 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(unix_gc_lock); lock(&u->lock); lock(unix_gc_lock); lock(&u->lock); *** DEADLOCK *** 3 locks held by kworker/u8:1/11: #0: ffff888015089148 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work kernel/workqueue.c:3229 [inline] #0: ffff888015089148 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_scheduled_works+0x8e0/0x17c0 kernel/workqueue.c:3335 #1: ffffc90000107d00 (unix_gc_work){+.+.}-{0:0}, at: process_one_work kernel/workqueue.c:3230 [inline] #1: ffffc90000107d00 (unix_gc_work){+.+.}-{0:0}, at: process_scheduled_works+0x91b/0x17c0 kernel/workqueue.c:3335 #2: ffffffff8f6ab638 (unix_gc_lock){+.+.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] #2: ffffffff8f6ab638 (unix_gc_lock){+.+.}-{2:2}, at: __unix_gc+0x117/0xf70 net/unix/garbage.c:261 stack backtrace: CPU: 0 PID: 11 Comm: kworker/u8:1 Not tainted 6.9.0-rc5-syzkaller-00007-g4d2008430ce8 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Workqueue: events_unbound __unix_gc Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2187 check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869 __lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:351 [inline] __unix_gc+0x40e/0xf70 net/unix/garbage.c:302 process_one_work kernel/workqueue.c:3254 [inline] process_scheduled_works+0xa10/0x17c0 kernel/workqueue.c:3335 worker_thread+0x86d/0xd70 kernel/workqueue.c:3416 kthread+0x2f0/0x390 kernel/kthread.c:388 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fixes: 47d8ac011fe1 ("af_unix: Fix garbage collector racing against connect()") Reported-and-tested-by: syzbot+fa379358c28cc87cc307@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=fa379358c28cc87cc307 Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240424170443.9832-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 627ea8e2d915..3dee0b2721aa 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -85,6 +85,9 @@ enum unix_socket_lock_class { U_LOCK_NORMAL, U_LOCK_SECOND, /* for double locking, see unix_state_double_lock(). */ U_LOCK_DIAG, /* used while dumping icons, see sk_diag_dump_icons(). */ + U_LOCK_GC_LISTENER, /* used for listening socket while determining gc + * candidates to close a small race window. + */ }; static inline void unix_state_lock_nested(struct sock *sk, -- cgit v1.2.3 From 66e13b615a0ce76b785d780ecc9776ba71983629 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 24 Apr 2024 10:02:08 +0000 Subject: bpf: verifier: prevent userspace memory access With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer. To thwart invalid memory accesses, the JITs add an exception table entry for all such accesses. But in case the src_reg + offset is a userspace address, the BPF program might read that memory if the user has mapped it. Make the verifier add guard instructions around such memory accesses and skip the load if the address falls into the userspace region. The JITs need to implement bpf_arch_uaddress_limit() to define where the userspace addresses end for that architecture or TASK_SIZE is taken as default. The implementation is as follows: REG_AX = SRC_REG if(offset) REG_AX += offset; REG_AX >>= 32; if (REG_AX <= (uaddress_limit >> 32)) DST_REG = 0; else DST_REG = *(size *)(SRC_REG + offset); Comparing just the upper 32 bits of the load address with the upper 32 bits of uaddress_limit implies that the values are being aligned down to a 4GB boundary before comparison. The above means that all loads with address <= uaddress_limit + 4GB are skipped. This is acceptable because there is a large hole (much larger than 4GB) between userspace and kernel space memory, therefore a correctly functioning BPF program should not access this 4GB memory above the userspace. Let's analyze what this patch does to the following fentry program dereferencing an untrusted pointer: SEC("fentry/tcp_v4_connect") int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk) { *(volatile long *)sk; return 0; } BPF Program before | BPF Program after ------------------ | ----------------- 0: (79) r1 = *(u64 *)(r1 +0) 0: (79) r1 = *(u64 *)(r1 +0) ----------------------------------------------------------------------- 1: (79) r1 = *(u64 *)(r1 +0) --\ 1: (bf) r11 = r1 ----------------------------\ \ 2: (77) r11 >>= 32 2: (b7) r0 = 0 \ \ 3: (b5) if r11 <= 0x8000 goto pc+2 3: (95) exit \ \-> 4: (79) r1 = *(u64 *)(r1 +0) \ 5: (05) goto pc+1 \ 6: (b7) r1 = 0 \-------------------------------------- 7: (b7) r0 = 0 8: (95) exit As you can see from above, in the best case (off=0), 5 extra instructions are emitted. Now, we analyze the same program after it has gone through the JITs of ARM64 and RISC-V architectures. We follow the single load instruction that has the untrusted pointer and see what instrumentation has been added around it. x86-64 JIT ========== JIT's Instrumentation (upstream) --------------------- 0: nopl 0x0(%rax,%rax,1) 5: xchg %ax,%ax 7: push %rbp 8: mov %rsp,%rbp b: mov 0x0(%rdi),%rdi --------------------------------- f: movabs $0x800000000000,%r11 19: cmp %r11,%rdi 1c: jb 0x000000000000002a 1e: mov %rdi,%r11 21: add $0x0,%r11 28: jae 0x000000000000002e 2a: xor %edi,%edi 2c: jmp 0x0000000000000032 2e: mov 0x0(%rdi),%rdi --------------------------------- 32: xor %eax,%eax 34: leave 35: ret The x86-64 JIT already emits some instructions to protect against user memory access. This patch doesn't make any changes for the x86-64 JIT. ARM64 JIT ========= No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: add x9, x30, #0x0 0: add x9, x30, #0x0 4: nop 4: nop 8: paciasp 8: paciasp c: stp x29, x30, [sp, #-16]! c: stp x29, x30, [sp, #-16]! 10: mov x29, sp 10: mov x29, sp 14: stp x19, x20, [sp, #-16]! 14: stp x19, x20, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 24: mov x25, sp 24: mov x25, sp 28: mov x26, #0x0 28: mov x26, #0x0 2c: sub x27, x25, #0x0 2c: sub x27, x25, #0x0 30: sub sp, sp, #0x0 30: sub sp, sp, #0x0 34: ldr x0, [x0] 34: ldr x0, [x0] -------------------------------------------------------------------------------- 38: ldr x0, [x0] ----------\ 38: add x9, x0, #0x0 -----------------------------------\\ 3c: lsr x9, x9, #32 3c: mov x7, #0x0 \\ 40: cmp x9, #0x10, lsl #12 40: mov sp, sp \\ 44: b.ls 0x0000000000000050 44: ldp x27, x28, [sp], #16 \\--> 48: ldr x0, [x0] 48: ldp x25, x26, [sp], #16 \ 4c: b 0x0000000000000054 4c: ldp x21, x22, [sp], #16 \ 50: mov x0, #0x0 50: ldp x19, x20, [sp], #16 \--------------------------------------- 54: ldp x29, x30, [sp], #16 54: mov x7, #0x0 58: add x0, x7, #0x0 58: mov sp, sp 5c: autiasp 5c: ldp x27, x28, [sp], #16 60: ret 60: ldp x25, x26, [sp], #16 64: nop 64: ldp x21, x22, [sp], #16 68: ldr x10, 0x0000000000000070 68: ldp x19, x20, [sp], #16 6c: br x10 6c: ldp x29, x30, [sp], #16 70: add x0, x7, #0x0 74: autiasp 78: ret 7c: nop 80: ldr x10, 0x0000000000000088 84: br x10 There are 6 extra instructions added in ARM64 in the best case. This will become 7 in the worst case (off != 0). RISC-V JIT (RISCV_ISA_C Disabled) ========== No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: nop 0: nop 4: nop 4: nop 8: li a6, 33 8: li a6, 33 c: addi sp, sp, -16 c: addi sp, sp, -16 10: sd s0, 8(sp) 10: sd s0, 8(sp) 14: addi s0, sp, 16 14: addi s0, sp, 16 18: ld a0, 0(a0) 18: ld a0, 0(a0) --------------------------------------------------------------- 1c: ld a0, 0(a0) --\ 1c: mv t0, a0 --------------------------\ \ 20: srli t0, t0, 32 20: li a5, 0 \ \ 24: lui t1, 4096 24: ld s0, 8(sp) \ \ 28: sext.w t1, t1 28: addi sp, sp, 16 \ \ 2c: bgeu t1, t0, 12 2c: sext.w a0, a5 \ \--> 30: ld a0, 0(a0) 30: ret \ 34: j 8 \ 38: li a0, 0 \------------------------------ 3c: li a5, 0 40: ld s0, 8(sp) 44: addi sp, sp, 16 48: sext.w a0, a5 4c: ret There are 7 extra instructions added in RISC-V. Fixes: 800834285361 ("bpf, arm64: Add BPF exception tables") Reported-by: Breno Leitao Suggested-by: Alexei Starovoitov Acked-by: Ilya Leoshkevich Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240424100210.11982-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index c99bc3df2d28..219ee7a76874 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -963,6 +963,7 @@ bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); +u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); -- cgit v1.2.3 From 2e5449f4f21a1b0bd9beec4c4b580eb1f9b9ed7f Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 27 Apr 2024 15:27:58 +0900 Subject: profiling: Remove create_prof_cpu_mask(). create_prof_cpu_mask() is no longer used after commit 1f44a225777e ("s390: convert interrupt handling to use generic hardirq"). Signed-off-by: Tetsuo Handa Signed-off-by: Linus Torvalds --- include/linux/profile.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/profile.h b/include/linux/profile.h index 11db1ec516e2..04ae5ebcb637 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -18,13 +18,8 @@ struct proc_dir_entry; struct notifier_block; #if defined(CONFIG_PROFILING) && defined(CONFIG_PROC_FS) -void create_prof_cpu_mask(void); int create_proc_profile(void); #else -static inline void create_prof_cpu_mask(void) -{ -} - static inline int create_proc_profile(void) { return 0; -- cgit v1.2.3 From f848337cd801c7106a4ec0d61765771dab2a5909 Mon Sep 17 00:00:00 2001 From: Oswald Buddenhagen Date: Sun, 28 Apr 2024 11:37:13 +0200 Subject: ALSA: emu10k1: move the whole GPIO event handling to the workqueue The actual event processing was already done by workqueue items. We can move the event dispatching there as well, rather than doing it already in the interrupt handler callback. This change has a rather profound "side effect" on the reliability of the FPGA programming: once we enter programming mode, we must not issue any snd_emu1010_fpga_{read,write}() calls until we're done, as these would badly mess up the programming protocol. But exactly that would happen when trying to program the dock, as that triggers GPIO interrupts as a side effect. This is mitigated by deferring the actual interrupt handling, as workqueue items are not re-entrant. To avoid scheduling the dispatcher on non-events, we now explicitly ignore GPIO IRQs triggered by "uninteresting" pins, which happens a lot as a side effect of calling snd_emu1010_fpga_{read,write}(). Fixes: fbb64eedf5a3 ("ALSA: emu10k1: make E-MU dock monitoring interrupt-driven") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218584 Signed-off-by: Oswald Buddenhagen Signed-off-by: Takashi Iwai Message-ID: <20240428093716.3198666-4-oswald.buddenhagen@gmx.de> --- include/sound/emu10k1.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h index 1af9e6819392..9cc10fab01a8 100644 --- a/include/sound/emu10k1.h +++ b/include/sound/emu10k1.h @@ -1684,8 +1684,7 @@ struct snd_emu1010 { unsigned int clock_fallback; unsigned int optical_in; /* 0:SPDIF, 1:ADAT */ unsigned int optical_out; /* 0:SPDIF, 1:ADAT */ - struct work_struct firmware_work; - struct work_struct clock_work; + struct work_struct work; }; struct snd_emu10k1 { -- cgit v1.2.3 From 2d3f4810886eb7c319cec41b6d725d2953bfa88a Mon Sep 17 00:00:00 2001 From: Oswald Buddenhagen Date: Sun, 28 Apr 2024 11:37:14 +0200 Subject: ALSA: emu10k1: use mutex for E-MU FPGA access locking The FPGA access through the GPIO port does not interfere with other sound processor register access, so there is no need to subject it to emu_lock. And after moving all FPGA access out of the interrupt handler, it does not need to be IRQ-safe, either. What's more, attaching the dock causes a firmware upload, which takes several seconds. We really don't want to disable IRQs for this long, and even less also have someone else spin with IRQs disabled waiting for us. Therefore, use a mutex for FPGA access locking. This makes the code somewhat more noisy, as we need to wrap bigger sections into the mutex, as it needs to enclose the spinlocks. The latter has the "side effect" of fixing dock FPGA programming in a corner case: a really badly timed mixer access right between entering FPGA programming mode and uploading the netlist would mess up the protocol. Signed-off-by: Oswald Buddenhagen Signed-off-by: Takashi Iwai Message-ID: <20240428093716.3198666-5-oswald.buddenhagen@gmx.de> --- include/sound/emu10k1.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h index 9cc10fab01a8..234b5baea69c 100644 --- a/include/sound/emu10k1.h +++ b/include/sound/emu10k1.h @@ -1685,6 +1685,7 @@ struct snd_emu1010 { unsigned int optical_in; /* 0:SPDIF, 1:ADAT */ unsigned int optical_out; /* 0:SPDIF, 1:ADAT */ struct work_struct work; + struct mutex lock; }; struct snd_emu10k1 { @@ -1833,6 +1834,9 @@ unsigned int snd_emu10k1_ptr20_read(struct snd_emu10k1 * emu, unsigned int reg, void snd_emu10k1_ptr20_write(struct snd_emu10k1 *emu, unsigned int reg, unsigned int chn, unsigned int data); int snd_emu10k1_spi_write(struct snd_emu10k1 * emu, unsigned int data); int snd_emu10k1_i2c_write(struct snd_emu10k1 *emu, u32 reg, u32 value); +static inline void snd_emu1010_fpga_lock(struct snd_emu10k1 *emu) { mutex_lock(&emu->emu1010.lock); }; +static inline void snd_emu1010_fpga_unlock(struct snd_emu10k1 *emu) { mutex_unlock(&emu->emu1010.lock); }; +void snd_emu1010_fpga_write_lock(struct snd_emu10k1 *emu, u32 reg, u32 value); void snd_emu1010_fpga_write(struct snd_emu10k1 *emu, u32 reg, u32 value); void snd_emu1010_fpga_read(struct snd_emu10k1 *emu, u32 reg, u32 *value); void snd_emu1010_fpga_link_dst_src_write(struct snd_emu10k1 *emu, u32 dst, u32 src); -- cgit v1.2.3 From 5ef31ea5d053a8f493a772ebad3f3ce82c35d845 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 30 Apr 2024 16:35:54 +0200 Subject: net: gro: fix udp bad offset in socket lookup by adding {inner_}network_offset to napi_gro_cb Commits a602456 ("udp: Add GRO functions to UDP socket") and 57c67ff ("udp: additional GRO support") introduce incorrect usage of {ip,ipv6}_hdr in the complete phase of gro. The functions always return skb->network_header, which in the case of encapsulated packets at the gro complete phase, is always set to the innermost L3 of the packet. That means that calling {ip,ipv6}_hdr for skbs which completed the GRO receive phase (both in gro_list and *_gro_complete) when parsing an encapsulated packet's _outer_ L3/L4 may return an unexpected value. This incorrect usage leads to a bug in GRO's UDP socket lookup. udp{4,6}_lib_lookup_skb functions use ip_hdr/ipv6_hdr respectively. These *_hdr functions return network_header which will point to the innermost L3, resulting in the wrong offset being used in __udp{4,6}_lib_lookup with encapsulated packets. This patch adds network_offset and inner_network_offset to napi_gro_cb, and makes sure both are set correctly. To fix the issue, network_offsets union is used inside napi_gro_cb, in which both the outer and the inner network offsets are saved. Reproduction example: Endpoint configuration example (fou + local address bind) # ip fou add port 6666 ipproto 4 # ip link add name tun1 type ipip remote 2.2.2.1 local 2.2.2.2 encap fou encap-dport 5555 encap-sport 6666 mode ipip # ip link set tun1 up # ip a add 1.1.1.2/24 dev tun1 Netperf TCP_STREAM result on net-next before patch is applied: net-next main, GRO enabled: $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 131072 16384 16384 5.28 2.37 net-next main, GRO disabled: $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 131072 16384 16384 5.01 2745.06 patch applied, GRO enabled: $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 131072 16384 16384 5.01 2877.38 Fixes: a6024562ffd7 ("udp: Add GRO functions to UDP socket") Signed-off-by: Richard Gobert Reviewed-by: Eric Dumazet Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni --- include/net/gro.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h index 50f1e403dbbb..c1d4ca0463a1 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -87,6 +87,15 @@ struct napi_gro_cb { /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; + + /* L3 offsets */ + union { + struct { + u16 network_offset; + u16 inner_network_offset; + }; + u16 network_offsets[2]; + }; }; #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) -- cgit v1.2.3 From b63db58e2fa5d6963db9c45df88e60060f0ff35f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 2 May 2024 09:03:15 -0400 Subject: eventfs/tracing: Add callback for release of an eventfs_inode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Synthetic events create and destroy tracefs files when they are created and removed. The tracing subsystem has its own file descriptor representing the state of the events attached to the tracefs files. There's a race between the eventfs files and this file descriptor of the tracing system where the following can cause an issue: With two scripts 'A' and 'B' doing: Script 'A': echo "hello int aaa" > /sys/kernel/tracing/synthetic_events while : do echo 0 > /sys/kernel/tracing/events/synthetic/hello/enable done Script 'B': echo > /sys/kernel/tracing/synthetic_events Script 'A' creates a synthetic event "hello" and then just writes zero into its enable file. Script 'B' removes all synthetic events (including the newly created "hello" event). What happens is that the opening of the "enable" file has: { struct trace_event_file *file = inode->i_private; int ret; ret = tracing_check_open_get_tr(file->tr); [..] But deleting the events frees the "file" descriptor, and a "use after free" happens with the dereference at "file->tr". The file descriptor does have a reference counter, but there needs to be a way to decrement it from the eventfs when the eventfs_inode is removed that represents this file descriptor. Add an optional "release" callback to the eventfs_entry array structure, that gets called when the eventfs file is about to be removed. This allows for the creating on the eventfs file to increment the tracing file descriptor ref counter. When the eventfs file is deleted, it can call the release function that will call the put function for the tracing file descriptor. This will protect the tracing file from being freed while a eventfs file that references it is being opened. Link: https://lore.kernel.org/linux-trace-kernel/20240426073410.17154-1-Tze-nan.Wu@mediatek.com/ Link: https://lore.kernel.org/linux-trace-kernel/20240502090315.448cba46@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: Tze-nan wu Tested-by: Tze-nan Wu (吳澤南) Signed-off-by: Steven Rostedt (Google) --- include/linux/tracefs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 7a5fe17b6bf9..d03f74658716 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -62,6 +62,8 @@ struct eventfs_file; typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, const struct file_operations **fops); +typedef void (*eventfs_release)(const char *name, void *data); + /** * struct eventfs_entry - dynamically created eventfs file call back handler * @name: Then name of the dynamic file in an eventfs directory @@ -72,6 +74,7 @@ typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, struct eventfs_entry { const char *name; eventfs_callback callback; + eventfs_release release; }; struct eventfs_inode; -- cgit v1.2.3