From b9a49520679e98700d3d89689cc91c08a1c88c1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 Jan 2025 00:55:32 +0100 Subject: rcuref: Plug slowpath race in rcuref_put() Kernel test robot reported an "imbalanced put" in the rcuref_put() slow path, which turned out to be a false positive. Consider the following race: ref = 0 (via rcuref_init(ref, 1)) T1 T2 rcuref_put(ref) -> atomic_add_negative_release(-1, ref) # ref -> 0xffffffff -> rcuref_put_slowpath(ref) rcuref_get(ref) -> atomic_add_negative_relaxed(1, &ref->refcnt) -> return true; # ref -> 0 rcuref_put(ref) -> atomic_add_negative_release(-1, ref) # ref -> 0xffffffff -> rcuref_put_slowpath() -> cnt = atomic_read(&ref->refcnt); # cnt -> 0xffffffff / RCUREF_NOREF -> atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD)) # ref -> 0xe0000000 / RCUREF_DEAD -> return true -> cnt = atomic_read(&ref->refcnt); # cnt -> 0xe0000000 / RCUREF_DEAD -> if (cnt > RCUREF_RELEASED) # 0xe0000000 > 0xc0000000 -> WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()") The problem is the additional read in the slow path (after it decremented to RCUREF_NOREF) which can happen after the counter has been marked RCUREF_DEAD. Prevent this by reusing the return value of the decrement. Now every "final" put uses RCUREF_NOREF in the slow path and attempts the final cmpxchg() to RCUREF_DEAD. [ bigeasy: Add changelog ] Fixes: ee1ee6db07795 ("atomics: Provide rcuref - scalable reference counting") Reported-by: kernel test robot Debugged-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Closes: https://lore.kernel.org/oe-lkp/202412311453.9d7636a2-lkp@intel.com --- include/linux/rcuref.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h index 2c8bfd0f1b6b..6322d8c1c6b4 100644 --- a/include/linux/rcuref.h +++ b/include/linux/rcuref.h @@ -71,27 +71,30 @@ static inline __must_check bool rcuref_get(rcuref_t *ref) return rcuref_get_slowpath(ref); } -extern __must_check bool rcuref_put_slowpath(rcuref_t *ref); +extern __must_check bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt); /* * Internal helper. Do not invoke directly. */ static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) { + int cnt; + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), "suspicious rcuref_put_rcusafe() usage"); /* * Unconditionally decrease the reference count. The saturation and * dead zones provide enough tolerance for this. */ - if (likely(!atomic_add_negative_release(-1, &ref->refcnt))) + cnt = atomic_sub_return_release(1, &ref->refcnt); + if (likely(cnt >= 0)) return false; /* * Handle the last reference drop and cases inside the saturation * and dead zones. */ - return rcuref_put_slowpath(ref); + return rcuref_put_slowpath(ref, cnt); } /** -- cgit v1.2.3 From 0532a79efd68a4d9686b0385e4993af4b130ff82 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 22 Jan 2025 18:09:13 +0800 Subject: strparser: Add read_sock callback Added a new read_sock handler, allowing users to customize read operations instead of relying on the native socket's read_sock. Signed-off-by: Jiayuan Chen Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://patch.msgid.link/20250122100917.49845-2-mrpre@163.com --- include/net/strparser.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/strparser.h b/include/net/strparser.h index 41e2ce9e9e10..0a83010b3a64 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -43,6 +43,8 @@ struct strparser; struct strp_callbacks { int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); + int (*read_sock)(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor); int (*read_sock_done)(struct strparser *strp, int err); void (*abort_parser)(struct strparser *strp, int err); void (*lock)(struct strparser *strp); -- cgit v1.2.3 From 36b62df5683c315ba58c950f1a9c771c796c30ec Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 22 Jan 2025 18:09:14 +0800 Subject: bpf: Fix wrong copied_seq calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'sk->copied_seq' was updated in the tcp_eat_skb() function when the action of a BPF program was SK_REDIRECT. For other actions, like SK_PASS, the update logic for 'sk->copied_seq' was moved to tcp_bpf_recvmsg_parser() to ensure the accuracy of the 'fionread' feature. It works for a single stream_verdict scenario, as it also modified sk_data_ready->sk_psock_verdict_data_ready->tcp_read_skb to remove updating 'sk->copied_seq'. However, for programs where both stream_parser and stream_verdict are active (strparser purpose), tcp_read_sock() was used instead of tcp_read_skb() (sk_data_ready->strp_data_ready->tcp_read_sock). tcp_read_sock() now still updates 'sk->copied_seq', leading to duplicate updates. In summary, for strparser + SK_PASS, copied_seq is redundantly calculated in both tcp_read_sock() and tcp_bpf_recvmsg_parser(). The issue causes incorrect copied_seq calculations, which prevent correct data reads from the recv() interface in user-land. We do not want to add new proto_ops to implement a new version of tcp_read_sock, as this would introduce code complexity [1]. We could have added noack and copied_seq to desc, and then called ops->read_sock. However, unfortunately, other modules didn’t fully initialize desc to zero. So, for now, we are directly calling tcp_read_sock_noack() in tcp_bpf.c. [1]: https://lore.kernel.org/bpf/20241218053408.437295-1-mrpre@163.com Fixes: e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq") Suggested-by: Jakub Sitnicki Signed-off-by: Jiayuan Chen Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://patch.msgid.link/20250122100917.49845-3-mrpre@163.com --- include/linux/skmsg.h | 2 ++ include/net/tcp.h | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 2cbe0c22a32f..0b9095a281b8 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -91,6 +91,8 @@ struct sk_psock { struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; + u32 copied_seq; + u32 ingress_bytes; #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; diff --git a/include/net/tcp.h b/include/net/tcp.h index 5b2b04835688..9c044fb9ab26 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -729,6 +729,9 @@ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); +int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off); void tcp_read_done(struct sock *sk, size_t len); @@ -2599,6 +2602,11 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); +#ifdef CONFIG_BPF_STREAM_PARSER +struct strparser; +int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor); +#endif /* CONFIG_BPF_STREAM_PARSER */ #endif /* CONFIG_BPF_SYSCALL */ #ifdef CONFIG_INET -- cgit v1.2.3 From b69bb476dee99d564d65d418e9a20acca6f32c3f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 30 Jan 2025 16:05:42 -0800 Subject: cgroup: fix race between fork and cgroup.kill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tejun reported the following race between fork() and cgroup.kill at [1]. Tejun: I was looking at cgroup.kill implementation and wondering whether there could be a race window. So, __cgroup_kill() does the following: k1. Set CGRP_KILL. k2. Iterate tasks and deliver SIGKILL. k3. Clear CGRP_KILL. The copy_process() does the following: c1. Copy a bunch of stuff. c2. Grab siglock. c3. Check fatal_signal_pending(). c4. Commit to forking. c5. Release siglock. c6. Call cgroup_post_fork() which puts the task on the css_set and tests CGRP_KILL. The intention seems to be that either a forking task gets SIGKILL and terminates on c3 or it sees CGRP_KILL on c6 and kills the child. However, I don't see what guarantees that k3 can't happen before c6. ie. After a forking task passes c5, k2 can take place and then before the forking task reaches c6, k3 can happen. Then, nobody would send SIGKILL to the child. What am I missing? This is indeed a race. One way to fix this race is by taking cgroup_threadgroup_rwsem in write mode in __cgroup_kill() as the fork() side takes cgroup_threadgroup_rwsem in read mode from cgroup_can_fork() to cgroup_post_fork(). However that would be heavy handed as this adds one more potential stall scenario for cgroup.kill which is usually called under extreme situation like memory pressure. To fix this race, let's maintain a sequence number per cgroup which gets incremented on __cgroup_kill() call. On the fork() side, the cgroup_can_fork() will cache the sequence number locally and recheck it against the cgroup's sequence number at cgroup_post_fork() site. If the sequence numbers mismatch, it means __cgroup_kill() can been called and we should send SIGKILL to the newly created task. Reported-by: Tejun Heo Closes: https://lore.kernel.org/all/Z5QHE2Qn-QZ6M-KW@slm.duckdns.org/ [1] Fixes: 661ee6280931 ("cgroup: introduce cgroup.kill") Cc: stable@vger.kernel.org # v5.14+ Signed-off-by: Shakeel Butt Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 6 +++--- include/linux/sched/task.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b20d2d8ef7c..17960a1e858d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -71,9 +71,6 @@ enum { /* Cgroup is frozen. */ CGRP_FROZEN, - - /* Control group has to be killed. */ - CGRP_KILL, }; /* cgroup_root->flags */ @@ -461,6 +458,9 @@ struct cgroup { int nr_threaded_children; /* # of live threaded child cgroups */ + /* sequence number for cgroup.kill, serialized by css_set_lock. */ + unsigned int kill_seq; + struct kernfs_node *kn; /* cgroup kernfs entry */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 0f2aeb37bbb0..ca1db4b92c32 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -43,6 +43,7 @@ struct kernel_clone_args { void *fn_arg; struct cgroup *cgrp; struct css_set *cset; + unsigned int kill_seq; }; /* -- cgit v1.2.3 From f0ada00a9b3801b71d203b0033b7612b687b7d72 Mon Sep 17 00:00:00 2001 From: Imran Shaik Date: Thu, 9 Jan 2025 14:27:44 +0530 Subject: dt-bindings: clock: qcom: Add GPU clocks for QCS8300 The QCS8300 GPU clock controller is a derivative of SA8775P, but has few additional clocks and minor differences. Hence, reuse gpucc bindings of SA8775P and add additional clocks required for QCS8300. Acked-by: Krzysztof Kozlowski Signed-off-by: Imran Shaik Link: https://lore.kernel.org/r/20250109-qcs8300-mm-patches-new-v4-1-63e8ac268b02@quicinc.com Signed-off-by: Rob Herring (Arm) --- include/dt-bindings/clock/qcom,qcs8300-gpucc.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,qcs8300-gpucc.h (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,qcs8300-gpucc.h b/include/dt-bindings/clock/qcom,qcs8300-gpucc.h new file mode 100644 index 000000000000..afa187467b4c --- /dev/null +++ b/include/dt-bindings/clock/qcom,qcs8300-gpucc.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_GPUCC_QCS8300_H +#define _DT_BINDINGS_CLK_QCOM_GPUCC_QCS8300_H + +#include "qcom,sa8775p-gpucc.h" + +/* QCS8300 introduces below new clocks compared to SA8775P */ + +/* GPU_CC clocks */ +#define GPU_CC_CX_ACCU_SHIFT_CLK 23 +#define GPU_CC_GX_ACCU_SHIFT_CLK 24 + +#endif -- cgit v1.2.3 From 0e193cc558e32a879c717bb2d53a1cf8628b5d20 Mon Sep 17 00:00:00 2001 From: Imran Shaik Date: Thu, 9 Jan 2025 14:27:46 +0530 Subject: dt-bindings: clock: qcom: Add CAMCC clocks for QCS8300 The QCS8300 camera clock controller is a derivative of SA8775P, but has an additional clock and minor differences. Hence, reuse the SA8775P camera bindings and add additional clock required for QCS8300. Reviewed-by: Vladimir Zapolskiy Acked-by: Krzysztof Kozlowski Signed-off-by: Imran Shaik Link: https://lore.kernel.org/r/20250109-qcs8300-mm-patches-new-v4-3-63e8ac268b02@quicinc.com Signed-off-by: Rob Herring (Arm) --- include/dt-bindings/clock/qcom,qcs8300-camcc.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,qcs8300-camcc.h (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,qcs8300-camcc.h b/include/dt-bindings/clock/qcom,qcs8300-camcc.h new file mode 100644 index 000000000000..fc535c847859 --- /dev/null +++ b/include/dt-bindings/clock/qcom,qcs8300-camcc.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_QCS8300_CAM_CC_H +#define _DT_BINDINGS_CLK_QCOM_QCS8300_CAM_CC_H + +#include "qcom,sa8775p-camcc.h" + +/* QCS8300 introduces below new clocks compared to SA8775P */ + +/* CAM_CC clocks */ +#define CAM_CC_TITAN_TOP_ACCU_SHIFT_CLK 86 + +#endif -- cgit v1.2.3 From ba69e0750b0362870294adab09339a0c39c3beaf Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 1 Feb 2025 18:21:35 +0100 Subject: efi: Avoid cold plugged memory for placing the kernel UEFI 2.11 introduced EFI_MEMORY_HOT_PLUGGABLE to annotate system memory regions that are 'cold plugged' at boot, i.e., hot pluggable memory that is available from early boot, and described as system RAM by the firmware. Existing loaders and EFI applications running in the boot context will happily use this memory for allocating data structures that cannot be freed or moved at runtime, and this prevents the memory from being unplugged. Going forward, the new EFI_MEMORY_HOT_PLUGGABLE attribute should be tested, and memory annotated as such should be avoided for such allocations. In the EFI stub, there are a couple of occurrences where, instead of the high-level AllocatePages() UEFI boot service, a low-level code sequence is used that traverses the EFI memory map and carves out the requested number of pages from a free region. This is needed, e.g., for allocating as low as possible, or for allocating pages at random. While AllocatePages() should presumably avoid special purpose memory and cold plugged regions, this manual approach needs to incorporate this logic itself, in order to prevent the kernel itself from ending up in a hot unpluggable region, preventing it from being unplugged. So add the EFI_MEMORY_HOTPLUGGABLE macro definition, and check for it where appropriate. Cc: stable@vger.kernel.org Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index 053c57e61869..db293d7de686 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -128,6 +128,7 @@ typedef struct { #define EFI_MEMORY_RO ((u64)0x0000000000020000ULL) /* read-only */ #define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ #define EFI_MEMORY_CPU_CRYPTO ((u64)0x0000000000080000ULL) /* supports encryption */ +#define EFI_MEMORY_HOT_PLUGGABLE BIT_ULL(20) /* supports unplugging at runtime */ #define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ #define EFI_MEMORY_DESCRIPTOR_VERSION 1 -- cgit v1.2.3 From bbc4578537e350d5bf8a7a2c7d054d6b163b3c41 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 1 Feb 2025 18:21:36 +0100 Subject: efi: Use BIT_ULL() constants for memory attributes For legibility, use the existing BIT_ULL() to generate the u64 type EFI memory attribute macros. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index db293d7de686..7d63d1d75f22 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -114,22 +114,22 @@ typedef struct { #define EFI_MAX_MEMORY_TYPE 16 /* Attribute values: */ -#define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */ -#define EFI_MEMORY_WC ((u64)0x0000000000000002ULL) /* write-coalescing */ -#define EFI_MEMORY_WT ((u64)0x0000000000000004ULL) /* write-through */ -#define EFI_MEMORY_WB ((u64)0x0000000000000008ULL) /* write-back */ -#define EFI_MEMORY_UCE ((u64)0x0000000000000010ULL) /* uncached, exported */ -#define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ -#define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ -#define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ -#define EFI_MEMORY_NV ((u64)0x0000000000008000ULL) /* non-volatile */ -#define EFI_MEMORY_MORE_RELIABLE \ - ((u64)0x0000000000010000ULL) /* higher reliability */ -#define EFI_MEMORY_RO ((u64)0x0000000000020000ULL) /* read-only */ -#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ -#define EFI_MEMORY_CPU_CRYPTO ((u64)0x0000000000080000ULL) /* supports encryption */ +#define EFI_MEMORY_UC BIT_ULL(0) /* uncached */ +#define EFI_MEMORY_WC BIT_ULL(1) /* write-coalescing */ +#define EFI_MEMORY_WT BIT_ULL(2) /* write-through */ +#define EFI_MEMORY_WB BIT_ULL(3) /* write-back */ +#define EFI_MEMORY_UCE BIT_ULL(4) /* uncached, exported */ +#define EFI_MEMORY_WP BIT_ULL(12) /* write-protect */ +#define EFI_MEMORY_RP BIT_ULL(13) /* read-protect */ +#define EFI_MEMORY_XP BIT_ULL(14) /* execute-protect */ +#define EFI_MEMORY_NV BIT_ULL(15) /* non-volatile */ +#define EFI_MEMORY_MORE_RELIABLE BIT_ULL(16) /* higher reliability */ +#define EFI_MEMORY_RO BIT_ULL(17) /* read-only */ +#define EFI_MEMORY_SP BIT_ULL(18) /* soft reserved */ +#define EFI_MEMORY_CPU_CRYPTO BIT_ULL(19) /* supports encryption */ #define EFI_MEMORY_HOT_PLUGGABLE BIT_ULL(20) /* supports unplugging at runtime */ -#define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ +#define EFI_MEMORY_RUNTIME BIT_ULL(63) /* range requires runtime mapping */ + #define EFI_MEMORY_DESCRIPTOR_VERSION 1 #define EFI_PAGE_SHIFT 12 -- cgit v1.2.3 From 482ad2a4ace2740ca0ff1cbc8f3c7f862f3ab507 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:09 +0000 Subject: net: add dev_net_rcu() helper dev->nd_net can change, readers should either use rcu_read_lock() or RTNL. We currently use a generic helper, dev_net() with no debugging support. We probably have many hidden bugs. Add dev_net_rcu() helper for callers using rcu_read_lock() protection. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250205155120.1676781-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 ++++++ include/net/net_namespace.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 03bb584c62cf..c0a86afb85da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2663,6 +2663,12 @@ struct net *dev_net(const struct net_device *dev) return read_pnet(&dev->nd_net); } +static inline +struct net *dev_net_rcu(const struct net_device *dev) +{ + return read_pnet_rcu(&dev->nd_net); +} + static inline void dev_net_set(struct net_device *dev, struct net *net) { diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 0f5eb9db0c62..7ba1402ca779 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -398,7 +398,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #endif } -static inline struct net *read_pnet_rcu(possible_net_t *pnet) +static inline struct net *read_pnet_rcu(const possible_net_t *pnet) { #ifdef CONFIG_NET_NS return rcu_dereference(pnet->net); -- cgit v1.2.3 From 469308552ca4560176cfc100e7ca84add1bebd7c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:10 +0000 Subject: ipv4: add RCU protection to ip4_dst_hoplimit() ip4_dst_hoplimit() must use RCU protection to make sure the net structure it reads does not disappear. Fixes: fa50d974d104 ("ipv4: Namespaceify ip_default_ttl sysctl knob") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250205155120.1676781-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index f86775be3e29..c605fd5ec0c0 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -382,10 +382,15 @@ static inline int inet_iif(const struct sk_buff *skb) static inline int ip4_dst_hoplimit(const struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); - struct net *net = dev_net(dst->dev); - if (hoplimit == 0) + if (hoplimit == 0) { + const struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); + rcu_read_unlock(); + } return hoplimit; } -- cgit v1.2.3 From 071d8012869b6af352acca346ade13e7be90a49f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:11 +0000 Subject: ipv4: use RCU protection in ip_dst_mtu_maybe_forward() ip_dst_mtu_maybe_forward() must use RCU protection to make sure the net structure it reads does not disappear. Fixes: f87c10a8aa1e8 ("ipv4: introduce ip_dst_mtu_maybe_forward and protect forwarding path against pmtu spoofing") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250205155120.1676781-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 9f5e33e371fc..ba7b43447775 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -471,9 +471,12 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { const struct rtable *rt = dst_rtable(dst); - struct net *net = dev_net(dst->dev); - unsigned int mtu; + unsigned int mtu, res; + struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { @@ -497,7 +500,11 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, out: mtu = min_t(unsigned int, mtu, IP_MAX_MTU); - return mtu - lwtunnel_headroom(dst->lwtstate, mtu); + res = mtu - lwtunnel_headroom(dst->lwtstate, mtu); + + rcu_read_unlock(); + + return res; } static inline unsigned int ip_skb_dst_mtu(struct sock *sk, -- cgit v1.2.3 From 6a774228e890ee04a0ee13f4e6e731ec8554b9c2 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 5 Feb 2025 12:03:01 +0100 Subject: net: ethtool: tsconfig: Fix netlink type of hwtstamp flags Fix the netlink type for hardware timestamp flags, which are represented as a bitset of flags. Although only one flag is supported currently, the correct netlink bitset type should be used instead of u32 to keep consistency with other fields. Address this by adding a new named string set description for the hwtstamp flag structure. The code has been introduced in the current release so the uAPI change is still okay. Signed-off-by: Kory Maincent Fixes: 6e9e2eed4f39 ("net: ethtool: Add support for tsconfig command to get/set hwtstamp config") Link: https://patch.msgid.link/20250205110304.375086-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index d1089b88efc7..9b18c4cfe56f 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -682,6 +682,7 @@ enum ethtool_link_ext_substate_module { * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics * @ETH_SS_STATS_RMON: names of RMON statistics * @ETH_SS_STATS_PHY: names of PHY(dev) statistics + * @ETH_SS_TS_FLAGS: hardware timestamping flags * * @ETH_SS_COUNT: number of defined string sets */ @@ -708,6 +709,7 @@ enum ethtool_stringset { ETH_SS_STATS_ETH_CTRL, ETH_SS_STATS_RMON, ETH_SS_STATS_PHY, + ETH_SS_TS_FLAGS, /* add new constants above here */ ETH_SS_COUNT -- cgit v1.2.3 From 011b0335903832facca86cd8ed05d7d8d94c9c76 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 6 Feb 2025 22:28:48 +0100 Subject: Revert "net: skb: introduce and use a single page frag cache" This reverts commit dbae2b062824 ("net: skb: introduce and use a single page frag cache"). The intended goal of such change was to counter a performance regression introduced by commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs"). Unfortunately, the blamed commit introduces another regression for the virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny size, so that the whole head frag could fit a 512-byte block. The single page frag cache uses a 1K fragment for such allocation, and the additional overhead, under small UDP packets flood, makes the page allocator a bottleneck. Thanks to commit bf9f1baa279f ("net: add dedicated kmem_cache for typical/small skb->head"), this revert does not re-introduce the original regression. Actually, in the relevant test on top of this revert, I measure a small but noticeable positive delta, just above noise level. The revert itself required some additional mangling due to the introduction of the SKB_HEAD_ALIGN() helper and local lock infra in the affected code. Suggested-by: Eric Dumazet Fixes: dbae2b062824 ("net: skb: introduce and use a single page frag cache") Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/e649212fde9f0fdee23909ca0d14158d32bb7425.1738877290.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c0a86afb85da..365f0e2098d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4115,7 +4115,6 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); -void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) -- cgit v1.2.3 From a1f7b7ff0e10ae574d388131596390157222f986 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 10 Feb 2025 10:17:27 +0200 Subject: PCI: pci_ids: add INTEL_HDA_PTL_H Add Intel PTL-H audio Device ID. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Acked-by: Bjorn Helgaas Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250210081730.22916-2-peter.ujfalusi@linux.intel.com --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index de5deb1a0118..1a2594a38199 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3134,6 +3134,7 @@ #define PCI_DEVICE_ID_INTEL_HDA_LNL_P 0xa828 #define PCI_DEVICE_ID_INTEL_S21152BB 0xb152 #define PCI_DEVICE_ID_INTEL_HDA_BMG 0xe2f7 +#define PCI_DEVICE_ID_INTEL_HDA_PTL_H 0xe328 #define PCI_DEVICE_ID_INTEL_HDA_PTL 0xe428 #define PCI_DEVICE_ID_INTEL_HDA_CML_R 0xf0c8 #define PCI_DEVICE_ID_INTEL_HDA_RKL_S 0xf1c8 -- cgit v1.2.3 From 6d0ce46a93135d96b7fa075a94a88fe0da8e8773 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 13:58:38 +0000 Subject: vrf: use RCU protection in l3mdev_l3_out() l3mdev_l3_out() can be called without RCU being held: raw_sendmsg() ip_push_pending_frames() ip_send_skb() ip_local_out() __ip_local_out() l3mdev_ip_out() Add rcu_read_lock() / rcu_read_unlock() pair to avoid a potential UAF. Fixes: a8e3e1a9f020 ("net: l3mdev: Add hook to output path") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250207135841.1948589-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/l3mdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 2d6141f28b53..f7fe796e8429 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -198,10 +198,12 @@ struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto) if (netif_is_l3_slave(dev)) { struct net_device *master; + rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); if (master && master->l3mdev_ops->l3mdev_l3_out) skb = master->l3mdev_ops->l3mdev_l3_out(master, sk, skb, proto); + rcu_read_unlock(); } return skb; -- cgit v1.2.3 From c195b9c6ab9c383d7aa3f4a65879b3ca90cb378b Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 8 Feb 2025 15:49:07 +0800 Subject: thermal/netlink: Prevent userspace segmentation fault by adjusting UAPI header The intel-lpmd tool [1], which uses the THERMAL_GENL_ATTR_CPU_CAPABILITY attribute to receive HFI events from kernel space, encounters a segmentation fault after commit 1773572863c4 ("thermal: netlink: Add the commands and the events for the thresholds"). The issue arises because the THERMAL_GENL_ATTR_CPU_CAPABILITY raw value was changed while intel_lpmd still uses the old value. Although intel_lpmd can be updated to check the THERMAL_GENL_VERSION and use the appropriate THERMAL_GENL_ATTR_CPU_CAPABILITY value, the commit itself is questionable. The commit introduced a new element in the middle of enum thermal_genl_attr, which affects many existing attributes and introduces potential risks and unnecessary maintenance burdens for userspace thermal netlink event users. Solve the issue by moving the newly introduced THERMAL_GENL_ATTR_TZ_PREV_TEMP attribute to the end of the enum thermal_genl_attr. This ensures that all existing thermal generic netlink attributes remain unaffected. Link: https://github.com/intel/intel-lpmd [1] Fixes: 1773572863c4 ("thermal: netlink: Add the commands and the events for the thresholds") Signed-off-by: Zhang Rui Reviewed-by: Daniel Lezcano Link: https://patch.msgid.link/20250208074907.5679-1-rui.zhang@intel.com [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- include/uapi/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index 349718c271eb..46a2633d33aa 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -30,7 +30,6 @@ enum thermal_genl_attr { THERMAL_GENL_ATTR_TZ, THERMAL_GENL_ATTR_TZ_ID, THERMAL_GENL_ATTR_TZ_TEMP, - THERMAL_GENL_ATTR_TZ_PREV_TEMP, THERMAL_GENL_ATTR_TZ_TRIP, THERMAL_GENL_ATTR_TZ_TRIP_ID, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, @@ -54,6 +53,7 @@ enum thermal_genl_attr { THERMAL_GENL_ATTR_THRESHOLD, THERMAL_GENL_ATTR_THRESHOLD_TEMP, THERMAL_GENL_ATTR_THRESHOLD_DIRECTION, + THERMAL_GENL_ATTR_TZ_PREV_TEMP, __THERMAL_GENL_ATTR_MAX, }; #define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1) -- cgit v1.2.3 From e00a2e5d485faf53c7a24b9d1b575a642227947f Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Wed, 12 Feb 2025 18:18:51 +0200 Subject: drm: Fix DSC BPP increment decoding Starting with DPCD version 2.0 bits 6:3 of the DP_DSC_BITS_PER_PIXEL_INC DPCD register contains the NativeYCbCr422_MAX_bpp_DELTA field, which can be non-zero as opposed to earlier DPCD versions, hence decoding the bit_per_pixel increment value at bits 2:0 in the same register requires applying a mask, do so. Cc: Ankit Nautiyal Fixes: 0c2287c96521 ("drm/display/dp: Add helper function to get DSC bpp precision") Reviewed-by: Jani Nikula Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20250212161851.4007005-1-imre.deak@intel.com --- include/drm/display/drm_dp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/drm/display/drm_dp.h b/include/drm/display/drm_dp.h index a6f8b098c56f..3bd9f482f0c3 100644 --- a/include/drm/display/drm_dp.h +++ b/include/drm/display/drm_dp.h @@ -359,6 +359,7 @@ # define DP_DSC_BITS_PER_PIXEL_1_4 0x2 # define DP_DSC_BITS_PER_PIXEL_1_2 0x3 # define DP_DSC_BITS_PER_PIXEL_1_1 0x4 +# define DP_DSC_BITS_PER_PIXEL_MASK 0x7 #define DP_PSR_SUPPORT 0x070 /* XXX 1.2? */ # define DP_PSR_IS_SUPPORTED 1 -- cgit v1.2.3 From 1d0013962d220b166d9f7c9fe2746f1542e459a3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2025 22:23:59 +0000 Subject: netfs: Fix a number of read-retry hangs Fix a number of hangs in the netfslib read-retry code, including: (1) netfs_reissue_read() doubles up the getting of references on subrequests, thereby leaking the subrequest and causing inode eviction to wait indefinitely. This can lead to the kernel reporting a hang in the filesystem's evict_inode(). Fix this by removing the get from netfs_reissue_read() and adding one to netfs_retry_read_subrequests() to deal with the one place that didn't double up. (2) The loop in netfs_retry_read_subrequests() that retries a sequence of failed subrequests doesn't record whether or not it retried the one that the "subreq" pointer points to when it leaves the loop. It may not if renegotiation/repreparation of the subrequests means that fewer subrequests are needed to span the cumulative range of the sequence. Because it doesn't record this, the piece of code that discards now-superfluous subrequests doesn't know whether it should discard the one "subreq" points to - and so it doesn't. Fix this by noting whether the last subreq it examines is superfluous and if it is, then getting rid of it and all subsequent subrequests. If that one one wasn't superfluous, then we would have tried to go round the previous loop again and so there can be no further unretried subrequests in the sequence. (3) netfs_retry_read_subrequests() gets yet an extra ref on any additional subrequests it has to get because it ran out of ones it could reuse to to renegotiation/repreparation shrinking the subrequests. Fix this by removing that extra ref. (4) In netfs_retry_reads(), it was using wait_on_bit() to wait for NETFS_SREQ_IN_PROGRESS to be cleared on all subrequests in the sequence - but netfs_read_subreq_terminated() is now using a wait queue on the request instead and so this wait will never finish. Fix this by waiting on the wait queue instead. To make this work, a new flag, NETFS_RREQ_RETRYING, is now set around the wait loop to tell the wake-up code to wake up the wait queue rather than requeuing the request's work item. Note that this flag replaces the NETFS_RREQ_NEED_RETRY flag which is no longer used. (5) Whilst not strictly anything to do with the hang, netfs_retry_read_subrequests() was also doubly incrementing the subreq_counter and re-setting the debug index, leaving a gap in the trace. This is also fixed. One of these hangs was observed with 9p and with cifs. Others were forced by manual code injection into fs/afs/file.c. Firstly, afs_prepare_read() was created to provide an changing pattern of maximum subrequest sizes: static int afs_prepare_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; if (!S_ISREG(subreq->rreq->inode->i_mode)) return 0; if (subreq->retry_count < 20) rreq->io_streams[0].sreq_max_len = umax(200, 2222 - subreq->retry_count * 40); else rreq->io_streams[0].sreq_max_len = 3333; return 0; } and pointed to by afs_req_ops. Then the following: struct netfs_io_subrequest *subreq = op->fetch.subreq; if (subreq->error == 0 && S_ISREG(subreq->rreq->inode->i_mode) && subreq->retry_count < 20) { subreq->transferred = subreq->already_done; __clear_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); afs_fetch_data_notify(op); return; } was inserted into afs_fetch_data_success() at the beginning and struct netfs_io_subrequest given an extra field, "already_done" that was set to the value in "subreq->transferred" by netfs_reissue_read(). When reading a 4K file, the subrequests would get gradually smaller, a new subrequest would be allocated around the 3rd retry and then eventually be rendered superfluous when the 20th retry was hit and the limit on the first subrequest was eased. Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20250212222402.3618494-2-dhowells@redhat.com Tested-by: Marc Dionne Tested-by: Steve French cc: Ihor Solodrai cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Paulo Alcantara cc: Jeff Layton cc: v9fs@lists.linux.dev cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 2 +- include/trace/events/netfs.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 071d05d81d38..c86a11cfc4a3 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -278,7 +278,7 @@ struct netfs_io_request { #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ -#define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */ +#define NETFS_RREQ_RETRYING 14 /* Set if we're in the retry path */ #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ const struct netfs_request_ops *netfs_ops; diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 6e699cadcb29..f880835f7695 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -99,7 +99,7 @@ EM(netfs_sreq_trace_limited, "LIMIT") \ EM(netfs_sreq_trace_need_clear, "N-CLR") \ EM(netfs_sreq_trace_partial_read, "PARTR") \ - EM(netfs_sreq_trace_need_retry, "NRTRY") \ + EM(netfs_sreq_trace_need_retry, "ND-RT") \ EM(netfs_sreq_trace_prepare, "PREP ") \ EM(netfs_sreq_trace_prep_failed, "PRPFL") \ EM(netfs_sreq_trace_progress, "PRGRS") \ @@ -108,7 +108,9 @@ EM(netfs_sreq_trace_short, "SHORT") \ EM(netfs_sreq_trace_split, "SPLIT") \ EM(netfs_sreq_trace_submit, "SUBMT") \ + EM(netfs_sreq_trace_superfluous, "SPRFL") \ EM(netfs_sreq_trace_terminated, "TERM ") \ + EM(netfs_sreq_trace_wait_for, "_WAIT") \ EM(netfs_sreq_trace_write, "WRITE") \ EM(netfs_sreq_trace_write_skip, "SKIP ") \ E_(netfs_sreq_trace_write_term, "WTERM") -- cgit v1.2.3 From 1f47ed294a2bd577d5ae43e6e28e1c9a3be4a833 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Feb 2025 08:18:46 -0700 Subject: block: cleanup and fix batch completion adding conditions The conditions for whether or not a request is allowed adding to a completion batch are a bit hard to read, and they also have a few issues. One is that ioerror may indeed be a random value on passthrough, and it's being checked unconditionally of whether or not the given request is a passthrough request or not. Rewrite the conditions to be separate for easier reading, and only check ioerror for non-passthrough requests. This fixes an issue with bio unmapping on passthrough, where it fails getting added to a batch. This both leads to suboptimal performance, and may trigger a potential schedule-under-atomic condition for polled passthrough IO. Fixes: f794f3351f26 ("block: add support for blk_mq_end_request_batch()") Link: https://lore.kernel.org/r/20575f0a-656e-4bb3-9d82-dec6c7e3a35c@kernel.dk Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9ebb53f031cd..fa2a76cc2f73 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -861,12 +861,22 @@ static inline bool blk_mq_add_to_batch(struct request *req, void (*complete)(struct io_comp_batch *)) { /* - * blk_mq_end_request_batch() can't end request allocated from - * sched tags + * Check various conditions that exclude batch processing: + * 1) No batch container + * 2) Has scheduler data attached + * 3) Not a passthrough request and end_io set + * 4) Not a passthrough request and an ioerror */ - if (!iob || (req->rq_flags & RQF_SCHED_TAGS) || ioerror || - (req->end_io && !blk_rq_is_passthrough(req))) + if (!iob) return false; + if (req->rq_flags & RQF_SCHED_TAGS) + return false; + if (!blk_rq_is_passthrough(req)) { + if (req->end_io) + return false; + if (ioerror < 0) + return false; + } if (!iob->complete) iob->complete = complete; -- cgit v1.2.3 From 35fa2d88ca9481e5caf533d58b99ca259c63b2fe Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 10 Feb 2025 13:30:25 +0100 Subject: driver core: add a faux bus for use when a simple device/bus is needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many drivers abuse the platform driver/bus system as it provides a simple way to create and bind a device to a driver-specific set of probe/release functions. Instead of doing that, and wasting all of the memory associated with a platform device, here is a "faux" bus that can be used instead. Reviewed-by: Jonathan Cameron Reviewed-by: Danilo Krummrich Reviewed-by: Lyude Paul Reviewed-by: Thomas Weißschuh Reviewed-by: Zijun Hu Link: https://lore.kernel.org/r/2025021026-atlantic-gibberish-3f0c@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/device/faux.h | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 include/linux/device/faux.h (limited to 'include') diff --git a/include/linux/device/faux.h b/include/linux/device/faux.h new file mode 100644 index 000000000000..9f43c0e46aa4 --- /dev/null +++ b/include/linux/device/faux.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025 Greg Kroah-Hartman + * Copyright (c) 2025 The Linux Foundation + * + * A "simple" faux bus that allows devices to be created and added + * automatically to it. This is to be used whenever you need to create a + * device that is not associated with any "real" system resources, and do + * not want to have to deal with a bus/driver binding logic. It is + * intended to be very simple, with only a create and a destroy function + * available. + */ +#ifndef _FAUX_DEVICE_H_ +#define _FAUX_DEVICE_H_ + +#include +#include + +/** + * struct faux_device - a "faux" device + * @dev: internal struct device of the object + * + * A simple faux device that can be created/destroyed. To be used when a + * driver only needs to have a device to "hang" something off. This can be + * used for downloading firmware or other basic tasks. Use this instead of + * a struct platform_device if the device has no resources assigned to + * it at all. + */ +struct faux_device { + struct device dev; +}; +#define to_faux_device(x) container_of_const((x), struct faux_device, dev) + +/** + * struct faux_device_ops - a set of callbacks for a struct faux_device + * @probe: called when a faux device is probed by the driver core + * before the device is fully bound to the internal faux bus + * code. If probe succeeds, return 0, otherwise return a + * negative error number to stop the probe sequence from + * succeeding. + * @remove: called when a faux device is removed from the system + * + * Both @probe and @remove are optional, if not needed, set to NULL. + */ +struct faux_device_ops { + int (*probe)(struct faux_device *faux_dev); + void (*remove)(struct faux_device *faux_dev); +}; + +struct faux_device *faux_device_create(const char *name, + struct device *parent, + const struct faux_device_ops *faux_ops); +struct faux_device *faux_device_create_with_groups(const char *name, + struct device *parent, + const struct faux_device_ops *faux_ops, + const struct attribute_group **groups); +void faux_device_destroy(struct faux_device *faux_dev); + +static inline void *faux_device_get_drvdata(const struct faux_device *faux_dev) +{ + return dev_get_drvdata(&faux_dev->dev); +} + +static inline void faux_device_set_drvdata(struct faux_device *faux_dev, void *data) +{ + dev_set_drvdata(&faux_dev->dev, data); +} + +#endif /* _FAUX_DEVICE_H_ */ -- cgit v1.2.3 From ab4eedb790cae44313759b50fe47da285e2519d5 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 6 Feb 2025 15:54:45 -0500 Subject: Bluetooth: L2CAP: Fix corrupted list in hci_chan_del This fixes the following trace by reworking the locking of l2cap_conn so instead of only locking when changing the chan_l list this promotes chan_lock to a general lock of l2cap_conn so whenever it is being held it would prevents the likes of l2cap_conn_del to run: list_del corruption, ffff888021297e00->prev is LIST_POISON2 (dead000000000122) ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:61! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 5896 Comm: syz-executor213 Not tainted 6.14.0-rc1-next-20250204-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 12/27/2024 RIP: 0010:__list_del_entry_valid_or_report+0x12c/0x190 lib/list_debug.c:59 Code: 8c 4c 89 fe 48 89 da e8 32 8c 37 fc 90 0f 0b 48 89 df e8 27 9f 14 fd 48 c7 c7 a0 c0 60 8c 4c 89 fe 48 89 da e8 15 8c 37 fc 90 <0f> 0b 4c 89 e7 e8 0a 9f 14 fd 42 80 3c 2b 00 74 08 4c 89 e7 e8 cb RSP: 0018:ffffc90003f6f998 EFLAGS: 00010246 RAX: 000000000000004e RBX: dead000000000122 RCX: 01454d423f7fbf00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff819f077c R09: 1ffff920007eded0 R10: dffffc0000000000 R11: fffff520007eded1 R12: dead000000000122 R13: dffffc0000000000 R14: ffff8880352248d8 R15: ffff888021297e00 FS: 00007f7ace6686c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7aceeeb1d0 CR3: 000000003527c000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __list_del_entry_valid include/linux/list.h:124 [inline] __list_del_entry include/linux/list.h:215 [inline] list_del_rcu include/linux/rculist.h:168 [inline] hci_chan_del+0x70/0x1b0 net/bluetooth/hci_conn.c:2858 l2cap_conn_free net/bluetooth/l2cap_core.c:1816 [inline] kref_put include/linux/kref.h:65 [inline] l2cap_conn_put+0x70/0xe0 net/bluetooth/l2cap_core.c:1830 l2cap_sock_shutdown+0xa8a/0x1020 net/bluetooth/l2cap_sock.c:1377 l2cap_sock_release+0x79/0x1d0 net/bluetooth/l2cap_sock.c:1416 __sock_release net/socket.c:642 [inline] sock_close+0xbc/0x240 net/socket.c:1393 __fput+0x3e9/0x9f0 fs/file_table.c:448 task_work_run+0x24f/0x310 kernel/task_work.c:227 ptrace_notify+0x2d2/0x380 kernel/signal.c:2522 ptrace_report_syscall include/linux/ptrace.h:415 [inline] ptrace_report_syscall_exit include/linux/ptrace.h:477 [inline] syscall_exit_work+0xc7/0x1d0 kernel/entry/common.c:173 syscall_exit_to_user_mode_prepare kernel/entry/common.c:200 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:205 [inline] syscall_exit_to_user_mode+0x24a/0x340 kernel/entry/common.c:218 do_syscall_64+0x100/0x230 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f7aceeaf449 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 41 19 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f7ace668218 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: fffffffffffffffc RBX: 00007f7acef39328 RCX: 00007f7aceeaf449 RDX: 000000000000000e RSI: 0000000020000100 RDI: 0000000000000004 RBP: 00007f7acef39320 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000003 R13: 0000000000000004 R14: 00007f7ace668670 R15: 000000000000000b Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:__list_del_entry_valid_or_report+0x12c/0x190 lib/list_debug.c:59 Code: 8c 4c 89 fe 48 89 da e8 32 8c 37 fc 90 0f 0b 48 89 df e8 27 9f 14 fd 48 c7 c7 a0 c0 60 8c 4c 89 fe 48 89 da e8 15 8c 37 fc 90 <0f> 0b 4c 89 e7 e8 0a 9f 14 fd 42 80 3c 2b 00 74 08 4c 89 e7 e8 cb RSP: 0018:ffffc90003f6f998 EFLAGS: 00010246 RAX: 000000000000004e RBX: dead000000000122 RCX: 01454d423f7fbf00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff819f077c R09: 1ffff920007eded0 R10: dffffc0000000000 R11: fffff520007eded1 R12: dead000000000122 R13: dffffc0000000000 R14: ffff8880352248d8 R15: ffff888021297e00 FS: 00007f7ace6686c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7acef05b08 CR3: 000000003527c000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Reported-by: syzbot+10bd8fe6741eedd2be2e@syzkaller.appspotmail.com Tested-by: syzbot+10bd8fe6741eedd2be2e@syzkaller.appspotmail.com Fixes: b4f82f9ed43a ("Bluetooth: L2CAP: Fix slab-use-after-free Read in l2cap_send_cmd") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Dan Carpenter --- include/net/bluetooth/l2cap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index d9c767cf773d..9189354c568f 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -668,7 +668,7 @@ struct l2cap_conn { struct l2cap_chan *smp; struct list_head chan_l; - struct mutex chan_lock; + struct mutex lock; struct kref ref; struct list_head users; }; @@ -970,6 +970,7 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err); void l2cap_send_conn_req(struct l2cap_chan *chan); struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn); +struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user); -- cgit v1.2.3 From 0892b840318daa6ae739b7cdec5ecdfca4006689 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 13 Feb 2025 08:49:44 -0800 Subject: Reapply "net: skb: introduce and use a single page frag cache" This reverts commit 011b0335903832facca86cd8ed05d7d8d94c9c76. Sabrina reports that the revert may trigger warnings due to intervening changes, especially the ability to rise MAX_SKB_FRAGS. Let's drop it and revisit once that part is also ironed out. Fixes: 011b03359038 ("Revert "net: skb: introduce and use a single page frag cache"") Reported-by: Sabrina Dubroca Link: https://lore.kernel.org/6bf54579233038bc0e76056c5ea459872ce362ab.1739375933.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 365f0e2098d1..c0a86afb85da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4115,6 +4115,7 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); +void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) -- cgit v1.2.3 From 192b7ff29b1fb0335a9b9107991e6286f462f361 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Noack?= Date: Fri, 24 Jan 2025 15:44:44 +0000 Subject: landlock: Minor typo and grammar fixes in IPC scoping documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix some whitespace, punctuation and minor grammar. * Add a missing sentence about the minimum ABI version, to stay in line with the section next to it. Cc: Tahera Fahimi Cc: Tanya Agarwal Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20250124154445.162841-1-gnoack@google.com [mic: Add newlines, update doc date] Signed-off-by: Mickaël Salaün --- include/uapi/linux/landlock.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index 33745642f787..e1d2c27533b4 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -268,7 +268,9 @@ struct landlock_net_port_attr { * ~~~~~~~~~~~~~~~~ * * These flags enable to restrict a sandboxed process to a set of network - * actions. This is supported since the Landlock ABI version 4. + * actions. + * + * This is supported since Landlock ABI version 4. * * The following access rights apply to TCP port numbers: * @@ -291,11 +293,13 @@ struct landlock_net_port_attr { * Setting a flag for a ruleset will isolate the Landlock domain to forbid * connections to resources outside the domain. * + * This is supported since Landlock ABI version 6. + * * Scopes: * * - %LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET: Restrict a sandboxed process from * connecting to an abstract UNIX socket created by a process outside the - * related Landlock domain (e.g. a parent domain or a non-sandboxed process). + * related Landlock domain (e.g., a parent domain or a non-sandboxed process). * - %LANDLOCK_SCOPE_SIGNAL: Restrict a sandboxed process from sending a signal * to another process outside the domain. */ -- cgit v1.2.3 From 362ff1e7c6c20f8d6ebe20682870d471373c608b Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 13 Feb 2025 17:18:25 +0100 Subject: virtio_snd.h: clarify that `controls` depends on VIRTIO_SND_F_CTLS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As defined in the specification, the `controls` field in the configuration space is only valid/present if VIRTIO_SND_F_CTLS is negotiated. From https://docs.oasis-open.org/virtio/virtio/v1.3/virtio-v1.3.html: 5.14.4 Device Configuration Layout ... controls (driver-read-only) indicates a total number of all available control elements if VIRTIO_SND_F_CTLS has been negotiated. Let's use the same style used in virtio_blk.h to clarify this and to avoid confusion as happened in QEMU (see link). Link: https://gitlab.com/qemu-project/qemu/-/issues/2805 Signed-off-by: Stefano Garzarella Acked-by: Eugenio Pérez Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250213161825.139952-1-sgarzare@redhat.com --- include/uapi/linux/virtio_snd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/virtio_snd.h b/include/uapi/linux/virtio_snd.h index 5f4100c2cf04..a4cfb9f6561a 100644 --- a/include/uapi/linux/virtio_snd.h +++ b/include/uapi/linux/virtio_snd.h @@ -25,7 +25,7 @@ struct virtio_snd_config { __le32 streams; /* # of available channel maps */ __le32 chmaps; - /* # of available control elements */ + /* # of available control elements (if VIRTIO_SND_F_CTLS) */ __le32 controls; }; -- cgit v1.2.3 From 435b344a7042e91fb4719d589f18310e8919e39f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 10 Feb 2025 22:53:47 +0000 Subject: crypto: ccp: Add external API interface for PSP module initialization KVM is dependent on the PSP SEV driver and PSP SEV driver needs to be loaded before KVM module. In case of module loading any dependent modules are automatically loaded but in case of built-in modules there is no inherent mechanism available to specify dependencies between modules and ensure that any dependent modules are loaded implicitly. Add a new external API interface for PSP module initialization which allows PSP SEV driver to be loaded explicitly if KVM is built-in. Signed-off-by: Sean Christopherson Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra Reviewed-by: Tom Lendacky Message-ID: <15279ca0cad56a07cf12834ec544310f85ff5edc.1739226950.git.ashish.kalra@amd.com> Signed-off-by: Paolo Bonzini --- include/linux/psp-sev.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 903ddfea8585..f3cad182d4ef 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -814,6 +814,15 @@ struct sev_data_snp_commit { #ifdef CONFIG_CRYPTO_DEV_SP_PSP +/** + * sev_module_init - perform PSP SEV module initialization + * + * Returns: + * 0 if the PSP module is successfully initialized + * negative value if the PSP module initialization fails + */ +int sev_module_init(void); + /** * sev_platform_init - perform SEV INIT command * -- cgit v1.2.3 From b016d0873777462e55af4c615104cc684fce086d Mon Sep 17 00:00:00 2001 From: Wang Yaxin Date: Sat, 8 Feb 2025 14:49:01 +0800 Subject: taskstats: modify taskstats version After adding "delay max" and "delay min" to the taskstats structure, the taskstats version needs to be updated. Link: https://lkml.kernel.org/r/20250208144901218Q5ptVpqsQkb2MOEmW4Ujn@zte.com.cn Fixes: f65c64f311ee ("delayacct: add delay min to record delay peak") Signed-off-by: Wang Yaxin Signed-off-by: Kun Jiang Reviewed-by: xu xin Signed-off-by: Andrew Morton --- include/uapi/linux/taskstats.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index 934e20ef7f79..95762232e018 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 14 +#define TASKSTATS_VERSION 15 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ -- cgit v1.2.3 From 02d954c0fdf91845169cdacc7405b120f90afe01 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 10 Feb 2025 16:32:50 +0100 Subject: sched: Compact RSEQ concurrency IDs with reduced threads and affinity When a process reduces its number of threads or clears bits in its CPU affinity mask, the mm_cid allocation should eventually converge towards smaller values. However, the change introduced by: commit 7e019dcc470f ("sched: Improve cache locality of RSEQ concurrency IDs for intermittent workloads") adds a per-mm/CPU recent_cid which is never unset unless a thread migrates. This is a tradeoff between: A) Preserving cache locality after a transition from many threads to few threads, or after reducing the hamming weight of the allowed CPU mask. B) Making the mm_cid upper bounds wrt nr threads and allowed CPU mask easy to document and understand. C) Allowing applications to eventually react to mm_cid compaction after reduction of the nr threads or allowed CPU mask, making the tracking of mm_cid compaction easier by shrinking it back towards 0 or not. D) Making sure applications that periodically reduce and then increase again the nr threads or allowed CPU mask still benefit from good cache locality with mm_cid. Introduce the following changes: * After shrinking the number of threads or reducing the number of allowed CPUs, reduce the value of max_nr_cid so expansion of CID allocation will preserve cache locality if the number of threads or allowed CPUs increase again. * Only re-use a recent_cid if it is within the max_nr_cid upper bound, else find the first available CID. Fixes: 7e019dcc470f ("sched: Improve cache locality of RSEQ concurrency IDs for intermittent workloads") Signed-off-by: Mathieu Desnoyers Signed-off-by: Gabriele Monaco Signed-off-by: Peter Zijlstra (Intel) Tested-by: Gabriele Monaco Link: https://lkml.kernel.org/r/20250210153253.460471-2-gmonaco@redhat.com --- include/linux/mm_types.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b27db7f9496..0234f14f2aa6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -875,10 +875,11 @@ struct mm_struct { */ unsigned int nr_cpus_allowed; /** - * @max_nr_cid: Maximum number of concurrency IDs allocated. + * @max_nr_cid: Maximum number of allowed concurrency + * IDs allocated. * - * Track the highest number of concurrency IDs allocated for the - * mm. + * Track the highest number of allowed concurrency IDs + * allocated for the mm. */ atomic_t max_nr_cid; /** -- cgit v1.2.3 From 84e009042d0f3dfe91bec60bcd208ee3f866cbcd Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Mon, 17 Feb 2025 17:08:27 +0100 Subject: nvme-tcp: add basic support for the C2HTermReq PDU Previously, the NVMe/TCP host driver did not handle the C2HTermReq PDU, instead printing "unsupported pdu type (3)" when received. This patch adds support for processing the C2HTermReq PDU, allowing the driver to print the Fatal Error Status field. Example of output: nvme nvme4: Received C2HTermReq (FES = Invalid PDU Header Field) Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- include/linux/nvme-tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h index e07e8978d691..e435250fcb4d 100644 --- a/include/linux/nvme-tcp.h +++ b/include/linux/nvme-tcp.h @@ -13,6 +13,8 @@ #define NVME_TCP_ADMIN_CCSZ SZ_8K #define NVME_TCP_DIGEST_LENGTH 4 #define NVME_TCP_MIN_MAXH2CDATA 4096 +#define NVME_TCP_MIN_C2HTERM_PLEN 24 +#define NVME_TCP_MAX_C2HTERM_PLEN 152 enum nvme_tcp_pfv { NVME_TCP_PFV_1_0 = 0x0, -- cgit v1.2.3 From d422247d14a53fe825b1778edf104167d8fd8f3f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:49:59 +0900 Subject: nvme: Cleanup the definition of the controller config register fields Reorganized the enum used to define the fields of the contrller configuration (CC) register in include/linux/nvme.h to: 1) Group together all the values defined for each field. 2) Add the missing field masks definitions. 3) Add comments to describe the enum and each field. Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fe3b60818fdc..2dc05b1c3283 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -199,28 +199,54 @@ enum { #define NVME_NVM_IOSQES 6 #define NVME_NVM_IOCQES 4 +/* + * Controller Configuration (CC) register (Offset 14h) + */ enum { + /* Enable (EN): bit 0 */ NVME_CC_ENABLE = 1 << 0, NVME_CC_EN_SHIFT = 0, + + /* Bits 03:01 are reserved (NVMe Base Specification rev 2.1) */ + + /* I/O Command Set Selected (CSS): bits 06:04 */ NVME_CC_CSS_SHIFT = 4, - NVME_CC_MPS_SHIFT = 7, - NVME_CC_AMS_SHIFT = 11, - NVME_CC_SHN_SHIFT = 14, - NVME_CC_IOSQES_SHIFT = 16, - NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT, NVME_CC_CSS_CSI = 6 << NVME_CC_CSS_SHIFT, - NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, + + /* Memory Page Size (MPS): bits 10:07 */ + NVME_CC_MPS_SHIFT = 7, + NVME_CC_MPS_MASK = 0xf << NVME_CC_MPS_SHIFT, + + /* Arbitration Mechanism Selected (AMS): bits 13:11 */ + NVME_CC_AMS_SHIFT = 11, + NVME_CC_AMS_MASK = 7 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, + + /* Shutdown Notification (SHN): bits 15:14 */ + NVME_CC_SHN_SHIFT = 14, + NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, - NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, + + /* I/O Submission Queue Entry Size (IOSQES): bits 19:16 */ + NVME_CC_IOSQES_SHIFT = 16, + NVME_CC_IOSQES_MASK = 0xf << NVME_CC_IOSQES_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, + + /* I/O Completion Queue Entry Size (IOCQES): bits 23:20 */ + NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_IOCQES_MASK = 0xf << NVME_CC_IOCQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + + /* Controller Ready Independent of Media Enable (CRIME): bit 24 */ NVME_CC_CRIME = 1 << 24, + + /* Bits 25:31 are reserved (NVMe Base Specification rev 2.1) */ }; enum { -- cgit v1.2.3 From 1fc61eeefe10d9996d2b875214d89f0909d03417 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Feb 2025 16:47:40 -0700 Subject: io_uring: fix spelling error in uapi io_uring.h This is obviously not that important, but when changes are synced back from the kernel to liburing, the codespell CI ends up erroring because of this misspelling. Let's just correct it and avoid this biting us again on an import. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e11c82638527..050fa8eb2e8f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -380,7 +380,7 @@ enum io_uring_op { * result will be the number of buffers send, with * the starting buffer ID in cqe->flags as per * usual for provided buffer usage. The buffers - * will be contigious from the starting buffer ID. + * will be contiguous from the starting buffer ID. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) -- cgit v1.2.3 From e57a6320215c3967f51ab0edeff87db2095440e4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 11:11:27 -0800 Subject: net: Add net_passive_inc() and net_passive_dec(). net_drop_ns() is NULL when CONFIG_NET_NS is disabled. The next patch introduces a function that increments and decrements net->passive. As a prep, let's rename and export net_free() to net_passive_dec() and add net_passive_inc(). Suggested-by: Eric Dumazet Link: https://lore.kernel.org/netdev/CANn89i+oUCt2VGvrbrweniTendZFEh+nwS=uonc004-aPkWy-Q@mail.gmail.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250217191129.19967-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 7ba1402ca779..f467a66abc6b 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -297,6 +297,7 @@ static inline int check_net(const struct net *net) } void net_drop_ns(void *); +void net_passive_dec(struct net *net); #else @@ -326,8 +327,18 @@ static inline int check_net(const struct net *net) } #define net_drop_ns NULL + +static inline void net_passive_dec(struct net *net) +{ + refcount_dec(&net->passive); +} #endif +static inline void net_passive_inc(struct net *net) +{ + refcount_inc(&net->passive); +} + /* Returns true if the netns initialization is completed successfully */ static inline bool net_initialized(const struct net *net) { -- cgit v1.2.3 From 5bbd6e863b15a85221e49b9bdb2d5d8f0bb91f3d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 1 Feb 2025 15:00:02 -0500 Subject: SUNRPC: Prevent looping due to rpc_signal_task() races If rpc_signal_task() is called while a task is in an rpc_call_done() callback function, and the latter calls rpc_restart_call(), the task can end up looping due to the RPC_TASK_SIGNALLED flag being set without the tk_rpc_status being set. Removing the redundant mechanism for signalling the task fixes the looping behaviour. Reported-by: Li Lingfeng Fixes: 39494194f93b ("SUNRPC: Fix races with rpc_killall_tasks()") Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 3 +-- include/trace/events/sunrpc.h | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index fec1e8a1570c..eac57914dcf3 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -158,7 +158,6 @@ enum { RPC_TASK_NEED_XMIT, RPC_TASK_NEED_RECV, RPC_TASK_MSG_PIN_WAIT, - RPC_TASK_SIGNALLED, }; #define rpc_test_and_set_running(t) \ @@ -171,7 +170,7 @@ enum { #define RPC_IS_ACTIVATED(t) test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate) -#define RPC_SIGNALLED(t) test_bit(RPC_TASK_SIGNALLED, &(t)->tk_runstate) +#define RPC_SIGNALLED(t) (READ_ONCE(task->tk_rpc_status) == -ERESTARTSYS) /* * Task priorities. diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index b13dc275ef4a..851841336ee6 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -360,8 +360,7 @@ TRACE_EVENT(rpc_request, { (1UL << RPC_TASK_ACTIVE), "ACTIVE" }, \ { (1UL << RPC_TASK_NEED_XMIT), "NEED_XMIT" }, \ { (1UL << RPC_TASK_NEED_RECV), "NEED_RECV" }, \ - { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }, \ - { (1UL << RPC_TASK_SIGNALLED), "SIGNALLED" }) + { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }) DECLARE_EVENT_CLASS(rpc_task_running, -- cgit v1.2.3 From 4b5a28b38c4a0106c64416a1b2042405166b26ce Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 18 Feb 2025 05:49:30 -0800 Subject: net: Add non-RCU dev_getbyhwaddr() helper Add dedicated helper for finding devices by hardware address when holding rtnl_lock, similar to existing dev_getbyhwaddr_rcu(). This prevents PROVE_LOCKING warnings when rtnl_lock is held but RCU read lock is not. Extract common address comparison logic into dev_addr_cmp(). The context about this change could be found in the following discussion: Link: https://lore.kernel.org/all/20250206-scarlet-ermine-of-improvement-1fcac5@leitao/ Cc: kuniyu@amazon.com Cc: ushankar@purestorage.com Suggested-by: Eric Dumazet Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250218-arm_fix_selftest-v5-1-d3d6892db9e1@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c0a86afb85da..94b7d4eca003 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3275,6 +3275,8 @@ static inline struct net_device *first_net_device_rcu(struct net *net) } int netdev_boot_setup_check(struct net_device *dev); +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *hwaddr); struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); -- cgit v1.2.3 From 9b6412e6979f6f9e0632075f8f008937b5cd4efd Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 17 Feb 2025 11:23:35 +0100 Subject: tcp: drop secpath at the same time as we currently drop dst Xiumei reported hitting the WARN in xfrm6_tunnel_net_exit while running tests that boil down to: - create a pair of netns - run a basic TCP test over ipcomp6 - delete the pair of netns The xfrm_state found on spi_byaddr was not deleted at the time we delete the netns, because we still have a reference on it. This lingering reference comes from a secpath (which holds a ref on the xfrm_state), which is still attached to an skb. This skb is not leaked, it ends up on sk_receive_queue and then gets defer-free'd by skb_attempt_defer_free. The problem happens when we defer freeing an skb (push it on one CPU's defer_list), and don't flush that list before the netns is deleted. In that case, we still have a reference on the xfrm_state that we don't expect at this point. We already drop the skb's dst in the TCP receive path when it's no longer needed, so let's also drop the secpath. At this point, tcp_filter has already called into the LSM hooks that may require the secpath, so it should not be needed anymore. However, in some of those places, the MPTCP extension has just been attached to the skb, so we cannot simply drop all extensions. Fixes: 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists") Reported-by: Xiumei Mu Signed-off-by: Sabrina Dubroca Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/5055ba8f8f72bdcb602faa299faca73c280b7735.1739743613.git.sd@queasysnail.net Signed-off-by: Paolo Abeni --- include/net/tcp.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 5b2b04835688..930cda5b5eb9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -683,6 +684,19 @@ void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); void tcp_sack_compress_send_ack(struct sock *sk); +static inline void tcp_cleanup_skb(struct sk_buff *skb) +{ + skb_dst_drop(skb); + secpath_reset(skb); +} + +static inline void tcp_add_receive_queue(struct sock *sk, struct sk_buff *skb) +{ + DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); + DEBUG_NET_WARN_ON_ONCE(secpath_exists(skb)); + __skb_queue_tail(&sk->sk_receive_queue, skb); +} + /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) -- cgit v1.2.3 From 14ad6ed30a10afbe91b0749d6378285f4225d482 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2025 19:29:39 +0100 Subject: net: allow small head cache usage with large MAX_SKB_FRAGS values Sabrina reported the following splat: WARNING: CPU: 0 PID: 1 at net/core/dev.c:6935 netif_napi_add_weight_locked+0x8f2/0xba0 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.14.0-rc1-net-00092-g011b03359038 #996 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 RIP: 0010:netif_napi_add_weight_locked+0x8f2/0xba0 Code: e8 c3 e6 6a fe 48 83 c4 28 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc c7 44 24 10 ff ff ff ff e9 8f fb ff ff e8 9e e6 6a fe <0f> 0b e9 d3 fe ff ff e8 92 e6 6a fe 48 8b 04 24 be ff ff ff ff 48 RSP: 0000:ffffc9000001fc60 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff88806ce48128 RCX: 1ffff11001664b9e RDX: ffff888008f00040 RSI: ffffffff8317ca42 RDI: ffff88800b325cb6 RBP: ffff88800b325c40 R08: 0000000000000001 R09: ffffed100167502c R10: ffff88800b3a8163 R11: 0000000000000000 R12: ffff88800ac1c168 R13: ffff88800ac1c168 R14: ffff88800ac1c168 R15: 0000000000000007 FS: 0000000000000000(0000) GS:ffff88806ce00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff888008201000 CR3: 0000000004c94001 CR4: 0000000000370ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: gro_cells_init+0x1ba/0x270 xfrm_input_init+0x4b/0x2a0 xfrm_init+0x38/0x50 ip_rt_init+0x2d7/0x350 ip_init+0xf/0x20 inet_init+0x406/0x590 do_one_initcall+0x9d/0x2e0 do_initcalls+0x23b/0x280 kernel_init_freeable+0x445/0x490 kernel_init+0x20/0x1d0 ret_from_fork+0x46/0x80 ret_from_fork_asm+0x1a/0x30 irq event stamp: 584330 hardirqs last enabled at (584338): [] __up_console_sem+0x77/0xb0 hardirqs last disabled at (584345): [] __up_console_sem+0x5c/0xb0 softirqs last enabled at (583242): [] netlink_insert+0x14d/0x470 softirqs last disabled at (583754): [] netif_napi_add_weight_locked+0x77d/0xba0 on kernel built with MAX_SKB_FRAGS=45, where SKB_WITH_OVERHEAD(1024) is smaller than GRO_MAX_HEAD. Such built additionally contains the revert of the single page frag cache so that napi_get_frags() ends up using the page frag allocator, triggering the splat. Note that the underlying issue is independent from the mentioned revert; address it ensuring that the small head cache will fit either TCP and GRO allocation and updating napi_alloc_skb() and __netdev_alloc_skb() to select kmalloc() usage for any allocation fitting such cache. Reported-by: Sabrina Dubroca Suggested-by: Eric Dumazet Fixes: 3948b05950fd ("net: introduce a config option to tweak MAX_SKB_FRAGS") Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/gro.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h index b9b58c1f8d19..7b548f91754b 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -11,6 +11,9 @@ #include #include +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + struct napi_gro_cb { union { struct { -- cgit v1.2.3 From 6bc7e4eb0499562ccd291712fd7be0d1a5aad00a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2025 19:29:40 +0100 Subject: Revert "net: skb: introduce and use a single page frag cache" After the previous commit is finally safe to revert commit dbae2b062824 ("net: skb: introduce and use a single page frag cache"): do it here. The intended goal of such change was to counter a performance regression introduced by commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs"). Unfortunately, the blamed commit introduces another regression for the virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny size, so that the whole head frag could fit a 512-byte block. The single page frag cache uses a 1K fragment for such allocation, and the additional overhead, under small UDP packets flood, makes the page allocator a bottleneck. Thanks to commit bf9f1baa279f ("net: add dedicated kmem_cache for typical/small skb->head"), this revert does not re-introduce the original regression. Actually, in the relevant test on top of this revert, I measure a small but noticeable positive delta, just above noise level. The revert itself required some additional mangling due to recent updates in the affected code. Suggested-by: Eric Dumazet Fixes: dbae2b062824 ("net: skb: introduce and use a single page frag cache") Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 94b7d4eca003..ab550a89b9bf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4117,7 +4117,6 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); -void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) -- cgit v1.2.3 From b4c173dfbb6c78568578ff18f9e8822d7bd0e31b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Feb 2025 11:02:58 +0100 Subject: fuse: don't truncate cached, mutated symlink Fuse allows the value of a symlink to change and this property is exploited by some filesystems (e.g. CVMFS). It has been observed, that sometimes after changing the symlink contents, the value is truncated to the old size. This is caused by fuse_getattr() racing with fuse_reverse_inval_inode(). fuse_reverse_inval_inode() updates the fuse_inode's attr_version, which results in fuse_change_attributes() exiting before updating the cached attributes This is okay, as the cached attributes remain invalid and the next call to fuse_change_attributes() will likely update the inode with the correct values. The reason this causes problems is that cached symlinks will be returned through page_get_link(), which truncates the symlink to inode->i_size. This is correct for filesystems that don't mutate symlinks, but in this case it causes bad behavior. The solution is to just remove this truncation. This can cause a regression in a filesystem that relies on supplying a symlink larger than the file size, but this is unlikely. If that happens we'd need to make this behavior conditional. Reported-by: Laura Promberger Tested-by: Sam Lewis Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20250220100258.793363-1-mszeredi@redhat.com Reviewed-by: Bernd Schubert Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c3b2f8a621f..9346adf28f7b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3452,6 +3452,8 @@ extern const struct file_operations generic_ro_fops; extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); +extern const char *page_get_link_raw(struct dentry *, struct inode *, + struct delayed_call *); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); -- cgit v1.2.3 From 8510edf191d2df0822ea22d6226e4eef87562271 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Tue, 18 Feb 2025 20:02:08 +0800 Subject: mm/filemap: fix miscalculated file range for filemap_fdatawrite_range_kick() iocb->ki_pos has been updated with the number of written bytes since generic_perform_write(). Besides __filemap_fdatawrite_range() accepts the inclusive end of the data range. Fixes: 1d4457576570 ("mm: call filemap_fdatawrite_range_kick() after IOCB_DONTCACHE issue") Signed-off-by: Jingbo Xu Link: https://lore.kernel.org/r/20250218120209.88093-2-jefflexu@linux.alibaba.com Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9346adf28f7b..2788df98080f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2975,8 +2975,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; - filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, - iocb->ki_pos + count); + filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count, + iocb->ki_pos - 1); } return count; -- cgit v1.2.3 From 1f0fc3374f3345ff1d150c5c56ac5016e5d3826a Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 18 Feb 2025 19:22:48 +0000 Subject: afs: Give an afs_server object a ref on the afs_cell object it points to Give an afs_server object a ref on the afs_cell object it points to so that the cell doesn't get deleted before the server record. Whilst this is circular (cell -> vol -> server_list -> server -> cell), the ref only pins the memory, not the lifetime as that's controlled by the activity counter. When the volume's activity counter reaches 0, it detaches from the cell and discards its server list; when a cell's activity counter reaches 0, it discards its root volume. At that point, the circularity is cut. Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation") Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250218192250.296870-6-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/afs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index b0db89058c91..958a2460330c 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -174,6 +174,7 @@ enum yfs_cm_operation { EM(afs_cell_trace_get_queue_dns, "GET q-dns ") \ EM(afs_cell_trace_get_queue_manage, "GET q-mng ") \ EM(afs_cell_trace_get_queue_new, "GET q-new ") \ + EM(afs_cell_trace_get_server, "GET server") \ EM(afs_cell_trace_get_vol, "GET vol ") \ EM(afs_cell_trace_insert, "INSERT ") \ EM(afs_cell_trace_manage, "MANAGE ") \ @@ -182,6 +183,7 @@ enum yfs_cm_operation { EM(afs_cell_trace_put_destroy, "PUT destry") \ EM(afs_cell_trace_put_queue_work, "PUT q-work") \ EM(afs_cell_trace_put_queue_fail, "PUT q-fail") \ + EM(afs_cell_trace_put_server, "PUT server") \ EM(afs_cell_trace_put_vol, "PUT vol ") \ EM(afs_cell_trace_see_source, "SEE source") \ EM(afs_cell_trace_see_ws, "SEE ws ") \ -- cgit v1.2.3 From 5c70eb5c593d64d93b178905da215a9fd288a4b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Feb 2025 13:18:54 +0000 Subject: net: better track kernel sockets lifetime While kernel sockets are dismantled during pernet_operations->exit(), their freeing can be delayed by any tx packets still held in qdisc or device queues, due to skb_set_owner_w() prior calls. This then trigger the following warning from ref_tracker_dir_exit() [1] To fix this, make sure that kernel sockets own a reference on net->passive. Add sk_net_refcnt_upgrade() helper, used whenever a kernel socket is converted to a refcounted one. [1] [ 136.263918][ T35] ref_tracker: net notrefcnt@ffff8880638f01e0 has 1/2 users at [ 136.263918][ T35] sk_alloc+0x2b3/0x370 [ 136.263918][ T35] inet6_create+0x6ce/0x10f0 [ 136.263918][ T35] __sock_create+0x4c0/0xa30 [ 136.263918][ T35] inet_ctl_sock_create+0xc2/0x250 [ 136.263918][ T35] igmp6_net_init+0x39/0x390 [ 136.263918][ T35] ops_init+0x31e/0x590 [ 136.263918][ T35] setup_net+0x287/0x9e0 [ 136.263918][ T35] copy_net_ns+0x33f/0x570 [ 136.263918][ T35] create_new_namespaces+0x425/0x7b0 [ 136.263918][ T35] unshare_nsproxy_namespaces+0x124/0x180 [ 136.263918][ T35] ksys_unshare+0x57d/0xa70 [ 136.263918][ T35] __x64_sys_unshare+0x38/0x40 [ 136.263918][ T35] do_syscall_64+0xf3/0x230 [ 136.263918][ T35] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 136.263918][ T35] [ 136.343488][ T35] ref_tracker: net notrefcnt@ffff8880638f01e0 has 1/2 users at [ 136.343488][ T35] sk_alloc+0x2b3/0x370 [ 136.343488][ T35] inet6_create+0x6ce/0x10f0 [ 136.343488][ T35] __sock_create+0x4c0/0xa30 [ 136.343488][ T35] inet_ctl_sock_create+0xc2/0x250 [ 136.343488][ T35] ndisc_net_init+0xa7/0x2b0 [ 136.343488][ T35] ops_init+0x31e/0x590 [ 136.343488][ T35] setup_net+0x287/0x9e0 [ 136.343488][ T35] copy_net_ns+0x33f/0x570 [ 136.343488][ T35] create_new_namespaces+0x425/0x7b0 [ 136.343488][ T35] unshare_nsproxy_namespaces+0x124/0x180 [ 136.343488][ T35] ksys_unshare+0x57d/0xa70 [ 136.343488][ T35] __x64_sys_unshare+0x38/0x40 [ 136.343488][ T35] do_syscall_64+0xf3/0x230 [ 136.343488][ T35] entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 0cafd77dcd03 ("net: add a refcount tracker for kernel sockets") Reported-by: syzbot+30a19e01a97420719891@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67b72aeb.050a0220.14d86d.0283.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250220131854.4048077-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 8036b3b79cd8..7ef728324e4e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1751,6 +1751,7 @@ static inline bool sock_allow_reclassification(const struct sock *csk) struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void sk_free(struct sock *sk); +void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); void sk_free_unlock_clone(struct sock *sk); -- cgit v1.2.3 From 769c1b79295c38d60fde4c0a8f5f31e01360c54f Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 25 Feb 2025 13:18:43 +0000 Subject: ASoC: cs35l56: Prevent races when soft-resetting using SPI control When SPI is used for control, the driver must hold the SPI bus lock while issuing the sequence of writes to perform a soft reset. >From the time the driver writes the SYSTEM_RESET command until the driver does a write to terminate the reset, there must not be any activity on the SPI bus lines. If there is any SPI activity during the soft-reset, another soft-reset will be triggered. The state of the SPI chip select is irrelevant. A repeated soft-reset does not in itself cause any problems, and it is not an infinite loop. The problem is a race between these resets and the driver polling for boot completion. There is a time window between soft resets where the driver could read HALO_STATE as 2 (fully booted) while the chip is actually soft-resetting. Although this window is small, it is long enough that it is possible to hit it in normal operation. To prevent this race and ensure the chip really is fully booted, the driver calls spi_bus_lock() to prevent other activity while resetting. It then issues the SYSTEM_RESET mailbox command. After allowing sufficient time for reset to take effect, the driver issues a PING mailbox command, which will force completion of the full soft-reset sequence. The SPI bus lock can then be released. The mailbox is checked for any boot or wakeup response from the firmware, before the value in HALO_STATE will be trusted. This does not affect SoundWire or I2C control. Fixes: 8a731fd37f8b ("ASoC: cs35l56: Move utility functions to shared file") Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20250225131843.113752-3-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs35l56.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include') diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 3dc7a1551ac3..5d653a3491d0 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #define CS35L56_DEVID 0x0000000 @@ -61,6 +62,7 @@ #define CS35L56_IRQ1_MASK_8 0x000E0AC #define CS35L56_IRQ1_MASK_18 0x000E0D4 #define CS35L56_IRQ1_MASK_20 0x000E0DC +#define CS35L56_DSP_MBOX_1_RAW 0x0011000 #define CS35L56_DSP_VIRTUAL1_MBOX_1 0x0011020 #define CS35L56_DSP_VIRTUAL1_MBOX_2 0x0011024 #define CS35L56_DSP_VIRTUAL1_MBOX_3 0x0011028 @@ -224,6 +226,7 @@ #define CS35L56_HALO_STATE_SHUTDOWN 1 #define CS35L56_HALO_STATE_BOOT_DONE 2 +#define CS35L56_MBOX_CMD_PING 0x0A000000 #define CS35L56_MBOX_CMD_AUDIO_PLAY 0x0B000001 #define CS35L56_MBOX_CMD_AUDIO_PAUSE 0x0B000002 #define CS35L56_MBOX_CMD_AUDIO_REINIT 0x0B000003 @@ -254,6 +257,16 @@ #define CS35L56_NUM_BULK_SUPPLIES 3 #define CS35L56_NUM_DSP_REGIONS 5 +/* Additional margin for SYSTEM_RESET to control port ready on SPI */ +#define CS35L56_SPI_RESET_TO_PORT_READY_US (CS35L56_CONTROL_PORT_READY_US + 2500) + +struct cs35l56_spi_payload { + __be32 addr; + __be16 pad; + __be32 value; +} __packed; +static_assert(sizeof(struct cs35l56_spi_payload) == 10); + struct cs35l56_base { struct device *dev; struct regmap *regmap; @@ -269,6 +282,7 @@ struct cs35l56_base { s8 cal_index; struct cirrus_amp_cal_data cal_data; struct gpio_desc *reset_gpio; + struct cs35l56_spi_payload *spi_payload_buf; }; static inline bool cs35l56_is_otp_register(unsigned int reg) @@ -276,6 +290,23 @@ static inline bool cs35l56_is_otp_register(unsigned int reg) return (reg >> 16) == 3; } +static inline int cs35l56_init_config_for_spi(struct cs35l56_base *cs35l56, + struct spi_device *spi) +{ + cs35l56->spi_payload_buf = devm_kzalloc(&spi->dev, + sizeof(*cs35l56->spi_payload_buf), + GFP_KERNEL | GFP_DMA); + if (!cs35l56->spi_payload_buf) + return -ENOMEM; + + return 0; +} + +static inline bool cs35l56_is_spi(struct cs35l56_base *cs35l56) +{ + return IS_ENABLED(CONFIG_SPI_MASTER) && !!cs35l56->spi_payload_buf; +} + extern const struct regmap_config cs35l56_regmap_i2c; extern const struct regmap_config cs35l56_regmap_spi; extern const struct regmap_config cs35l56_regmap_sdw; -- cgit v1.2.3 From 889c57066ceee5e9172232da0608a8ac053bb6e5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 25 Feb 2025 10:21:41 +0800 Subject: block: make segment size limit workable for > 4K PAGE_SIZE Using PAGE_SIZE as a minimum expected DMA segment size in consideration of devices which have a max DMA segment size of < 64k when used on 64k PAGE_SIZE systems leads to devices not being able to probe such as eMMC and Exynos UFS controller [0] [1] you can end up with a probe failure as follows: WARNING: CPU: 2 PID: 397 at block/blk-settings.c:339 blk_validate_limits+0x364/0x3c0 Ensure we use min(max_seg_size, seg_boundary_mask + 1) as the new min segment size when max segment size is < PAGE_SIZE for 16k and 64k base page size systems. If anyone need to backport this patch, the following commits are depended: commit 6aeb4f836480 ("block: remove bio_add_pc_page") commit 02ee5d69e3ba ("block: remove blk_rq_bio_prep") commit b7175e24d6ac ("block: add a dma mapping iterator") Link: https://lore.kernel.org/linux-block/20230612203314.17820-1-bvanassche@acm.org/ # [0] Link: https://lore.kernel.org/linux-block/1d55e942-5150-de4c-3a02-c3d066f87028@acm.org/ # [1] Cc: Yi Zhang Cc: John Garry Cc: Keith Busch Tested-by: Paul Bunyan Reviewed-by: Daniel Gomez Reviewed-by: Luis Chamberlain Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250225022141.2154581-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 248416ecd01c..58ff5aca83b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -367,6 +367,7 @@ struct queue_limits { unsigned int max_sectors; unsigned int max_user_sectors; unsigned int max_segment_size; + unsigned int min_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; -- cgit v1.2.3 From 68f3ea7ee199ef77551e090dfef5a49046ea8443 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 21 Feb 2025 14:57:06 +0100 Subject: vmlinux.lds: Ensure that const vars with relocations are mapped R/O In the kernel, there are architectures (x86, arm64) that perform boot-time relocation (for KASLR) without relying on PIE codegen. In this case, all const global objects are emitted into .rodata, including const objects with fields that will be fixed up by the boot-time relocation code. This implies that .rodata (and .text in some cases) need to be writable at boot, but they will usually be mapped read-only as soon as the boot completes. When using PIE codegen, the compiler will emit const global objects into .data.rel.ro rather than .rodata if the object contains fields that need such fixups at boot-time. This permits the linker to annotate such regions as requiring read-write access only at load time, but not at execution time (in user space), while keeping .rodata truly const (in user space, this is important for reducing the CoW footprint of dynamic executables). This distinction does not matter for the kernel, but it does imply that const data will end up in writable memory if the .data.rel.ro sections are not treated in a special way, as they will end up in the writable .data segment by default. So emit .data.rel.ro into the .rodata segment. Cc: stable@vger.kernel.org Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250221135704.431269-5-ardb+git@google.com Signed-off-by: Josh Poimboeuf --- include/asm-generic/vmlinux.lds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 54504013c749..337d3336e175 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -457,7 +457,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) . = ALIGN((align)); \ .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ __start_rodata = .; \ - *(.rodata) *(.rodata.*) \ + *(.rodata) *(.rodata.*) *(.data.rel.ro*) \ SCHED_DATA \ RO_AFTER_INIT_DATA /* Read only after init */ \ . = ALIGN(8); \ -- cgit v1.2.3 From 73cfc53cc3b6380eccf013049574485f64cb83ca Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 21 Feb 2025 14:57:07 +0100 Subject: objtool: Fix C jump table annotations for Clang A C jump table (such as the one used by the BPF interpreter) is a const global array of absolute code addresses, and this means that the actual values in the table may not be known until the kernel is booted (e.g., when using KASLR or when the kernel VA space is sized dynamically). When using PIE codegen, the compiler will default to placing such const global objects in .data.rel.ro (which is annotated as writable), rather than .rodata (which is annotated as read-only). As C jump tables are explicitly emitted into .rodata, this used to result in warnings for LoongArch builds (which uses PIE codegen for the entire kernel) like Warning: setting incorrect section attributes for .rodata..c_jump_table due to the fact that the explicitly specified .rodata section inherited the read-write annotation that the compiler uses for such objects when using PIE codegen. This warning was suppressed by explicitly adding the read-only annotation to the __attribute__((section(""))) string, by commit c5b1184decc8 ("compiler.h: specify correct attribute for .rodata..c_jump_table") Unfortunately, this hack does not work on Clang's integrated assembler, which happily interprets the appended section type and permission specifiers as part of the section name, which therefore no longer matches the hard-coded pattern '.rodata..c_jump_table' that objtool expects, causing it to emit a warning kernel/bpf/core.o: warning: objtool: ___bpf_prog_run+0x20: sibling call from callable instruction with modified stack frame Work around this, by emitting C jump tables into .data.rel.ro instead, which is treated as .rodata by the linker script for all builds, not just PIE based ones. Fixes: c5b1184decc8 ("compiler.h: specify correct attribute for .rodata..c_jump_table") Tested-by: Tiezhu Yang # on LoongArch Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250221135704.431269-6-ardb+git@google.com Signed-off-by: Josh Poimboeuf --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index b087de2f3e94..0c25f3e429bb 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -110,7 +110,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Unreachable code */ #ifdef CONFIG_OBJTOOL /* Annotate a C jump table to allow objtool to follow the code flow */ -#define __annotate_jump_table __section(".rodata..c_jump_table,\"a\",@progbits #") +#define __annotate_jump_table __section(".data.rel.ro.c_jump_table") #else /* !CONFIG_OBJTOOL */ #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ -- cgit v1.2.3 From 9084ed79ddaaaa1ec01cd304af9fb532c26252db Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Thu, 20 Feb 2025 14:29:36 -0500 Subject: lsm,nfs: fix memory leak of lsm_context commit b530104f50e8 ("lsm: lsm_context in security_dentry_init_security") did not preserve the lsm id for subsequent release calls, which results in a memory leak. Fix it by saving the lsm id in the nfs4_label and providing it on the subsequent release call. Fixes: b530104f50e8 ("lsm: lsm_context in security_dentry_init_security") Signed-off-by: Stephen Smalley Acked-by: Paul Moore Acked-by: Casey Schaufler Signed-off-by: Anna Schumaker --- include/linux/nfs4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 71fbebfa43c7..9ac83ca88326 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -47,6 +47,7 @@ struct nfs4_acl { struct nfs4_label { uint32_t lfs; uint32_t pi; + u32 lsmid; u32 len; char *label; }; -- cgit v1.2.3 From 18912c520674ec4d920fe3826e7e4fefeecdf5ae Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 24 Feb 2025 09:44:01 -0800 Subject: tcp: devmem: don't write truncated dmabuf CMSGs to userspace Currently, we report -ETOOSMALL (err) only on the first iteration (!sent). When we get put_cmsg error after a bunch of successful put_cmsg calls, we don't signal the error at all. This might be confusing on the userspace side which will see truncated CMSGs but no MSG_CTRUNC signal. Consider the following case: - sizeof(struct cmsghdr) = 16 - sizeof(struct dmabuf_cmsg) = 24 - total cmsg size (CMSG_LEN) = 40 (16+24) When calling recvmsg with msg_controllen=60, the userspace will receive two(!) dmabuf_cmsg(s), the first one will be a valid one and the second one will be silently truncated. There is no easy way to discover the truncation besides doing something like "cm->cmsg_len != CMSG_LEN(sizeof(dmabuf_cmsg))". Introduce new put_devmem_cmsg wrapper that reports an error instead of doing the truncation. Mina suggests that it's the intended way this API should work. Note that we might now report MSG_CTRUNC when the users (incorrectly) call us with msg_control == NULL. Fixes: 8f0b3cc9a4c1 ("tcp: RX path for devmem TCP") Reviewed-by: Mina Almasry Signed-off-by: Stanislav Fomichev Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250224174401.3582695-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index d18cc47e89bd..c3322eb3d686 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -392,6 +392,8 @@ struct ucred { extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); +extern int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data); struct timespec64; struct __kernel_timespec; -- cgit v1.2.3 From a6aa36e957a1bfb5341986dec32d013d23228fe1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 14 Feb 2025 13:14:34 +0900 Subject: block: Remove zone write plugs when handling native zone append writes For devices that natively support zone append operations, REQ_OP_ZONE_APPEND BIOs are not processed through zone write plugging and are immediately issued to the zoned device. This means that there is no write pointer offset tracking done for these operations and that a zone write plug is not necessary. However, when receiving a zone append BIO, we may already have a zone write plug for the target zone if that zone was previously partially written using regular write operations. In such case, since the write pointer offset of the zone write plug is not incremented by the amount of sectors appended to the zone, 2 issues arise: 1) we risk leaving the plug in the disk hash table if the zone is fully written using zone append or regular write operations, because the write pointer offset will never reach the "zone full" state. 2) Regular write operations that are issued after zone append operations will always be failed by blk_zone_wplug_prepare_bio() as the write pointer alignment check will fail, even if the user correctly accounted for the zone append operations and issued the regular writes with a correct sector. Avoid these issues by immediately removing the zone write plug of zones that are the target of zone append operations when blk_zone_plug_bio() is called. The new function blk_zone_wplug_handle_native_zone_append() implements this for devices that natively support zone append. The removal of the zone write plug using disk_remove_zone_wplug() requires aborting all plugged regular write using disk_zone_wplug_abort() as otherwise the plugged write BIOs would never be executed (with the plug removed, the completion path will never see again the zone write plug as disk_get_zone_wplug() will return NULL). Rate-limited warnings are added to blk_zone_wplug_handle_native_zone_append() and to disk_zone_wplug_abort() to signal this. Since blk_zone_wplug_handle_native_zone_append() is called in the hot path for operations that will not be plugged, disk_get_zone_wplug() is optimized under the assumption that a user issuing zone append operations is not at the same time issuing regular writes and that there are no hashed zone write plugs. The struct gendisk atomic counter nr_zone_wplugs is added to check this, with this counter incremented in disk_insert_zone_wplug() and decremented in disk_remove_zone_wplug(). To be consistent with this fix, we do not need to fill the zone write plug hash table with zone write plugs for zones that are partially written for a device that supports native zone append operations. So modify blk_revalidate_seq_zone() to return early to avoid allocating and inserting a zone write plug for partially written sequential zones if the device natively supports zone append. Reported-by: Jorgen Hansen Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Tested-by: Jorgen Hansen Link: https://lore.kernel.org/r/20250214041434.82564-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 58ff5aca83b6..d37751789bf5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -196,10 +196,11 @@ struct gendisk { unsigned int zone_capacity; unsigned int last_zone_capacity; unsigned long __rcu *conv_zones_bitmap; - unsigned int zone_wplugs_hash_bits; - spinlock_t zone_wplugs_lock; + unsigned int zone_wplugs_hash_bits; + atomic_t nr_zone_wplugs; + spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; - struct hlist_head *zone_wplugs_hash; + struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ -- cgit v1.2.3 From 02410ac72ac3707936c07ede66e94360d0d65319 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 26 Feb 2025 12:06:51 +0000 Subject: mm: hugetlb: Add huge page size param to huge_ptep_get_and_clear() In order to fix a bug, arm64 needs to be told the size of the huge page for which the huge_pte is being cleared in huge_ptep_get_and_clear(). Provide for this by adding an `unsigned long sz` parameter to the function. This follows the same pattern as huge_pte_clear() and set_huge_pte_at(). This commit makes the required interface modifications to the core mm as well as all arches that implement this function (arm64, loongarch, mips, parisc, powerpc, riscv, s390, sparc). The actual arm64 bug will be fixed in a separate commit. Cc: stable@vger.kernel.org Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") Acked-by: David Hildenbrand Reviewed-by: Alexandre Ghiti # riscv Reviewed-by: Christophe Leroy Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Signed-off-by: Ryan Roberts Acked-by: Alexander Gordeev # s390 Link: https://lore.kernel.org/r/20250226120656.2400136-2-ryan.roberts@arm.com Signed-off-by: Will Deacon --- include/asm-generic/hugetlb.h | 2 +- include/linux/hugetlb.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index f42133dae68e..2afc95bf1655 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -90,7 +90,7 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, unsigned long sz) { return ptep_get_and_clear(mm, addr, ptep); } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ec8c0ccc8f95..bf5f7256bd28 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1004,7 +1004,9 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + unsigned long psize = huge_page_size(hstate_vma(vma)); + + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize); } #endif -- cgit v1.2.3 From 916b7f42b3b3b539a71c204a9b49fdc4ca92cd82 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 15:06:31 -0800 Subject: kvm: retry nx_huge_page_recovery_thread creation A VMM may send a non-fatal signal to its threads, including vCPU tasks, at any time, and thus may signal vCPU tasks during KVM_RUN. If a vCPU task receives the signal while its trying to spawn the huge page recovery vhost task, then KVM_RUN will fail due to copy_process() returning -ERESTARTNOINTR. Rework call_once() to mark the call complete if and only if the called function succeeds, and plumb the function's true error code back to the call_once() invoker. This provides userspace with the correct, non-fatal error code so that the VMM doesn't terminate the VM on -ENOMEM, and allows subsequent KVM_RUN a succeed by virtue of retrying creation of the NX huge page task. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson [implemented the kvm user side] Signed-off-by: Keith Busch Message-ID: <20250227230631.303431-3-kbusch@meta.com> Signed-off-by: Paolo Bonzini --- include/linux/call_once.h | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/call_once.h b/include/linux/call_once.h index 6261aa0b3fb0..13cd6469e7e5 100644 --- a/include/linux/call_once.h +++ b/include/linux/call_once.h @@ -26,20 +26,41 @@ do { \ __once_init((once), #once, &__key); \ } while (0) -static inline void call_once(struct once *once, void (*cb)(struct once *)) +/* + * call_once - Ensure a function has been called exactly once + * + * @once: Tracking struct + * @cb: Function to be called + * + * If @once has never completed successfully before, call @cb and, if + * it returns a zero or positive value, mark @once as completed. Return + * the value returned by @cb + * + * If @once has completed succesfully before, return 0. + * + * The call to @cb is implicitly surrounded by a mutex, though for + * efficiency the * function avoids taking it after the first call. + */ +static inline int call_once(struct once *once, int (*cb)(struct once *)) { - /* Pairs with atomic_set_release() below. */ - if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) - return; - - guard(mutex)(&once->lock); - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); - if (atomic_read(&once->state) != ONCE_NOT_STARTED) - return; - - atomic_set(&once->state, ONCE_RUNNING); - cb(once); - atomic_set_release(&once->state, ONCE_COMPLETED); + int r, state; + + /* Pairs with atomic_set_release() below. */ + if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) + return 0; + + guard(mutex)(&once->lock); + state = atomic_read(&once->state); + if (unlikely(state != ONCE_NOT_STARTED)) + return WARN_ON_ONCE(state != ONCE_COMPLETED) ? -EINVAL : 0; + + atomic_set(&once->state, ONCE_RUNNING); + r = cb(once); + if (r < 0) + atomic_set(&once->state, ONCE_NOT_STARTED); + else + atomic_set_release(&once->state, ONCE_COMPLETED); + return r; } #endif /* _LINUX_CALL_ONCE_H */ -- cgit v1.2.3 From e04918dc594669068f5d59d567d08db531167188 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 2 Mar 2025 15:18:24 +0800 Subject: cred: Fix RCU warnings in override/revert_creds Fix RCU warnings in override_creds and revert_creds by turning the RCU pointer into a normal pointer using rcu_replace_pointer. These warnings were previously private to the cred code, but due to the move into the header file they are now polluting unrelated subsystems. Fixes: 49dffdfde462 ("cred: Add a light version of override/revert_creds()") Signed-off-by: Herbert Xu Link: https://lore.kernel.org/r/Z8QGQGW0IaSklKG7@gondor.apana.org.au Signed-off-by: Christian Brauner --- include/linux/cred.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index 0c3c4b16b469..5658a3bfe803 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -172,18 +172,12 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred) static inline const struct cred *override_creds(const struct cred *override_cred) { - const struct cred *old = current->cred; - - rcu_assign_pointer(current->cred, override_cred); - return old; + return rcu_replace_pointer(current->cred, override_cred, 1); } static inline const struct cred *revert_creds(const struct cred *revert_cred) { - const struct cred *override_cred = current->cred; - - rcu_assign_pointer(current->cred, revert_cred); - return override_cred; + return rcu_replace_pointer(current->cred, revert_cred, 1); } /** -- cgit v1.2.3 From 3d252160b818045f3a152b13756f6f37ca34639d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 4 Mar 2025 13:51:38 +0000 Subject: fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex pipe_readable(), pipe_writable(), and pipe_poll() can read "pipe->head" and "pipe->tail" outside of "pipe->mutex" critical section. When the head and the tail are read individually in that order, there is a window for interruption between the two reads in which both the head and the tail can be updated by concurrent readers and writers. One of the problematic scenarios observed with hackbench running multiple groups on a large server on a particular pipe inode is as follows: pipe->head = 36 pipe->tail = 36 hackbench-118762 [057] ..... 1029.550548: pipe_write: *wakes up: pipe not full* hackbench-118762 [057] ..... 1029.550548: pipe_write: head: 36 -> 37 [tail: 36] hackbench-118762 [057] ..... 1029.550548: pipe_write: *wake up next reader 118740* hackbench-118762 [057] ..... 1029.550548: pipe_write: *wake up next writer 118768* hackbench-118768 [206] ..... 1029.55055X: pipe_write: *writer wakes up* hackbench-118768 [206] ..... 1029.55055X: pipe_write: head = READ_ONCE(pipe->head) [37] ... CPU 206 interrupted (exact wakeup was not traced but 118768 did read head at 37 in traces) hackbench-118740 [057] ..... 1029.550558: pipe_read: *reader wakes up: pipe is not empty* hackbench-118740 [057] ..... 1029.550558: pipe_read: tail: 36 -> 37 [head = 37] hackbench-118740 [057] ..... 1029.550559: pipe_read: *pipe is empty; wakeup writer 118768* hackbench-118740 [057] ..... 1029.550559: pipe_read: *sleeps* hackbench-118766 [185] ..... 1029.550592: pipe_write: *New writer comes in* hackbench-118766 [185] ..... 1029.550592: pipe_write: head: 37 -> 38 [tail: 37] hackbench-118766 [185] ..... 1029.550592: pipe_write: *wakes up reader 118766* hackbench-118740 [185] ..... 1029.550598: pipe_read: *reader wakes up; pipe not empty* hackbench-118740 [185] ..... 1029.550599: pipe_read: tail: 37 -> 38 [head: 38] hackbench-118740 [185] ..... 1029.550599: pipe_read: *pipe is empty* hackbench-118740 [185] ..... 1029.550599: pipe_read: *reader sleeps; wakeup writer 118768* ... CPU 206 switches back to writer hackbench-118768 [206] ..... 1029.550601: pipe_write: tail = READ_ONCE(pipe->tail) [38] hackbench-118768 [206] ..... 1029.550601: pipe_write: pipe_full()? (u32)(37 - 38) >= 16? Yes hackbench-118768 [206] ..... 1029.550601: pipe_write: *writer goes back to sleep* [ Tasks 118740 and 118768 can then indefinitely wait on each other. ] The unsigned arithmetic in pipe_occupancy() wraps around when "pipe->tail > pipe->head" leading to pipe_full() returning true despite the pipe being empty. The case of genuine wraparound of "pipe->head" is handled since pipe buffer has data allowing readers to make progress until the pipe->tail wraps too after which the reader will wakeup a sleeping writer, however, mistaking the pipe to be full when it is in fact empty can lead to readers and writers waiting on each other indefinitely. This issue became more problematic and surfaced as a hang in hackbench after the optimization in commit aaec5a95d596 ("pipe_read: don't wake up the writer if the pipe is still full") significantly reduced the number of spurious wakeups of writers that had previously helped mask the issue. To avoid missing any updates between the reads of "pipe->head" and "pipe->write", unionize the two with a single unsigned long "pipe->head_tail" member that can be loaded atomically. Using "pipe->head_tail" to read the head and the tail ensures the lockless checks do not miss any updates to the head or the tail and since those two are only updated under "pipe->mutex", it ensures that the head is always ahead of, or equal to the tail resulting in correct calculations. [ prateek: commit log, testing on x86 platforms. ] Reported-and-debugged-by: Swapnil Sapkal Closes: https://lore.kernel.org/lkml/e813814e-7094-4673-bc69-731af065a0eb@amd.com/ Reported-by: Alexey Gladkov Closes: https://lore.kernel.org/all/Z8Wn0nTvevLRG_4m@example.org/ Fixes: 8cefc107ca54 ("pipe: Use head and tail pointers for the ring, not cursor and length") Tested-by: Swapnil Sapkal Reviewed-by: Oleg Nesterov Tested-by: Alexey Gladkov Signed-off-by: K Prateek Nayak Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 8ff23bf5a819..3cc4f8eab853 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -31,6 +31,33 @@ struct pipe_buffer { unsigned long private; }; +/* + * Really only alpha needs 32-bit fields, but + * might as well do it for 64-bit architectures + * since that's what we've historically done, + * and it makes 'head_tail' always be a simple + * 'unsigned long'. + */ +#ifdef CONFIG_64BIT +typedef unsigned int pipe_index_t; +#else +typedef unsigned short pipe_index_t; +#endif + +/* + * We have to declare this outside 'struct pipe_inode_info', + * but then we can't use 'union pipe_index' for an anonymous + * union, so we end up having to duplicate this declaration + * below. Annoying. + */ +union pipe_index { + unsigned long head_tail; + struct { + pipe_index_t head; + pipe_index_t tail; + }; +}; + /** * struct pipe_inode_info - a linux kernel pipe * @mutex: mutex protecting the whole thing @@ -58,8 +85,16 @@ struct pipe_buffer { struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; - unsigned int head; - unsigned int tail; + + /* This has to match the 'union pipe_index' above */ + union { + unsigned long head_tail; + struct { + pipe_index_t head; + pipe_index_t tail; + }; + }; + unsigned int max_usage; unsigned int ring_size; unsigned int nr_accounted; -- cgit v1.2.3 From 778b94d7ac17b5800aa857222911f09cc986b509 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 28 Feb 2025 11:01:53 -0600 Subject: ACPI: platform_profile: Add support for hidden choices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When two drivers don't support all the same profiles the legacy interface only exports the common profiles. This causes problems for cases where one driver uses low-power but another uses quiet because the result is that neither is exported to sysfs. To allow two drivers to disagree, add support for "hidden choices". Hidden choices are platform profiles that a driver supports to be compatible with the platform profile of another driver. Fixes: 688834743d67 ("ACPI: platform_profile: Allow multiple handlers") Reported-by: Antheas Kapenekakis Closes: https://lore.kernel.org/platform-driver-x86/e64b771e-3255-42ad-9257-5b8fc6c24ac9@gmx.de/T/#mc068042dd29df36c16c8af92664860fc4763974b Signed-off-by: Mario Limonciello Tested-by: Antheas Kapenekakis Tested-by: Derek J. Clark Acked-by: Ilpo Järvinen Link: https://patch.msgid.link/20250228170155.2623386-2-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- include/linux/platform_profile.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 8ab5b0e8eb2c..8c9df7dadd5d 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -33,6 +33,8 @@ enum platform_profile_option { * @probe: Callback to setup choices available to the new class device. These * choices will only be enforced when setting a new profile, not when * getting the current one. + * @hidden_choices: Callback to setup choices that are not visible to the user + * but can be set by the driver. * @profile_get: Callback that will be called when showing the current platform * profile in sysfs. * @profile_set: Callback that will be called when storing a new platform @@ -40,6 +42,7 @@ enum platform_profile_option { */ struct platform_profile_ops { int (*probe)(void *drvdata, unsigned long *choices); + int (*hidden_choices)(void *drvdata, unsigned long *choices); int (*profile_get)(struct device *dev, enum platform_profile_option *profile); int (*profile_set)(struct device *dev, enum platform_profile_option profile); }; -- cgit v1.2.3 From c27c66afc449b80f3b4b84d123358c0248f2cf63 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Mar 2025 07:08:09 -1000 Subject: fs/pipe: Fix pipe_occupancy() with 16-bit indexes The pipe_occupancy() logic implicitly relied on the natural unsigned modulo arithmetic in C, but that doesn't work for the new 'pipe_index_t' case, since any arithmetic will be done in 'int' (and here we had also made it 'unsigned int' due to the function call boundary). So make the modulo arithmetic explicit by casting the result to the proper type. Cc: Oleg Nesterov Cc: Mateusz Guzik Cc: Manfred Spraul Cc: Christian Brauner Cc: Swapnil Sapkal Cc: Alexey Gladkov Cc: K Prateek Nayak Link: https://lore.kernel.org/all/CAHk-=wjyHsGLx=rxg6PKYBNkPYAejgo7=CbyL3=HGLZLsAaJFQ@mail.gmail.com/ Fixes: 3d252160b818 ("fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex") Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 3cc4f8eab853..1f013ed7577e 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -192,7 +192,7 @@ static inline bool pipe_empty(unsigned int head, unsigned int tail) */ static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) { - return head - tail; + return (pipe_index_t)(head - tail); } /** -- cgit v1.2.3 From cfced12f5100e50d56bc587299393fd33c1169a9 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 5 Mar 2025 11:23:01 +0000 Subject: include/linux/pipe_fs_i: Add htmldoc annotation for "head_tail" member Add htmldoc annotation for the newly introduced "head_tail" member describing it to be a union of the pipe_inode_info's @head and @tail members. Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20250305204609.5e64768e@canb.auug.org.au/ Fixes: 3d252160b818 ("fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex") Signed-off-by: K Prateek Nayak Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 1f013ed7577e..05ccbc5d0129 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -65,6 +65,7 @@ union pipe_index { * @wr_wait: writer wait point in case of full pipe * @head: The point of buffer production * @tail: The point of buffer consumption + * @head_tail: unsigned long union of @head and @tail * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) -- cgit v1.2.3 From 0eba2a7e858907a746ba69cd002eb9eb4dbd7bf3 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Fri, 28 Feb 2025 15:14:56 +0000 Subject: ASoC: ops: Consistently treat platform_max as control value This reverts commit 9bdd10d57a88 ("ASoC: ops: Shift tested values in snd_soc_put_volsw() by +min"), and makes some additional related updates. There are two ways the platform_max could be interpreted; the maximum register value, or the maximum value the control can be set to. The patch moved from treating the value as a control value to a register one. When the patch was applied it was technically correct as snd_soc_limit_volume() also used the register interpretation. However, even then most of the other usages treated platform_max as a control value, and snd_soc_limit_volume() has since been updated to also do so in commit fb9ad24485087 ("ASoC: ops: add correct range check for limiting volume"). That patch however, missed updating snd_soc_put_volsw() back to the control interpretation, and fixing snd_soc_info_volsw_range(). The control interpretation makes more sense as limiting is typically done from the machine driver, so it is appropriate to use the customer facing representation rather than the internal codec representation. Update all the code to consistently use this interpretation of platform_max. Finally, also add some comments to the soc_mixer_control struct to hopefully avoid further patches switching between the two approaches. Fixes: fb9ad24485087 ("ASoC: ops: add correct range check for limiting volume") Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250228151456.3703342-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/soc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index fcdb5adfcd5e..b3e84bc47c6f 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1261,7 +1261,10 @@ void snd_soc_close_delayed_work(struct snd_soc_pcm_runtime *rtd); /* mixer control */ struct soc_mixer_control { - int min, max, platform_max; + /* Minimum and maximum specified as written to the hardware */ + int min, max; + /* Limited maximum value specified as presented through the control */ + int platform_max; int reg, rreg; unsigned int shift, rshift; unsigned int sign_bit; -- cgit v1.2.3 From 0d2d0f3d93ddd6556f23c917d910becd9925ddeb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Mar 2025 07:35:40 -1000 Subject: fs/pipe: remove buggy and unused 'helper' function While looking for incorrect users of the pipe head/tail fields (see commit c27c66afc449: "fs/pipe: Fix pipe_occupancy() with 16-bit indexes"), I found a bug in pipe_discard_from() that looked entirely broken. However, the fix is trivial: this buggy function isn't actually called by anything, so let's just remove it ASAP. Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 05ccbc5d0129..e572e6fc4f81 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -281,15 +281,6 @@ static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe, return buf->ops->try_steal(pipe, buf); } -static inline void pipe_discard_from(struct pipe_inode_info *pipe, - unsigned int old_head) -{ - unsigned int mask = pipe->ring_size - 1; - - while (pipe->head > old_head) - pipe_buf_release(pipe, &pipe->bufs[--pipe->head & mask]); -} - /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ #define PIPE_SIZE PAGE_SIZE -- cgit v1.2.3 From 67bab13307c83fb742c2556b06cdc39dbad27f07 Mon Sep 17 00:00:00 2001 From: Ge Yang Date: Wed, 19 Feb 2025 11:46:44 +0800 Subject: mm/hugetlb: wait for hugetlb folios to be freed Since the introduction of commit c77c0a8ac4c52 ("mm/hugetlb: defer freeing of huge pages if in non-task context"), which supports deferring the freeing of hugetlb pages, the allocation of contiguous memory through cma_alloc() may fail probabilistically. In the CMA allocation process, if it is found that the CMA area is occupied by in-use hugetlb folios, these in-use hugetlb folios need to be migrated to another location. When there are no available hugetlb folios in the free hugetlb pool during the migration of in-use hugetlb folios, new folios are allocated from the buddy system. A temporary state is set on the newly allocated folio. Upon completion of the hugetlb folio migration, the temporary state is transferred from the new folios to the old folios. Normally, when the old folios with the temporary state are freed, it is directly released back to the buddy system. However, due to the deferred freeing of hugetlb pages, the PageBuddy() check fails, ultimately leading to the failure of cma_alloc(). Here is a simplified call trace illustrating the process: cma_alloc() ->__alloc_contig_migrate_range() // Migrate in-use hugetlb folios ->unmap_and_move_huge_page() ->folio_putback_hugetlb() // Free old folios ->test_pages_isolated() ->__test_page_isolated_in_pageblock() ->PageBuddy(page) // Check if the page is in buddy To resolve this issue, we have implemented a function named wait_for_freed_hugetlb_folios(). This function ensures that the hugetlb folios are properly released back to the buddy system after their migration is completed. By invoking wait_for_freed_hugetlb_folios() before calling PageBuddy(), we ensure that PageBuddy() will succeed. Link: https://lkml.kernel.org/r/1739936804-18199-1-git-send-email-yangge1116@126.com Fixes: c77c0a8ac4c5 ("mm/hugetlb: defer freeing of huge pages if in non-task context") Signed-off-by: Ge Yang Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ec8c0ccc8f95..dbe76d4f1bfc 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -682,6 +682,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); +void wait_for_freed_hugetlb_folios(void); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, @@ -1066,6 +1067,10 @@ static inline int replace_free_hugepage_folios(unsigned long start_pfn, return 0; } +static inline void wait_for_freed_hugetlb_folios(void) +{ +} + static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner) -- cgit v1.2.3 From ce6d9c1c2b5cc785016faa11b48b6cd317eb367e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 24 Feb 2025 21:20:02 -0500 Subject: NFS: fix nfs_release_folio() to not deadlock via kcompactd writeback Add PF_KCOMPACTD flag and current_is_kcompactd() helper to check for it so nfs_release_folio() can skip calling nfs_wb_folio() from kcompactd. Otherwise NFS can deadlock waiting for kcompactd enduced writeback which recurses back to NFS (which triggers writeback to NFSD via NFS loopback mount on the same host, NFSD blocks waiting for XFS's call to __filemap_get_folio): 6070.550357] INFO: task kcompactd0:58 blocked for more than 4435 seconds. {--- [58] "kcompactd0" [<0>] folio_wait_bit+0xe8/0x200 [<0>] folio_wait_writeback+0x2b/0x80 [<0>] nfs_wb_folio+0x80/0x1b0 [nfs] [<0>] nfs_release_folio+0x68/0x130 [nfs] [<0>] split_huge_page_to_list_to_order+0x362/0x840 [<0>] migrate_pages_batch+0x43d/0xb90 [<0>] migrate_pages_sync+0x9a/0x240 [<0>] migrate_pages+0x93c/0x9f0 [<0>] compact_zone+0x8e2/0x1030 [<0>] compact_node+0xdb/0x120 [<0>] kcompactd+0x121/0x2e0 [<0>] kthread+0xcf/0x100 [<0>] ret_from_fork+0x31/0x40 [<0>] ret_from_fork_asm+0x1a/0x30 ---} [akpm@linux-foundation.org: fix build] Link: https://lkml.kernel.org/r/20250225022002.26141-1-snitzer@kernel.org Fixes: 96780ca55e3c ("NFS: fix up nfs_release_folio() to try to release the page") Signed-off-by: Mike Snitzer Cc: Anna Schumaker Cc: Trond Myklebust Cc: Signed-off-by: Andrew Morton --- include/linux/compaction.h | 5 +++++ include/linux/sched.h | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index e94776496049..7bf0c521db63 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -80,6 +80,11 @@ static inline unsigned long compact_gap(unsigned int order) return 2UL << order; } +static inline int current_is_kcompactd(void) +{ + return current->flags & PF_KCOMPACTD; +} + #ifdef CONFIG_COMPACTION extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order); diff --git a/include/linux/sched.h b/include/linux/sched.h index 9632e3318e0d..9c15365a30c0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1701,7 +1701,7 @@ extern struct pid *cad_pid; #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ -#define PF__HOLE__00010000 0x00010000 +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ -- cgit v1.2.3 From c29564d8b46f64f5e6e6f1c9c02f7761b7b90963 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Fri, 21 Feb 2025 15:16:25 +0800 Subject: include/linux/log2.h: mark is_power_of_2() with __always_inline When building kernel with randconfig, there is an error: In function `kvm_is_cr4_bit_set',inlined from `kvm_update_cpuid_runtime' at arch/x86/kvm/cpuid.c:310:9: include/linux/compiler_types.h:542:38: error: call to `__compiletime_assert_380' declared with attribute error: BUILD_BUG_ON failed: !is_power_of_2(cr4_bit). '!is_power_of_2(X86_CR4_OSXSAVE)' is False, but gcc treats is_power_of_2() as non-inline function and a compilation error happens. Fix this by marking is_power_of_2() with __always_inline. Link: https://lkml.kernel.org/r/20250221071624.1356899-1-suhui@nfschina.com Signed-off-by: Su Hui Cc: Binbin Wu Cc: Paolo Bonzini Cc: Sean Christopherson Signed-off-by: Andrew Morton --- include/linux/log2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/log2.h b/include/linux/log2.h index 9f30d087a128..1366cb688a6d 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -41,7 +41,7 @@ int __ilog2_u64(u64 n) * *not* considered a power of two. * Return: true if @n is a power of 2, otherwise false. */ -static inline __attribute__((const)) +static __always_inline __attribute__((const)) bool is_power_of_2(unsigned long n) { return (n != 0 && ((n & (n - 1)) == 0)); -- cgit v1.2.3 From fb8286562ecfb585e26b033c5e32e6fb85efb0b3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 6 Mar 2025 04:05:26 +0100 Subject: netfilter: nf_tables: make destruction work queue pernet The call to flush_work before tearing down a table from the netlink notifier was supposed to make sure that all earlier updates (e.g. rule add) that might reference that table have been processed. Unfortunately, flush_work() waits for the last queued instance. This could be an instance that is different from the one that we must wait for. This is because transactions are protected with a pernet mutex, but the work item is global, so holding the transaction mutex doesn't prevent another netns from queueing more work. Make the work item pernet so that flush_work() will wait for all transactions queued from this netns. A welcome side effect is that we no longer need to wait for transaction objects from foreign netns. The gc work queue is still global. This seems to be ok because nft_set structures are reference counted and each container structure owns a reference on the net namespace. The destroy_list is still protected by a global spinlock rather than pernet one but the hold time is very short anyway. v2: call cancel_work_sync before reaping the remaining tables (Pablo). Fixes: 9f6958ba2e90 ("netfilter: nf_tables: unconditionally flush pending work before notifier") Reported-by: syzbot+5d8c5789c8cb076b2c25@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 60d5dcdb289c..803d5f1601f9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1891,7 +1891,7 @@ void nft_chain_filter_fini(void); void __init nft_chain_route_init(void); void nft_chain_route_fini(void); -void nf_tables_trans_destroy_flush_work(void); +void nf_tables_trans_destroy_flush_work(struct net *net); int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); __be64 nf_jiffies64_to_msecs(u64 input); @@ -1905,6 +1905,7 @@ static inline int nft_request_module(struct net *net, const char *fmt, ...) { re struct nftables_pernet { struct list_head tables; struct list_head commit_list; + struct list_head destroy_list; struct list_head commit_set_list; struct list_head binding_list; struct list_head module_list; @@ -1915,6 +1916,7 @@ struct nftables_pernet { unsigned int base_seq; unsigned int gc_seq; u8 validate_state; + struct work_struct destroy_work; }; extern unsigned int nf_tables_net_id; -- cgit v1.2.3 From 74d42bdb3a4673b1c10d1f457184e4d3c9cb0196 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 6 Mar 2025 07:30:42 -1000 Subject: fs/pipe: express 'pipe_empty()' in terms of 'pipe_occupancy()' That's what 'pipe_full()' does, so it's more consistent. But more importantly it gets the type limits right when the pipe head and tail are no longer necessarily 'unsigned int'. Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index e572e6fc4f81..4d0a2267e6ef 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -177,23 +177,23 @@ static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe) } /** - * pipe_empty - Return true if the pipe is empty + * pipe_occupancy - Return number of slots used in the pipe * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ -static inline bool pipe_empty(unsigned int head, unsigned int tail) +static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) { - return head == tail; + return (pipe_index_t)(head - tail); } /** - * pipe_occupancy - Return number of slots used in the pipe + * pipe_empty - Return true if the pipe is empty * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ -static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) +static inline bool pipe_empty(unsigned int head, unsigned int tail) { - return (pipe_index_t)(head - tail); + return !pipe_occupancy(head, tail); } /** -- cgit v1.2.3 From e7112524e5e885181cc5ae4d258f33b9dbe0b907 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 6 Mar 2025 08:27:51 -0800 Subject: block: Name the RQF flags enum Commit 5f89154e8e9e3445f9b59 ("block: Use enum to define RQF_x bit indexes") converted the RQF flags to an anonymous enum, which was a beneficial change. This patch goes one step further by naming the enum as "rqf_flags". This naming enables exporting these flags to BPF clients, eliminating the need to duplicate these flags in BPF code. Instead, BPF clients can now access the same kernel-side values through CO:RE (Compile Once, Run Everywhere), as shown in this example: rqf_stats = bpf_core_enum_value(enum rqf_flags, __RQF_STATS) Suggested-by: Yonghong Song Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20250306-rqf_flags-v1-1-bbd64918b406@debian.org Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index fa2a76cc2f73..71f4f0cc3dac 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -28,7 +28,7 @@ typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); typedef __u32 __bitwise req_flags_t; /* Keep rqf_name[] in sync with the definitions below */ -enum { +enum rqf_flags { /* drive already may have started this one */ __RQF_STARTED, /* request for flush sequence */ -- cgit v1.2.3 From 00a7d39898c8010bfd5ff62af31ca5db34421b38 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 6 Mar 2025 18:25:35 -1000 Subject: fs/pipe: add simpler helpers for common cases The fix to atomically read the pipe head and tail state when not holding the pipe mutex has caused a number of headaches due to the size change of the involved types. It turns out that we don't have _that_ many places that access these fields directly and were affected, but we have more than we strictly should have, because our low-level helper functions have been designed to have intimate knowledge of how the pipes work. And as a result, that random noise of direct 'pipe->head' and 'pipe->tail' accesses makes it harder to pinpoint any actual potential problem spots remaining. For example, we didn't have a "is the pipe full" helper function, but instead had a "given these pipe buffer indexes and this pipe size, is the pipe full". That's because some low-level pipe code does actually want that much more complicated interface. But most other places literally just want a "is the pipe full" helper, and not having it meant that those places ended up being unnecessarily much too aware of this all. It would have been much better if only the very core pipe code that cared had been the one aware of this all. So let's fix it - better late than never. This just introduces the trivial wrappers for "is this pipe full or empty" and to get how many pipe buffers are used, so that instead of writing if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) the places that literally just want to know if a pipe is full can just say if (pipe_is_full(pipe)) instead. The existing trivial cases were converted with a 'sed' script. This cuts down on the places that access pipe->head and pipe->tail directly outside of the pipe code (and core splice code) quite a lot. The splice code in particular still revels in doing the direct low-level accesses, and the fuse fuse_dev_splice_write() code also seems a bit unnecessarily eager to go very low-level, but it's at least a bit better than it used to be. Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 4d0a2267e6ef..b698758000f8 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -208,6 +208,33 @@ static inline bool pipe_full(unsigned int head, unsigned int tail, return pipe_occupancy(head, tail) >= limit; } +/** + * pipe_is_full - Return true if the pipe is full + * @pipe: the pipe + */ +static inline bool pipe_is_full(const struct pipe_inode_info *pipe) +{ + return pipe_full(pipe->head, pipe->tail, pipe->max_usage); +} + +/** + * pipe_is_empty - Return true if the pipe is empty + * @pipe: the pipe + */ +static inline bool pipe_is_empty(const struct pipe_inode_info *pipe) +{ + return pipe_empty(pipe->head, pipe->tail); +} + +/** + * pipe_buf_usage - Return how many pipe buffers are in use + * @pipe: the pipe + */ +static inline unsigned int pipe_buf_usage(const struct pipe_inode_info *pipe) +{ + return pipe_occupancy(pipe->head, pipe->tail); +} + /** * pipe_buf - Return the pipe buffer for the specified slot in the pipe ring * @pipe: The pipe to access -- cgit v1.2.3 From ab6ab707a4d060a51c45fc13e3b2228d5f7c0b87 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 4 Mar 2025 10:06:10 -0500 Subject: Revert "Bluetooth: hci_core: Fix sleeping function called from invalid context" This reverts commit 4d94f05558271654670d18c26c912da0c1c15549 which has problems (see [1]) and is no longer needed since 581dd2dc168f ("Bluetooth: hci_event: Fix using rcu_read_(un)lock while iterating") has reworked the code where the original bug has been found. [1] Link: https://lore.kernel.org/linux-bluetooth/877c55ci1r.wl-tiwai@suse.de/T/#t Fixes: 4d94f0555827 ("Bluetooth: hci_core: Fix sleeping function called from invalid context") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 108 ++++++++++++++------------------------- 1 file changed, 38 insertions(+), 70 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index f756fac95488..6281063cbd8e 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -804,6 +804,7 @@ struct hci_conn_params { extern struct list_head hci_dev_list; extern struct list_head hci_cb_list; extern rwlock_t hci_dev_list_lock; +extern struct mutex hci_cb_list_lock; #define hci_dev_set_flag(hdev, nr) set_bit((nr), (hdev)->dev_flags) #define hci_dev_clear_flag(hdev, nr) clear_bit((nr), (hdev)->dev_flags) @@ -2010,47 +2011,24 @@ struct hci_cb { char *name; - bool (*match) (struct hci_conn *conn); void (*connect_cfm) (struct hci_conn *conn, __u8 status); void (*disconn_cfm) (struct hci_conn *conn, __u8 status); void (*security_cfm) (struct hci_conn *conn, __u8 status, - __u8 encrypt); + __u8 encrypt); void (*key_change_cfm) (struct hci_conn *conn, __u8 status); void (*role_switch_cfm) (struct hci_conn *conn, __u8 status, __u8 role); }; -static inline void hci_cb_lookup(struct hci_conn *conn, struct list_head *list) -{ - struct hci_cb *cb, *cpy; - - rcu_read_lock(); - list_for_each_entry_rcu(cb, &hci_cb_list, list) { - if (cb->match && cb->match(conn)) { - cpy = kmalloc(sizeof(*cpy), GFP_ATOMIC); - if (!cpy) - break; - - *cpy = *cb; - INIT_LIST_HEAD(&cpy->list); - list_add_rcu(&cpy->list, list); - } - } - rcu_read_unlock(); -} - static inline void hci_connect_cfm(struct hci_conn *conn, __u8 status) { - struct list_head list; - struct hci_cb *cb, *tmp; - - INIT_LIST_HEAD(&list); - hci_cb_lookup(conn, &list); + struct hci_cb *cb; - list_for_each_entry_safe(cb, tmp, &list, list) { + mutex_lock(&hci_cb_list_lock); + list_for_each_entry(cb, &hci_cb_list, list) { if (cb->connect_cfm) cb->connect_cfm(conn, status); - kfree(cb); } + mutex_unlock(&hci_cb_list_lock); if (conn->connect_cfm_cb) conn->connect_cfm_cb(conn, status); @@ -2058,43 +2036,22 @@ static inline void hci_connect_cfm(struct hci_conn *conn, __u8 status) static inline void hci_disconn_cfm(struct hci_conn *conn, __u8 reason) { - struct list_head list; - struct hci_cb *cb, *tmp; - - INIT_LIST_HEAD(&list); - hci_cb_lookup(conn, &list); + struct hci_cb *cb; - list_for_each_entry_safe(cb, tmp, &list, list) { + mutex_lock(&hci_cb_list_lock); + list_for_each_entry(cb, &hci_cb_list, list) { if (cb->disconn_cfm) cb->disconn_cfm(conn, reason); - kfree(cb); } + mutex_unlock(&hci_cb_list_lock); if (conn->disconn_cfm_cb) conn->disconn_cfm_cb(conn, reason); } -static inline void hci_security_cfm(struct hci_conn *conn, __u8 status, - __u8 encrypt) -{ - struct list_head list; - struct hci_cb *cb, *tmp; - - INIT_LIST_HEAD(&list); - hci_cb_lookup(conn, &list); - - list_for_each_entry_safe(cb, tmp, &list, list) { - if (cb->security_cfm) - cb->security_cfm(conn, status, encrypt); - kfree(cb); - } - - if (conn->security_cfm_cb) - conn->security_cfm_cb(conn, status); -} - static inline void hci_auth_cfm(struct hci_conn *conn, __u8 status) { + struct hci_cb *cb; __u8 encrypt; if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) @@ -2102,11 +2059,20 @@ static inline void hci_auth_cfm(struct hci_conn *conn, __u8 status) encrypt = test_bit(HCI_CONN_ENCRYPT, &conn->flags) ? 0x01 : 0x00; - hci_security_cfm(conn, status, encrypt); + mutex_lock(&hci_cb_list_lock); + list_for_each_entry(cb, &hci_cb_list, list) { + if (cb->security_cfm) + cb->security_cfm(conn, status, encrypt); + } + mutex_unlock(&hci_cb_list_lock); + + if (conn->security_cfm_cb) + conn->security_cfm_cb(conn, status); } static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status) { + struct hci_cb *cb; __u8 encrypt; if (conn->state == BT_CONFIG) { @@ -2133,38 +2099,40 @@ static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status) conn->sec_level = conn->pending_sec_level; } - hci_security_cfm(conn, status, encrypt); + mutex_lock(&hci_cb_list_lock); + list_for_each_entry(cb, &hci_cb_list, list) { + if (cb->security_cfm) + cb->security_cfm(conn, status, encrypt); + } + mutex_unlock(&hci_cb_list_lock); + + if (conn->security_cfm_cb) + conn->security_cfm_cb(conn, status); } static inline void hci_key_change_cfm(struct hci_conn *conn, __u8 status) { - struct list_head list; - struct hci_cb *cb, *tmp; - - INIT_LIST_HEAD(&list); - hci_cb_lookup(conn, &list); + struct hci_cb *cb; - list_for_each_entry_safe(cb, tmp, &list, list) { + mutex_lock(&hci_cb_list_lock); + list_for_each_entry(cb, &hci_cb_list, list) { if (cb->key_change_cfm) cb->key_change_cfm(conn, status); - kfree(cb); } + mutex_unlock(&hci_cb_list_lock); } static inline void hci_role_switch_cfm(struct hci_conn *conn, __u8 status, __u8 role) { - struct list_head list; - struct hci_cb *cb, *tmp; - - INIT_LIST_HEAD(&list); - hci_cb_lookup(conn, &list); + struct hci_cb *cb; - list_for_each_entry_safe(cb, tmp, &list, list) { + mutex_lock(&hci_cb_list_lock); + list_for_each_entry(cb, &hci_cb_list, list) { if (cb->role_switch_cfm) cb->role_switch_cfm(conn, status, role); - kfree(cb); } + mutex_unlock(&hci_cb_list_lock); } static inline bool hci_bdaddr_is_rpa(bdaddr_t *bdaddr, u8 addr_type) -- cgit v1.2.3 From 366fef794bd2b7c2e9df933f6828dd9739bfba84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 10 Mar 2025 14:21:58 +0200 Subject: : Allow the passing of both iomem and non-iomem pointers to no_free_ptr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling no_free_ptr() for an __iomem pointer results in Sparse complaining about the types: warning: incorrect type in argument 1 (different address spaces) expected void const volatile *val got void [noderef] __iomem *__val [ The example is from drivers/platform/x86/intel/pmc/core_ssram.c:283 ] The problem is caused by the signature of __must_check_fn() added in: 85be6d842447 ("cleanup: Make no_free_ptr() __must_check") ... to enforce that the return value is always used. Use __force to allow both iomem and non-iomem pointers to be given for no_free_ptr(). Reported-by: kernel test robot Signed-off-by: Ilpo Järvinen Signed-off-by: Ingo Molnar Reviewed-by: Andy Shevchenko Reviewed-by: Dan Williams Cc: "H. Peter Anvin" Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250310122158.20966-1-ilpo.jarvinen@linux.intel.com Closes: https://lore.kernel.org/oe-kbuild-all/202403050547.qnZtuNlN-lkp@intel.com/ --- include/linux/cleanup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index ec00e3f7af2b..ee2614adb785 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -212,7 +212,7 @@ const volatile void * __must_check_fn(const volatile void *val) { return val; } #define no_free_ptr(p) \ - ((typeof(p)) __must_check_fn(__get_and_null(p, NULL))) + ((typeof(p)) __must_check_fn((__force const volatile void *)__get_and_null(p, NULL))) #define return_ptr(p) return no_free_ptr(p) -- cgit v1.2.3 From 9bce6b5f8987678b9c6c1fe433af6b5fe41feadc Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Tue, 11 Mar 2025 19:43:59 +0900 Subject: block: change blk_mq_add_to_batch() third argument type to bool Commit 1f47ed294a2b ("block: cleanup and fix batch completion adding conditions") modified the evaluation criteria for the third argument, 'ioerror', in the blk_mq_add_to_batch() function. Initially, the function had checked if 'ioerror' equals zero. Following the commit, it started checking for negative error values, with the presumption that such values, for instance -EIO, would be passed in. However, blk_mq_add_to_batch() callers do not pass negative error values. Instead, they pass status codes defined in various ways: - NVMe PCI and Apple drivers pass NVMe status code - virtio_blk driver passes the virtblk request header status byte - null_blk driver passes blk_status_t These codes are either zero or positive, therefore the revised check fails to function as intended. Specifically, with the NVMe PCI driver, this modification led to the failure of the blktests test case nvme/039. In this test scenario, errors are artificially injected to the NVMe driver, resulting in positive NVMe status codes passed to blk_mq_add_to_batch(), which unexpectedly processes the failed I/O in a batch. Hence the failure. To correct the ioerror check within blk_mq_add_to_batch(), make all callers to uniformly pass the argument as boolean. Modify the callers to check their specific status codes and pass the boolean value 'is_error'. Also describe the arguments of blK_mq_add_to_batch as kerneldoc. Fixes: 1f47ed294a2b ("block: cleanup and fix batch completion adding conditions") Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20250311104359.1767728-3-shinichiro.kawasaki@wdc.com [axboe: fold in documentation update] Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 71f4f0cc3dac..aba9c24486aa 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -852,12 +852,20 @@ static inline bool blk_mq_is_reserved_rq(struct request *rq) return rq->rq_flags & RQF_RESV; } -/* +/** + * blk_mq_add_to_batch() - add a request to the completion batch + * @req: The request to add to batch + * @iob: The batch to add the request + * @is_error: Specify true if the request failed with an error + * @complete: The completaion handler for the request + * * Batched completions only work when there is no I/O error and no special * ->end_io handler. + * + * Return: true when the request was added to the batch, otherwise false */ static inline bool blk_mq_add_to_batch(struct request *req, - struct io_comp_batch *iob, int ioerror, + struct io_comp_batch *iob, bool is_error, void (*complete)(struct io_comp_batch *)) { /* @@ -865,7 +873,7 @@ static inline bool blk_mq_add_to_batch(struct request *req, * 1) No batch container * 2) Has scheduler data attached * 3) Not a passthrough request and end_io set - * 4) Not a passthrough request and an ioerror + * 4) Not a passthrough request and failed with an error */ if (!iob) return false; @@ -874,7 +882,7 @@ static inline bool blk_mq_add_to_batch(struct request *req, if (!blk_rq_is_passthrough(req)) { if (req->end_io) return false; - if (ioerror < 0) + if (is_error) return false; } -- cgit v1.2.3 From 066e053fe208a3b83ee89dc5a192146add688861 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 12 Mar 2025 08:38:47 +0100 Subject: fsnotify: add pre-content hooks on mmap() Pre-content hooks in page faults introduces potential deadlock of HSM handler in userspace with filesystem freezing. The requirement with pre-content event is that for every accessed file range an event covering at least this range will be generated at least once before the file data is accesses. In preparation to disabling pre-content event hooks on page faults, add pre-content hooks at mmap() variants for the entire mmaped range, so HSM can fill content when user requests to map a portion of the file. Note that exec() variant also calls vm_mmap_pgoff() internally to map code sections, so pre-content hooks are also generated in this case. Link: https://lore.kernel.org/linux-fsdevel/7ehxrhbvehlrjwvrduoxsao5k3x4aw275patsb3krkwuq573yv@o2hskrfawbnc/ Suggested-by: Josef Bacik Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/20250312073852.2123409-2-amir73il@gmail.com --- include/linux/fsnotify.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 6a33288bd6a1..83d3ac97f826 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -170,6 +170,21 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } +/* + * fsnotify_mmap_perm - permission hook before mmap of file range + */ +static inline int fsnotify_mmap_perm(struct file *file, int prot, + const loff_t off, size_t len) +{ + /* + * mmap() generates only pre-content events. + */ + if (!file || likely(!FMODE_FSNOTIFY_HSM(file->f_mode))) + return 0; + + return fsnotify_pre_content(&file->f_path, &off, len); +} + /* * fsnotify_truncate_perm - permission hook before file truncate */ @@ -223,6 +238,12 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, return 0; } +static inline int fsnotify_mmap_perm(struct file *file, int prot, + const loff_t off, size_t len) +{ + return 0; +} + static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) { return 0; -- cgit v1.2.3 From 955fbe0ef19df4197595a98d0906c94025c4beef Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 12 Mar 2025 08:38:50 +0100 Subject: Revert "fsnotify: generate pre-content permission event on page fault" This reverts commit 8392bc2ff8c8bf7c4c5e6dfa71ccd893a3c046f6. In the use case of buffered write whose input buffer is mmapped file on a filesystem with a pre-content mark, the prefaulting of the buffer can happen under the filesystem freeze protection (obtained in vfs_write()) which breaks assumptions of pre-content hook and introduces potential deadlock of HSM handler in userspace with filesystem freezing. Now that we have pre-content hooks at file mmap() time, disable the pre-content event hooks on page fault to avoid the potential deadlock. Reported-by: syzbot+7229071b47908b19d5b7@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-fsdevel/7ehxrhbvehlrjwvrduoxsao5k3x4aw275patsb3krkwuq573yv@o2hskrfawbnc/ Fixes: 8392bc2ff8c8 ("fsnotify: generate pre-content permission event on page fault") Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/20250312073852.2123409-5-amir73il@gmail.com --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..8483e09aeb2c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3420,7 +3420,6 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); -extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ -- cgit v1.2.3 From e794dc30be8b69e44646a162db09b43f719525b7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 13 Feb 2025 16:07:15 +0200 Subject: i2c: Introduce i2c_10bit_addr_*_from_msg() helpers There are already a lot of drivers that have been using i2c_8bit_addr_from_msg() for 7-bit addresses, now it's time to have the similar for 10-bit addresses. Signed-off-by: Andy Shevchenko Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250213141045.2716943-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andi Shyti --- include/linux/i2c.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 2b2af24d2a43..1e35b1e23165 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -952,6 +952,21 @@ static inline u8 i2c_8bit_addr_from_msg(const struct i2c_msg *msg) return (msg->addr << 1) | (msg->flags & I2C_M_RD); } +/* + * 10-bit address + * addr_1: 5'b11110 | addr[9:8] | (R/nW) + * addr_2: addr[7:0] + */ +static inline u8 i2c_10bit_addr_hi_from_msg(const struct i2c_msg *msg) +{ + return 0xf0 | ((msg->addr & GENMASK(9, 8)) >> 7) | (msg->flags & I2C_M_RD); +} + +static inline u8 i2c_10bit_addr_lo_from_msg(const struct i2c_msg *msg) +{ + return msg->addr & GENMASK(7, 0); +} + u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold); void i2c_put_dma_safe_msg_buf(u8 *buf, struct i2c_msg *msg, bool xferred); -- cgit v1.2.3