From b9a49520679e98700d3d89689cc91c08a1c88c1d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 19 Jan 2025 00:55:32 +0100
Subject: rcuref: Plug slowpath race in rcuref_put()

Kernel test robot reported an "imbalanced put" in the rcuref_put() slow
path, which turned out to be a false positive. Consider the following race:

            ref  = 0 (via rcuref_init(ref, 1))
 T1                                      T2
 rcuref_put(ref)
 -> atomic_add_negative_release(-1, ref)                                         # ref -> 0xffffffff
 -> rcuref_put_slowpath(ref)
                                         rcuref_get(ref)
                                         -> atomic_add_negative_relaxed(1, &ref->refcnt)
                                           -> return true;                       # ref -> 0

                                         rcuref_put(ref)
                                         -> atomic_add_negative_release(-1, ref) # ref -> 0xffffffff
                                         -> rcuref_put_slowpath()

    -> cnt = atomic_read(&ref->refcnt);                                          # cnt -> 0xffffffff / RCUREF_NOREF
    -> atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD))              # ref -> 0xe0000000 / RCUREF_DEAD
       -> return true
                                           -> cnt = atomic_read(&ref->refcnt);   # cnt -> 0xe0000000 / RCUREF_DEAD
                                           -> if (cnt > RCUREF_RELEASED)         # 0xe0000000 > 0xc0000000
                                             -> WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")

The problem is the additional read in the slow path (after it
decremented to RCUREF_NOREF) which can happen after the counter has been
marked RCUREF_DEAD.

Prevent this by reusing the return value of the decrement. Now every "final"
put uses RCUREF_NOREF in the slow path and attempts the final cmpxchg() to
RCUREF_DEAD.

[ bigeasy: Add changelog ]

Fixes: ee1ee6db07795 ("atomics: Provide rcuref - scalable reference counting")
Reported-by: kernel test robot <oliver.sang@intel.com>
Debugged-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Closes: https://lore.kernel.org/oe-lkp/202412311453.9d7636a2-lkp@intel.com
---
 include/linux/rcuref.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
index 2c8bfd0f1b6b..6322d8c1c6b4 100644
--- a/include/linux/rcuref.h
+++ b/include/linux/rcuref.h
@@ -71,27 +71,30 @@ static inline __must_check bool rcuref_get(rcuref_t *ref)
 	return rcuref_get_slowpath(ref);
 }
 
-extern __must_check bool rcuref_put_slowpath(rcuref_t *ref);
+extern __must_check bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt);
 
 /*
  * Internal helper. Do not invoke directly.
  */
 static __always_inline __must_check bool __rcuref_put(rcuref_t *ref)
 {
+	int cnt;
+
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(),
 			 "suspicious rcuref_put_rcusafe() usage");
 	/*
 	 * Unconditionally decrease the reference count. The saturation and
 	 * dead zones provide enough tolerance for this.
 	 */
-	if (likely(!atomic_add_negative_release(-1, &ref->refcnt)))
+	cnt = atomic_sub_return_release(1, &ref->refcnt);
+	if (likely(cnt >= 0))
 		return false;
 
 	/*
 	 * Handle the last reference drop and cases inside the saturation
 	 * and dead zones.
 	 */
-	return rcuref_put_slowpath(ref);
+	return rcuref_put_slowpath(ref, cnt);
 }
 
 /**
-- 
cgit v1.2.3


From 0532a79efd68a4d9686b0385e4993af4b130ff82 Mon Sep 17 00:00:00 2001
From: Jiayuan Chen <mrpre@163.com>
Date: Wed, 22 Jan 2025 18:09:13 +0800
Subject: strparser: Add read_sock callback

Added a new read_sock handler, allowing users to customize read operations
instead of relying on the native socket's read_sock.

Signed-off-by: Jiayuan Chen <mrpre@163.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://patch.msgid.link/20250122100917.49845-2-mrpre@163.com
---
 include/net/strparser.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/strparser.h b/include/net/strparser.h
index 41e2ce9e9e10..0a83010b3a64 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -43,6 +43,8 @@ struct strparser;
 struct strp_callbacks {
 	int (*parse_msg)(struct strparser *strp, struct sk_buff *skb);
 	void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb);
+	int (*read_sock)(struct strparser *strp, read_descriptor_t *desc,
+			 sk_read_actor_t recv_actor);
 	int (*read_sock_done)(struct strparser *strp, int err);
 	void (*abort_parser)(struct strparser *strp, int err);
 	void (*lock)(struct strparser *strp);
-- 
cgit v1.2.3


From 36b62df5683c315ba58c950f1a9c771c796c30ec Mon Sep 17 00:00:00 2001
From: Jiayuan Chen <mrpre@163.com>
Date: Wed, 22 Jan 2025 18:09:14 +0800
Subject: bpf: Fix wrong copied_seq calculation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

'sk->copied_seq' was updated in the tcp_eat_skb() function when the action
of a BPF program was SK_REDIRECT. For other actions, like SK_PASS, the
update logic for 'sk->copied_seq' was moved to tcp_bpf_recvmsg_parser()
to ensure the accuracy of the 'fionread' feature.

It works for a single stream_verdict scenario, as it also modified
sk_data_ready->sk_psock_verdict_data_ready->tcp_read_skb
to remove updating 'sk->copied_seq'.

However, for programs where both stream_parser and stream_verdict are
active (strparser purpose), tcp_read_sock() was used instead of
tcp_read_skb() (sk_data_ready->strp_data_ready->tcp_read_sock).
tcp_read_sock() now still updates 'sk->copied_seq', leading to duplicate
updates.

In summary, for strparser + SK_PASS, copied_seq is redundantly calculated
in both tcp_read_sock() and tcp_bpf_recvmsg_parser().

The issue causes incorrect copied_seq calculations, which prevent
correct data reads from the recv() interface in user-land.

We do not want to add new proto_ops to implement a new version of
tcp_read_sock, as this would introduce code complexity [1].

We could have added noack and copied_seq to desc, and then called
ops->read_sock. However, unfortunately, other modules didn’t fully
initialize desc to zero. So, for now, we are directly calling
tcp_read_sock_noack() in tcp_bpf.c.

[1]: https://lore.kernel.org/bpf/20241218053408.437295-1-mrpre@163.com

Fixes: e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq")
Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Jiayuan Chen <mrpre@163.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://patch.msgid.link/20250122100917.49845-3-mrpre@163.com
---
 include/linux/skmsg.h | 2 ++
 include/net/tcp.h     | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 2cbe0c22a32f..0b9095a281b8 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -91,6 +91,8 @@ struct sk_psock {
 	struct sk_psock_progs		progs;
 #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 	struct strparser		strp;
+	u32				copied_seq;
+	u32				ingress_bytes;
 #endif
 	struct sk_buff_head		ingress_skb;
 	struct list_head		ingress_msg;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5b2b04835688..9c044fb9ab26 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -729,6 +729,9 @@ void tcp_get_info(struct sock *, struct tcp_info *);
 /* Read 'sendfile()'-style from a TCP socket */
 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 		  sk_read_actor_t recv_actor);
+int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
+			sk_read_actor_t recv_actor, bool noack,
+			u32 *copied_seq);
 int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off);
 void tcp_read_done(struct sock *sk, size_t len);
@@ -2599,6 +2602,11 @@ struct sk_psock;
 #ifdef CONFIG_BPF_SYSCALL
 int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
+#ifdef CONFIG_BPF_STREAM_PARSER
+struct strparser;
+int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc,
+			   sk_read_actor_t recv_actor);
+#endif /* CONFIG_BPF_STREAM_PARSER */
 #endif /* CONFIG_BPF_SYSCALL */
 
 #ifdef CONFIG_INET
-- 
cgit v1.2.3


From b69bb476dee99d564d65d418e9a20acca6f32c3f Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Thu, 30 Jan 2025 16:05:42 -0800
Subject: cgroup: fix race between fork and cgroup.kill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tejun reported the following race between fork() and cgroup.kill at [1].

Tejun:
  I was looking at cgroup.kill implementation and wondering whether there
  could be a race window. So, __cgroup_kill() does the following:

   k1. Set CGRP_KILL.
   k2. Iterate tasks and deliver SIGKILL.
   k3. Clear CGRP_KILL.

  The copy_process() does the following:

   c1. Copy a bunch of stuff.
   c2. Grab siglock.
   c3. Check fatal_signal_pending().
   c4. Commit to forking.
   c5. Release siglock.
   c6. Call cgroup_post_fork() which puts the task on the css_set and tests
       CGRP_KILL.

  The intention seems to be that either a forking task gets SIGKILL and
  terminates on c3 or it sees CGRP_KILL on c6 and kills the child. However, I
  don't see what guarantees that k3 can't happen before c6. ie. After a
  forking task passes c5, k2 can take place and then before the forking task
  reaches c6, k3 can happen. Then, nobody would send SIGKILL to the child.
  What am I missing?

This is indeed a race. One way to fix this race is by taking
cgroup_threadgroup_rwsem in write mode in __cgroup_kill() as the fork()
side takes cgroup_threadgroup_rwsem in read mode from cgroup_can_fork()
to cgroup_post_fork(). However that would be heavy handed as this adds
one more potential stall scenario for cgroup.kill which is usually
called under extreme situation like memory pressure.

To fix this race, let's maintain a sequence number per cgroup which gets
incremented on __cgroup_kill() call. On the fork() side, the
cgroup_can_fork() will cache the sequence number locally and recheck it
against the cgroup's sequence number at cgroup_post_fork() site. If the
sequence numbers mismatch, it means __cgroup_kill() can been called and
we should send SIGKILL to the newly created task.

Reported-by: Tejun Heo <tj@kernel.org>
Closes: https://lore.kernel.org/all/Z5QHE2Qn-QZ6M-KW@slm.duckdns.org/ [1]
Fixes: 661ee6280931 ("cgroup: introduce cgroup.kill")
Cc: stable@vger.kernel.org # v5.14+
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 6 +++---
 include/linux/sched/task.h  | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1b20d2d8ef7c..17960a1e858d 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -71,9 +71,6 @@ enum {
 
 	/* Cgroup is frozen. */
 	CGRP_FROZEN,
-
-	/* Control group has to be killed. */
-	CGRP_KILL,
 };
 
 /* cgroup_root->flags */
@@ -461,6 +458,9 @@ struct cgroup {
 
 	int nr_threaded_children;	/* # of live threaded child cgroups */
 
+	/* sequence number for cgroup.kill, serialized by css_set_lock. */
+	unsigned int kill_seq;
+
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
 	struct cgroup_file procs_file;	/* handle for "cgroup.procs" */
 	struct cgroup_file events_file;	/* handle for "cgroup.events" */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 0f2aeb37bbb0..ca1db4b92c32 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -43,6 +43,7 @@ struct kernel_clone_args {
 	void *fn_arg;
 	struct cgroup *cgrp;
 	struct css_set *cset;
+	unsigned int kill_seq;
 };
 
 /*
-- 
cgit v1.2.3


From f0ada00a9b3801b71d203b0033b7612b687b7d72 Mon Sep 17 00:00:00 2001
From: Imran Shaik <quic_imrashai@quicinc.com>
Date: Thu, 9 Jan 2025 14:27:44 +0530
Subject: dt-bindings: clock: qcom: Add GPU clocks for QCS8300

The QCS8300 GPU clock controller is a derivative of SA8775P, but has few
additional clocks and minor differences. Hence, reuse gpucc bindings of
SA8775P and add additional clocks required for QCS8300.

Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Imran Shaik <quic_imrashai@quicinc.com>
Link: https://lore.kernel.org/r/20250109-qcs8300-mm-patches-new-v4-1-63e8ac268b02@quicinc.com
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 include/dt-bindings/clock/qcom,qcs8300-gpucc.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 include/dt-bindings/clock/qcom,qcs8300-gpucc.h

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,qcs8300-gpucc.h b/include/dt-bindings/clock/qcom,qcs8300-gpucc.h
new file mode 100644
index 000000000000..afa187467b4c
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,qcs8300-gpucc.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_GPUCC_QCS8300_H
+#define _DT_BINDINGS_CLK_QCOM_GPUCC_QCS8300_H
+
+#include "qcom,sa8775p-gpucc.h"
+
+/* QCS8300 introduces below new clocks compared to SA8775P */
+
+/* GPU_CC clocks */
+#define GPU_CC_CX_ACCU_SHIFT_CLK		23
+#define GPU_CC_GX_ACCU_SHIFT_CLK		24
+
+#endif
-- 
cgit v1.2.3


From 0e193cc558e32a879c717bb2d53a1cf8628b5d20 Mon Sep 17 00:00:00 2001
From: Imran Shaik <quic_imrashai@quicinc.com>
Date: Thu, 9 Jan 2025 14:27:46 +0530
Subject: dt-bindings: clock: qcom: Add CAMCC clocks for QCS8300

The QCS8300 camera clock controller is a derivative of SA8775P, but has
an additional clock and minor differences. Hence, reuse the SA8775P
camera bindings and add additional clock required for QCS8300.

Reviewed-by: Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Imran Shaik <quic_imrashai@quicinc.com>
Link: https://lore.kernel.org/r/20250109-qcs8300-mm-patches-new-v4-3-63e8ac268b02@quicinc.com
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 include/dt-bindings/clock/qcom,qcs8300-camcc.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 include/dt-bindings/clock/qcom,qcs8300-camcc.h

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,qcs8300-camcc.h b/include/dt-bindings/clock/qcom,qcs8300-camcc.h
new file mode 100644
index 000000000000..fc535c847859
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,qcs8300-camcc.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_QCS8300_CAM_CC_H
+#define _DT_BINDINGS_CLK_QCOM_QCS8300_CAM_CC_H
+
+#include "qcom,sa8775p-camcc.h"
+
+/* QCS8300 introduces below new clocks compared to SA8775P */
+
+/* CAM_CC clocks */
+#define CAM_CC_TITAN_TOP_ACCU_SHIFT_CLK				86
+
+#endif
-- 
cgit v1.2.3


From ba69e0750b0362870294adab09339a0c39c3beaf Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Sat, 1 Feb 2025 18:21:35 +0100
Subject: efi: Avoid cold plugged memory for placing the kernel

UEFI 2.11 introduced EFI_MEMORY_HOT_PLUGGABLE to annotate system memory
regions that are 'cold plugged' at boot, i.e., hot pluggable memory that
is available from early boot, and described as system RAM by the
firmware.

Existing loaders and EFI applications running in the boot context will
happily use this memory for allocating data structures that cannot be
freed or moved at runtime, and this prevents the memory from being
unplugged. Going forward, the new EFI_MEMORY_HOT_PLUGGABLE attribute
should be tested, and memory annotated as such should be avoided for
such allocations.

In the EFI stub, there are a couple of occurrences where, instead of the
high-level AllocatePages() UEFI boot service, a low-level code sequence
is used that traverses the EFI memory map and carves out the requested
number of pages from a free region. This is needed, e.g., for allocating
as low as possible, or for allocating pages at random.

While AllocatePages() should presumably avoid special purpose memory and
cold plugged regions, this manual approach needs to incorporate this
logic itself, in order to prevent the kernel itself from ending up in a
hot unpluggable region, preventing it from being unplugged.

So add the EFI_MEMORY_HOTPLUGGABLE macro definition, and check for it
where appropriate.

Cc: stable@vger.kernel.org
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 053c57e61869..db293d7de686 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -128,6 +128,7 @@ typedef	struct {
 #define EFI_MEMORY_RO		((u64)0x0000000000020000ULL)	/* read-only */
 #define EFI_MEMORY_SP		((u64)0x0000000000040000ULL)	/* soft reserved */
 #define EFI_MEMORY_CPU_CRYPTO	((u64)0x0000000000080000ULL)	/* supports encryption */
+#define EFI_MEMORY_HOT_PLUGGABLE	BIT_ULL(20)	/* supports unplugging at runtime */
 #define EFI_MEMORY_RUNTIME	((u64)0x8000000000000000ULL)	/* range requires runtime mapping */
 #define EFI_MEMORY_DESCRIPTOR_VERSION	1
 
-- 
cgit v1.2.3


From bbc4578537e350d5bf8a7a2c7d054d6b163b3c41 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Sat, 1 Feb 2025 18:21:36 +0100
Subject: efi: Use BIT_ULL() constants for memory attributes

For legibility, use the existing BIT_ULL() to generate the u64 type EFI
memory attribute macros.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index db293d7de686..7d63d1d75f22 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -114,22 +114,22 @@ typedef	struct {
 #define EFI_MAX_MEMORY_TYPE		16
 
 /* Attribute values: */
-#define EFI_MEMORY_UC		((u64)0x0000000000000001ULL)	/* uncached */
-#define EFI_MEMORY_WC		((u64)0x0000000000000002ULL)	/* write-coalescing */
-#define EFI_MEMORY_WT		((u64)0x0000000000000004ULL)	/* write-through */
-#define EFI_MEMORY_WB		((u64)0x0000000000000008ULL)	/* write-back */
-#define EFI_MEMORY_UCE		((u64)0x0000000000000010ULL)	/* uncached, exported */
-#define EFI_MEMORY_WP		((u64)0x0000000000001000ULL)	/* write-protect */
-#define EFI_MEMORY_RP		((u64)0x0000000000002000ULL)	/* read-protect */
-#define EFI_MEMORY_XP		((u64)0x0000000000004000ULL)	/* execute-protect */
-#define EFI_MEMORY_NV		((u64)0x0000000000008000ULL)	/* non-volatile */
-#define EFI_MEMORY_MORE_RELIABLE \
-				((u64)0x0000000000010000ULL)	/* higher reliability */
-#define EFI_MEMORY_RO		((u64)0x0000000000020000ULL)	/* read-only */
-#define EFI_MEMORY_SP		((u64)0x0000000000040000ULL)	/* soft reserved */
-#define EFI_MEMORY_CPU_CRYPTO	((u64)0x0000000000080000ULL)	/* supports encryption */
+#define EFI_MEMORY_UC			BIT_ULL(0)	/* uncached */
+#define EFI_MEMORY_WC			BIT_ULL(1)	/* write-coalescing */
+#define EFI_MEMORY_WT			BIT_ULL(2)	/* write-through */
+#define EFI_MEMORY_WB			BIT_ULL(3)	/* write-back */
+#define EFI_MEMORY_UCE			BIT_ULL(4)	/* uncached, exported */
+#define EFI_MEMORY_WP			BIT_ULL(12)	/* write-protect */
+#define EFI_MEMORY_RP			BIT_ULL(13)	/* read-protect */
+#define EFI_MEMORY_XP			BIT_ULL(14)	/* execute-protect */
+#define EFI_MEMORY_NV			BIT_ULL(15)	/* non-volatile */
+#define EFI_MEMORY_MORE_RELIABLE	BIT_ULL(16)	/* higher reliability */
+#define EFI_MEMORY_RO			BIT_ULL(17)	/* read-only */
+#define EFI_MEMORY_SP			BIT_ULL(18)	/* soft reserved */
+#define EFI_MEMORY_CPU_CRYPTO		BIT_ULL(19)	/* supports encryption */
 #define EFI_MEMORY_HOT_PLUGGABLE	BIT_ULL(20)	/* supports unplugging at runtime */
-#define EFI_MEMORY_RUNTIME	((u64)0x8000000000000000ULL)	/* range requires runtime mapping */
+#define EFI_MEMORY_RUNTIME		BIT_ULL(63)	/* range requires runtime mapping */
+
 #define EFI_MEMORY_DESCRIPTOR_VERSION	1
 
 #define EFI_PAGE_SHIFT		12
-- 
cgit v1.2.3


From 482ad2a4ace2740ca0ff1cbc8f3c7f862f3ab507 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 5 Feb 2025 15:51:09 +0000
Subject: net: add dev_net_rcu() helper

dev->nd_net can change, readers should either
use rcu_read_lock() or RTNL.

We currently use a generic helper, dev_net() with
no debugging support. We probably have many hidden bugs.

Add dev_net_rcu() helper for callers using rcu_read_lock()
protection.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250205155120.1676781-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h   | 6 ++++++
 include/net/net_namespace.h | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03bb584c62cf..c0a86afb85da 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2663,6 +2663,12 @@ struct net *dev_net(const struct net_device *dev)
 	return read_pnet(&dev->nd_net);
 }
 
+static inline
+struct net *dev_net_rcu(const struct net_device *dev)
+{
+	return read_pnet_rcu(&dev->nd_net);
+}
+
 static inline
 void dev_net_set(struct net_device *dev, struct net *net)
 {
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 0f5eb9db0c62..7ba1402ca779 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -398,7 +398,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
 #endif
 }
 
-static inline struct net *read_pnet_rcu(possible_net_t *pnet)
+static inline struct net *read_pnet_rcu(const possible_net_t *pnet)
 {
 #ifdef CONFIG_NET_NS
 	return rcu_dereference(pnet->net);
-- 
cgit v1.2.3


From 469308552ca4560176cfc100e7ca84add1bebd7c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 5 Feb 2025 15:51:10 +0000
Subject: ipv4: add RCU protection to ip4_dst_hoplimit()

ip4_dst_hoplimit() must use RCU protection to make
sure the net structure it reads does not disappear.

Fixes: fa50d974d104 ("ipv4: Namespaceify ip_default_ttl sysctl knob")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250205155120.1676781-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/route.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index f86775be3e29..c605fd5ec0c0 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -382,10 +382,15 @@ static inline int inet_iif(const struct sk_buff *skb)
 static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
 {
 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
-	struct net *net = dev_net(dst->dev);
 
-	if (hoplimit == 0)
+	if (hoplimit == 0) {
+		const struct net *net;
+
+		rcu_read_lock();
+		net = dev_net_rcu(dst->dev);
 		hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
+		rcu_read_unlock();
+	}
 	return hoplimit;
 }
 
-- 
cgit v1.2.3


From 071d8012869b6af352acca346ade13e7be90a49f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 5 Feb 2025 15:51:11 +0000
Subject: ipv4: use RCU protection in ip_dst_mtu_maybe_forward()

ip_dst_mtu_maybe_forward() must use RCU protection to make
sure the net structure it reads does not disappear.

Fixes: f87c10a8aa1e8 ("ipv4: introduce ip_dst_mtu_maybe_forward and protect forwarding path against pmtu spoofing")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250205155120.1676781-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 9f5e33e371fc..ba7b43447775 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -471,9 +471,12 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 						    bool forwarding)
 {
 	const struct rtable *rt = dst_rtable(dst);
-	struct net *net = dev_net(dst->dev);
-	unsigned int mtu;
+	unsigned int mtu, res;
+	struct net *net;
+
+	rcu_read_lock();
 
+	net = dev_net_rcu(dst->dev);
 	if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) ||
 	    ip_mtu_locked(dst) ||
 	    !forwarding) {
@@ -497,7 +500,11 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 out:
 	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
 
-	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
+	res = mtu - lwtunnel_headroom(dst->lwtstate, mtu);
+
+	rcu_read_unlock();
+
+	return res;
 }
 
 static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
-- 
cgit v1.2.3


From 6a774228e890ee04a0ee13f4e6e731ec8554b9c2 Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Wed, 5 Feb 2025 12:03:01 +0100
Subject: net: ethtool: tsconfig: Fix netlink type of hwtstamp flags

Fix the netlink type for hardware timestamp flags, which are represented
as a bitset of flags. Although only one flag is supported currently, the
correct netlink bitset type should be used instead of u32 to keep
consistency with other fields. Address this by adding a new named string
set description for the hwtstamp flag structure.

The code has been introduced in the current release so the uAPI change is
still okay.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Fixes: 6e9e2eed4f39 ("net: ethtool: Add support for tsconfig command to get/set hwtstamp config")
Link: https://patch.msgid.link/20250205110304.375086-1-kory.maincent@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index d1089b88efc7..9b18c4cfe56f 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -682,6 +682,7 @@ enum ethtool_link_ext_substate_module {
  * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics
  * @ETH_SS_STATS_RMON: names of RMON statistics
  * @ETH_SS_STATS_PHY: names of PHY(dev) statistics
+ * @ETH_SS_TS_FLAGS: hardware timestamping flags
  *
  * @ETH_SS_COUNT: number of defined string sets
  */
@@ -708,6 +709,7 @@ enum ethtool_stringset {
 	ETH_SS_STATS_ETH_CTRL,
 	ETH_SS_STATS_RMON,
 	ETH_SS_STATS_PHY,
+	ETH_SS_TS_FLAGS,
 
 	/* add new constants above here */
 	ETH_SS_COUNT
-- 
cgit v1.2.3


From 011b0335903832facca86cd8ed05d7d8d94c9c76 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 6 Feb 2025 22:28:48 +0100
Subject: Revert "net: skb: introduce and use a single page frag cache"

This reverts commit dbae2b062824 ("net: skb: introduce and use a single
page frag cache"). The intended goal of such change was to counter a
performance regression introduced by commit 3226b158e67c ("net: avoid
32 x truesize under-estimation for tiny skbs").

Unfortunately, the blamed commit introduces another regression for the
virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny
size, so that the whole head frag could fit a 512-byte block.

The single page frag cache uses a 1K fragment for such allocation, and
the additional overhead, under small UDP packets flood, makes the page
allocator a bottleneck.

Thanks to commit bf9f1baa279f ("net: add dedicated kmem_cache for
typical/small skb->head"), this revert does not re-introduce the
original regression. Actually, in the relevant test on top of this
revert, I measure a small but noticeable positive delta, just above
noise level.

The revert itself required some additional mangling due to the
introduction of the SKB_HEAD_ALIGN() helper and local lock infra in the
affected code.

Suggested-by: Eric Dumazet <edumazet@google.com>
Fixes: dbae2b062824 ("net: skb: introduce and use a single page frag cache")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/e649212fde9f0fdee23909ca0d14158d32bb7425.1738877290.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c0a86afb85da..365f0e2098d1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4115,7 +4115,6 @@ void netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
-void napi_get_frags_check(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
 
 static inline void napi_free_frags(struct napi_struct *napi)
-- 
cgit v1.2.3


From a1f7b7ff0e10ae574d388131596390157222f986 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 10 Feb 2025 10:17:27 +0200
Subject: PCI: pci_ids: add INTEL_HDA_PTL_H

Add Intel PTL-H audio Device ID.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20250210081730.22916-2-peter.ujfalusi@linux.intel.com
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index de5deb1a0118..1a2594a38199 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -3134,6 +3134,7 @@
 #define PCI_DEVICE_ID_INTEL_HDA_LNL_P	0xa828
 #define PCI_DEVICE_ID_INTEL_S21152BB	0xb152
 #define PCI_DEVICE_ID_INTEL_HDA_BMG	0xe2f7
+#define PCI_DEVICE_ID_INTEL_HDA_PTL_H	0xe328
 #define PCI_DEVICE_ID_INTEL_HDA_PTL	0xe428
 #define PCI_DEVICE_ID_INTEL_HDA_CML_R	0xf0c8
 #define PCI_DEVICE_ID_INTEL_HDA_RKL_S	0xf1c8
-- 
cgit v1.2.3


From 6d0ce46a93135d96b7fa075a94a88fe0da8e8773 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 7 Feb 2025 13:58:38 +0000
Subject: vrf: use RCU protection in l3mdev_l3_out()

l3mdev_l3_out() can be called without RCU being held:

raw_sendmsg()
 ip_push_pending_frames()
  ip_send_skb()
   ip_local_out()
    __ip_local_out()
     l3mdev_ip_out()

Add rcu_read_lock() / rcu_read_unlock() pair to avoid
a potential UAF.

Fixes: a8e3e1a9f020 ("net: l3mdev: Add hook to output path")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250207135841.1948589-7-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/l3mdev.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 2d6141f28b53..f7fe796e8429 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -198,10 +198,12 @@ struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto)
 	if (netif_is_l3_slave(dev)) {
 		struct net_device *master;
 
+		rcu_read_lock();
 		master = netdev_master_upper_dev_get_rcu(dev);
 		if (master && master->l3mdev_ops->l3mdev_l3_out)
 			skb = master->l3mdev_ops->l3mdev_l3_out(master, sk,
 								skb, proto);
+		rcu_read_unlock();
 	}
 
 	return skb;
-- 
cgit v1.2.3


From c195b9c6ab9c383d7aa3f4a65879b3ca90cb378b Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sat, 8 Feb 2025 15:49:07 +0800
Subject: thermal/netlink: Prevent userspace segmentation fault by adjusting
 UAPI header

The intel-lpmd tool [1], which uses the THERMAL_GENL_ATTR_CPU_CAPABILITY
attribute to receive HFI events from kernel space, encounters a
segmentation fault after commit 1773572863c4 ("thermal: netlink: Add the
commands and the events for the thresholds").

The issue arises because the THERMAL_GENL_ATTR_CPU_CAPABILITY raw value
was changed while intel_lpmd still uses the old value.

Although intel_lpmd can be updated to check the THERMAL_GENL_VERSION and
use the appropriate THERMAL_GENL_ATTR_CPU_CAPABILITY value, the commit
itself is questionable.

The commit introduced a new element in the middle of enum thermal_genl_attr,
which affects many existing attributes and introduces potential risks
and unnecessary maintenance burdens for userspace thermal netlink event
users.

Solve the issue by moving the newly introduced
THERMAL_GENL_ATTR_TZ_PREV_TEMP attribute to the end of the
enum thermal_genl_attr. This ensures that all existing thermal generic
netlink attributes remain unaffected.

Link: https://github.com/intel/intel-lpmd [1]
Fixes: 1773572863c4 ("thermal: netlink: Add the commands and the events for the thresholds")
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/20250208074907.5679-1-rui.zhang@intel.com
[ rjw: Subject edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/uapi/linux/thermal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h
index 349718c271eb..46a2633d33aa 100644
--- a/include/uapi/linux/thermal.h
+++ b/include/uapi/linux/thermal.h
@@ -30,7 +30,6 @@ enum thermal_genl_attr {
 	THERMAL_GENL_ATTR_TZ,
 	THERMAL_GENL_ATTR_TZ_ID,
 	THERMAL_GENL_ATTR_TZ_TEMP,
-	THERMAL_GENL_ATTR_TZ_PREV_TEMP,
 	THERMAL_GENL_ATTR_TZ_TRIP,
 	THERMAL_GENL_ATTR_TZ_TRIP_ID,
 	THERMAL_GENL_ATTR_TZ_TRIP_TYPE,
@@ -54,6 +53,7 @@ enum thermal_genl_attr {
 	THERMAL_GENL_ATTR_THRESHOLD,
 	THERMAL_GENL_ATTR_THRESHOLD_TEMP,
 	THERMAL_GENL_ATTR_THRESHOLD_DIRECTION,
+	THERMAL_GENL_ATTR_TZ_PREV_TEMP,
 	__THERMAL_GENL_ATTR_MAX,
 };
 #define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1)
-- 
cgit v1.2.3


From e00a2e5d485faf53c7a24b9d1b575a642227947f Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 12 Feb 2025 18:18:51 +0200
Subject: drm: Fix DSC BPP increment decoding

Starting with DPCD version 2.0 bits 6:3 of the DP_DSC_BITS_PER_PIXEL_INC
DPCD register contains the NativeYCbCr422_MAX_bpp_DELTA field, which can
be non-zero as opposed to earlier DPCD versions, hence decoding the
bit_per_pixel increment value at bits 2:0 in the same register requires
applying a mask, do so.

Cc: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Fixes: 0c2287c96521 ("drm/display/dp: Add helper function to get DSC bpp precision")
Reviewed-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Imre Deak <imre.deak@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250212161851.4007005-1-imre.deak@intel.com
---
 include/drm/display/drm_dp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/drm/display/drm_dp.h b/include/drm/display/drm_dp.h
index a6f8b098c56f..3bd9f482f0c3 100644
--- a/include/drm/display/drm_dp.h
+++ b/include/drm/display/drm_dp.h
@@ -359,6 +359,7 @@
 # define DP_DSC_BITS_PER_PIXEL_1_4          0x2
 # define DP_DSC_BITS_PER_PIXEL_1_2          0x3
 # define DP_DSC_BITS_PER_PIXEL_1_1          0x4
+# define DP_DSC_BITS_PER_PIXEL_MASK         0x7
 
 #define DP_PSR_SUPPORT                      0x070   /* XXX 1.2? */
 # define DP_PSR_IS_SUPPORTED                1
-- 
cgit v1.2.3


From 1d0013962d220b166d9f7c9fe2746f1542e459a3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 12 Feb 2025 22:23:59 +0000
Subject: netfs: Fix a number of read-retry hangs

Fix a number of hangs in the netfslib read-retry code, including:

 (1) netfs_reissue_read() doubles up the getting of references on
     subrequests, thereby leaking the subrequest and causing inode eviction
     to wait indefinitely.  This can lead to the kernel reporting a hang in
     the filesystem's evict_inode().

     Fix this by removing the get from netfs_reissue_read() and adding one
     to netfs_retry_read_subrequests() to deal with the one place that
     didn't double up.

 (2) The loop in netfs_retry_read_subrequests() that retries a sequence of
     failed subrequests doesn't record whether or not it retried the one
     that the "subreq" pointer points to when it leaves the loop.  It may
     not if renegotiation/repreparation of the subrequests means that fewer
     subrequests are needed to span the cumulative range of the sequence.

     Because it doesn't record this, the piece of code that discards
     now-superfluous subrequests doesn't know whether it should discard the
     one "subreq" points to - and so it doesn't.

     Fix this by noting whether the last subreq it examines is superfluous
     and if it is, then getting rid of it and all subsequent subrequests.

     If that one one wasn't superfluous, then we would have tried to go
     round the previous loop again and so there can be no further unretried
     subrequests in the sequence.

 (3) netfs_retry_read_subrequests() gets yet an extra ref on any additional
     subrequests it has to get because it ran out of ones it could reuse to
     to renegotiation/repreparation shrinking the subrequests.

     Fix this by removing that extra ref.

 (4) In netfs_retry_reads(), it was using wait_on_bit() to wait for
     NETFS_SREQ_IN_PROGRESS to be cleared on all subrequests in the
     sequence - but netfs_read_subreq_terminated() is now using a wait
     queue on the request instead and so this wait will never finish.

     Fix this by waiting on the wait queue instead.  To make this work, a
     new flag, NETFS_RREQ_RETRYING, is now set around the wait loop to tell
     the wake-up code to wake up the wait queue rather than requeuing the
     request's work item.

     Note that this flag replaces the NETFS_RREQ_NEED_RETRY flag which is
     no longer used.

 (5) Whilst not strictly anything to do with the hang,
     netfs_retry_read_subrequests() was also doubly incrementing the
     subreq_counter and re-setting the debug index, leaving a gap in the
     trace.  This is also fixed.

One of these hangs was observed with 9p and with cifs.  Others were forced
by manual code injection into fs/afs/file.c.  Firstly, afs_prepare_read()
was created to provide an changing pattern of maximum subrequest sizes:

	static int afs_prepare_read(struct netfs_io_subrequest *subreq)
	{
		struct netfs_io_request *rreq = subreq->rreq;
		if (!S_ISREG(subreq->rreq->inode->i_mode))
			return 0;
		if (subreq->retry_count < 20)
			rreq->io_streams[0].sreq_max_len =
				umax(200, 2222 - subreq->retry_count * 40);
		else
			rreq->io_streams[0].sreq_max_len = 3333;
		return 0;
	}

and pointed to by afs_req_ops.  Then the following:

	struct netfs_io_subrequest *subreq = op->fetch.subreq;
	if (subreq->error == 0 &&
	    S_ISREG(subreq->rreq->inode->i_mode) &&
	    subreq->retry_count < 20) {
		subreq->transferred = subreq->already_done;
		__clear_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
		__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
		afs_fetch_data_notify(op);
		return;
	}

was inserted into afs_fetch_data_success() at the beginning and struct
netfs_io_subrequest given an extra field, "already_done" that was set to
the value in "subreq->transferred" by netfs_reissue_read().

When reading a 4K file, the subrequests would get gradually smaller, a new
subrequest would be allocated around the 3rd retry and then eventually be
rendered superfluous when the 20th retry was hit and the limit on the first
subrequest was eased.

Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20250212222402.3618494-2-dhowells@redhat.com
Tested-by: Marc Dionne <marc.dionne@auristor.com>
Tested-by: Steve French <stfrench@microsoft.com>
cc: Ihor Solodrai <ihor.solodrai@pm.me>
cc: Eric Van Hensbergen <ericvh@kernel.org>
cc: Latchesar Ionkov <lucho@ionkov.net>
cc: Dominique Martinet <asmadeus@codewreck.org>
cc: Christian Schoenebeck <linux_oss@crudebyte.com>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: v9fs@lists.linux.dev
cc: linux-cifs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/netfs.h        | 2 +-
 include/trace/events/netfs.h | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 071d05d81d38..c86a11cfc4a3 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -278,7 +278,7 @@ struct netfs_io_request {
 #define NETFS_RREQ_PAUSE		11	/* Pause subrequest generation */
 #define NETFS_RREQ_USE_IO_ITER		12	/* Use ->io_iter rather than ->i_pages */
 #define NETFS_RREQ_ALL_QUEUED		13	/* All subreqs are now queued */
-#define NETFS_RREQ_NEED_RETRY		14	/* Need to try retrying */
+#define NETFS_RREQ_RETRYING		14	/* Set if we're in the retry path */
 #define NETFS_RREQ_USE_PGPRIV2		31	/* [DEPRECATED] Use PG_private_2 to mark
 						 * write to cache on read */
 	const struct netfs_request_ops *netfs_ops;
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 6e699cadcb29..f880835f7695 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -99,7 +99,7 @@
 	EM(netfs_sreq_trace_limited,		"LIMIT")	\
 	EM(netfs_sreq_trace_need_clear,		"N-CLR")	\
 	EM(netfs_sreq_trace_partial_read,	"PARTR")	\
-	EM(netfs_sreq_trace_need_retry,		"NRTRY")	\
+	EM(netfs_sreq_trace_need_retry,		"ND-RT")	\
 	EM(netfs_sreq_trace_prepare,		"PREP ")	\
 	EM(netfs_sreq_trace_prep_failed,	"PRPFL")	\
 	EM(netfs_sreq_trace_progress,		"PRGRS")	\
@@ -108,7 +108,9 @@
 	EM(netfs_sreq_trace_short,		"SHORT")	\
 	EM(netfs_sreq_trace_split,		"SPLIT")	\
 	EM(netfs_sreq_trace_submit,		"SUBMT")	\
+	EM(netfs_sreq_trace_superfluous,	"SPRFL")	\
 	EM(netfs_sreq_trace_terminated,		"TERM ")	\
+	EM(netfs_sreq_trace_wait_for,		"_WAIT")	\
 	EM(netfs_sreq_trace_write,		"WRITE")	\
 	EM(netfs_sreq_trace_write_skip,		"SKIP ")	\
 	E_(netfs_sreq_trace_write_term,		"WTERM")
-- 
cgit v1.2.3


From 1f47ed294a2bd577d5ae43e6e28e1c9a3be4a833 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 13 Feb 2025 08:18:46 -0700
Subject: block: cleanup and fix batch completion adding conditions

The conditions for whether or not a request is allowed adding to a
completion batch are a bit hard to read, and they also have a few
issues. One is that ioerror may indeed be a random value on passthrough,
and it's being checked unconditionally of whether or not the given
request is a passthrough request or not.

Rewrite the conditions to be separate for easier reading, and only check
ioerror for non-passthrough requests. This fixes an issue with bio
unmapping on passthrough, where it fails getting added to a batch. This
both leads to suboptimal performance, and may trigger a potential
schedule-under-atomic condition for polled passthrough IO.

Fixes: f794f3351f26 ("block: add support for blk_mq_end_request_batch()")
Link: https://lore.kernel.org/r/20575f0a-656e-4bb3-9d82-dec6c7e3a35c@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9ebb53f031cd..fa2a76cc2f73 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -861,12 +861,22 @@ static inline bool blk_mq_add_to_batch(struct request *req,
 				       void (*complete)(struct io_comp_batch *))
 {
 	/*
-	 * blk_mq_end_request_batch() can't end request allocated from
-	 * sched tags
+	 * Check various conditions that exclude batch processing:
+	 * 1) No batch container
+	 * 2) Has scheduler data attached
+	 * 3) Not a passthrough request and end_io set
+	 * 4) Not a passthrough request and an ioerror
 	 */
-	if (!iob || (req->rq_flags & RQF_SCHED_TAGS) || ioerror ||
-			(req->end_io && !blk_rq_is_passthrough(req)))
+	if (!iob)
 		return false;
+	if (req->rq_flags & RQF_SCHED_TAGS)
+		return false;
+	if (!blk_rq_is_passthrough(req)) {
+		if (req->end_io)
+			return false;
+		if (ioerror < 0)
+			return false;
+	}
 
 	if (!iob->complete)
 		iob->complete = complete;
-- 
cgit v1.2.3


From 35fa2d88ca9481e5caf533d58b99ca259c63b2fe Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 10 Feb 2025 13:30:25 +0100
Subject: driver core: add a faux bus for use when a simple device/bus is
 needed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Many drivers abuse the platform driver/bus system as it provides a
simple way to create and bind a device to a driver-specific set of
probe/release functions.  Instead of doing that, and wasting all of the
memory associated with a platform device, here is a "faux" bus that
can be used instead.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Danilo Krummrich <dakr@kernel.org>
Reviewed-by: Lyude Paul <lyude@redhat.com>
Reviewed-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: Zijun Hu <quic_zijuhu@quicinc.com>
Link: https://lore.kernel.org/r/2025021026-atlantic-gibberish-3f0c@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/faux.h | 69 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 include/linux/device/faux.h

(limited to 'include')

diff --git a/include/linux/device/faux.h b/include/linux/device/faux.h
new file mode 100644
index 000000000000..9f43c0e46aa4
--- /dev/null
+++ b/include/linux/device/faux.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (c) 2025 The Linux Foundation
+ *
+ * A "simple" faux bus that allows devices to be created and added
+ * automatically to it.  This is to be used whenever you need to create a
+ * device that is not associated with any "real" system resources, and do
+ * not want to have to deal with a bus/driver binding logic.  It is
+ * intended to be very simple, with only a create and a destroy function
+ * available.
+ */
+#ifndef _FAUX_DEVICE_H_
+#define _FAUX_DEVICE_H_
+
+#include <linux/container_of.h>
+#include <linux/device.h>
+
+/**
+ * struct faux_device - a "faux" device
+ * @dev:	internal struct device of the object
+ *
+ * A simple faux device that can be created/destroyed.  To be used when a
+ * driver only needs to have a device to "hang" something off.  This can be
+ * used for downloading firmware or other basic tasks.  Use this instead of
+ * a struct platform_device if the device has no resources assigned to
+ * it at all.
+ */
+struct faux_device {
+	struct device dev;
+};
+#define to_faux_device(x) container_of_const((x), struct faux_device, dev)
+
+/**
+ * struct faux_device_ops - a set of callbacks for a struct faux_device
+ * @probe:	called when a faux device is probed by the driver core
+ *		before the device is fully bound to the internal faux bus
+ *		code.  If probe succeeds, return 0, otherwise return a
+ *		negative error number to stop the probe sequence from
+ *		succeeding.
+ * @remove:	called when a faux device is removed from the system
+ *
+ * Both @probe and @remove are optional, if not needed, set to NULL.
+ */
+struct faux_device_ops {
+	int (*probe)(struct faux_device *faux_dev);
+	void (*remove)(struct faux_device *faux_dev);
+};
+
+struct faux_device *faux_device_create(const char *name,
+				       struct device *parent,
+				       const struct faux_device_ops *faux_ops);
+struct faux_device *faux_device_create_with_groups(const char *name,
+						   struct device *parent,
+						   const struct faux_device_ops *faux_ops,
+						   const struct attribute_group **groups);
+void faux_device_destroy(struct faux_device *faux_dev);
+
+static inline void *faux_device_get_drvdata(const struct faux_device *faux_dev)
+{
+	return dev_get_drvdata(&faux_dev->dev);
+}
+
+static inline void faux_device_set_drvdata(struct faux_device *faux_dev, void *data)
+{
+	dev_set_drvdata(&faux_dev->dev, data);
+}
+
+#endif /* _FAUX_DEVICE_H_ */
-- 
cgit v1.2.3


From ab4eedb790cae44313759b50fe47da285e2519d5 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 6 Feb 2025 15:54:45 -0500
Subject: Bluetooth: L2CAP: Fix corrupted list in hci_chan_del

This fixes the following trace by reworking the locking of l2cap_conn
so instead of only locking when changing the chan_l list this promotes
chan_lock to a general lock of l2cap_conn so whenever it is being held
it would prevents the likes of l2cap_conn_del to run:

list_del corruption, ffff888021297e00->prev is LIST_POISON2 (dead000000000122)
------------[ cut here ]------------
kernel BUG at lib/list_debug.c:61!
Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI
CPU: 1 UID: 0 PID: 5896 Comm: syz-executor213 Not tainted 6.14.0-rc1-next-20250204-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 12/27/2024
RIP: 0010:__list_del_entry_valid_or_report+0x12c/0x190 lib/list_debug.c:59
Code: 8c 4c 89 fe 48 89 da e8 32 8c 37 fc 90 0f 0b 48 89 df e8 27 9f 14 fd 48 c7 c7 a0 c0 60 8c 4c 89 fe 48 89 da e8 15 8c 37 fc 90 <0f> 0b 4c 89 e7 e8 0a 9f 14 fd 42 80 3c 2b 00 74 08 4c 89 e7 e8 cb
RSP: 0018:ffffc90003f6f998 EFLAGS: 00010246
RAX: 000000000000004e RBX: dead000000000122 RCX: 01454d423f7fbf00
RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000
RBP: dffffc0000000000 R08: ffffffff819f077c R09: 1ffff920007eded0
R10: dffffc0000000000 R11: fffff520007eded1 R12: dead000000000122
R13: dffffc0000000000 R14: ffff8880352248d8 R15: ffff888021297e00
FS:  00007f7ace6686c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f7aceeeb1d0 CR3: 000000003527c000 CR4: 00000000003526f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
 __list_del_entry_valid include/linux/list.h:124 [inline]
 __list_del_entry include/linux/list.h:215 [inline]
 list_del_rcu include/linux/rculist.h:168 [inline]
 hci_chan_del+0x70/0x1b0 net/bluetooth/hci_conn.c:2858
 l2cap_conn_free net/bluetooth/l2cap_core.c:1816 [inline]
 kref_put include/linux/kref.h:65 [inline]
 l2cap_conn_put+0x70/0xe0 net/bluetooth/l2cap_core.c:1830
 l2cap_sock_shutdown+0xa8a/0x1020 net/bluetooth/l2cap_sock.c:1377
 l2cap_sock_release+0x79/0x1d0 net/bluetooth/l2cap_sock.c:1416
 __sock_release net/socket.c:642 [inline]
 sock_close+0xbc/0x240 net/socket.c:1393
 __fput+0x3e9/0x9f0 fs/file_table.c:448
 task_work_run+0x24f/0x310 kernel/task_work.c:227
 ptrace_notify+0x2d2/0x380 kernel/signal.c:2522
 ptrace_report_syscall include/linux/ptrace.h:415 [inline]
 ptrace_report_syscall_exit include/linux/ptrace.h:477 [inline]
 syscall_exit_work+0xc7/0x1d0 kernel/entry/common.c:173
 syscall_exit_to_user_mode_prepare kernel/entry/common.c:200 [inline]
 __syscall_exit_to_user_mode_work kernel/entry/common.c:205 [inline]
 syscall_exit_to_user_mode+0x24a/0x340 kernel/entry/common.c:218
 do_syscall_64+0x100/0x230 arch/x86/entry/common.c:89
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f7aceeaf449
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 41 19 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f7ace668218 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
RAX: fffffffffffffffc RBX: 00007f7acef39328 RCX: 00007f7aceeaf449
RDX: 000000000000000e RSI: 0000000020000100 RDI: 0000000000000004
RBP: 00007f7acef39320 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000003
R13: 0000000000000004 R14: 00007f7ace668670 R15: 000000000000000b
 </TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---
RIP: 0010:__list_del_entry_valid_or_report+0x12c/0x190 lib/list_debug.c:59
Code: 8c 4c 89 fe 48 89 da e8 32 8c 37 fc 90 0f 0b 48 89 df e8 27 9f 14 fd 48 c7 c7 a0 c0 60 8c 4c 89 fe 48 89 da e8 15 8c 37 fc 90 <0f> 0b 4c 89 e7 e8 0a 9f 14 fd 42 80 3c 2b 00 74 08 4c 89 e7 e8 cb
RSP: 0018:ffffc90003f6f998 EFLAGS: 00010246
RAX: 000000000000004e RBX: dead000000000122 RCX: 01454d423f7fbf00
RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000
RBP: dffffc0000000000 R08: ffffffff819f077c R09: 1ffff920007eded0
R10: dffffc0000000000 R11: fffff520007eded1 R12: dead000000000122
R13: dffffc0000000000 R14: ffff8880352248d8 R15: ffff888021297e00
FS:  00007f7ace6686c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f7acef05b08 CR3: 000000003527c000 CR4: 00000000003526f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

Reported-by: syzbot+10bd8fe6741eedd2be2e@syzkaller.appspotmail.com
Tested-by: syzbot+10bd8fe6741eedd2be2e@syzkaller.appspotmail.com
Fixes: b4f82f9ed43a ("Bluetooth: L2CAP: Fix slab-use-after-free Read in l2cap_send_cmd")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
---
 include/net/bluetooth/l2cap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index d9c767cf773d..9189354c568f 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -668,7 +668,7 @@ struct l2cap_conn {
 	struct l2cap_chan	*smp;
 
 	struct list_head	chan_l;
-	struct mutex		chan_lock;
+	struct mutex		lock;
 	struct kref		ref;
 	struct list_head	users;
 };
@@ -970,6 +970,7 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err);
 void l2cap_send_conn_req(struct l2cap_chan *chan);
 
 struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn);
+struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *conn);
 void l2cap_conn_put(struct l2cap_conn *conn);
 
 int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user);
-- 
cgit v1.2.3


From 0892b840318daa6ae739b7cdec5ecdfca4006689 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 13 Feb 2025 08:49:44 -0800
Subject: Reapply "net: skb: introduce and use a single page frag cache"

This reverts commit 011b0335903832facca86cd8ed05d7d8d94c9c76.

Sabrina reports that the revert may trigger warnings due to intervening
changes, especially the ability to rise MAX_SKB_FRAGS. Let's drop it
and revisit once that part is also ironed out.

Fixes: 011b03359038 ("Revert "net: skb: introduce and use a single page frag cache"")
Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Link: https://lore.kernel.org/6bf54579233038bc0e76056c5ea459872ce362ab.1739375933.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 365f0e2098d1..c0a86afb85da 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4115,6 +4115,7 @@ void netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
+void napi_get_frags_check(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
 
 static inline void napi_free_frags(struct napi_struct *napi)
-- 
cgit v1.2.3


From 192b7ff29b1fb0335a9b9107991e6286f462f361 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 24 Jan 2025 15:44:44 +0000
Subject: landlock: Minor typo and grammar fixes in IPC scoping documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix some whitespace, punctuation and minor grammar.
* Add a missing sentence about the minimum ABI version,
  to stay in line with the section next to it.

Cc: Tahera Fahimi <fahimitahera@gmail.com>
Cc: Tanya Agarwal <tanyaagarwal25699@gmail.com>
Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20250124154445.162841-1-gnoack@google.com
[mic: Add newlines, update doc date]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 33745642f787..e1d2c27533b4 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -268,7 +268,9 @@ struct landlock_net_port_attr {
  * ~~~~~~~~~~~~~~~~
  *
  * These flags enable to restrict a sandboxed process to a set of network
- * actions. This is supported since the Landlock ABI version 4.
+ * actions.
+ *
+ * This is supported since Landlock ABI version 4.
  *
  * The following access rights apply to TCP port numbers:
  *
@@ -291,11 +293,13 @@ struct landlock_net_port_attr {
  * Setting a flag for a ruleset will isolate the Landlock domain to forbid
  * connections to resources outside the domain.
  *
+ * This is supported since Landlock ABI version 6.
+ *
  * Scopes:
  *
  * - %LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET: Restrict a sandboxed process from
  *   connecting to an abstract UNIX socket created by a process outside the
- *   related Landlock domain (e.g. a parent domain or a non-sandboxed process).
+ *   related Landlock domain (e.g., a parent domain or a non-sandboxed process).
  * - %LANDLOCK_SCOPE_SIGNAL: Restrict a sandboxed process from sending a signal
  *   to another process outside the domain.
  */
-- 
cgit v1.2.3


From 362ff1e7c6c20f8d6ebe20682870d471373c608b Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 13 Feb 2025 17:18:25 +0100
Subject: virtio_snd.h: clarify that `controls` depends on VIRTIO_SND_F_CTLS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As defined in the specification, the `controls` field in the configuration
space is only valid/present if VIRTIO_SND_F_CTLS is negotiated.

  From https://docs.oasis-open.org/virtio/virtio/v1.3/virtio-v1.3.html:

  5.14.4 Device Configuration Layout
    ...
    controls
       (driver-read-only) indicates a total number of all available control
       elements if VIRTIO_SND_F_CTLS has been negotiated.

Let's use the same style used in virtio_blk.h to clarify this and to avoid
confusion as happened in QEMU (see link).

Link: https://gitlab.com/qemu-project/qemu/-/issues/2805
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20250213161825.139952-1-sgarzare@redhat.com
---
 include/uapi/linux/virtio_snd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/virtio_snd.h b/include/uapi/linux/virtio_snd.h
index 5f4100c2cf04..a4cfb9f6561a 100644
--- a/include/uapi/linux/virtio_snd.h
+++ b/include/uapi/linux/virtio_snd.h
@@ -25,7 +25,7 @@ struct virtio_snd_config {
 	__le32 streams;
 	/* # of available channel maps */
 	__le32 chmaps;
-	/* # of available control elements */
+	/* # of available control elements (if VIRTIO_SND_F_CTLS) */
 	__le32 controls;
 };
 
-- 
cgit v1.2.3


From 435b344a7042e91fb4719d589f18310e8919e39f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 10 Feb 2025 22:53:47 +0000
Subject: crypto: ccp: Add external API interface for PSP module initialization

KVM is dependent on the PSP SEV driver and PSP SEV driver needs to be
loaded before KVM module. In case of module loading any dependent
modules are automatically loaded but in case of built-in modules there
is no inherent mechanism available to specify dependencies between
modules and ensure that any dependent modules are loaded implicitly.

Add a new external API interface for PSP module initialization which
allows PSP SEV driver to be loaded explicitly if KVM is built-in.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ashish Kalra <ashish.kalra@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Message-ID: <15279ca0cad56a07cf12834ec544310f85ff5edc.1739226950.git.ashish.kalra@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/psp-sev.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 903ddfea8585..f3cad182d4ef 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -814,6 +814,15 @@ struct sev_data_snp_commit {
 
 #ifdef CONFIG_CRYPTO_DEV_SP_PSP
 
+/**
+ * sev_module_init - perform PSP SEV module initialization
+ *
+ * Returns:
+ * 0 if the PSP module is successfully initialized
+ * negative value if the PSP module initialization fails
+ */
+int sev_module_init(void);
+
 /**
  * sev_platform_init - perform SEV INIT command
  *
-- 
cgit v1.2.3


From b016d0873777462e55af4c615104cc684fce086d Mon Sep 17 00:00:00 2001
From: Wang Yaxin <wang.yaxin@zte.com.cn>
Date: Sat, 8 Feb 2025 14:49:01 +0800
Subject: taskstats: modify taskstats version

After adding "delay max" and "delay min" to the taskstats structure, the
taskstats version needs to be updated.

Link: https://lkml.kernel.org/r/20250208144901218Q5ptVpqsQkb2MOEmW4Ujn@zte.com.cn
Fixes: f65c64f311ee ("delayacct: add delay min to record delay peak")
Signed-off-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Signed-off-by: Kun Jiang <jiang.kun2@zte.com.cn>
Reviewed-by: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/taskstats.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 934e20ef7f79..95762232e018 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
  */
 
 
-#define TASKSTATS_VERSION	14
+#define TASKSTATS_VERSION	15
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
-- 
cgit v1.2.3


From 02d954c0fdf91845169cdacc7405b120f90afe01 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 10 Feb 2025 16:32:50 +0100
Subject: sched: Compact RSEQ concurrency IDs with reduced threads and affinity

When a process reduces its number of threads or clears bits in its CPU
affinity mask, the mm_cid allocation should eventually converge towards
smaller values.

However, the change introduced by:

commit 7e019dcc470f ("sched: Improve cache locality of RSEQ concurrency
IDs for intermittent workloads")

adds a per-mm/CPU recent_cid which is never unset unless a thread
migrates.

This is a tradeoff between:

A) Preserving cache locality after a transition from many threads to few
   threads, or after reducing the hamming weight of the allowed CPU mask.

B) Making the mm_cid upper bounds wrt nr threads and allowed CPU mask
   easy to document and understand.

C) Allowing applications to eventually react to mm_cid compaction after
   reduction of the nr threads or allowed CPU mask, making the tracking
   of mm_cid compaction easier by shrinking it back towards 0 or not.

D) Making sure applications that periodically reduce and then increase
   again the nr threads or allowed CPU mask still benefit from good
   cache locality with mm_cid.

Introduce the following changes:

* After shrinking the number of threads or reducing the number of
  allowed CPUs, reduce the value of max_nr_cid so expansion of CID
  allocation will preserve cache locality if the number of threads or
  allowed CPUs increase again.

* Only re-use a recent_cid if it is within the max_nr_cid upper bound,
  else find the first available CID.

Fixes: 7e019dcc470f ("sched: Improve cache locality of RSEQ concurrency IDs for intermittent workloads")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lkml.kernel.org/r/20250210153253.460471-2-gmonaco@redhat.com
---
 include/linux/mm_types.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6b27db7f9496..0234f14f2aa6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -875,10 +875,11 @@ struct mm_struct {
 		 */
 		unsigned int nr_cpus_allowed;
 		/**
-		 * @max_nr_cid: Maximum number of concurrency IDs allocated.
+		 * @max_nr_cid: Maximum number of allowed concurrency
+		 *              IDs allocated.
 		 *
-		 * Track the highest number of concurrency IDs allocated for the
-		 * mm.
+		 * Track the highest number of allowed concurrency IDs
+		 * allocated for the mm.
 		 */
 		atomic_t max_nr_cid;
 		/**
-- 
cgit v1.2.3


From 84e009042d0f3dfe91bec60bcd208ee3f866cbcd Mon Sep 17 00:00:00 2001
From: Maurizio Lombardi <mlombard@redhat.com>
Date: Mon, 17 Feb 2025 17:08:27 +0100
Subject: nvme-tcp: add basic support for the C2HTermReq PDU

Previously, the NVMe/TCP host driver did not handle the C2HTermReq PDU,
instead printing "unsupported pdu type (3)" when received. This patch adds
support for processing the C2HTermReq PDU, allowing the driver
to print the Fatal Error Status field.

Example of output:
nvme nvme4: Received C2HTermReq (FES = Invalid PDU Header Field)

Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme-tcp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h
index e07e8978d691..e435250fcb4d 100644
--- a/include/linux/nvme-tcp.h
+++ b/include/linux/nvme-tcp.h
@@ -13,6 +13,8 @@
 #define NVME_TCP_ADMIN_CCSZ	SZ_8K
 #define NVME_TCP_DIGEST_LENGTH	4
 #define NVME_TCP_MIN_MAXH2CDATA 4096
+#define NVME_TCP_MIN_C2HTERM_PLEN	24
+#define NVME_TCP_MAX_C2HTERM_PLEN	152
 
 enum nvme_tcp_pfv {
 	NVME_TCP_PFV_1_0 = 0x0,
-- 
cgit v1.2.3


From d422247d14a53fe825b1778edf104167d8fd8f3f Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Thu, 13 Feb 2025 15:49:59 +0900
Subject: nvme: Cleanup the definition of the controller config register fields

Reorganized the enum used to define the fields of the contrller
configuration (CC) register in include/linux/nvme.h to:
1) Group together all the values defined for each field.
2) Add the missing field masks definitions.
3) Add comments to describe the enum and each field.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index fe3b60818fdc..2dc05b1c3283 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -199,28 +199,54 @@ enum {
 #define NVME_NVM_IOSQES		6
 #define NVME_NVM_IOCQES		4
 
+/*
+ * Controller Configuration (CC) register (Offset 14h)
+ */
 enum {
+	/* Enable (EN): bit 0 */
 	NVME_CC_ENABLE		= 1 << 0,
 	NVME_CC_EN_SHIFT	= 0,
+
+	/* Bits 03:01 are reserved (NVMe Base Specification rev 2.1) */
+
+	/* I/O Command Set Selected (CSS): bits 06:04 */
 	NVME_CC_CSS_SHIFT	= 4,
-	NVME_CC_MPS_SHIFT	= 7,
-	NVME_CC_AMS_SHIFT	= 11,
-	NVME_CC_SHN_SHIFT	= 14,
-	NVME_CC_IOSQES_SHIFT	= 16,
-	NVME_CC_IOCQES_SHIFT	= 20,
+	NVME_CC_CSS_MASK	= 7 << NVME_CC_CSS_SHIFT,
 	NVME_CC_CSS_NVM		= 0 << NVME_CC_CSS_SHIFT,
 	NVME_CC_CSS_CSI		= 6 << NVME_CC_CSS_SHIFT,
-	NVME_CC_CSS_MASK	= 7 << NVME_CC_CSS_SHIFT,
+
+	/* Memory Page Size (MPS): bits 10:07 */
+	NVME_CC_MPS_SHIFT	= 7,
+	NVME_CC_MPS_MASK	= 0xf << NVME_CC_MPS_SHIFT,
+
+	/* Arbitration Mechanism Selected (AMS): bits 13:11 */
+	NVME_CC_AMS_SHIFT	= 11,
+	NVME_CC_AMS_MASK	= 7 << NVME_CC_AMS_SHIFT,
 	NVME_CC_AMS_RR		= 0 << NVME_CC_AMS_SHIFT,
 	NVME_CC_AMS_WRRU	= 1 << NVME_CC_AMS_SHIFT,
 	NVME_CC_AMS_VS		= 7 << NVME_CC_AMS_SHIFT,
+
+	/* Shutdown Notification (SHN): bits 15:14 */
+	NVME_CC_SHN_SHIFT	= 14,
+	NVME_CC_SHN_MASK	= 3 << NVME_CC_SHN_SHIFT,
 	NVME_CC_SHN_NONE	= 0 << NVME_CC_SHN_SHIFT,
 	NVME_CC_SHN_NORMAL	= 1 << NVME_CC_SHN_SHIFT,
 	NVME_CC_SHN_ABRUPT	= 2 << NVME_CC_SHN_SHIFT,
-	NVME_CC_SHN_MASK	= 3 << NVME_CC_SHN_SHIFT,
+
+	/* I/O Submission Queue Entry Size (IOSQES): bits 19:16 */
+	NVME_CC_IOSQES_SHIFT	= 16,
+	NVME_CC_IOSQES_MASK	= 0xf << NVME_CC_IOSQES_SHIFT,
 	NVME_CC_IOSQES		= NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT,
+
+	/* I/O Completion Queue Entry Size (IOCQES): bits 23:20 */
+	NVME_CC_IOCQES_SHIFT	= 20,
+	NVME_CC_IOCQES_MASK	= 0xf << NVME_CC_IOCQES_SHIFT,
 	NVME_CC_IOCQES		= NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT,
+
+	/* Controller Ready Independent of Media Enable (CRIME): bit 24 */
 	NVME_CC_CRIME		= 1 << 24,
+
+	/* Bits 25:31 are reserved (NVMe Base Specification rev 2.1) */
 };
 
 enum {
-- 
cgit v1.2.3


From 1fc61eeefe10d9996d2b875214d89f0909d03417 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 18 Feb 2025 16:47:40 -0700
Subject: io_uring: fix spelling error in uapi io_uring.h

This is obviously not that important, but when changes are synced back
from the kernel to liburing, the codespell CI ends up erroring because
of this misspelling. Let's just correct it and avoid this biting us
again on an import.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index e11c82638527..050fa8eb2e8f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -380,7 +380,7 @@ enum io_uring_op {
  *				result 	will be the number of buffers send, with
  *				the starting buffer ID in cqe->flags as per
  *				usual for provided buffer usage. The buffers
- *				will be	contigious from the starting buffer ID.
+ *				will be	contiguous from the starting buffer ID.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
 #define IORING_RECV_MULTISHOT		(1U << 1)
-- 
cgit v1.2.3


From e57a6320215c3967f51ab0edeff87db2095440e4 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 17 Feb 2025 11:11:27 -0800
Subject: net: Add net_passive_inc() and net_passive_dec().

net_drop_ns() is NULL when CONFIG_NET_NS is disabled.

The next patch introduces a function that increments
and decrements net->passive.

As a prep, let's rename and export net_free() to
net_passive_dec() and add net_passive_inc().

Suggested-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/netdev/CANn89i+oUCt2VGvrbrweniTendZFEh+nwS=uonc004-aPkWy-Q@mail.gmail.com/
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250217191129.19967-2-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/net_namespace.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 7ba1402ca779..f467a66abc6b 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -297,6 +297,7 @@ static inline int check_net(const struct net *net)
 }
 
 void net_drop_ns(void *);
+void net_passive_dec(struct net *net);
 
 #else
 
@@ -326,8 +327,18 @@ static inline int check_net(const struct net *net)
 }
 
 #define net_drop_ns NULL
+
+static inline void net_passive_dec(struct net *net)
+{
+	refcount_dec(&net->passive);
+}
 #endif
 
+static inline void net_passive_inc(struct net *net)
+{
+	refcount_inc(&net->passive);
+}
+
 /* Returns true if the netns initialization is completed successfully */
 static inline bool net_initialized(const struct net *net)
 {
-- 
cgit v1.2.3


From 5bbd6e863b15a85221e49b9bdb2d5d8f0bb91f3d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 1 Feb 2025 15:00:02 -0500
Subject: SUNRPC: Prevent looping due to rpc_signal_task() races

If rpc_signal_task() is called while a task is in an rpc_call_done()
callback function, and the latter calls rpc_restart_call(), the task can
end up looping due to the RPC_TASK_SIGNALLED flag being set without the
tk_rpc_status being set.
Removing the redundant mechanism for signalling the task fixes the
looping behaviour.

Reported-by: Li Lingfeng <lilingfeng3@huawei.com>
Fixes: 39494194f93b ("SUNRPC: Fix races with rpc_killall_tasks()")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/sunrpc/sched.h  | 3 +--
 include/trace/events/sunrpc.h | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index fec1e8a1570c..eac57914dcf3 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -158,7 +158,6 @@ enum {
 	RPC_TASK_NEED_XMIT,
 	RPC_TASK_NEED_RECV,
 	RPC_TASK_MSG_PIN_WAIT,
-	RPC_TASK_SIGNALLED,
 };
 
 #define rpc_test_and_set_running(t) \
@@ -171,7 +170,7 @@ enum {
 
 #define RPC_IS_ACTIVATED(t)	test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate)
 
-#define RPC_SIGNALLED(t)	test_bit(RPC_TASK_SIGNALLED, &(t)->tk_runstate)
+#define RPC_SIGNALLED(t)	(READ_ONCE(task->tk_rpc_status) == -ERESTARTSYS)
 
 /*
  * Task priorities.
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index b13dc275ef4a..851841336ee6 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -360,8 +360,7 @@ TRACE_EVENT(rpc_request,
 		{ (1UL << RPC_TASK_ACTIVE), "ACTIVE" },			\
 		{ (1UL << RPC_TASK_NEED_XMIT), "NEED_XMIT" },		\
 		{ (1UL << RPC_TASK_NEED_RECV), "NEED_RECV" },		\
-		{ (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" },	\
-		{ (1UL << RPC_TASK_SIGNALLED), "SIGNALLED" })
+		{ (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" })
 
 DECLARE_EVENT_CLASS(rpc_task_running,
 
-- 
cgit v1.2.3


From 4b5a28b38c4a0106c64416a1b2042405166b26ce Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Tue, 18 Feb 2025 05:49:30 -0800
Subject: net: Add non-RCU dev_getbyhwaddr() helper

Add dedicated helper for finding devices by hardware address when
holding rtnl_lock, similar to existing dev_getbyhwaddr_rcu(). This prevents
PROVE_LOCKING warnings when rtnl_lock is held but RCU read lock is not.

Extract common address comparison logic into dev_addr_cmp().

The context about this change could be found in the following
discussion:

Link: https://lore.kernel.org/all/20250206-scarlet-ermine-of-improvement-1fcac5@leitao/

Cc: kuniyu@amazon.com
Cc: ushankar@purestorage.com
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250218-arm_fix_selftest-v5-1-d3d6892db9e1@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c0a86afb85da..94b7d4eca003 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3275,6 +3275,8 @@ static inline struct net_device *first_net_device_rcu(struct net *net)
 }
 
 int netdev_boot_setup_check(struct net_device *dev);
+struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
+				   const char *hwaddr);
 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 				       const char *hwaddr);
 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type);
-- 
cgit v1.2.3


From 9b6412e6979f6f9e0632075f8f008937b5cd4efd Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Mon, 17 Feb 2025 11:23:35 +0100
Subject: tcp: drop secpath at the same time as we currently drop dst

Xiumei reported hitting the WARN in xfrm6_tunnel_net_exit while
running tests that boil down to:
 - create a pair of netns
 - run a basic TCP test over ipcomp6
 - delete the pair of netns

The xfrm_state found on spi_byaddr was not deleted at the time we
delete the netns, because we still have a reference on it. This
lingering reference comes from a secpath (which holds a ref on the
xfrm_state), which is still attached to an skb. This skb is not
leaked, it ends up on sk_receive_queue and then gets defer-free'd by
skb_attempt_defer_free.

The problem happens when we defer freeing an skb (push it on one CPU's
defer_list), and don't flush that list before the netns is deleted. In
that case, we still have a reference on the xfrm_state that we don't
expect at this point.

We already drop the skb's dst in the TCP receive path when it's no
longer needed, so let's also drop the secpath. At this point,
tcp_filter has already called into the LSM hooks that may require the
secpath, so it should not be needed anymore. However, in some of those
places, the MPTCP extension has just been attached to the skb, so we
cannot simply drop all extensions.

Fixes: 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists")
Reported-by: Xiumei Mu <xmu@redhat.com>
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/5055ba8f8f72bdcb602faa299faca73c280b7735.1739743613.git.sd@queasysnail.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tcp.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5b2b04835688..930cda5b5eb9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -41,6 +41,7 @@
 #include <net/inet_ecn.h>
 #include <net/dst.h>
 #include <net/mptcp.h>
+#include <net/xfrm.h>
 
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
@@ -683,6 +684,19 @@ void tcp_fin(struct sock *sk);
 void tcp_check_space(struct sock *sk);
 void tcp_sack_compress_send_ack(struct sock *sk);
 
+static inline void tcp_cleanup_skb(struct sk_buff *skb)
+{
+	skb_dst_drop(skb);
+	secpath_reset(skb);
+}
+
+static inline void tcp_add_receive_queue(struct sock *sk, struct sk_buff *skb)
+{
+	DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
+	DEBUG_NET_WARN_ON_ONCE(secpath_exists(skb));
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
+}
+
 /* tcp_timer.c */
 void tcp_init_xmit_timers(struct sock *);
 static inline void tcp_clear_xmit_timers(struct sock *sk)
-- 
cgit v1.2.3


From 14ad6ed30a10afbe91b0749d6378285f4225d482 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 18 Feb 2025 19:29:39 +0100
Subject: net: allow small head cache usage with large MAX_SKB_FRAGS values

Sabrina reported the following splat:

    WARNING: CPU: 0 PID: 1 at net/core/dev.c:6935 netif_napi_add_weight_locked+0x8f2/0xba0
    Modules linked in:
    CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.14.0-rc1-net-00092-g011b03359038 #996
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
    RIP: 0010:netif_napi_add_weight_locked+0x8f2/0xba0
    Code: e8 c3 e6 6a fe 48 83 c4 28 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc c7 44 24 10 ff ff ff ff e9 8f fb ff ff e8 9e e6 6a fe <0f> 0b e9 d3 fe ff ff e8 92 e6 6a fe 48 8b 04 24 be ff ff ff ff 48
    RSP: 0000:ffffc9000001fc60 EFLAGS: 00010293
    RAX: 0000000000000000 RBX: ffff88806ce48128 RCX: 1ffff11001664b9e
    RDX: ffff888008f00040 RSI: ffffffff8317ca42 RDI: ffff88800b325cb6
    RBP: ffff88800b325c40 R08: 0000000000000001 R09: ffffed100167502c
    R10: ffff88800b3a8163 R11: 0000000000000000 R12: ffff88800ac1c168
    R13: ffff88800ac1c168 R14: ffff88800ac1c168 R15: 0000000000000007
    FS:  0000000000000000(0000) GS:ffff88806ce00000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: ffff888008201000 CR3: 0000000004c94001 CR4: 0000000000370ef0
    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
    Call Trace:
    <TASK>
    gro_cells_init+0x1ba/0x270
    xfrm_input_init+0x4b/0x2a0
    xfrm_init+0x38/0x50
    ip_rt_init+0x2d7/0x350
    ip_init+0xf/0x20
    inet_init+0x406/0x590
    do_one_initcall+0x9d/0x2e0
    do_initcalls+0x23b/0x280
    kernel_init_freeable+0x445/0x490
    kernel_init+0x20/0x1d0
    ret_from_fork+0x46/0x80
    ret_from_fork_asm+0x1a/0x30
    </TASK>
    irq event stamp: 584330
    hardirqs last  enabled at (584338): [<ffffffff8168bf87>] __up_console_sem+0x77/0xb0
    hardirqs last disabled at (584345): [<ffffffff8168bf6c>] __up_console_sem+0x5c/0xb0
    softirqs last  enabled at (583242): [<ffffffff833ee96d>] netlink_insert+0x14d/0x470
    softirqs last disabled at (583754): [<ffffffff8317c8cd>] netif_napi_add_weight_locked+0x77d/0xba0

on kernel built with MAX_SKB_FRAGS=45, where SKB_WITH_OVERHEAD(1024)
is smaller than GRO_MAX_HEAD.

Such built additionally contains the revert of the single page frag cache
so that napi_get_frags() ends up using the page frag allocator, triggering
the splat.

Note that the underlying issue is independent from the mentioned
revert; address it ensuring that the small head cache will fit either TCP
and GRO allocation and updating napi_alloc_skb() and __netdev_alloc_skb()
to select kmalloc() usage for any allocation fitting such cache.

Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Suggested-by: Eric Dumazet <edumazet@google.com>
Fixes: 3948b05950fd ("net: introduce a config option to tweak MAX_SKB_FRAGS")
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/gro.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/gro.h b/include/net/gro.h
index b9b58c1f8d19..7b548f91754b 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -11,6 +11,9 @@
 #include <net/udp.h>
 #include <net/hotdata.h>
 
+/* This should be increased if a protocol with a bigger head is added. */
+#define GRO_MAX_HEAD (MAX_HEADER + 128)
+
 struct napi_gro_cb {
 	union {
 		struct {
-- 
cgit v1.2.3


From 6bc7e4eb0499562ccd291712fd7be0d1a5aad00a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 18 Feb 2025 19:29:40 +0100
Subject: Revert "net: skb: introduce and use a single page frag cache"

After the previous commit is finally safe to revert commit dbae2b062824
("net: skb: introduce and use a single page frag cache"): do it here.

The intended goal of such change was to counter a performance regression
introduced by commit 3226b158e67c ("net: avoid 32 x truesize
under-estimation for tiny skbs").

Unfortunately, the blamed commit introduces another regression for the
virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny
size, so that the whole head frag could fit a 512-byte block.

The single page frag cache uses a 1K fragment for such allocation, and
the additional overhead, under small UDP packets flood, makes the page
allocator a bottleneck.

Thanks to commit bf9f1baa279f ("net: add dedicated kmem_cache for
typical/small skb->head"), this revert does not re-introduce the
original regression. Actually, in the relevant test on top of this
revert, I measure a small but noticeable positive delta, just above
noise level.

The revert itself required some additional mangling due to recent updates
in the affected code.

Suggested-by: Eric Dumazet <edumazet@google.com>
Fixes: dbae2b062824 ("net: skb: introduce and use a single page frag cache")
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 94b7d4eca003..ab550a89b9bf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4117,7 +4117,6 @@ void netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
-void napi_get_frags_check(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
 
 static inline void napi_free_frags(struct napi_struct *napi)
-- 
cgit v1.2.3


From b4c173dfbb6c78568578ff18f9e8822d7bd0e31b Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Thu, 20 Feb 2025 11:02:58 +0100
Subject: fuse: don't truncate cached, mutated symlink

Fuse allows the value of a symlink to change and this property is exploited
by some filesystems (e.g. CVMFS).

It has been observed, that sometimes after changing the symlink contents,
the value is truncated to the old size.

This is caused by fuse_getattr() racing with fuse_reverse_inval_inode().
fuse_reverse_inval_inode() updates the fuse_inode's attr_version, which
results in fuse_change_attributes() exiting before updating the cached
attributes

This is okay, as the cached attributes remain invalid and the next call to
fuse_change_attributes() will likely update the inode with the correct
values.

The reason this causes problems is that cached symlinks will be
returned through page_get_link(), which truncates the symlink to
inode->i_size.  This is correct for filesystems that don't mutate
symlinks, but in this case it causes bad behavior.

The solution is to just remove this truncation.  This can cause a
regression in a filesystem that relies on supplying a symlink larger than
the file size, but this is unlikely.  If that happens we'd need to make
this behavior conditional.

Reported-by: Laura Promberger <laura.promberger@cern.ch>
Tested-by: Sam Lewis <samclewis@google.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20250220100258.793363-1-mszeredi@redhat.com
Reviewed-by: Bernd Schubert <bschubert@ddn.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c3b2f8a621f..9346adf28f7b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3452,6 +3452,8 @@ extern const struct file_operations generic_ro_fops;
 
 extern int readlink_copy(char __user *, int, const char *, int);
 extern int page_readlink(struct dentry *, char __user *, int);
+extern const char *page_get_link_raw(struct dentry *, struct inode *,
+				     struct delayed_call *);
 extern const char *page_get_link(struct dentry *, struct inode *,
 				 struct delayed_call *);
 extern void page_put_link(void *);
-- 
cgit v1.2.3


From 8510edf191d2df0822ea22d6226e4eef87562271 Mon Sep 17 00:00:00 2001
From: Jingbo Xu <jefflexu@linux.alibaba.com>
Date: Tue, 18 Feb 2025 20:02:08 +0800
Subject: mm/filemap: fix miscalculated file range for
 filemap_fdatawrite_range_kick()

iocb->ki_pos has been updated with the number of written bytes since
generic_perform_write().

Besides __filemap_fdatawrite_range() accepts the inclusive end of the
data range.

Fixes: 1d4457576570 ("mm: call filemap_fdatawrite_range_kick() after IOCB_DONTCACHE issue")
Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20250218120209.88093-2-jefflexu@linux.alibaba.com
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9346adf28f7b..2788df98080f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2975,8 +2975,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
 	} else if (iocb->ki_flags & IOCB_DONTCACHE) {
 		struct address_space *mapping = iocb->ki_filp->f_mapping;
 
-		filemap_fdatawrite_range_kick(mapping, iocb->ki_pos,
-					      iocb->ki_pos + count);
+		filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count,
+					      iocb->ki_pos - 1);
 	}
 
 	return count;
-- 
cgit v1.2.3


From 1f0fc3374f3345ff1d150c5c56ac5016e5d3826a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 18 Feb 2025 19:22:48 +0000
Subject: afs: Give an afs_server object a ref on the afs_cell object it points
 to

Give an afs_server object a ref on the afs_cell object it points to so that
the cell doesn't get deleted before the server record.

Whilst this is circular (cell -> vol -> server_list -> server -> cell), the
ref only pins the memory, not the lifetime as that's controlled by the
activity counter.  When the volume's activity counter reaches 0, it
detaches from the cell and discards its server list; when a cell's activity
counter reaches 0, it discards its root volume.  At that point, the
circularity is cut.

Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Simon Horman <horms@kernel.org>
cc: linux-afs@lists.infradead.org
Link: https://patch.msgid.link/20250218192250.296870-6-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/afs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index b0db89058c91..958a2460330c 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -174,6 +174,7 @@ enum yfs_cm_operation {
 	EM(afs_cell_trace_get_queue_dns,	"GET q-dns ") \
 	EM(afs_cell_trace_get_queue_manage,	"GET q-mng ") \
 	EM(afs_cell_trace_get_queue_new,	"GET q-new ") \
+	EM(afs_cell_trace_get_server,		"GET server") \
 	EM(afs_cell_trace_get_vol,		"GET vol   ") \
 	EM(afs_cell_trace_insert,		"INSERT    ") \
 	EM(afs_cell_trace_manage,		"MANAGE    ") \
@@ -182,6 +183,7 @@ enum yfs_cm_operation {
 	EM(afs_cell_trace_put_destroy,		"PUT destry") \
 	EM(afs_cell_trace_put_queue_work,	"PUT q-work") \
 	EM(afs_cell_trace_put_queue_fail,	"PUT q-fail") \
+	EM(afs_cell_trace_put_server,		"PUT server") \
 	EM(afs_cell_trace_put_vol,		"PUT vol   ") \
 	EM(afs_cell_trace_see_source,		"SEE source") \
 	EM(afs_cell_trace_see_ws,		"SEE ws    ") \
-- 
cgit v1.2.3


From 5c70eb5c593d64d93b178905da215a9fd288a4b5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 20 Feb 2025 13:18:54 +0000
Subject: net: better track kernel sockets lifetime

While kernel sockets are dismantled during pernet_operations->exit(),
their freeing can be delayed by any tx packets still held in qdisc
or device queues, due to skb_set_owner_w() prior calls.

This then trigger the following warning from ref_tracker_dir_exit() [1]

To fix this, make sure that kernel sockets own a reference on net->passive.

Add sk_net_refcnt_upgrade() helper, used whenever a kernel socket
is converted to a refcounted one.

[1]

[  136.263918][   T35] ref_tracker: net notrefcnt@ffff8880638f01e0 has 1/2 users at
[  136.263918][   T35]      sk_alloc+0x2b3/0x370
[  136.263918][   T35]      inet6_create+0x6ce/0x10f0
[  136.263918][   T35]      __sock_create+0x4c0/0xa30
[  136.263918][   T35]      inet_ctl_sock_create+0xc2/0x250
[  136.263918][   T35]      igmp6_net_init+0x39/0x390
[  136.263918][   T35]      ops_init+0x31e/0x590
[  136.263918][   T35]      setup_net+0x287/0x9e0
[  136.263918][   T35]      copy_net_ns+0x33f/0x570
[  136.263918][   T35]      create_new_namespaces+0x425/0x7b0
[  136.263918][   T35]      unshare_nsproxy_namespaces+0x124/0x180
[  136.263918][   T35]      ksys_unshare+0x57d/0xa70
[  136.263918][   T35]      __x64_sys_unshare+0x38/0x40
[  136.263918][   T35]      do_syscall_64+0xf3/0x230
[  136.263918][   T35]      entry_SYSCALL_64_after_hwframe+0x77/0x7f
[  136.263918][   T35]
[  136.343488][   T35] ref_tracker: net notrefcnt@ffff8880638f01e0 has 1/2 users at
[  136.343488][   T35]      sk_alloc+0x2b3/0x370
[  136.343488][   T35]      inet6_create+0x6ce/0x10f0
[  136.343488][   T35]      __sock_create+0x4c0/0xa30
[  136.343488][   T35]      inet_ctl_sock_create+0xc2/0x250
[  136.343488][   T35]      ndisc_net_init+0xa7/0x2b0
[  136.343488][   T35]      ops_init+0x31e/0x590
[  136.343488][   T35]      setup_net+0x287/0x9e0
[  136.343488][   T35]      copy_net_ns+0x33f/0x570
[  136.343488][   T35]      create_new_namespaces+0x425/0x7b0
[  136.343488][   T35]      unshare_nsproxy_namespaces+0x124/0x180
[  136.343488][   T35]      ksys_unshare+0x57d/0xa70
[  136.343488][   T35]      __x64_sys_unshare+0x38/0x40
[  136.343488][   T35]      do_syscall_64+0xf3/0x230
[  136.343488][   T35]      entry_SYSCALL_64_after_hwframe+0x77/0x7f

Fixes: 0cafd77dcd03 ("net: add a refcount tracker for kernel sockets")
Reported-by: syzbot+30a19e01a97420719891@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/67b72aeb.050a0220.14d86d.0283.GAE@google.com/T/#u
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250220131854.4048077-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 8036b3b79cd8..7ef728324e4e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1751,6 +1751,7 @@ static inline bool sock_allow_reclassification(const struct sock *csk)
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		      struct proto *prot, int kern);
 void sk_free(struct sock *sk);
+void sk_net_refcnt_upgrade(struct sock *sk);
 void sk_destruct(struct sock *sk);
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
 void sk_free_unlock_clone(struct sock *sk);
-- 
cgit v1.2.3


From 769c1b79295c38d60fde4c0a8f5f31e01360c54f Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 25 Feb 2025 13:18:43 +0000
Subject: ASoC: cs35l56: Prevent races when soft-resetting using SPI control

When SPI is used for control, the driver must hold the SPI bus lock
while issuing the sequence of writes to perform a soft reset.

>From the time the driver writes the SYSTEM_RESET command until the
driver does a write to terminate the reset, there must not be any
activity on the SPI bus lines. If there is any SPI activity during the
soft-reset, another soft-reset will be triggered. The state of the SPI
chip select is irrelevant.

A repeated soft-reset does not in itself cause any problems, and it is
not an infinite loop. The problem is a race between these resets and
the driver polling for boot completion. There is a time window between
soft resets where the driver could read HALO_STATE as 2 (fully booted)
while the chip is actually soft-resetting. Although this window is
small, it is long enough that it is possible to hit it in normal
operation.

To prevent this race and ensure the chip really is fully booted, the
driver calls spi_bus_lock() to prevent other activity while resetting.
It then issues the SYSTEM_RESET mailbox command. After allowing
sufficient time for reset to take effect, the driver issues a PING
mailbox command, which will force completion of the full soft-reset
sequence. The SPI bus lock can then be released. The mailbox is
checked for any boot or wakeup response from the firmware, before the
value in HALO_STATE will be trusted.

This does not affect SoundWire or I2C control.

Fixes: 8a731fd37f8b ("ASoC: cs35l56: Move utility functions to shared file")
Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20250225131843.113752-3-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index 3dc7a1551ac3..5d653a3491d0 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -12,6 +12,7 @@
 #include <linux/firmware/cirrus/cs_dsp.h>
 #include <linux/regulator/consumer.h>
 #include <linux/regmap.h>
+#include <linux/spi/spi.h>
 #include <sound/cs-amp-lib.h>
 
 #define CS35L56_DEVID					0x0000000
@@ -61,6 +62,7 @@
 #define CS35L56_IRQ1_MASK_8				0x000E0AC
 #define CS35L56_IRQ1_MASK_18				0x000E0D4
 #define CS35L56_IRQ1_MASK_20				0x000E0DC
+#define CS35L56_DSP_MBOX_1_RAW				0x0011000
 #define CS35L56_DSP_VIRTUAL1_MBOX_1			0x0011020
 #define CS35L56_DSP_VIRTUAL1_MBOX_2			0x0011024
 #define CS35L56_DSP_VIRTUAL1_MBOX_3			0x0011028
@@ -224,6 +226,7 @@
 #define CS35L56_HALO_STATE_SHUTDOWN			1
 #define CS35L56_HALO_STATE_BOOT_DONE			2
 
+#define CS35L56_MBOX_CMD_PING				0x0A000000
 #define CS35L56_MBOX_CMD_AUDIO_PLAY			0x0B000001
 #define CS35L56_MBOX_CMD_AUDIO_PAUSE			0x0B000002
 #define CS35L56_MBOX_CMD_AUDIO_REINIT			0x0B000003
@@ -254,6 +257,16 @@
 #define CS35L56_NUM_BULK_SUPPLIES			3
 #define CS35L56_NUM_DSP_REGIONS				5
 
+/* Additional margin for SYSTEM_RESET to control port ready on SPI */
+#define CS35L56_SPI_RESET_TO_PORT_READY_US (CS35L56_CONTROL_PORT_READY_US + 2500)
+
+struct cs35l56_spi_payload {
+	__be32	addr;
+	__be16	pad;
+	__be32	value;
+} __packed;
+static_assert(sizeof(struct cs35l56_spi_payload) == 10);
+
 struct cs35l56_base {
 	struct device *dev;
 	struct regmap *regmap;
@@ -269,6 +282,7 @@ struct cs35l56_base {
 	s8 cal_index;
 	struct cirrus_amp_cal_data cal_data;
 	struct gpio_desc *reset_gpio;
+	struct cs35l56_spi_payload *spi_payload_buf;
 };
 
 static inline bool cs35l56_is_otp_register(unsigned int reg)
@@ -276,6 +290,23 @@ static inline bool cs35l56_is_otp_register(unsigned int reg)
 	return (reg >> 16) == 3;
 }
 
+static inline int cs35l56_init_config_for_spi(struct cs35l56_base *cs35l56,
+					      struct spi_device *spi)
+{
+	cs35l56->spi_payload_buf = devm_kzalloc(&spi->dev,
+						sizeof(*cs35l56->spi_payload_buf),
+						GFP_KERNEL | GFP_DMA);
+	if (!cs35l56->spi_payload_buf)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline bool cs35l56_is_spi(struct cs35l56_base *cs35l56)
+{
+	return IS_ENABLED(CONFIG_SPI_MASTER) && !!cs35l56->spi_payload_buf;
+}
+
 extern const struct regmap_config cs35l56_regmap_i2c;
 extern const struct regmap_config cs35l56_regmap_spi;
 extern const struct regmap_config cs35l56_regmap_sdw;
-- 
cgit v1.2.3


From 889c57066ceee5e9172232da0608a8ac053bb6e5 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 25 Feb 2025 10:21:41 +0800
Subject: block: make segment size limit workable for > 4K PAGE_SIZE

Using PAGE_SIZE as a minimum expected DMA segment size in consideration
of devices which have a max DMA segment size of < 64k when used on 64k
PAGE_SIZE systems leads to devices not being able to probe such as
eMMC and Exynos UFS controller [0] [1] you can end up with a probe failure
as follows:

WARNING: CPU: 2 PID: 397 at block/blk-settings.c:339 blk_validate_limits+0x364/0x3c0

Ensure we use min(max_seg_size, seg_boundary_mask + 1) as the new min segment
size when max segment size is < PAGE_SIZE for 16k and 64k base page size systems.

If anyone need to backport this patch, the following commits are depended:

	commit 6aeb4f836480 ("block: remove bio_add_pc_page")
	commit 02ee5d69e3ba ("block: remove blk_rq_bio_prep")
	commit b7175e24d6ac ("block: add a dma mapping iterator")

Link: https://lore.kernel.org/linux-block/20230612203314.17820-1-bvanassche@acm.org/ # [0]
Link: https://lore.kernel.org/linux-block/1d55e942-5150-de4c-3a02-c3d066f87028@acm.org/ # [1]
Cc: Yi Zhang <yi.zhang@redhat.com>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Keith Busch <kbusch@kernel.org>
Tested-by: Paul Bunyan <pbunyan@redhat.com>
Reviewed-by: Daniel Gomez <da.gomez@kernel.org>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250225022141.2154581-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 248416ecd01c..58ff5aca83b6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -367,6 +367,7 @@ struct queue_limits {
 	unsigned int		max_sectors;
 	unsigned int		max_user_sectors;
 	unsigned int		max_segment_size;
+	unsigned int		min_segment_size;
 	unsigned int		physical_block_size;
 	unsigned int		logical_block_size;
 	unsigned int		alignment_offset;
-- 
cgit v1.2.3


From 68f3ea7ee199ef77551e090dfef5a49046ea8443 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 21 Feb 2025 14:57:06 +0100
Subject: vmlinux.lds: Ensure that const vars with relocations are mapped R/O

In the kernel, there are architectures (x86, arm64) that perform
boot-time relocation (for KASLR) without relying on PIE codegen. In this
case, all const global objects are emitted into .rodata, including const
objects with fields that will be fixed up by the boot-time relocation
code.  This implies that .rodata (and .text in some cases) need to be
writable at boot, but they will usually be mapped read-only as soon as
the boot completes.

When using PIE codegen, the compiler will emit const global objects into
.data.rel.ro rather than .rodata if the object contains fields that need
such fixups at boot-time. This permits the linker to annotate such
regions as requiring read-write access only at load time, but not at
execution time (in user space), while keeping .rodata truly const (in
user space, this is important for reducing the CoW footprint of dynamic
executables).

This distinction does not matter for the kernel, but it does imply that
const data will end up in writable memory if the .data.rel.ro sections
are not treated in a special way, as they will end up in the writable
.data segment by default.

So emit .data.rel.ro into the .rodata segment.

Cc: stable@vger.kernel.org
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250221135704.431269-5-ardb+git@google.com
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/asm-generic/vmlinux.lds.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 54504013c749..337d3336e175 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -457,7 +457,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
 	. = ALIGN((align));						\
 	.rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {		\
 		__start_rodata = .;					\
-		*(.rodata) *(.rodata.*)					\
+		*(.rodata) *(.rodata.*) *(.data.rel.ro*)		\
 		SCHED_DATA						\
 		RO_AFTER_INIT_DATA	/* Read only after init */	\
 		. = ALIGN(8);						\
-- 
cgit v1.2.3


From 73cfc53cc3b6380eccf013049574485f64cb83ca Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 21 Feb 2025 14:57:07 +0100
Subject: objtool: Fix C jump table annotations for Clang

A C jump table (such as the one used by the BPF interpreter) is a const
global array of absolute code addresses, and this means that the actual
values in the table may not be known until the kernel is booted (e.g.,
when using KASLR or when the kernel VA space is sized dynamically).

When using PIE codegen, the compiler will default to placing such const
global objects in .data.rel.ro (which is annotated as writable), rather
than .rodata (which is annotated as read-only). As C jump tables are
explicitly emitted into .rodata, this used to result in warnings for
LoongArch builds (which uses PIE codegen for the entire kernel) like

  Warning: setting incorrect section attributes for .rodata..c_jump_table

due to the fact that the explicitly specified .rodata section inherited
the read-write annotation that the compiler uses for such objects when
using PIE codegen.

This warning was suppressed by explicitly adding the read-only
annotation to the __attribute__((section(""))) string, by commit

  c5b1184decc8 ("compiler.h: specify correct attribute for .rodata..c_jump_table")

Unfortunately, this hack does not work on Clang's integrated assembler,
which happily interprets the appended section type and permission
specifiers as part of the section name, which therefore no longer
matches the hard-coded pattern '.rodata..c_jump_table' that objtool
expects, causing it to emit a warning

  kernel/bpf/core.o: warning: objtool: ___bpf_prog_run+0x20: sibling call from callable instruction with modified stack frame

Work around this, by emitting C jump tables into .data.rel.ro instead,
which is treated as .rodata by the linker script for all builds, not
just PIE based ones.

Fixes: c5b1184decc8 ("compiler.h: specify correct attribute for .rodata..c_jump_table")
Tested-by: Tiezhu Yang <yangtiezhu@loongson.cn> # on LoongArch
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250221135704.431269-6-ardb+git@google.com
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b087de2f3e94..0c25f3e429bb 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -110,7 +110,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 /* Unreachable code */
 #ifdef CONFIG_OBJTOOL
 /* Annotate a C jump table to allow objtool to follow the code flow */
-#define __annotate_jump_table __section(".rodata..c_jump_table,\"a\",@progbits #")
+#define __annotate_jump_table __section(".data.rel.ro.c_jump_table")
 #else /* !CONFIG_OBJTOOL */
 #define __annotate_jump_table
 #endif /* CONFIG_OBJTOOL */
-- 
cgit v1.2.3


From 9084ed79ddaaaa1ec01cd304af9fb532c26252db Mon Sep 17 00:00:00 2001
From: Stephen Smalley <stephen.smalley.work@gmail.com>
Date: Thu, 20 Feb 2025 14:29:36 -0500
Subject: lsm,nfs: fix memory leak of lsm_context

commit b530104f50e8 ("lsm: lsm_context in security_dentry_init_security")
did not preserve the lsm id for subsequent release calls, which results
in a memory leak. Fix it by saving the lsm id in the nfs4_label and
providing it on the subsequent release call.

Fixes: b530104f50e8 ("lsm: lsm_context in security_dentry_init_security")
Signed-off-by: Stephen Smalley <stephen.smalley.work@gmail.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/nfs4.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 71fbebfa43c7..9ac83ca88326 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -47,6 +47,7 @@ struct nfs4_acl {
 struct nfs4_label {
 	uint32_t	lfs;
 	uint32_t	pi;
+	u32		lsmid;
 	u32		len;
 	char	*label;
 };
-- 
cgit v1.2.3


From 18912c520674ec4d920fe3826e7e4fefeecdf5ae Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Mon, 24 Feb 2025 09:44:01 -0800
Subject: tcp: devmem: don't write truncated dmabuf CMSGs to userspace

Currently, we report -ETOOSMALL (err) only on the first iteration
(!sent). When we get put_cmsg error after a bunch of successful
put_cmsg calls, we don't signal the error at all. This might be
confusing on the userspace side which will see truncated CMSGs
but no MSG_CTRUNC signal.

Consider the following case:
- sizeof(struct cmsghdr) = 16
- sizeof(struct dmabuf_cmsg) = 24
- total cmsg size (CMSG_LEN) = 40 (16+24)

When calling recvmsg with msg_controllen=60, the userspace
will receive two(!) dmabuf_cmsg(s), the first one will
be a valid one and the second one will be silently truncated. There is no
easy way to discover the truncation besides doing something like
"cm->cmsg_len != CMSG_LEN(sizeof(dmabuf_cmsg))".

Introduce new put_devmem_cmsg wrapper that reports an error instead
of doing the truncation. Mina suggests that it's the intended way
this API should work.

Note that we might now report MSG_CTRUNC when the users (incorrectly)
call us with msg_control == NULL.

Fixes: 8f0b3cc9a4c1 ("tcp: RX path for devmem TCP")
Reviewed-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250224174401.3582695-1-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/socket.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index d18cc47e89bd..c3322eb3d686 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -392,6 +392,8 @@ struct ucred {
 
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
+extern int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len,
+			    void *data);
 
 struct timespec64;
 struct __kernel_timespec;
-- 
cgit v1.2.3


From a6aa36e957a1bfb5341986dec32d013d23228fe1 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Fri, 14 Feb 2025 13:14:34 +0900
Subject: block: Remove zone write plugs when handling native zone append
 writes

For devices that natively support zone append operations,
REQ_OP_ZONE_APPEND BIOs are not processed through zone write plugging
and are immediately issued to the zoned device. This means that there is
no write pointer offset tracking done for these operations and that a
zone write plug is not necessary.

However, when receiving a zone append BIO, we may already have a zone
write plug for the target zone if that zone was previously partially
written using regular write operations. In such case, since the write
pointer offset of the zone write plug is not incremented by the amount
of sectors appended to the zone, 2 issues arise:
1) we risk leaving the plug in the disk hash table if the zone is fully
   written using zone append or regular write operations, because the
   write pointer offset will never reach the "zone full" state.
2) Regular write operations that are issued after zone append operations
   will always be failed by blk_zone_wplug_prepare_bio() as the write
   pointer alignment check will fail, even if the user correctly
   accounted for the zone append operations and issued the regular
   writes with a correct sector.

Avoid these issues by immediately removing the zone write plug of zones
that are the target of zone append operations when blk_zone_plug_bio()
is called. The new function blk_zone_wplug_handle_native_zone_append()
implements this for devices that natively support zone append. The
removal of the zone write plug using disk_remove_zone_wplug() requires
aborting all plugged regular write using disk_zone_wplug_abort() as
otherwise the plugged write BIOs would never be executed (with the plug
removed, the completion path will never see again the zone write plug as
disk_get_zone_wplug() will return NULL). Rate-limited warnings are added
to blk_zone_wplug_handle_native_zone_append() and to
disk_zone_wplug_abort() to signal this.

Since blk_zone_wplug_handle_native_zone_append() is called in the hot
path for operations that will not be plugged, disk_get_zone_wplug() is
optimized under the assumption that a user issuing zone append
operations is not at the same time issuing regular writes and that there
are no hashed zone write plugs. The struct gendisk atomic counter
nr_zone_wplugs is added to check this, with this counter incremented in
disk_insert_zone_wplug() and decremented in disk_remove_zone_wplug().

To be consistent with this fix, we do not need to fill the zone write
plug hash table with zone write plugs for zones that are partially
written for a device that supports native zone append operations.
So modify blk_revalidate_seq_zone() to return early to avoid allocating
and inserting a zone write plug for partially written sequential zones
if the device natively supports zone append.

Reported-by: Jorgen Hansen <Jorgen.Hansen@wdc.com>
Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Tested-by: Jorgen Hansen <Jorgen.Hansen@wdc.com>
Link: https://lore.kernel.org/r/20250214041434.82564-1-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 58ff5aca83b6..d37751789bf5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -196,10 +196,11 @@ struct gendisk {
 	unsigned int		zone_capacity;
 	unsigned int		last_zone_capacity;
 	unsigned long __rcu	*conv_zones_bitmap;
-	unsigned int            zone_wplugs_hash_bits;
-	spinlock_t              zone_wplugs_lock;
+	unsigned int		zone_wplugs_hash_bits;
+	atomic_t		nr_zone_wplugs;
+	spinlock_t		zone_wplugs_lock;
 	struct mempool_s	*zone_wplugs_pool;
-	struct hlist_head       *zone_wplugs_hash;
+	struct hlist_head	*zone_wplugs_hash;
 	struct workqueue_struct *zone_wplugs_wq;
 #endif /* CONFIG_BLK_DEV_ZONED */
 
-- 
cgit v1.2.3


From 02410ac72ac3707936c07ede66e94360d0d65319 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Wed, 26 Feb 2025 12:06:51 +0000
Subject: mm: hugetlb: Add huge page size param to huge_ptep_get_and_clear()

In order to fix a bug, arm64 needs to be told the size of the huge page
for which the huge_pte is being cleared in huge_ptep_get_and_clear().
Provide for this by adding an `unsigned long sz` parameter to the
function. This follows the same pattern as huge_pte_clear() and
set_huge_pte_at().

This commit makes the required interface modifications to the core mm as
well as all arches that implement this function (arm64, loongarch, mips,
parisc, powerpc, riscv, s390, sparc). The actual arm64 bug will be fixed
in a separate commit.

Cc: stable@vger.kernel.org
Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit")
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com> # riscv
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Alexander Gordeev <agordeev@linux.ibm.com> # s390
Link: https://lore.kernel.org/r/20250226120656.2400136-2-ryan.roberts@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/asm-generic/hugetlb.h | 2 +-
 include/linux/hugetlb.h       | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index f42133dae68e..2afc95bf1655 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -90,7 +90,7 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 
 #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
-		unsigned long addr, pte_t *ptep)
+		unsigned long addr, pte_t *ptep, unsigned long sz)
 {
 	return ptep_get_and_clear(mm, addr, ptep);
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ec8c0ccc8f95..bf5f7256bd28 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1004,7 +1004,9 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
 static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
 						unsigned long addr, pte_t *ptep)
 {
-	return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
+	unsigned long psize = huge_page_size(hstate_vma(vma));
+
+	return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize);
 }
 #endif
 
-- 
cgit v1.2.3


From 916b7f42b3b3b539a71c204a9b49fdc4ca92cd82 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 27 Feb 2025 15:06:31 -0800
Subject: kvm: retry nx_huge_page_recovery_thread creation

A VMM may send a non-fatal signal to its threads, including vCPU tasks,
at any time, and thus may signal vCPU tasks during KVM_RUN.  If a vCPU
task receives the signal while its trying to spawn the huge page recovery
vhost task, then KVM_RUN will fail due to copy_process() returning
-ERESTARTNOINTR.

Rework call_once() to mark the call complete if and only if the called
function succeeds, and plumb the function's true error code back to the
call_once() invoker.  This provides userspace with the correct, non-fatal
error code so that the VMM doesn't terminate the VM on -ENOMEM, and allows
subsequent KVM_RUN a succeed by virtue of retrying creation of the NX huge
page task.

Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
[implemented the kvm user side]
Signed-off-by: Keith Busch <kbusch@kernel.org>
Message-ID: <20250227230631.303431-3-kbusch@meta.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/call_once.h | 47 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/call_once.h b/include/linux/call_once.h
index 6261aa0b3fb0..13cd6469e7e5 100644
--- a/include/linux/call_once.h
+++ b/include/linux/call_once.h
@@ -26,20 +26,41 @@ do {									\
 	__once_init((once), #once, &__key);				\
 } while (0)
 
-static inline void call_once(struct once *once, void (*cb)(struct once *))
+/*
+ * call_once - Ensure a function has been called exactly once
+ *
+ * @once: Tracking struct
+ * @cb: Function to be called
+ *
+ * If @once has never completed successfully before, call @cb and, if
+ * it returns a zero or positive value, mark @once as completed.  Return
+ * the value returned by @cb
+ *
+ * If @once has completed succesfully before, return 0.
+ *
+ * The call to @cb is implicitly surrounded by a mutex, though for
+ * efficiency the * function avoids taking it after the first call.
+ */
+static inline int call_once(struct once *once, int (*cb)(struct once *))
 {
-        /* Pairs with atomic_set_release() below.  */
-        if (atomic_read_acquire(&once->state) == ONCE_COMPLETED)
-                return;
-
-        guard(mutex)(&once->lock);
-        WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
-        if (atomic_read(&once->state) != ONCE_NOT_STARTED)
-                return;
-
-        atomic_set(&once->state, ONCE_RUNNING);
-        cb(once);
-        atomic_set_release(&once->state, ONCE_COMPLETED);
+	int r, state;
+
+	/* Pairs with atomic_set_release() below.  */
+	if (atomic_read_acquire(&once->state) == ONCE_COMPLETED)
+		return 0;
+
+	guard(mutex)(&once->lock);
+	state = atomic_read(&once->state);
+	if (unlikely(state != ONCE_NOT_STARTED))
+		return WARN_ON_ONCE(state != ONCE_COMPLETED) ? -EINVAL : 0;
+
+	atomic_set(&once->state, ONCE_RUNNING);
+	r = cb(once);
+	if (r < 0)
+		atomic_set(&once->state, ONCE_NOT_STARTED);
+	else
+		atomic_set_release(&once->state, ONCE_COMPLETED);
+	return r;
 }
 
 #endif /* _LINUX_CALL_ONCE_H */
-- 
cgit v1.2.3


From e04918dc594669068f5d59d567d08db531167188 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 2 Mar 2025 15:18:24 +0800
Subject: cred: Fix RCU warnings in override/revert_creds

Fix RCU warnings in override_creds and revert_creds by turning
the RCU pointer into a normal pointer using rcu_replace_pointer.

These warnings were previously private to the cred code, but due
to the move into the header file they are now polluting unrelated
subsystems.

Fixes: 49dffdfde462 ("cred: Add a light version of override/revert_creds()")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Link: https://lore.kernel.org/r/Z8QGQGW0IaSklKG7@gondor.apana.org.au
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/cred.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 0c3c4b16b469..5658a3bfe803 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -172,18 +172,12 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
 
 static inline const struct cred *override_creds(const struct cred *override_cred)
 {
-	const struct cred *old = current->cred;
-
-	rcu_assign_pointer(current->cred, override_cred);
-	return old;
+	return rcu_replace_pointer(current->cred, override_cred, 1);
 }
 
 static inline const struct cred *revert_creds(const struct cred *revert_cred)
 {
-	const struct cred *override_cred = current->cred;
-
-	rcu_assign_pointer(current->cred, revert_cred);
-	return override_cred;
+	return rcu_replace_pointer(current->cred, revert_cred, 1);
 }
 
 /**
-- 
cgit v1.2.3


From 3d252160b818045f3a152b13756f6f37ca34639d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 4 Mar 2025 13:51:38 +0000
Subject: fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex

pipe_readable(), pipe_writable(), and pipe_poll() can read "pipe->head"
and "pipe->tail" outside of "pipe->mutex" critical section. When the
head and the tail are read individually in that order, there is a window
for interruption between the two reads in which both the head and the
tail can be updated by concurrent readers and writers.

One of the problematic scenarios observed with hackbench running
multiple groups on a large server on a particular pipe inode is as
follows:

    pipe->head = 36
    pipe->tail = 36

    hackbench-118762  [057] .....  1029.550548: pipe_write: *wakes up: pipe not full*
    hackbench-118762  [057] .....  1029.550548: pipe_write: head: 36 -> 37 [tail: 36]
    hackbench-118762  [057] .....  1029.550548: pipe_write: *wake up next reader 118740*
    hackbench-118762  [057] .....  1029.550548: pipe_write: *wake up next writer 118768*

    hackbench-118768  [206] .....  1029.55055X: pipe_write: *writer wakes up*
    hackbench-118768  [206] .....  1029.55055X: pipe_write: head = READ_ONCE(pipe->head) [37]
    ... CPU 206 interrupted (exact wakeup was not traced but 118768 did read head at 37 in traces)

    hackbench-118740  [057] .....  1029.550558: pipe_read:  *reader wakes up: pipe is not empty*
    hackbench-118740  [057] .....  1029.550558: pipe_read:  tail: 36 -> 37 [head = 37]
    hackbench-118740  [057] .....  1029.550559: pipe_read:  *pipe is empty; wakeup writer 118768*
    hackbench-118740  [057] .....  1029.550559: pipe_read:  *sleeps*

    hackbench-118766  [185] .....  1029.550592: pipe_write: *New writer comes in*
    hackbench-118766  [185] .....  1029.550592: pipe_write: head: 37 -> 38 [tail: 37]
    hackbench-118766  [185] .....  1029.550592: pipe_write: *wakes up reader 118766*

    hackbench-118740  [185] .....  1029.550598: pipe_read:  *reader wakes up; pipe not empty*
    hackbench-118740  [185] .....  1029.550599: pipe_read:  tail: 37 -> 38 [head: 38]
    hackbench-118740  [185] .....  1029.550599: pipe_read:  *pipe is empty*
    hackbench-118740  [185] .....  1029.550599: pipe_read:  *reader sleeps; wakeup writer 118768*

    ... CPU 206 switches back to writer
    hackbench-118768  [206] .....  1029.550601: pipe_write: tail = READ_ONCE(pipe->tail) [38]
    hackbench-118768  [206] .....  1029.550601: pipe_write: pipe_full()? (u32)(37 - 38) >= 16? Yes
    hackbench-118768  [206] .....  1029.550601: pipe_write: *writer goes back to sleep*

    [ Tasks 118740 and 118768 can then indefinitely wait on each other. ]

The unsigned arithmetic in pipe_occupancy() wraps around when
"pipe->tail > pipe->head" leading to pipe_full() returning true despite
the pipe being empty.

The case of genuine wraparound of "pipe->head" is handled since pipe
buffer has data allowing readers to make progress until the pipe->tail
wraps too after which the reader will wakeup a sleeping writer, however,
mistaking the pipe to be full when it is in fact empty can lead to
readers and writers waiting on each other indefinitely.

This issue became more problematic and surfaced as a hang in hackbench
after the optimization in commit aaec5a95d596 ("pipe_read: don't wake up
the writer if the pipe is still full") significantly reduced the number
of spurious wakeups of writers that had previously helped mask the
issue.

To avoid missing any updates between the reads of "pipe->head" and
"pipe->write", unionize the two with a single unsigned long
"pipe->head_tail" member that can be loaded atomically.

Using "pipe->head_tail" to read the head and the tail ensures the
lockless checks do not miss any updates to the head or the tail and
since those two are only updated under "pipe->mutex", it ensures that
the head is always ahead of, or equal to the tail resulting in correct
calculations.

  [ prateek: commit log, testing on x86 platforms. ]

Reported-and-debugged-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Closes: https://lore.kernel.org/lkml/e813814e-7094-4673-bc69-731af065a0eb@amd.com/
Reported-by: Alexey Gladkov <legion@kernel.org>
Closes: https://lore.kernel.org/all/Z8Wn0nTvevLRG_4m@example.org/
Fixes: 8cefc107ca54 ("pipe: Use head and tail pointers for the ring, not cursor and length")
Tested-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Tested-by: Alexey Gladkov <legion@kernel.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pipe_fs_i.h | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 8ff23bf5a819..3cc4f8eab853 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -31,6 +31,33 @@ struct pipe_buffer {
 	unsigned long private;
 };
 
+/*
+ * Really only alpha needs 32-bit fields, but
+ * might as well do it for 64-bit architectures
+ * since that's what we've historically done,
+ * and it makes 'head_tail' always be a simple
+ * 'unsigned long'.
+ */
+#ifdef CONFIG_64BIT
+typedef unsigned int pipe_index_t;
+#else
+typedef unsigned short pipe_index_t;
+#endif
+
+/*
+ * We have to declare this outside 'struct pipe_inode_info',
+ * but then we can't use 'union pipe_index' for an anonymous
+ * union, so we end up having to duplicate this declaration
+ * below. Annoying.
+ */
+union pipe_index {
+	unsigned long head_tail;
+	struct {
+		pipe_index_t head;
+		pipe_index_t tail;
+	};
+};
+
 /**
  *	struct pipe_inode_info - a linux kernel pipe
  *	@mutex: mutex protecting the whole thing
@@ -58,8 +85,16 @@ struct pipe_buffer {
 struct pipe_inode_info {
 	struct mutex mutex;
 	wait_queue_head_t rd_wait, wr_wait;
-	unsigned int head;
-	unsigned int tail;
+
+	/* This has to match the 'union pipe_index' above */
+	union {
+		unsigned long head_tail;
+		struct {
+			pipe_index_t head;
+			pipe_index_t tail;
+		};
+	};
+
 	unsigned int max_usage;
 	unsigned int ring_size;
 	unsigned int nr_accounted;
-- 
cgit v1.2.3


From 778b94d7ac17b5800aa857222911f09cc986b509 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Fri, 28 Feb 2025 11:01:53 -0600
Subject: ACPI: platform_profile: Add support for hidden choices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When two drivers don't support all the same profiles the legacy interface
only exports the common profiles.

This causes problems for cases where one driver uses low-power but another
uses quiet because the result is that neither is exported to sysfs.

To allow two drivers to disagree, add support for "hidden choices".
Hidden choices are platform profiles that a driver supports to be
compatible with the platform profile of another driver.

Fixes: 688834743d67 ("ACPI: platform_profile: Allow multiple handlers")
Reported-by: Antheas Kapenekakis <lkml@antheas.dev>
Closes: https://lore.kernel.org/platform-driver-x86/e64b771e-3255-42ad-9257-5b8fc6c24ac9@gmx.de/T/#mc068042dd29df36c16c8af92664860fc4763974b
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Tested-by: Antheas Kapenekakis <lkml@antheas.dev>
Tested-by: Derek J. Clark <derekjohn.clark@gmail.com>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://patch.msgid.link/20250228170155.2623386-2-superm1@kernel.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/platform_profile.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h
index 8ab5b0e8eb2c..8c9df7dadd5d 100644
--- a/include/linux/platform_profile.h
+++ b/include/linux/platform_profile.h
@@ -33,6 +33,8 @@ enum platform_profile_option {
  * @probe: Callback to setup choices available to the new class device. These
  *	   choices will only be enforced when setting a new profile, not when
  *	   getting the current one.
+ * @hidden_choices: Callback to setup choices that are not visible to the user
+ *		    but can be set by the driver.
  * @profile_get: Callback that will be called when showing the current platform
  *		 profile in sysfs.
  * @profile_set: Callback that will be called when storing a new platform
@@ -40,6 +42,7 @@ enum platform_profile_option {
  */
 struct platform_profile_ops {
 	int (*probe)(void *drvdata, unsigned long *choices);
+	int (*hidden_choices)(void *drvdata, unsigned long *choices);
 	int (*profile_get)(struct device *dev, enum platform_profile_option *profile);
 	int (*profile_set)(struct device *dev, enum platform_profile_option profile);
 };
-- 
cgit v1.2.3


From c27c66afc449b80f3b4b84d123358c0248f2cf63 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 5 Mar 2025 07:08:09 -1000
Subject: fs/pipe: Fix pipe_occupancy() with 16-bit indexes

The pipe_occupancy() logic implicitly relied on the natural unsigned
modulo arithmetic in C, but that doesn't work for the new 'pipe_index_t'
case, since any arithmetic will be done in 'int' (and here we had also
made it 'unsigned int' due to the function call boundary).

So make the modulo arithmetic explicit by casting the result to the
proper type.

Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Alexey Gladkov <legion@kernel.org>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/all/CAHk-=wjyHsGLx=rxg6PKYBNkPYAejgo7=CbyL3=HGLZLsAaJFQ@mail.gmail.com/
Fixes: 3d252160b818 ("fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex")
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pipe_fs_i.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 3cc4f8eab853..1f013ed7577e 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -192,7 +192,7 @@ static inline bool pipe_empty(unsigned int head, unsigned int tail)
  */
 static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail)
 {
-	return head - tail;
+	return (pipe_index_t)(head - tail);
 }
 
 /**
-- 
cgit v1.2.3


From cfced12f5100e50d56bc587299393fd33c1169a9 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Wed, 5 Mar 2025 11:23:01 +0000
Subject: include/linux/pipe_fs_i: Add htmldoc annotation for "head_tail"
 member

Add htmldoc annotation for the newly introduced "head_tail" member
describing it to be a union of the pipe_inode_info's @head and @tail
members.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/lkml/20250305204609.5e64768e@canb.auug.org.au/
Fixes: 3d252160b818 ("fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex")
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pipe_fs_i.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 1f013ed7577e..05ccbc5d0129 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -65,6 +65,7 @@ union pipe_index {
  *	@wr_wait: writer wait point in case of full pipe
  *	@head: The point of buffer production
  *	@tail: The point of buffer consumption
+ *	@head_tail: unsigned long union of @head and @tail
  *	@note_loss: The next read() should insert a data-lost message
  *	@max_usage: The maximum number of slots that may be used in the ring
  *	@ring_size: total number of buffers (should be a power of 2)
-- 
cgit v1.2.3


From 0eba2a7e858907a746ba69cd002eb9eb4dbd7bf3 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Fri, 28 Feb 2025 15:14:56 +0000
Subject: ASoC: ops: Consistently treat platform_max as control value

This reverts commit 9bdd10d57a88 ("ASoC: ops: Shift tested values in
snd_soc_put_volsw() by +min"), and makes some additional related
updates.

There are two ways the platform_max could be interpreted; the maximum
register value, or the maximum value the control can be set to. The
patch moved from treating the value as a control value to a register
one. When the patch was applied it was technically correct as
snd_soc_limit_volume() also used the register interpretation. However,
even then most of the other usages treated platform_max as a
control value, and snd_soc_limit_volume() has since been updated to
also do so in commit fb9ad24485087 ("ASoC: ops: add correct range
check for limiting volume"). That patch however, missed updating
snd_soc_put_volsw() back to the control interpretation, and fixing
snd_soc_info_volsw_range(). The control interpretation makes more
sense as limiting is typically done from the machine driver, so it is
appropriate to use the customer facing representation rather than the
internal codec representation. Update all the code to consistently use
this interpretation of platform_max.

Finally, also add some comments to the soc_mixer_control struct to
hopefully avoid further patches switching between the two approaches.

Fixes: fb9ad24485087 ("ASoC: ops: add correct range check for limiting volume")
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20250228151456.3703342-1-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index fcdb5adfcd5e..b3e84bc47c6f 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1261,7 +1261,10 @@ void snd_soc_close_delayed_work(struct snd_soc_pcm_runtime *rtd);
 
 /* mixer control */
 struct soc_mixer_control {
-	int min, max, platform_max;
+	/* Minimum and maximum specified as written to the hardware */
+	int min, max;
+	/* Limited maximum value specified as presented through the control */
+	int platform_max;
 	int reg, rreg;
 	unsigned int shift, rshift;
 	unsigned int sign_bit;
-- 
cgit v1.2.3


From 0d2d0f3d93ddd6556f23c917d910becd9925ddeb Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 5 Mar 2025 07:35:40 -1000
Subject: fs/pipe: remove buggy and unused 'helper' function

While looking for incorrect users of the pipe head/tail fields (see
commit c27c66afc449: "fs/pipe: Fix pipe_occupancy() with 16-bit
indexes"), I found a bug in pipe_discard_from() that looked entirely
broken.

However, the fix is trivial: this buggy function isn't actually called
by anything, so let's just remove it ASAP.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pipe_fs_i.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 05ccbc5d0129..e572e6fc4f81 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -281,15 +281,6 @@ static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe,
 	return buf->ops->try_steal(pipe, buf);
 }
 
-static inline void pipe_discard_from(struct pipe_inode_info *pipe,
-		unsigned int old_head)
-{
-	unsigned int mask = pipe->ring_size - 1;
-
-	while (pipe->head > old_head)
-		pipe_buf_release(pipe, &pipe->bufs[--pipe->head & mask]);
-}
-
 /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
    memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
 #define PIPE_SIZE		PAGE_SIZE
-- 
cgit v1.2.3


From 67bab13307c83fb742c2556b06cdc39dbad27f07 Mon Sep 17 00:00:00 2001
From: Ge Yang <yangge1116@126.com>
Date: Wed, 19 Feb 2025 11:46:44 +0800
Subject: mm/hugetlb: wait for hugetlb folios to be freed

Since the introduction of commit c77c0a8ac4c52 ("mm/hugetlb: defer freeing
of huge pages if in non-task context"), which supports deferring the
freeing of hugetlb pages, the allocation of contiguous memory through
cma_alloc() may fail probabilistically.

In the CMA allocation process, if it is found that the CMA area is
occupied by in-use hugetlb folios, these in-use hugetlb folios need to be
migrated to another location.  When there are no available hugetlb folios
in the free hugetlb pool during the migration of in-use hugetlb folios,
new folios are allocated from the buddy system.  A temporary state is set
on the newly allocated folio.  Upon completion of the hugetlb folio
migration, the temporary state is transferred from the new folios to the
old folios.  Normally, when the old folios with the temporary state are
freed, it is directly released back to the buddy system.  However, due to
the deferred freeing of hugetlb pages, the PageBuddy() check fails,
ultimately leading to the failure of cma_alloc().

Here is a simplified call trace illustrating the process:
cma_alloc()
    ->__alloc_contig_migrate_range() // Migrate in-use hugetlb folios
        ->unmap_and_move_huge_page()
            ->folio_putback_hugetlb() // Free old folios
    ->test_pages_isolated()
        ->__test_page_isolated_in_pageblock()
             ->PageBuddy(page) // Check if the page is in buddy

To resolve this issue, we have implemented a function named
wait_for_freed_hugetlb_folios().  This function ensures that the hugetlb
folios are properly released back to the buddy system after their
migration is completed.  By invoking wait_for_freed_hugetlb_folios()
before calling PageBuddy(), we ensure that PageBuddy() will succeed.

Link: https://lkml.kernel.org/r/1739936804-18199-1-git-send-email-yangge1116@126.com
Fixes: c77c0a8ac4c5 ("mm/hugetlb: defer freeing of huge pages if in non-task context")
Signed-off-by: Ge Yang <yangge1116@126.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ec8c0ccc8f95..dbe76d4f1bfc 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -682,6 +682,7 @@ struct huge_bootmem_page {
 
 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
 int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
+void wait_for_freed_hugetlb_folios(void);
 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 				unsigned long addr, bool cow_from_owner);
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
@@ -1066,6 +1067,10 @@ static inline int replace_free_hugepage_folios(unsigned long start_pfn,
 	return 0;
 }
 
+static inline void wait_for_freed_hugetlb_folios(void)
+{
+}
+
 static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 					   unsigned long addr,
 					   bool cow_from_owner)
-- 
cgit v1.2.3


From ce6d9c1c2b5cc785016faa11b48b6cd317eb367e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Mon, 24 Feb 2025 21:20:02 -0500
Subject: NFS: fix nfs_release_folio() to not deadlock via kcompactd writeback

Add PF_KCOMPACTD flag and current_is_kcompactd() helper to check for it so
nfs_release_folio() can skip calling nfs_wb_folio() from kcompactd.

Otherwise NFS can deadlock waiting for kcompactd enduced writeback which
recurses back to NFS (which triggers writeback to NFSD via NFS loopback
mount on the same host, NFSD blocks waiting for XFS's call to
__filemap_get_folio):

6070.550357] INFO: task kcompactd0:58 blocked for more than 4435 seconds.

{---
[58] "kcompactd0"
[<0>] folio_wait_bit+0xe8/0x200
[<0>] folio_wait_writeback+0x2b/0x80
[<0>] nfs_wb_folio+0x80/0x1b0 [nfs]
[<0>] nfs_release_folio+0x68/0x130 [nfs]
[<0>] split_huge_page_to_list_to_order+0x362/0x840
[<0>] migrate_pages_batch+0x43d/0xb90
[<0>] migrate_pages_sync+0x9a/0x240
[<0>] migrate_pages+0x93c/0x9f0
[<0>] compact_zone+0x8e2/0x1030
[<0>] compact_node+0xdb/0x120
[<0>] kcompactd+0x121/0x2e0
[<0>] kthread+0xcf/0x100
[<0>] ret_from_fork+0x31/0x40
[<0>] ret_from_fork_asm+0x1a/0x30
---}

[akpm@linux-foundation.org: fix build]
Link: https://lkml.kernel.org/r/20250225022002.26141-1-snitzer@kernel.org
Fixes: 96780ca55e3c ("NFS: fix up nfs_release_folio() to try to release the page")
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compaction.h | 5 +++++
 include/linux/sched.h      | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index e94776496049..7bf0c521db63 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -80,6 +80,11 @@ static inline unsigned long compact_gap(unsigned int order)
 	return 2UL << order;
 }
 
+static inline int current_is_kcompactd(void)
+{
+	return current->flags & PF_KCOMPACTD;
+}
+
 #ifdef CONFIG_COMPACTION
 
 extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9632e3318e0d..9c15365a30c0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1701,7 +1701,7 @@ extern struct pid *cad_pid;
 #define PF_USED_MATH		0x00002000	/* If unset the fpu must be initialized before use */
 #define PF_USER_WORKER		0x00004000	/* Kernel thread cloned from userspace thread */
 #define PF_NOFREEZE		0x00008000	/* This thread should not be frozen */
-#define PF__HOLE__00010000	0x00010000
+#define PF_KCOMPACTD		0x00010000	/* I am kcompactd */
 #define PF_KSWAPD		0x00020000	/* I am kswapd */
 #define PF_MEMALLOC_NOFS	0x00040000	/* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
 #define PF_MEMALLOC_NOIO	0x00080000	/* All allocations inherit GFP_NOIO. See memalloc_noio_save() */
-- 
cgit v1.2.3


From c29564d8b46f64f5e6e6f1c9c02f7761b7b90963 Mon Sep 17 00:00:00 2001
From: Su Hui <suhui@nfschina.com>
Date: Fri, 21 Feb 2025 15:16:25 +0800
Subject: include/linux/log2.h: mark is_power_of_2() with __always_inline

When building kernel with randconfig, there is an error:

In function `kvm_is_cr4_bit_set',inlined from
`kvm_update_cpuid_runtime' at arch/x86/kvm/cpuid.c:310:9:

include/linux/compiler_types.h:542:38: error: call to
`__compiletime_assert_380' declared with attribute error:
BUILD_BUG_ON failed: !is_power_of_2(cr4_bit).

'!is_power_of_2(X86_CR4_OSXSAVE)' is False, but gcc treats is_power_of_2()
as non-inline function and a compilation error happens. Fix this by marking
is_power_of_2() with __always_inline.

Link: https://lkml.kernel.org/r/20250221071624.1356899-1-suhui@nfschina.com
Signed-off-by: Su Hui <suhui@nfschina.com>
Cc: Binbin Wu <binbin.wu@linux.intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/log2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/log2.h b/include/linux/log2.h
index 9f30d087a128..1366cb688a6d 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -41,7 +41,7 @@ int __ilog2_u64(u64 n)
  * *not* considered a power of two.
  * Return: true if @n is a power of 2, otherwise false.
  */
-static inline __attribute__((const))
+static __always_inline __attribute__((const))
 bool is_power_of_2(unsigned long n)
 {
 	return (n != 0 && ((n & (n - 1)) == 0));
-- 
cgit v1.2.3


From fb8286562ecfb585e26b033c5e32e6fb85efb0b3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 6 Mar 2025 04:05:26 +0100
Subject: netfilter: nf_tables: make destruction work queue pernet

The call to flush_work before tearing down a table from the netlink
notifier was supposed to make sure that all earlier updates (e.g. rule
add) that might reference that table have been processed.

Unfortunately, flush_work() waits for the last queued instance.
This could be an instance that is different from the one that we must
wait for.

This is because transactions are protected with a pernet mutex, but the
work item is global, so holding the transaction mutex doesn't prevent
another netns from queueing more work.

Make the work item pernet so that flush_work() will wait for all
transactions queued from this netns.

A welcome side effect is that we no longer need to wait for transaction
objects from foreign netns.

The gc work queue is still global.  This seems to be ok because nft_set
structures are reference counted and each container structure owns a
reference on the net namespace.

The destroy_list is still protected by a global spinlock rather than
pernet one but the hold time is very short anyway.

v2: call cancel_work_sync before reaping the remaining tables (Pablo).

Fixes: 9f6958ba2e90 ("netfilter: nf_tables: unconditionally flush pending work before notifier")
Reported-by: syzbot+5d8c5789c8cb076b2c25@syzkaller.appspotmail.com
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 60d5dcdb289c..803d5f1601f9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1891,7 +1891,7 @@ void nft_chain_filter_fini(void);
 void __init nft_chain_route_init(void);
 void nft_chain_route_fini(void);
 
-void nf_tables_trans_destroy_flush_work(void);
+void nf_tables_trans_destroy_flush_work(struct net *net);
 
 int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result);
 __be64 nf_jiffies64_to_msecs(u64 input);
@@ -1905,6 +1905,7 @@ static inline int nft_request_module(struct net *net, const char *fmt, ...) { re
 struct nftables_pernet {
 	struct list_head	tables;
 	struct list_head	commit_list;
+	struct list_head	destroy_list;
 	struct list_head	commit_set_list;
 	struct list_head	binding_list;
 	struct list_head	module_list;
@@ -1915,6 +1916,7 @@ struct nftables_pernet {
 	unsigned int		base_seq;
 	unsigned int		gc_seq;
 	u8			validate_state;
+	struct work_struct	destroy_work;
 };
 
 extern unsigned int nf_tables_net_id;
-- 
cgit v1.2.3


From 74d42bdb3a4673b1c10d1f457184e4d3c9cb0196 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 6 Mar 2025 07:30:42 -1000
Subject: fs/pipe: express 'pipe_empty()' in terms of 'pipe_occupancy()'

That's what 'pipe_full()' does, so it's more consistent. But more
importantly it gets the type limits right when the pipe head and tail
are no longer necessarily 'unsigned int'.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pipe_fs_i.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index e572e6fc4f81..4d0a2267e6ef 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -177,23 +177,23 @@ static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe)
 }
 
 /**
- * pipe_empty - Return true if the pipe is empty
+ * pipe_occupancy - Return number of slots used in the pipe
  * @head: The pipe ring head pointer
  * @tail: The pipe ring tail pointer
  */
-static inline bool pipe_empty(unsigned int head, unsigned int tail)
+static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail)
 {
-	return head == tail;
+	return (pipe_index_t)(head - tail);
 }
 
 /**
- * pipe_occupancy - Return number of slots used in the pipe
+ * pipe_empty - Return true if the pipe is empty
  * @head: The pipe ring head pointer
  * @tail: The pipe ring tail pointer
  */
-static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail)
+static inline bool pipe_empty(unsigned int head, unsigned int tail)
 {
-	return (pipe_index_t)(head - tail);
+	return !pipe_occupancy(head, tail);
 }
 
 /**
-- 
cgit v1.2.3


From e7112524e5e885181cc5ae4d258f33b9dbe0b907 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 6 Mar 2025 08:27:51 -0800
Subject: block: Name the RQF flags enum

Commit 5f89154e8e9e3445f9b59 ("block: Use enum to define RQF_x bit
indexes") converted the RQF flags to an anonymous enum, which was
a beneficial change. This patch goes one step further by naming the enum
as "rqf_flags".

This naming enables exporting these flags to BPF clients, eliminating
the need to duplicate these flags in BPF code. Instead, BPF clients can
now access the same kernel-side values through CO:RE (Compile Once, Run
Everywhere), as shown in this example:

    rqf_stats = bpf_core_enum_value(enum rqf_flags, __RQF_STATS)

Suggested-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/20250306-rqf_flags-v1-1-bbd64918b406@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index fa2a76cc2f73..71f4f0cc3dac 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -28,7 +28,7 @@ typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t);
 typedef __u32 __bitwise req_flags_t;
 
 /* Keep rqf_name[] in sync with the definitions below */
-enum {
+enum rqf_flags {
 	/* drive already may have started this one */
 	__RQF_STARTED,
 	/* request for flush sequence */
-- 
cgit v1.2.3


From 00a7d39898c8010bfd5ff62af31ca5db34421b38 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 6 Mar 2025 18:25:35 -1000
Subject: fs/pipe: add simpler helpers for common cases

The fix to atomically read the pipe head and tail state when not holding
the pipe mutex has caused a number of headaches due to the size change
of the involved types.

It turns out that we don't have _that_ many places that access these
fields directly and were affected, but we have more than we strictly
should have, because our low-level helper functions have been designed
to have intimate knowledge of how the pipes work.

And as a result, that random noise of direct 'pipe->head' and
'pipe->tail' accesses makes it harder to pinpoint any actual potential
problem spots remaining.

For example, we didn't have a "is the pipe full" helper function, but
instead had a "given these pipe buffer indexes and this pipe size, is
the pipe full".  That's because some low-level pipe code does actually
want that much more complicated interface.

But most other places literally just want a "is the pipe full" helper,
and not having it meant that those places ended up being unnecessarily
much too aware of this all.

It would have been much better if only the very core pipe code that
cared had been the one aware of this all.

So let's fix it - better late than never.  This just introduces the
trivial wrappers for "is this pipe full or empty" and to get how many
pipe buffers are used, so that instead of writing

        if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))

the places that literally just want to know if a pipe is full can just
say

        if (pipe_is_full(pipe))

instead.  The existing trivial cases were converted with a 'sed' script.

This cuts down on the places that access pipe->head and pipe->tail
directly outside of the pipe code (and core splice code) quite a lot.

The splice code in particular still revels in doing the direct low-level
accesses, and the fuse fuse_dev_splice_write() code also seems a bit
unnecessarily eager to go very low-level, but it's at least a bit better
than it used to be.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pipe_fs_i.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 4d0a2267e6ef..b698758000f8 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -208,6 +208,33 @@ static inline bool pipe_full(unsigned int head, unsigned int tail,
 	return pipe_occupancy(head, tail) >= limit;
 }
 
+/**
+ * pipe_is_full - Return true if the pipe is full
+ * @pipe: the pipe
+ */
+static inline bool pipe_is_full(const struct pipe_inode_info *pipe)
+{
+	return pipe_full(pipe->head, pipe->tail, pipe->max_usage);
+}
+
+/**
+ * pipe_is_empty - Return true if the pipe is empty
+ * @pipe: the pipe
+ */
+static inline bool pipe_is_empty(const struct pipe_inode_info *pipe)
+{
+	return pipe_empty(pipe->head, pipe->tail);
+}
+
+/**
+ * pipe_buf_usage - Return how many pipe buffers are in use
+ * @pipe: the pipe
+ */
+static inline unsigned int pipe_buf_usage(const struct pipe_inode_info *pipe)
+{
+	return pipe_occupancy(pipe->head, pipe->tail);
+}
+
 /**
  * pipe_buf - Return the pipe buffer for the specified slot in the pipe ring
  * @pipe: The pipe to access
-- 
cgit v1.2.3


From ab6ab707a4d060a51c45fc13e3b2228d5f7c0b87 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 4 Mar 2025 10:06:10 -0500
Subject: Revert "Bluetooth: hci_core: Fix sleeping function called from
 invalid context"

This reverts commit 4d94f05558271654670d18c26c912da0c1c15549 which has
problems (see [1]) and is no longer needed since 581dd2dc168f
("Bluetooth: hci_event: Fix using rcu_read_(un)lock while iterating")
has reworked the code where the original bug has been found.

[1] Link: https://lore.kernel.org/linux-bluetooth/877c55ci1r.wl-tiwai@suse.de/T/#t
Fixes: 4d94f0555827 ("Bluetooth: hci_core: Fix sleeping function called from invalid context")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 108 ++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index f756fac95488..6281063cbd8e 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -804,6 +804,7 @@ struct hci_conn_params {
 extern struct list_head hci_dev_list;
 extern struct list_head hci_cb_list;
 extern rwlock_t hci_dev_list_lock;
+extern struct mutex hci_cb_list_lock;
 
 #define hci_dev_set_flag(hdev, nr)             set_bit((nr), (hdev)->dev_flags)
 #define hci_dev_clear_flag(hdev, nr)           clear_bit((nr), (hdev)->dev_flags)
@@ -2010,47 +2011,24 @@ struct hci_cb {
 
 	char *name;
 
-	bool (*match)		(struct hci_conn *conn);
 	void (*connect_cfm)	(struct hci_conn *conn, __u8 status);
 	void (*disconn_cfm)	(struct hci_conn *conn, __u8 status);
 	void (*security_cfm)	(struct hci_conn *conn, __u8 status,
-				 __u8 encrypt);
+								__u8 encrypt);
 	void (*key_change_cfm)	(struct hci_conn *conn, __u8 status);
 	void (*role_switch_cfm)	(struct hci_conn *conn, __u8 status, __u8 role);
 };
 
-static inline void hci_cb_lookup(struct hci_conn *conn, struct list_head *list)
-{
-	struct hci_cb *cb, *cpy;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(cb, &hci_cb_list, list) {
-		if (cb->match && cb->match(conn)) {
-			cpy = kmalloc(sizeof(*cpy), GFP_ATOMIC);
-			if (!cpy)
-				break;
-
-			*cpy = *cb;
-			INIT_LIST_HEAD(&cpy->list);
-			list_add_rcu(&cpy->list, list);
-		}
-	}
-	rcu_read_unlock();
-}
-
 static inline void hci_connect_cfm(struct hci_conn *conn, __u8 status)
 {
-	struct list_head list;
-	struct hci_cb *cb, *tmp;
-
-	INIT_LIST_HEAD(&list);
-	hci_cb_lookup(conn, &list);
+	struct hci_cb *cb;
 
-	list_for_each_entry_safe(cb, tmp, &list, list) {
+	mutex_lock(&hci_cb_list_lock);
+	list_for_each_entry(cb, &hci_cb_list, list) {
 		if (cb->connect_cfm)
 			cb->connect_cfm(conn, status);
-		kfree(cb);
 	}
+	mutex_unlock(&hci_cb_list_lock);
 
 	if (conn->connect_cfm_cb)
 		conn->connect_cfm_cb(conn, status);
@@ -2058,43 +2036,22 @@ static inline void hci_connect_cfm(struct hci_conn *conn, __u8 status)
 
 static inline void hci_disconn_cfm(struct hci_conn *conn, __u8 reason)
 {
-	struct list_head list;
-	struct hci_cb *cb, *tmp;
-
-	INIT_LIST_HEAD(&list);
-	hci_cb_lookup(conn, &list);
+	struct hci_cb *cb;
 
-	list_for_each_entry_safe(cb, tmp, &list, list) {
+	mutex_lock(&hci_cb_list_lock);
+	list_for_each_entry(cb, &hci_cb_list, list) {
 		if (cb->disconn_cfm)
 			cb->disconn_cfm(conn, reason);
-		kfree(cb);
 	}
+	mutex_unlock(&hci_cb_list_lock);
 
 	if (conn->disconn_cfm_cb)
 		conn->disconn_cfm_cb(conn, reason);
 }
 
-static inline void hci_security_cfm(struct hci_conn *conn, __u8 status,
-				    __u8 encrypt)
-{
-	struct list_head list;
-	struct hci_cb *cb, *tmp;
-
-	INIT_LIST_HEAD(&list);
-	hci_cb_lookup(conn, &list);
-
-	list_for_each_entry_safe(cb, tmp, &list, list) {
-		if (cb->security_cfm)
-			cb->security_cfm(conn, status, encrypt);
-		kfree(cb);
-	}
-
-	if (conn->security_cfm_cb)
-		conn->security_cfm_cb(conn, status);
-}
-
 static inline void hci_auth_cfm(struct hci_conn *conn, __u8 status)
 {
+	struct hci_cb *cb;
 	__u8 encrypt;
 
 	if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags))
@@ -2102,11 +2059,20 @@ static inline void hci_auth_cfm(struct hci_conn *conn, __u8 status)
 
 	encrypt = test_bit(HCI_CONN_ENCRYPT, &conn->flags) ? 0x01 : 0x00;
 
-	hci_security_cfm(conn, status, encrypt);
+	mutex_lock(&hci_cb_list_lock);
+	list_for_each_entry(cb, &hci_cb_list, list) {
+		if (cb->security_cfm)
+			cb->security_cfm(conn, status, encrypt);
+	}
+	mutex_unlock(&hci_cb_list_lock);
+
+	if (conn->security_cfm_cb)
+		conn->security_cfm_cb(conn, status);
 }
 
 static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status)
 {
+	struct hci_cb *cb;
 	__u8 encrypt;
 
 	if (conn->state == BT_CONFIG) {
@@ -2133,38 +2099,40 @@ static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status)
 			conn->sec_level = conn->pending_sec_level;
 	}
 
-	hci_security_cfm(conn, status, encrypt);
+	mutex_lock(&hci_cb_list_lock);
+	list_for_each_entry(cb, &hci_cb_list, list) {
+		if (cb->security_cfm)
+			cb->security_cfm(conn, status, encrypt);
+	}
+	mutex_unlock(&hci_cb_list_lock);
+
+	if (conn->security_cfm_cb)
+		conn->security_cfm_cb(conn, status);
 }
 
 static inline void hci_key_change_cfm(struct hci_conn *conn, __u8 status)
 {
-	struct list_head list;
-	struct hci_cb *cb, *tmp;
-
-	INIT_LIST_HEAD(&list);
-	hci_cb_lookup(conn, &list);
+	struct hci_cb *cb;
 
-	list_for_each_entry_safe(cb, tmp, &list, list) {
+	mutex_lock(&hci_cb_list_lock);
+	list_for_each_entry(cb, &hci_cb_list, list) {
 		if (cb->key_change_cfm)
 			cb->key_change_cfm(conn, status);
-		kfree(cb);
 	}
+	mutex_unlock(&hci_cb_list_lock);
 }
 
 static inline void hci_role_switch_cfm(struct hci_conn *conn, __u8 status,
 								__u8 role)
 {
-	struct list_head list;
-	struct hci_cb *cb, *tmp;
-
-	INIT_LIST_HEAD(&list);
-	hci_cb_lookup(conn, &list);
+	struct hci_cb *cb;
 
-	list_for_each_entry_safe(cb, tmp, &list, list) {
+	mutex_lock(&hci_cb_list_lock);
+	list_for_each_entry(cb, &hci_cb_list, list) {
 		if (cb->role_switch_cfm)
 			cb->role_switch_cfm(conn, status, role);
-		kfree(cb);
 	}
+	mutex_unlock(&hci_cb_list_lock);
 }
 
 static inline bool hci_bdaddr_is_rpa(bdaddr_t *bdaddr, u8 addr_type)
-- 
cgit v1.2.3


From 366fef794bd2b7c2e9df933f6828dd9739bfba84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@linux.intel.com>
Date: Mon, 10 Mar 2025 14:21:58 +0200
Subject: <linux/cleanup.h>: Allow the passing of both iomem and non-iomem
 pointers to no_free_ptr()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Calling no_free_ptr() for an __iomem pointer results in Sparse
complaining about the types:

  warning: incorrect type in argument 1 (different address spaces)
     expected void const volatile *val
     got void [noderef] __iomem *__val

[ The example is from drivers/platform/x86/intel/pmc/core_ssram.c:283 ]

The problem is caused by the signature of __must_check_fn() added in:

  85be6d842447 ("cleanup: Make no_free_ptr() __must_check")

... to enforce that the return value is always used.

Use __force to allow both iomem and non-iomem pointers to be given for
no_free_ptr().

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20250310122158.20966-1-ilpo.jarvinen@linux.intel.com
Closes: https://lore.kernel.org/oe-kbuild-all/202403050547.qnZtuNlN-lkp@intel.com/
---
 include/linux/cleanup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index ec00e3f7af2b..ee2614adb785 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -212,7 +212,7 @@ const volatile void * __must_check_fn(const volatile void *val)
 { return val; }
 
 #define no_free_ptr(p) \
-	((typeof(p)) __must_check_fn(__get_and_null(p, NULL)))
+	((typeof(p)) __must_check_fn((__force const volatile void *)__get_and_null(p, NULL)))
 
 #define return_ptr(p)	return no_free_ptr(p)
 
-- 
cgit v1.2.3


From 9bce6b5f8987678b9c6c1fe433af6b5fe41feadc Mon Sep 17 00:00:00 2001
From: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Date: Tue, 11 Mar 2025 19:43:59 +0900
Subject: block: change blk_mq_add_to_batch() third argument type to bool

Commit 1f47ed294a2b ("block: cleanup and fix batch completion adding
conditions") modified the evaluation criteria for the third argument,
'ioerror', in the blk_mq_add_to_batch() function. Initially, the
function had checked if 'ioerror' equals zero. Following the commit, it
started checking for negative error values, with the presumption that
such values, for instance -EIO, would be passed in.

However, blk_mq_add_to_batch() callers do not pass negative error
values. Instead, they pass status codes defined in various ways:

- NVMe PCI and Apple drivers pass NVMe status code
- virtio_blk driver passes the virtblk request header status byte
- null_blk driver passes blk_status_t

These codes are either zero or positive, therefore the revised check
fails to function as intended. Specifically, with the NVMe PCI driver,
this modification led to the failure of the blktests test case nvme/039.
In this test scenario, errors are artificially injected to the NVMe
driver, resulting in positive NVMe status codes passed to
blk_mq_add_to_batch(), which unexpectedly processes the failed I/O in a
batch. Hence the failure.

To correct the ioerror check within blk_mq_add_to_batch(), make all
callers to uniformly pass the argument as boolean. Modify the callers to
check their specific status codes and pass the boolean value 'is_error'.
Also describe the arguments of blK_mq_add_to_batch as kerneldoc.

Fixes: 1f47ed294a2b ("block: cleanup and fix batch completion adding conditions")
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Link: https://lore.kernel.org/r/20250311104359.1767728-3-shinichiro.kawasaki@wdc.com
[axboe: fold in documentation update]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 71f4f0cc3dac..aba9c24486aa 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -852,12 +852,20 @@ static inline bool blk_mq_is_reserved_rq(struct request *rq)
 	return rq->rq_flags & RQF_RESV;
 }
 
-/*
+/**
+ * blk_mq_add_to_batch() - add a request to the completion batch
+ * @req: The request to add to batch
+ * @iob: The batch to add the request
+ * @is_error: Specify true if the request failed with an error
+ * @complete: The completaion handler for the request
+ *
  * Batched completions only work when there is no I/O error and no special
  * ->end_io handler.
+ *
+ * Return: true when the request was added to the batch, otherwise false
  */
 static inline bool blk_mq_add_to_batch(struct request *req,
-				       struct io_comp_batch *iob, int ioerror,
+				       struct io_comp_batch *iob, bool is_error,
 				       void (*complete)(struct io_comp_batch *))
 {
 	/*
@@ -865,7 +873,7 @@ static inline bool blk_mq_add_to_batch(struct request *req,
 	 * 1) No batch container
 	 * 2) Has scheduler data attached
 	 * 3) Not a passthrough request and end_io set
-	 * 4) Not a passthrough request and an ioerror
+	 * 4) Not a passthrough request and failed with an error
 	 */
 	if (!iob)
 		return false;
@@ -874,7 +882,7 @@ static inline bool blk_mq_add_to_batch(struct request *req,
 	if (!blk_rq_is_passthrough(req)) {
 		if (req->end_io)
 			return false;
-		if (ioerror < 0)
+		if (is_error)
 			return false;
 	}
 
-- 
cgit v1.2.3


From 066e053fe208a3b83ee89dc5a192146add688861 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 12 Mar 2025 08:38:47 +0100
Subject: fsnotify: add pre-content hooks on mmap()

Pre-content hooks in page faults introduces potential deadlock of HSM
handler in userspace with filesystem freezing.

The requirement with pre-content event is that for every accessed file
range an event covering at least this range will be generated at least
once before the file data is accesses.

In preparation to disabling pre-content event hooks on page faults,
add pre-content hooks at mmap() variants for the entire mmaped range,
so HSM can fill content when user requests to map a portion of the file.

Note that exec() variant also calls vm_mmap_pgoff() internally to map
code sections, so pre-content hooks are also generated in this case.

Link: https://lore.kernel.org/linux-fsdevel/7ehxrhbvehlrjwvrduoxsao5k3x4aw275patsb3krkwuq573yv@o2hskrfawbnc/
Suggested-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20250312073852.2123409-2-amir73il@gmail.com
---
 include/linux/fsnotify.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 6a33288bd6a1..83d3ac97f826 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -170,6 +170,21 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
 	return fsnotify_path(&file->f_path, FS_ACCESS_PERM);
 }
 
+/*
+ * fsnotify_mmap_perm - permission hook before mmap of file range
+ */
+static inline int fsnotify_mmap_perm(struct file *file, int prot,
+				     const loff_t off, size_t len)
+{
+	/*
+	 * mmap() generates only pre-content events.
+	 */
+	if (!file || likely(!FMODE_FSNOTIFY_HSM(file->f_mode)))
+		return 0;
+
+	return fsnotify_pre_content(&file->f_path, &off, len);
+}
+
 /*
  * fsnotify_truncate_perm - permission hook before file truncate
  */
@@ -223,6 +238,12 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
 	return 0;
 }
 
+static inline int fsnotify_mmap_perm(struct file *file, int prot,
+				     const loff_t off, size_t len)
+{
+	return 0;
+}
+
 static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
 {
 	return 0;
-- 
cgit v1.2.3


From 955fbe0ef19df4197595a98d0906c94025c4beef Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 12 Mar 2025 08:38:50 +0100
Subject: Revert "fsnotify: generate pre-content permission event on page
 fault"

This reverts commit 8392bc2ff8c8bf7c4c5e6dfa71ccd893a3c046f6.

In the use case of buffered write whose input buffer is mmapped file on a
filesystem with a pre-content mark, the prefaulting of the buffer can
happen under the filesystem freeze protection (obtained in vfs_write())
which breaks assumptions of pre-content hook and introduces potential
deadlock of HSM handler in userspace with filesystem freezing.

Now that we have pre-content hooks at file mmap() time, disable the
pre-content event hooks on page fault to avoid the potential deadlock.

Reported-by: syzbot+7229071b47908b19d5b7@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-fsdevel/7ehxrhbvehlrjwvrduoxsao5k3x4aw275patsb3krkwuq573yv@o2hskrfawbnc/
Fixes: 8392bc2ff8c8 ("fsnotify: generate pre-content permission event on page fault")
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20250312073852.2123409-5-amir73il@gmail.com
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b1068ddcbb7..8483e09aeb2c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3420,7 +3420,6 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf);
 extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 		pgoff_t start_pgoff, pgoff_t end_pgoff);
 extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
-extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf);
 
 extern unsigned long stack_guard_gap;
 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
-- 
cgit v1.2.3


From e794dc30be8b69e44646a162db09b43f719525b7 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 13 Feb 2025 16:07:15 +0200
Subject: i2c: Introduce i2c_10bit_addr_*_from_msg() helpers

There are already a lot of drivers that have been using
i2c_8bit_addr_from_msg() for 7-bit addresses, now it's time
to have the similar for 10-bit addresses.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20250213141045.2716943-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 include/linux/i2c.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 2b2af24d2a43..1e35b1e23165 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -952,6 +952,21 @@ static inline u8 i2c_8bit_addr_from_msg(const struct i2c_msg *msg)
 	return (msg->addr << 1) | (msg->flags & I2C_M_RD);
 }
 
+/*
+ * 10-bit address
+ *   addr_1: 5'b11110 | addr[9:8] | (R/nW)
+ *   addr_2: addr[7:0]
+ */
+static inline u8 i2c_10bit_addr_hi_from_msg(const struct i2c_msg *msg)
+{
+	return 0xf0 | ((msg->addr & GENMASK(9, 8)) >> 7) | (msg->flags & I2C_M_RD);
+}
+
+static inline u8 i2c_10bit_addr_lo_from_msg(const struct i2c_msg *msg)
+{
+	return msg->addr & GENMASK(7, 0);
+}
+
 u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold);
 void i2c_put_dma_safe_msg_buf(u8 *buf, struct i2c_msg *msg, bool xferred);
 
-- 
cgit v1.2.3