From 5c563ee90a22d3295bcd6217e3ecd7bf9f4d9d48 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 24 May 2024 11:58:28 -0700
Subject: cpumask: introduce assign_cpu() macro

Now that assign_bit() is a thin macro wrapper around set_bit() and
clear_bit(), we can use it in cpumask API and drop duplicating
implementations of set_cpu_xxx() helpers with no additional overhead.

Bloat-o-meter reports almost 2k less of generated code for allyesconfig,
mostly in kernel/cpu.c:
	add/remove: 2/4 grow/shrink: 3/4 up/down: 498/-2228 (-1730)

Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 40 ++++++----------------------------------
 1 file changed, 6 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 23686bed441d..18410acdbc9e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1083,44 +1083,16 @@ void init_cpu_present(const struct cpumask *src);
 void init_cpu_possible(const struct cpumask *src);
 void init_cpu_online(const struct cpumask *src);
 
-static inline void
-set_cpu_possible(unsigned int cpu, bool possible)
-{
-	if (possible)
-		cpumask_set_cpu(cpu, &__cpu_possible_mask);
-	else
-		cpumask_clear_cpu(cpu, &__cpu_possible_mask);
-}
+#define assign_cpu(cpu, mask, val)	\
+	assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val))
 
-static inline void
-set_cpu_present(unsigned int cpu, bool present)
-{
-	if (present)
-		cpumask_set_cpu(cpu, &__cpu_present_mask);
-	else
-		cpumask_clear_cpu(cpu, &__cpu_present_mask);
-}
+#define set_cpu_possible(cpu, possible)	assign_cpu((cpu), &__cpu_possible_mask, (possible))
+#define set_cpu_present(cpu, present)	assign_cpu((cpu), &__cpu_present_mask, (present))
+#define set_cpu_active(cpu, active)	assign_cpu((cpu), &__cpu_active_mask, (active))
+#define set_cpu_dying(cpu, dying)	assign_cpu((cpu), &__cpu_dying_mask, (dying))
 
 void set_cpu_online(unsigned int cpu, bool online);
 
-static inline void
-set_cpu_active(unsigned int cpu, bool active)
-{
-	if (active)
-		cpumask_set_cpu(cpu, &__cpu_active_mask);
-	else
-		cpumask_clear_cpu(cpu, &__cpu_active_mask);
-}
-
-static inline void
-set_cpu_dying(unsigned int cpu, bool dying)
-{
-	if (dying)
-		cpumask_set_cpu(cpu, &__cpu_dying_mask);
-	else
-		cpumask_clear_cpu(cpu, &__cpu_dying_mask);
-}
-
 /**
  * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask *
  * @bitmap: the bitmap
-- 
cgit v1.2.3


From e0eeb938adb0367de4b0946125a06142d8de7d37 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Tue, 11 Jun 2024 15:38:12 +0300
Subject: bitops: Add a comment explaining the double underscore macros

Linus Walleij pointed out that a new comer might be confused about the
difference between set_bit() and __set_bit().  Add a comment explaining
the difference.

Link: https://lore.kernel.org/all/CACRpkdZFPG_YLici-BmYfk9HZ36f4WavCN3JNotkk8cPgCODCg@mail.gmail.com/
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitops.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 46d4bdc634c0..ba35bbf07798 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -47,12 +47,17 @@ extern unsigned long __sw_hweight64(__u64 w);
 	  __builtin_constant_p(*(const unsigned long *)(addr))) ?	\
 	 const##op(nr, addr) : op(nr, addr))
 
+/*
+ * The following macros are non-atomic versions of their non-underscored
+ * counterparts.
+ */
 #define __set_bit(nr, addr)		bitop(___set_bit, nr, addr)
 #define __clear_bit(nr, addr)		bitop(___clear_bit, nr, addr)
 #define __change_bit(nr, addr)		bitop(___change_bit, nr, addr)
 #define __test_and_set_bit(nr, addr)	bitop(___test_and_set_bit, nr, addr)
 #define __test_and_clear_bit(nr, addr)	bitop(___test_and_clear_bit, nr, addr)
 #define __test_and_change_bit(nr, addr)	bitop(___test_and_change_bit, nr, addr)
+
 #define test_bit(nr, addr)		bitop(_test_bit, nr, addr)
 #define test_bit_acquire(nr, addr)	bitop(_test_bit_acquire, nr, addr)
 
-- 
cgit v1.2.3


From 675e979db473d08be346a3190c5f0db095a57153 Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fabio.m.de.francesco@linux.intel.com>
Date: Fri, 7 Jun 2024 16:43:58 +0200
Subject: cxl/events: Use a common struct for DRAM and General Media events

cxl_event_common was an unfortunate naming choice and caused confusion with
the existing Common Event Record. Furthermore, its fields didn't map all
the common information between DRAM and General Media Events.

Remove cxl_event_common and introduce cxl_event_media_hdr to record common
information between DRAM and General Media events.

cxl_event_media_hdr, which is embedded in both cxl_event_gen_media and
cxl_event_dram, leverages the commonalities between the two events to
simplify their respective handling.

Suggested-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20240607144423.48681-1-fabio.m.de.francesco@linux.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/linux/cxl-event.h | 45 +++++++++++++++++++--------------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 60b25020281f..0bea1afbd747 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -21,6 +21,21 @@ struct cxl_event_record_hdr {
 	u8 reserved[15];
 } __packed;
 
+struct cxl_event_media_hdr {
+	struct cxl_event_record_hdr hdr;
+	__le64 phys_addr;
+	u8 descriptor;
+	u8 type;
+	u8 transaction_type;
+	/*
+	 * The meaning of Validity Flags from bit 2 is
+	 * different across DRAM and General Media records
+	 */
+	u8 validity_flags[2];
+	u8 channel;
+	u8 rank;
+} __packed;
+
 #define CXL_EVENT_RECORD_DATA_LENGTH 0x50
 struct cxl_event_generic {
 	struct cxl_event_record_hdr hdr;
@@ -33,14 +48,7 @@ struct cxl_event_generic {
  */
 #define CXL_EVENT_GEN_MED_COMP_ID_SIZE	0x10
 struct cxl_event_gen_media {
-	struct cxl_event_record_hdr hdr;
-	__le64 phys_addr;
-	u8 descriptor;
-	u8 type;
-	u8 transaction_type;
-	u8 validity_flags[2];
-	u8 channel;
-	u8 rank;
+	struct cxl_event_media_hdr media_hdr;
 	u8 device[3];
 	u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE];
 	u8 reserved[46];
@@ -52,14 +60,7 @@ struct cxl_event_gen_media {
  */
 #define CXL_EVENT_DER_CORRECTION_MASK_SIZE	0x20
 struct cxl_event_dram {
-	struct cxl_event_record_hdr hdr;
-	__le64 phys_addr;
-	u8 descriptor;
-	u8 type;
-	u8 transaction_type;
-	u8 validity_flags[2];
-	u8 channel;
-	u8 rank;
+	struct cxl_event_media_hdr media_hdr;
 	u8 nibble_mask[3];
 	u8 bank_group;
 	u8 bank;
@@ -95,21 +96,13 @@ struct cxl_event_mem_module {
 	u8 reserved[0x3d];
 } __packed;
 
-/*
- * General Media or DRAM Event Common Fields
- * - provides common access to phys_addr
- */
-struct cxl_event_common {
-	struct cxl_event_record_hdr hdr;
-	__le64 phys_addr;
-} __packed;
-
 union cxl_event {
 	struct cxl_event_generic generic;
 	struct cxl_event_gen_media gen_media;
 	struct cxl_event_dram dram;
 	struct cxl_event_mem_module mem_module;
-	struct cxl_event_common common;
+	/* dram & gen_media event header */
+	struct cxl_event_media_hdr media_hdr;
 } __packed;
 
 /*
-- 
cgit v1.2.3


From 1f9a8286bc0c3df7d789ea625d9d9db3d7779f2d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 28 May 2024 14:58:03 +0000
Subject: uaccess: always export _copy_[from|to]_user with CONFIG_RUST

Rust code needs to be able to access _copy_from_user and _copy_to_user
so that it can skip the check_copy_size check in cases where the length
is known at compile-time, mirroring the logic for when C code will skip
check_copy_size. To do this, we ensure that exported versions of these
methods are available when CONFIG_RUST is enabled.

Alice has verified that this patch passes the CONFIG_TEST_USER_COPY test
on x86 using the Android cuttlefish emulator.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Alice Ryhl <aliceryhl@google.com>
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/r/20240528-alice-mm-v7-2-78222c31b8f4@google.com
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 include/linux/uaccess.h | 46 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 3064314f4832..d8e4105a2f21 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -5,6 +5,7 @@
 #include <linux/fault-inject-usercopy.h>
 #include <linux/instrumented.h>
 #include <linux/minmax.h>
+#include <linux/nospec.h>
 #include <linux/sched.h>
 #include <linux/thread_info.h>
 
@@ -138,13 +139,26 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
 	return raw_copy_to_user(to, from, n);
 }
 
-#ifdef INLINE_COPY_FROM_USER
+/*
+ * Architectures that #define INLINE_COPY_TO_USER use this function
+ * directly in the normal copy_to/from_user(), the other ones go
+ * through an extern _copy_to/from_user(), which expands the same code
+ * here.
+ *
+ * Rust code always uses the extern definition.
+ */
 static inline __must_check unsigned long
-_copy_from_user(void *to, const void __user *from, unsigned long n)
+_inline_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	unsigned long res = n;
 	might_fault();
 	if (!should_fail_usercopy() && likely(access_ok(from, n))) {
+		/*
+		 * Ensure that bad access_ok() speculation will not
+		 * lead to nasty side effects *after* the copy is
+		 * finished:
+		 */
+		barrier_nospec();
 		instrument_copy_from_user_before(to, from, n);
 		res = raw_copy_from_user(to, from, n);
 		instrument_copy_from_user_after(to, from, n, res);
@@ -153,14 +167,11 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
 		memset(to + (n - res), 0, res);
 	return res;
 }
-#else
 extern __must_check unsigned long
 _copy_from_user(void *, const void __user *, unsigned long);
-#endif
 
-#ifdef INLINE_COPY_TO_USER
 static inline __must_check unsigned long
-_copy_to_user(void __user *to, const void *from, unsigned long n)
+_inline_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 	might_fault();
 	if (should_fail_usercopy())
@@ -171,25 +182,32 @@ _copy_to_user(void __user *to, const void *from, unsigned long n)
 	}
 	return n;
 }
-#else
 extern __must_check unsigned long
 _copy_to_user(void __user *, const void *, unsigned long);
-#endif
 
 static __always_inline unsigned long __must_check
 copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	if (check_copy_size(to, n, false))
-		n = _copy_from_user(to, from, n);
-	return n;
+	if (!check_copy_size(to, n, false))
+		return n;
+#ifdef INLINE_COPY_FROM_USER
+	return _inline_copy_from_user(to, from, n);
+#else
+	return _copy_from_user(to, from, n);
+#endif
 }
 
 static __always_inline unsigned long __must_check
 copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-	if (check_copy_size(from, n, true))
-		n = _copy_to_user(to, from, n);
-	return n;
+	if (!check_copy_size(from, n, true))
+		return n;
+
+#ifdef INLINE_COPY_TO_USER
+	return _inline_copy_to_user(to, from, n);
+#else
+	return _copy_to_user(to, from, n);
+#endif
 }
 
 #ifndef copy_mc_to_kernel
-- 
cgit v1.2.3


From 415fb383ec2bbe4d4bb1a1b5593cd67eb058f1de Mon Sep 17 00:00:00 2001
From: Francis Pravin <francis.p@samsung.com>
Date: Mon, 15 Jul 2024 06:31:57 +0530
Subject: nvme-core: choose PIF from QPIF if QPIFS supports and PIF is QTYPE

As per TP4141a:
"If the Qualified Protection Information Format Support(QPIFS) bit is
set to 1 and the Protection Information Format(PIF) field is set to 11b
(i.e., Qualified Type), then the pif is as defined in the Qualified
Protection Information Format (QPIF) field."
So, choose PIF from QPIF if QPIFS supports and PIF is QTYPE.

Signed-off-by: Francis Pravin <francis.p@samsung.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 57e27e48c913..5c2290f0d5af 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -483,6 +483,9 @@ enum {
 	NVME_ID_NS_NVM_STS_MASK		= 0x7f,
 	NVME_ID_NS_NVM_GUARD_SHIFT	= 7,
 	NVME_ID_NS_NVM_GUARD_MASK	= 0x3,
+	NVME_ID_NS_NVM_QPIF_SHIFT	= 9,
+	NVME_ID_NS_NVM_QPIF_MASK	= 0xf,
+	NVME_ID_NS_NVM_QPIFS		= 1 << 3,
 };
 
 static inline __u8 nvme_elbaf_sts(__u32 elbaf)
@@ -495,6 +498,11 @@ static inline __u8 nvme_elbaf_guard_type(__u32 elbaf)
 	return (elbaf >> NVME_ID_NS_NVM_GUARD_SHIFT) & NVME_ID_NS_NVM_GUARD_MASK;
 }
 
+static inline __u8 nvme_elbaf_qualified_guard_type(__u32 elbaf)
+{
+	return (elbaf >> NVME_ID_NS_NVM_QPIF_SHIFT) & NVME_ID_NS_NVM_QPIF_MASK;
+}
+
 struct nvme_id_ctrl_nvm {
 	__u8	vsl;
 	__u8	wzsl;
@@ -574,6 +582,7 @@ enum {
 	NVME_NVM_NS_16B_GUARD	= 0,
 	NVME_NVM_NS_32B_GUARD	= 1,
 	NVME_NVM_NS_64B_GUARD	= 2,
+	NVME_NVM_NS_QTYPE_GUARD	= 3,
 };
 
 static inline __u8 nvme_lbaf_index(__u8 flbas)
-- 
cgit v1.2.3


From af22bbe1f4a514c80b89a27252beef033168f4e9 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Tue, 16 Jul 2024 13:35:48 +0200
Subject: virtio: create admin queues alongside other virtqueues

Admin virtqueue is just another virtqueue nothing that special about it.
The current implementation treats it somehow separate though in terms
of creation and deletion. Unify the admin virtqueue creation and
deletion flows to be aligned with the rest of virtqueues, creating
it from vp_find_vqs_*() helpers. Let the admin virtqueue to be deleted
by vp_del_vqs() as the rest.

Call vp_find_one_vq_msix() with slow_path argument being "true" to make
sure that in case of limited interrupt vectors the config vector is used
for admin queue.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Message-Id: <20240716113552.80599-10-jiri@resnulli.us>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index ab4b9a3fef6b..169c7d367fac 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -104,8 +104,6 @@ struct virtqueue_info {
  *	Returns 0 on success or error status
  *	If disable_vq_and_reset is set, then enable_vq_after_reset must also be
  *	set.
- * @create_avq: create admin virtqueue resource.
- * @destroy_avq: destroy admin virtqueue resource.
  */
 struct virtio_config_ops {
 	void (*get)(struct virtio_device *vdev, unsigned offset,
@@ -133,8 +131,6 @@ struct virtio_config_ops {
 			       struct virtio_shm_region *region, u8 id);
 	int (*disable_vq_and_reset)(struct virtqueue *vq);
 	int (*enable_vq_after_reset)(struct virtqueue *vq);
-	int (*create_avq)(struct virtio_device *vdev);
-	void (*destroy_avq)(struct virtio_device *vdev);
 };
 
 /* If driver didn't advertise the feature, it will never appear. */
-- 
cgit v1.2.3


From 4c3b54af907e709609d3d8beca92d65e2f0cfd83 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Tue, 16 Jul 2024 13:35:51 +0200
Subject: virtio_pci_modern: use completion instead of busy loop to wait on
 admin cmd result

Currently, the code waits in a busy loop on every admin virtqueue issued
command to get a reply. That prevents callers from issuing multiple
commands in parallel.

To overcome this limitation, introduce a virtqueue event callback for
admin virtqueue. For every issued command, use completion mechanism
to wait on a reply. In the event callback, trigger the completion
is done for every incoming reply.

Alongside with that, introduce a spin lock to protect the admin
virtqueue operations.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Message-Id: <20240716113552.80599-13-jiri@resnulli.us>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 96fea920873b..999ff5934392 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -10,6 +10,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/gfp.h>
 #include <linux/dma-mapping.h>
+#include <linux/completion.h>
 
 /**
  * struct virtqueue - a queue to register buffers for sending or receiving.
@@ -109,6 +110,8 @@ struct virtio_admin_cmd {
 	__le64 group_member_id;
 	struct scatterlist *data_sg;
 	struct scatterlist *result_sg;
+	struct completion completion;
+	int ret;
 };
 
 /**
-- 
cgit v1.2.3


From a2d6d8aee4a4f694920bfaf6f6be3690e0f8d302 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 19 Jul 2024 11:56:34 +0200
Subject: ALSA: hda: tas2781: mark const variables as __maybe_unused

An earlier patch changed the DECLARE_TLV_DB_SCALE declaration, but
now there are additional static const variables that cause
the same build warnings:

In file included from sound/pci/hda/tas2781_hda_i2c.c:23:
include/sound/tas2781-tlv.h:23:28: error: 'tas2563_dvc_table' defined but not used [-Werror=unused-const-variable=]
   23 | static const unsigned char tas2563_dvc_table[][4] = {
      |                            ^~~~~~~~~~~~~~~~~
In file included from include/sound/tlv.h:10,
                 from sound/pci/hda/tas2781_hda_i2c.c:22:
include/sound/tas2781-tlv.h:20:35: error: 'tas2563_dvc_tlv' defined but not used [-Werror=unused-const-variable=]
   20 | static const DECLARE_TLV_DB_SCALE(tas2563_dvc_tlv, -12150, 50, 1);
      |                                   ^~~~~~~~~~~~~~~

Mark them all as unused as well.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20240719095640.3741247-1-arnd@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/tas2781-tlv.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/sound/tas2781-tlv.h b/include/sound/tas2781-tlv.h
index 99c41bfc7827..00fd4d449ff3 100644
--- a/include/sound/tas2781-tlv.h
+++ b/include/sound/tas2781-tlv.h
@@ -16,11 +16,11 @@
 #define __TAS2781_TLV_H__
 
 static const __maybe_unused DECLARE_TLV_DB_SCALE(dvc_tlv, -10000, 100, 0);
-static const DECLARE_TLV_DB_SCALE(amp_vol_tlv, 1100, 50, 0);
-static const DECLARE_TLV_DB_SCALE(tas2563_dvc_tlv, -12150, 50, 1);
+static const __maybe_unused DECLARE_TLV_DB_SCALE(amp_vol_tlv, 1100, 50, 0);
+static const __maybe_unused DECLARE_TLV_DB_SCALE(tas2563_dvc_tlv, -12150, 50, 1);
 
 /* pow(10, db/20) * pow(2,30) */
-static const unsigned char tas2563_dvc_table[][4] = {
+static const __maybe_unused unsigned char tas2563_dvc_table[][4] = {
 	{ 0X00, 0X00, 0X00, 0X00 }, /* -121.5db */
 	{ 0X00, 0X00, 0X03, 0XBC }, /* -121.0db */
 	{ 0X00, 0X00, 0X03, 0XF5 }, /* -120.5db */
-- 
cgit v1.2.3


From eabd9db64ea8ba64d2a0b1d70da38e1a95dcd08b Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo1.xu@intel.com>
Date: Thu, 13 Jun 2024 16:54:33 +0800
Subject: ACPI: RISCV: Add NUMA support based on SRAT and SLIT

Add acpi_numa.c file to enable parse NUMA information from
ACPI SRAT and SLIT tables. SRAT table provide CPUs(Hart) and
memory nodes to proximity domain mapping, while SLIT table
provide the distance metrics between proximity domains.

Signed-off-by: Haibo Xu <haibo1.xu@intel.com>
Reviewed-by: Sunil V L <sunilvl@ventanamicro.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Link: https://lore.kernel.org/r/65dbad1fda08a32922c44886e4581e49b4a2fecc.1718268003.git.haibo1.xu@intel.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 include/linux/acpi.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 28c3fb2bef0d..5a2b620ccd58 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -264,6 +264,12 @@ static inline void
 acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
 #endif
 
+#ifdef CONFIG_RISCV
+void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa);
+#else
+static inline void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa) { }
+#endif
+
 #ifndef PHYS_CPUID_INVALID
 typedef u32 phys_cpuid_t;
 #define PHYS_CPUID_INVALID (phys_cpuid_t)(-1)
-- 
cgit v1.2.3


From 10a0e6f3d3db7dcfe36e578923e5f038f1d2b72a Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Wed, 17 Jul 2024 11:49:40 +0200
Subject: timers/migration: Move hierarchy setup into cpuhotplug prepare
 callback

When a CPU comes online the first time, it is possible that a new top level
group will be created. In general all propagation is done from the bottom
to top. This minimizes complexity and prevents possible races. But when a
new top level group is created, the formely top level group needs to be
connected to the new level. This is the only time, when the direction to
propagate changes is changed: the changes are propagated from top (new top
level group) to bottom (formerly top level group).

This introduces two races (see (A) and (B)) as reported by Frederic:

(A) This race happens, when marking the formely top level group as active,
but the last active CPU of the formerly top level group goes idle. Then
it's likely that formerly group is no longer active, but marked
nevertheless as active in new top level group:

		  [GRP0:0]
	       migrator = 0
	       active   = 0
	       nextevt  = KTIME_MAX
	       /         \
	      0         1 .. 7
	  active         idle

0) Hierarchy has for now only 8 CPUs and CPU 0 is the only active CPU.

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = NONE
			nextevt  = KTIME_MAX
					 \
		 [GRP0:0]                  [GRP0:1]
	      migrator = 0              migrator = TMIGR_NONE
	      active   = 0              active   = NONE
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
		/         \
	      0          1 .. 7                8
	  active         idle                !online

1) CPU 8 is booting and creates a new group in first level GRP0:1 and
   therefore also a new top group GRP1:0. For now the setup code proceeded
   only until the connected between GRP0:1 to the new top group. The
   connection between CPU8 and GRP0:1 is not yet established and CPU 8 is
   still !online.

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = NONE
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = 0              migrator = TMIGR_NONE
	      active   = 0              active   = NONE
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
		/         \
	      0          1 .. 7                8
	  active         idle                !online

2) Setup code now connects GRP0:0 to GRP1:0 and observes while in
   tmigr_connect_child_parent() that GRP0:0 is not TMIGR_NONE. So it
   prepares to call tmigr_active_up() on it. It hasn't done it yet.

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = NONE
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = TMIGR_NONE        migrator = TMIGR_NONE
	      active   = NONE              active   = NONE
	      nextevt  = KTIME_MAX         nextevt  = KTIME_MAX
		/         \
	      0          1 .. 7                8
	    idle         idle                !online

3) CPU 0 goes idle. Since GRP0:0->parent has been updated by CPU 8 with
   GRP0:0->lock held, CPU 0 observes GRP1:0 after calling
   tmigr_update_events() and it propagates the change to the top (no change
   there and no wakeup programmed since there is no timer).

			     [GRP1:0]
			migrator = GRP0:0
			active   = GRP0:0
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = TMIGR_NONE       migrator = TMIGR_NONE
	      active   = NONE             active   = NONE
	      nextevt  = KTIME_MAX        nextevt  = KTIME_MAX
		/         \
	      0          1 .. 7                8
	    idle         idle                !online

4) Now the setup code finally calls tmigr_active_up() to and sets GRP0:0
   active in GRP1:0

			     [GRP1:0]
			migrator = GRP0:0
			active   = GRP0:0, GRP0:1
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = TMIGR_NONE       migrator = 8
	      active   = NONE             active   = 8
	      nextevt  = KTIME_MAX        nextevt  = KTIME_MAX
		/         \                    |
	      0          1 .. 7                8
	    idle         idle                active

5) Now CPU 8 is connected with GRP0:1 and CPU 8 calls tmigr_active_up() out
   of tmigr_cpu_online().

			     [GRP1:0]
			migrator = GRP0:0
			active   = GRP0:0
			nextevt  = T8
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = TMIGR_NONE         migrator = TMIGR_NONE
	      active   = NONE               active   = NONE
	      nextevt  = KTIME_MAX          nextevt  = T8
		/         \                    |
	      0          1 .. 7                8
	    idle         idle                  idle

5) CPU 8 goes idle with a timer T8 and relies on GRP0:0 as the migrator.
   But it's not really active, so T8 gets ignored.

--> The update which is done in third step is not noticed by setup code. So
    a wrong migrator is set to top level group and a timer could get
    ignored.

(B) Reading group->parent and group->childmask when an hierarchy update is
ongoing and reaches the formerly top level group is racy as those values
could be inconsistent. (The notation of migrator and active now slightly
changes in contrast to the above example, as now the childmasks are used.)

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = 0x00
			nextevt  = KTIME_MAX
					 \
		 [GRP0:0]                  [GRP0:1]
	      migrator = TMIGR_NONE     migrator = TMIGR_NONE
	      active   = 0x00           active   = 0x00
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
	      childmask= 0		childmask= 1
	      parent   = NULL		parent   = GRP1:0
		/         \
	      0          1 .. 7                8
	  idle           idle                !online
	  childmask=1

1) Hierarchy has 8 CPUs. CPU 8 is at the moment in the process of onlining
   but did not yet connect GRP0:0 to GRP1:0.

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = 0x00
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = TMIGR_NONE     migrator = TMIGR_NONE
	      active   = 0x00           active   = 0x00
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
	      childmask= 0		childmask= 1
	      parent   = GRP1:0		parent   = GRP1:0
		/         \
	      0          1 .. 7                8
	  idle           idle                !online
	  childmask=1

2) Setup code (running on CPU 8) now connects GRP0:0 to GRP1:0, updates
   parent pointer of GRP0:0 and ...

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = 0x00
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = 0x01           migrator = TMIGR_NONE
	      active   = 0x01           active   = 0x00
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
	      childmask= 0		childmask= 1
	      parent   = GRP1:0		parent   = GRP1:0
		/         \
	      0          1 .. 7                8
	  active          idle                !online
	  childmask=1

	  tmigr_walk.childmask = 0

3) ... CPU 0 comes active in the same time. As migrator in GRP0:0 was
   TMIGR_NONE, childmask of GRP0:0 is stored in update propagation data
   structure tmigr_walk (as update of childmask is not yet
   visible/updated). And now ...

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = 0x00
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = 0x01           migrator = TMIGR_NONE
	      active   = 0x01           active   = 0x00
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
	      childmask= 2		childmask= 1
	      parent   = GRP1:0		parent   = GRP1:0
		/         \
	      0          1 .. 7                8
	  active          idle                !online
	  childmask=1

	  tmigr_walk.childmask = 0

4) ... childmask of GRP0:0 is updated by CPU 8 (still part of setup
   code).

			     [GRP1:0]
			migrator = 0x00
			active   = 0x00
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = 0x01           migrator = TMIGR_NONE
	      active   = 0x01           active   = 0x00
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
	      childmask= 2		childmask= 1
	      parent   = GRP1:0		parent   = GRP1:0
		/         \
	      0          1 .. 7                8
	  active          idle                !online
	  childmask=1

	  tmigr_walk.childmask = 0

5) CPU 0 sees the connection to GRP1:0 and now propagates active state to
   GRP1:0 but with childmask = 0 as stored in propagation data structure.

--> Now GRP1:0 always has a migrator as 0x00 != TMIGR_NONE and for all CPUs
    it looks like GRP1:0 is always active.

To prevent those races, the setup of the hierarchy is moved into the
cpuhotplug prepare callback. The prepare callback is not executed by the
CPU which will come online, it is executed by the CPU which prepares
onlining of the other CPU. This CPU is active while it is connecting the
formerly top level to the new one. This prevents from (A) to happen and it
also prevents from any further walk above the formerly top level until that
active CPU becomes inactive, releasing the new ->parent and ->childmask
updates to be visible by any subsequent walk up above the formerly top
level hierarchy. This prevents from (B) to happen. The direction for the
updates is now forced to look like "from bottom to top".

However if the active CPU prevents from tmigr_cpu_(in)active() to walk up
with the update not-or-half visible, nothing prevents walking up to the new
top with a 0 childmask in tmigr_handle_remote_up() or
tmigr_requires_handle_remote_up() if the active CPU doing the prepare is
not the migrator. But then it looks fine because:

  * tmigr_check_migrator() should just return false
  * The migrator is active and should eventually observe the new childmask
    at some point in a future tick.

Split setup functionality of online callback into the cpuhotplug prepare
callback and setup hotplug state. Change init call into early_initcall() to
make sure an already active CPU prepares everything for newly upcoming
CPUs. Reorder the code, that all prepare related functions are close to
each other and online and offline callbacks are also close together.

Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model")
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20240717094940.18687-1-anna-maria@linutronix.de
---
 include/linux/cpuhotplug.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 7a5785f405b6..df59666a2a66 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -122,6 +122,7 @@ enum cpuhp_state {
 	CPUHP_KVM_PPC_BOOK3S_PREPARE,
 	CPUHP_ZCOMP_PREPARE,
 	CPUHP_TIMERS_PREPARE,
+	CPUHP_TMIGR_PREPARE,
 	CPUHP_MIPS_SOC_PREPARE,
 	CPUHP_BP_PREPARE_DYN,
 	CPUHP_BP_PREPARE_DYN_END		= CPUHP_BP_PREPARE_DYN + 20,
-- 
cgit v1.2.3


From 835a9a67f54f01033417a254e53a1391f99db708 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Tue, 16 Jul 2024 16:19:24 +0200
Subject: timers/migration: Rename childmask by groupmask to make naming more
 obvious

childmask in the group reflects the mask that is required to 'reference'
this group in the parent. When reading childmask, this might be confusing,
as this suggests, that this is the mask of the child of the group.

Clarify this by renaming childmask in the tmigr_group and tmc_group by
groupmask.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-6-757baa7803fe@linutronix.de
---
 include/trace/events/timer_migration.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/timer_migration.h b/include/trace/events/timer_migration.h
index 79f19e76a80b..47db5eaf2f9a 100644
--- a/include/trace/events/timer_migration.h
+++ b/include/trace/events/timer_migration.h
@@ -43,7 +43,7 @@ TRACE_EVENT(tmigr_connect_child_parent,
 		__field( unsigned int,	lvl		)
 		__field( unsigned int,	numa_node	)
 		__field( unsigned int,	num_children	)
-		__field( u32,		childmask	)
+		__field( u32,		groupmask	)
 	),
 
 	TP_fast_assign(
@@ -52,11 +52,11 @@ TRACE_EVENT(tmigr_connect_child_parent,
 		__entry->lvl		= child->parent->level;
 		__entry->numa_node	= child->parent->numa_node;
 		__entry->num_children	= child->parent->num_children;
-		__entry->childmask	= child->childmask;
+		__entry->groupmask	= child->groupmask;
 	),
 
-	TP_printk("group=%p childmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
-		  __entry->child,  __entry->childmask, __entry->parent,
+	TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
+		  __entry->child,  __entry->groupmask, __entry->parent,
 		  __entry->lvl, __entry->numa_node, __entry->num_children)
 );
 
@@ -72,7 +72,7 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
 		__field( unsigned int,	lvl		)
 		__field( unsigned int,	numa_node	)
 		__field( unsigned int,	num_children	)
-		__field( u32,		childmask	)
+		__field( u32,		groupmask	)
 	),
 
 	TP_fast_assign(
@@ -81,11 +81,11 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
 		__entry->lvl		= tmc->tmgroup->level;
 		__entry->numa_node	= tmc->tmgroup->numa_node;
 		__entry->num_children	= tmc->tmgroup->num_children;
-		__entry->childmask	= tmc->childmask;
+		__entry->groupmask	= tmc->groupmask;
 	),
 
-	TP_printk("cpu=%d childmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
-		  __entry->cpu,	 __entry->childmask, __entry->parent,
+	TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
+		  __entry->cpu,	 __entry->groupmask, __entry->parent,
 		  __entry->lvl, __entry->numa_node, __entry->num_children)
 );
 
-- 
cgit v1.2.3


From a2b72b81fb3ba18717fc000949ca9d45a3351130 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 24 Jul 2024 12:16:20 +0100
Subject: io_uring: kill REQ_F_CANCEL_SEQ

We removed the reliance on the flag by the cancellation path and now
it's unused.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e57afe566bbe4fefeb44daffb08900f2a4756577.1721819383.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3bb6198d1523..e62aa9f0629f 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -461,7 +461,6 @@ enum {
 	REQ_F_SUPPORT_NOWAIT_BIT,
 	REQ_F_ISREG_BIT,
 	REQ_F_POLL_NO_LAZY_BIT,
-	REQ_F_CANCEL_SEQ_BIT,
 	REQ_F_CAN_POLL_BIT,
 	REQ_F_BL_EMPTY_BIT,
 	REQ_F_BL_NO_RECYCLE_BIT,
@@ -536,8 +535,6 @@ enum {
 	REQ_F_HASH_LOCKED	= IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT),
 	/* don't use lazy poll wake for this request */
 	REQ_F_POLL_NO_LAZY	= IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT),
-	/* cancel sequence is set and valid */
-	REQ_F_CANCEL_SEQ	= IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT),
 	/* file is pollable */
 	REQ_F_CAN_POLL		= IO_REQ_FLAG(REQ_F_CAN_POLL_BIT),
 	/* buffer list was empty after selection of buffer */
-- 
cgit v1.2.3


From 9722c3b66e21ff08aec570d02a97d331087fd70f Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca.ceresoli@bootlin.com>
Date: Wed, 24 Jul 2024 18:33:06 +0200
Subject: of: remove internal arguments from of_property_for_each_u32()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The of_property_for_each_u32() macro needs five parameters, two of which
are primarily meant as internal variables for the macro itself (in the
for() clause). Yet these two parameters are used by a few drivers, and this
can be considered misuse or at least bad practice.

Now that the kernel uses C11 to build, these two parameters can be avoided
by declaring them internally, thus changing this pattern:

  struct property *prop;
  const __be32 *p;
  u32 val;

  of_property_for_each_u32(np, "xyz", prop, p, val) { ... }

to this:

  u32 val;

  of_property_for_each_u32(np, "xyz", val) { ... }

However two variables cannot be declared in the for clause even with C11,
so declare one struct that contain the two variables we actually need. As
the variables inside this struct are not meant to be used by users of this
macro, give the struct instance the noticeable name "_it" so it is visible
during code reviews, helping to avoid new code to use it directly.

Most usages are trivially converted as they do not use those two
parameters, as expected. The non-trivial cases are:

 - drivers/clk/clk.c, of_clk_get_parent_name(): easily doable anyway
 - drivers/clk/clk-si5351.c, si5351_dt_parse(): this is more complex as the
   checks had to be replicated in a different way, making code more verbose
   and somewhat uglier, but I refrained from a full rework to keep as much
   of the original code untouched having no hardware to test my changes

All the changes have been build tested. The few for which I have the
hardware have been runtime-tested too.

Reviewed-by: Andre Przywara <andre.przywara@arm.com> # drivers/clk/sunxi/clk-simple-gates.c, drivers/clk/sunxi/clk-sun8i-bus-gates.c
Acked-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org> # drivers/gpio/gpio-brcmstb.c
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com> # drivers/irqchip/irq-atmel-aic-common.c
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # drivers/iio/adc/ti_am335x_adc.c
Acked-by: Uwe Kleine-König <u.kleine-koenig@baylibre.com> # drivers/pwm/pwm-samsung.c
Acked-by: Richard Leitner <richard.leitner@linux.dev> # drivers/usb/misc/usb251xb.c
Acked-by: Mark Brown <broonie@kernel.org> # sound/soc/codecs/arizona.c
Reviewed-by: Richard Fitzgerald <rf@opensource.cirrus.com> # sound/soc/codecs/arizona.c
Acked-by: Michael Ellerman <mpe@ellerman.id.au> # arch/powerpc/sysdev/xive/spapr.c
Acked-by: Stephen Boyd <sboyd@kernel.org> # clk
Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
Acked-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20240724-of_property_for_each_u32-v3-1-bea82ce429e2@bootlin.com
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 include/linux/of.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/of.h b/include/linux/of.h
index 13cf7a43b473..85b60ac9eec5 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -430,11 +430,9 @@ extern int of_detach_node(struct device_node *);
 #define of_match_ptr(_ptr)	(_ptr)
 
 /*
- * struct property *prop;
- * const __be32 *p;
  * u32 u;
  *
- * of_property_for_each_u32(np, "propname", prop, p, u)
+ * of_property_for_each_u32(np, "propname", u)
  *         printk("U32 value: %x\n", u);
  */
 const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur,
@@ -1431,11 +1429,12 @@ static inline int of_property_read_s32(const struct device_node *np,
 	     err == 0;							\
 	     err = of_phandle_iterator_next(it))
 
-#define of_property_for_each_u32(np, propname, prop, p, u)	\
-	for (prop = of_find_property(np, propname, NULL),	\
-		p = of_prop_next_u32(prop, NULL, &u);		\
-		p;						\
-		p = of_prop_next_u32(prop, p, &u))
+#define of_property_for_each_u32(np, propname, u)			\
+	for (struct {struct property *prop; const __be32 *item; } _it =	\
+		{of_find_property(np, propname, NULL),			\
+		 of_prop_next_u32(_it.prop, NULL, &u)};			\
+	     _it.item;							\
+	     _it.item = of_prop_next_u32(_it.prop, _it.item, &u))
 
 #define of_property_for_each_string(np, propname, prop, s)	\
 	for (prop = of_find_property(np, propname, NULL),	\
-- 
cgit v1.2.3


From 63c33ca0969cf4d4574103b69fb46f58a19a182b Mon Sep 17 00:00:00 2001
From: Bhoomik Gupta <bhoomik.gupta@nxp.com>
Date: Mon, 8 Jul 2024 11:08:35 +0530
Subject: i3c: master: Enhance i3c_bus_type visibility for device searching &
 event monitoring

Improve the visibility of i3c_bus_type to facilitate searching for
i3c devices attached to the i3c bus. Enable other drivers to use
bus_register_notifier to monitor i3c bus device events.

Signed-off-by: Bhoomik Gupta <bhoomik.gupta@nxp.com>
Link: https://lore.kernel.org/r/20240708053835.3003986-1-bhoomik.gupta@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 0ca27dd86956..074f632868d9 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -33,6 +33,7 @@ enum {
 struct i3c_master_controller;
 struct i3c_bus;
 struct i3c_device;
+extern const struct bus_type i3c_bus_type;
 
 /**
  * struct i3c_i2c_dev_desc - Common part of the I3C/I2C device descriptor
-- 
cgit v1.2.3


From 24168c5e6dfbdd5b414f048f47f75d64533296ca Mon Sep 17 00:00:00 2001
From: Carlos Song <carlos.song@nxp.com>
Date: Mon, 15 Jul 2024 18:53:51 -0400
Subject: dt-bindings: i3c: add header for generic I3C flags

Add header file for generic I3C flags to avoid hard code in dts file.

Signed-off-by: Carlos Song <carlos.song@nxp.com>
Reviewed-by: Frank Li <frank.li@nxp.com>
Acked-by: Jason Liu <jason.hui.liu@nxp.com>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240715225351.3237284-1-Frank.Li@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/dt-bindings/i3c/i3c.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 include/dt-bindings/i3c/i3c.h

(limited to 'include')

diff --git a/include/dt-bindings/i3c/i3c.h b/include/dt-bindings/i3c/i3c.h
new file mode 100644
index 000000000000..373439218bba
--- /dev/null
+++ b/include/dt-bindings/i3c/i3c.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */
+/*
+ * Copyright 2024 NXP
+ */
+
+#ifndef _DT_BINDINGS_I3C_I3C_H
+#define _DT_BINDINGS_I3C_I3C_H
+
+#define I2C_FM      (1 << 4)
+#define I2C_FM_PLUS (0 << 4)
+
+#define I2C_FILTER  (0 << 5)
+#define I2C_NO_FILTER_HIGH_FREQUENCY    (1 << 5)
+#define I2C_NO_FILTER_LOW_FREQUENCY     (2 << 5)
+
+#endif
-- 
cgit v1.2.3


From 342b2e395d5f34c9f111a818556e617939f83a8c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 26 Jul 2024 15:24:30 +0100
Subject: io_uring/napi: use ktime in busy polling

It's more natural to use ktime/ns instead of keeping around usec,
especially since we're comparing it against user provided timers,
so convert napi busy poll internal handling to ktime. It's also nicer
since the type (ktime_t vs unsigned long) now tells the unit of measure.

Keep everything as ktime, which we convert to/from micro seconds for
IORING_[UN]REGISTER_NAPI. The net/ busy polling works seems to work with
usec, however it's not real usec as shift by 10 is used to get it from
nsecs, see busy_loop_current_time(), so it's easy to get truncated nsec
back and we get back better precision.

Note, we can further improve it later by removing the truncation and
maybe convincing net/ to use ktime/ns instead.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/95e7ec8d095069a3ed5d40a4bc6f8b586698bc7e.1722003776.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e62aa9f0629f..3315005df117 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -404,7 +404,7 @@ struct io_ring_ctx {
 	spinlock_t		napi_lock;	/* napi_list lock */
 
 	/* napi busy poll default timeout */
-	unsigned int		napi_busy_poll_to;
+	ktime_t			napi_busy_poll_dt;
 	bool			napi_prefer_busy_poll;
 	bool			napi_enabled;
 
-- 
cgit v1.2.3


From d659b715e94ac039803d7601505d3473393fc0be Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Mon, 15 Jul 2024 10:04:23 +1000
Subject: mm/huge_memory: avoid PMD-size page cache if needed

xarray can't support arbitrary page cache size.  the largest and supported
page cache size is defined as MAX_PAGECACHE_ORDER by commit 099d90642a71
("mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray").  However,
it's possible to have 512MB page cache in the huge memory's collapsing
path on ARM64 system whose base page size is 64KB.  512MB page cache is
breaking the limitation and a warning is raised when the xarray entry is
split as shown in the following example.

[root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize
KernelPageSize:       64 kB
[root@dhcp-10-26-1-207 ~]# cat /tmp/test.c
   :
int main(int argc, char **argv)
{
	const char *filename = TEST_XFS_FILENAME;
	int fd = 0;
	void *buf = (void *)-1, *p;
	int pgsize = getpagesize();
	int ret = 0;

	if (pgsize != 0x10000) {
		fprintf(stdout, "System with 64KB base page size is required!\n");
		return -EPERM;
	}

	system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb");
	system("echo 1 > /proc/sys/vm/drop_caches");

	/* Open the xfs file */
	fd = open(filename, O_RDONLY);
	assert(fd > 0);

	/* Create VMA */
	buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0);
	assert(buf != (void *)-1);
	fprintf(stdout, "mapped buffer at 0x%p\n", buf);

	/* Populate VMA */
	ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE);
	assert(ret == 0);
	ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ);
	assert(ret == 0);

	/* Collapse VMA */
	ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE);
	assert(ret == 0);
	ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE);
	if (ret) {
		fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno);
		goto out;
	}

	/* Split xarray entry. Write permission is needed */
	munmap(buf, TEST_MEM_SIZE);
	buf = (void *)-1;
	close(fd);
	fd = open(filename, O_RDWR);
	assert(fd > 0);
	fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
 		  TEST_MEM_SIZE - pgsize, pgsize);
out:
	if (buf != (void *)-1)
		munmap(buf, TEST_MEM_SIZE);
	if (fd > 0)
		close(fd);

	return ret;
}

[root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test
[root@dhcp-10-26-1-207 ~]# /tmp/test
 ------------[ cut here ]------------
 WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
 Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib    \
 nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct      \
 nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4      \
 ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse   \
 xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net  \
 sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio
 CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9
 Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
 pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
 pc : xas_split_alloc+0xf8/0x128
 lr : split_huge_page_to_list_to_order+0x1c4/0x780
 sp : ffff8000ac32f660
 x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0
 x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d
 x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000
 x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000
 x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000
 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
 x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c
 x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8
 x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40
 x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
 Call trace:
  xas_split_alloc+0xf8/0x128
  split_huge_page_to_list_to_order+0x1c4/0x780
  truncate_inode_partial_folio+0xdc/0x160
  truncate_inode_pages_range+0x1b4/0x4a8
  truncate_pagecache_range+0x84/0xa0
  xfs_flush_unmap_range+0x70/0x90 [xfs]
  xfs_file_fallocate+0xfc/0x4d8 [xfs]
  vfs_fallocate+0x124/0x2f0
  ksys_fallocate+0x4c/0xa0
  __arm64_sys_fallocate+0x24/0x38
  invoke_syscall.constprop.0+0x7c/0xd8
  do_el0_svc+0xb4/0xd0
  el0_svc+0x44/0x1d8
  el0t_64_sync_handler+0x134/0x150
  el0t_64_sync+0x17c/0x180

Fix it by correcting the supported page cache orders, different sets for
DAX and other files.  With it corrected, 512MB page cache becomes
disallowed on all non-DAX files on ARM64 system where the base page size
is 64KB.  After this patch is applied, the test program fails with error
-EINVAL returned from __thp_vma_allowable_orders() and the madvise()
system call to collapse the page caches.

Link: https://lkml.kernel.org/r/20240715000423.316491-1-gshan@redhat.com
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Signed-off-by: Gavin Shan <gshan@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Don Dutile <ddutile@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: <stable@vger.kernel.org>	[5.17+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index cff002be83eb..e25d9ebfdf89 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -74,14 +74,20 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
 #define THP_ORDERS_ALL_ANON	((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
 
 /*
- * Mask of all large folio orders supported for file THP.
+ * Mask of all large folio orders supported for file THP. Folios in a DAX
+ * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to
+ * it.
  */
-#define THP_ORDERS_ALL_FILE	(BIT(PMD_ORDER) | BIT(PUD_ORDER))
+#define THP_ORDERS_ALL_FILE_DAX		\
+	(BIT(PMD_ORDER) | BIT(PUD_ORDER))
+#define THP_ORDERS_ALL_FILE_DEFAULT	\
+	((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
 
 /*
  * Mask of all large folio orders supported for THP.
  */
-#define THP_ORDERS_ALL		(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE)
+#define THP_ORDERS_ALL	\
+	(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
 
 #define TVA_SMAPS		(1 << 0)	/* Will be used for procfs */
 #define TVA_IN_PF		(1 << 1)	/* Page fault handler */
-- 
cgit v1.2.3


From b3bebe44306e23827397d0d774d206e3fa374041 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 17 Jul 2024 14:28:44 -0700
Subject: alloc_tag: outline and export free_reserved_page()

Outline and export free_reserved_page() because modules use it and it in
turn uses page_ext_{get|put} which should not be exported.  The same
result could be obtained by outlining {get|put}_page_tag_ref() but that
would have higher performance impact as these functions are used in more
performance critical paths.

Link: https://lkml.kernel.org/r/20240717212844.2749975-1-surenb@google.com
Fixes: dcfe378c81f7 ("lib: introduce support for page allocation tagging")
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202407080044.DWMC9N9I-lkp@intel.com/
Suggested-by: Christoph Hellwig <hch@infradead.org>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Sourav Panda <souravpanda@google.com>
Cc: <stable@vger.kernel.org>	[6.10]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4563b560ced7..c4b238a20b76 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3137,21 +3137,7 @@ extern void reserve_bootmem_region(phys_addr_t start,
 				   phys_addr_t end, int nid);
 
 /* Free the reserved page into the buddy system, so it gets managed. */
-static inline void free_reserved_page(struct page *page)
-{
-	if (mem_alloc_profiling_enabled()) {
-		union codetag_ref *ref = get_page_tag_ref(page);
-
-		if (ref) {
-			set_codetag_empty(ref);
-			put_page_tag_ref(ref);
-		}
-	}
-	ClearPageReserved(page);
-	init_page_count(page);
-	__free_page(page);
-	adjust_managed_page_count(page, 1);
-}
+void free_reserved_page(struct page *page);
 #define free_highmem_page(page) free_reserved_page(page)
 
 static inline void mark_page_reserved(struct page *page)
-- 
cgit v1.2.3


From f59adcf5933271ab7247c4c8938c67be8905b725 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Tue, 23 Jul 2024 17:12:44 +0000
Subject: mm: memcg: add cacheline padding after lruvec in mem_cgroup_per_node

Oliver Sand reported a performance regression caused by commit
98c9daf5ae6b ("mm: memcg: guard memcg1-specific members of struct
mem_cgroup_per_node"), which puts some fields of the mem_cgroup_per_node
structure under the CONFIG_MEMCG_V1 config option.  Apparently it causes a
false cache sharing between lruvec and lru_zone_size members of the
structure.  Fix it by adding an explicit padding after the lruvec member.

Even though the padding is not required with CONFIG_MEMCG_V1 set, it seems
like the introduced memory overhead is not significant enough to warrant
another divergence in the mem_cgroup_per_node layout, so the padding is
added unconditionally.

Link: https://lkml.kernel.org/r/20240723171244.747521-1-roman.gushchin@linux.dev
Fixes: 98c9daf5ae6b ("mm: memcg: guard memcg1-specific members of struct mem_cgroup_per_node")
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202407121335.31a10cb6-oliver.sang@intel.com
Tested-by: Oliver Sang <oliver.sang@intel.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7e2eb091049a..0e5bf25d324f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -109,6 +109,7 @@ struct mem_cgroup_per_node {
 
 	/* Fields which get updated often at the end. */
 	struct lruvec		lruvec;
+	CACHELINE_PADDING(_pad2_);
 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 	struct mem_cgroup_reclaim_iter	iter;
 };
-- 
cgit v1.2.3


From 3a7e02c040b130b5545e4b115aada7bacd80a2b6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 26 Jul 2024 15:32:27 -0700
Subject: minmax: avoid overly complicated constant expressions in VM code

The minmax infrastructure is overkill for simple constants, and can
cause huge expansions because those simple constants are then used by
other things.

For example, 'pageblock_order' is a core VM constant, but because it was
implemented using 'min_t()' and all the type-checking that involves, it
actually expanded to something like 2.5kB of preprocessor noise.

And when that simple constant was then used inside other expansions:

  #define pageblock_nr_pages      (1UL << pageblock_order)
  #define pageblock_start_pfn(pfn)  ALIGN_DOWN((pfn), pageblock_nr_pages)

and we then use that inside a 'max()' macro:

	case ISOLATE_SUCCESS:
		update_cached = false;
		last_migrated_pfn = max(cc->zone->zone_start_pfn,
			pageblock_start_pfn(cc->migrate_pfn - 1));

the end result was that one statement expanding to 253kB in size.

There are probably other cases of this, but this one case certainly
stood out.

I've added 'MIN_T()' and 'MAX_T()' macros for this kind of "core simple
constant with specific type" use.  These macros skip the type checking,
and as such need to be very sparingly used only for obvious cases that
have active issues like this.

Reported-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Link: https://lore.kernel.org/all/36aa2cad-1db1-4abf-8dd2-fb20484aabc3@lucifer.local/
Cc: David Laight <David.Laight@aculab.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/minmax.h          | 7 +++++++
 include/linux/pageblock-flags.h | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 2ec559284a9f..a7ef65f78933 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -270,4 +270,11 @@ static inline bool in_range32(u32 val, u32 start, u32 len)
 #define swap(a, b) \
 	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
 
+/*
+ * Use these carefully: no type checking, and uses the arguments
+ * multiple times. Use for obvious constants only.
+ */
+#define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b))
+#define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b))
+
 #endif	/* _LINUX_MINMAX_H */
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 547e82cdc89a..fc6b9c87cb0a 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -41,13 +41,13 @@ extern unsigned int pageblock_order;
  * Huge pages are a constant size, but don't exceed the maximum allocation
  * granularity.
  */
-#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER)
+#define pageblock_order		MIN_T(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
 #elif defined(CONFIG_TRANSPARENT_HUGEPAGE)
 
-#define pageblock_order		min_t(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER)
+#define pageblock_order		MIN_T(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER)
 
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-- 
cgit v1.2.3


From 00e3913b0416fe69d28745c0a2a340e2f76c219c Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Fri, 26 Jul 2024 01:16:48 +0900
Subject: Revert "firewire: Annotate struct fw_iso_packet with __counted_by()"

This reverts commit d3155742db89df3b3c96da383c400e6ff4d23c25.

The header_length field is byte unit, thus it can not express the number of
elements in header field. It seems that the argument for counted_by
attribute can have no arithmetic expression, therefore this commit just
reverts the issued commit.

Suggested-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20240725161648.130404-1-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 include/linux/firewire.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 00abe0e5d602..1cca14cf5652 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -462,9 +462,8 @@ struct fw_iso_packet {
 				/* rx: Sync bit, wait for matching sy	*/
 	u32 tag:2;		/* tx: Tag in packet header		*/
 	u32 sy:4;		/* tx: Sy in packet header		*/
-	u32 header_length:8;	/* Length of immediate header		*/
-				/* tx: Top of 1394 isoch. data_block    */
-	u32 header[] __counted_by(header_length);
+	u32 header_length:8;	/* Size of immediate header		*/
+	u32 header[];		/* tx: Top of 1394 isoch. data_block	*/
 };
 
 #define FW_ISO_CONTEXT_TRANSMIT			0
-- 
cgit v1.2.3


From 017fa3e89187848fd056af757769c9e66ac3e93d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 28 Jul 2024 13:50:01 -0700
Subject: minmax: simplify and clarify min_t()/max_t() implementation

This simplifies the min_t() and max_t() macros by no longer making them
work in the context of a C constant expression.

That means that you can no longer use them for static initializers or
for array sizes in type definitions, but there were only a couple of
such uses, and all of them were converted (famous last words) to use
MIN_T/MAX_T instead.

Cc: David Laight <David.Laight@aculab.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/minmax.h | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index a7ef65f78933..9c2848abc804 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -45,17 +45,20 @@
 
 #define __cmp(op, x, y)	((x) __cmp_op_##op (y) ? (x) : (y))
 
-#define __cmp_once(op, x, y, unique_x, unique_y) ({	\
-	typeof(x) unique_x = (x);			\
-	typeof(y) unique_y = (y);			\
+#define __cmp_once_unique(op, type, x, y, ux, uy) \
+	({ type ux = (x); type uy = (y); __cmp(op, ux, uy); })
+
+#define __cmp_once(op, type, x, y) \
+	__cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))
+
+#define __careful_cmp_once(op, x, y) ({			\
 	static_assert(__types_ok(x, y),			\
 		#op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \
-	__cmp(op, unique_x, unique_y); })
+	__cmp_once(op, __auto_type, x, y); })
 
 #define __careful_cmp(op, x, y)					\
 	__builtin_choose_expr(__is_constexpr((x) - (y)),	\
-		__cmp(op, x, y),				\
-		__cmp_once(op, x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y)))
+		__cmp(op, x, y), __careful_cmp_once(op, x, y))
 
 #define __clamp(val, lo, hi)	\
 	((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val)))
@@ -158,7 +161,7 @@
  * @x: first value
  * @y: second value
  */
-#define min_t(type, x, y)	__careful_cmp(min, (type)(x), (type)(y))
+#define min_t(type, x, y) __cmp_once(min, type, x, y)
 
 /**
  * max_t - return maximum of two values, using the specified type
@@ -166,7 +169,7 @@
  * @x: first value
  * @y: second value
  */
-#define max_t(type, x, y)	__careful_cmp(max, (type)(x), (type)(y))
+#define max_t(type, x, y) __cmp_once(max, type, x, y)
 
 /*
  * Do not check the array parameter using __must_be_array().
-- 
cgit v1.2.3


From 1a251f52cfdc417c84411a056bc142cbd77baef4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 28 Jul 2024 15:49:18 -0700
Subject: minmax: make generic MIN() and MAX() macros available everywhere

This just standardizes the use of MIN() and MAX() macros, with the very
traditional semantics.  The goal is to use these for C constant
expressions and for top-level / static initializers, and so be able to
simplify the min()/max() macros.

These macro names were used by various kernel code - they are very
traditional, after all - and all such users have been fixed up, with a
few different approaches:

 - trivial duplicated macro definitions have been removed

   Note that 'trivial' here means that it's obviously kernel code that
   already included all the major kernel headers, and thus gets the new
   generic MIN/MAX macros automatically.

 - non-trivial duplicated macro definitions are guarded with #ifndef

   This is the "yes, they define their own versions, but no, the include
   situation is not entirely obvious, and maybe they don't get the
   generic version automatically" case.

 - strange use case #1

   A couple of drivers decided that the way they want to describe their
   versioning is with

	#define MAJ 1
	#define MIN 2
	#define DRV_VERSION __stringify(MAJ) "." __stringify(MIN)

   which adds zero value and I just did my Alexander the Great
   impersonation, and rewrote that pointless Gordian knot as

	#define DRV_VERSION "1.2"

   instead.

 - strange use case #2

   A couple of drivers thought that it's a good idea to have a random
   'MIN' or 'MAX' define for a value or index into a table, rather than
   the traditional macro that takes arguments.

   These values were re-written as C enum's instead. The new
   function-line macros only expand when followed by an open
   parenthesis, and thus don't clash with enum use.

Happily, there weren't really all that many of these cases, and a lot of
users already had the pattern of using '#ifndef' guarding (or in one
case just using '#undef MIN') before defining their own private version
that does the same thing. I left such cases alone.

Cc: David Laight <David.Laight@aculab.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/minmax.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 9c2848abc804..fc384714da45 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -277,6 +277,8 @@ static inline bool in_range32(u32 val, u32 start, u32 len)
  * Use these carefully: no type checking, and uses the arguments
  * multiple times. Use for obvious constants only.
  */
+#define MIN(a,b) __cmp(min,a,b)
+#define MAX(a,b) __cmp(max,a,b)
 #define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b))
 #define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b))
 
-- 
cgit v1.2.3


From dc1c8034e31b14a2e5e212104ec508aec44ce1b9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 28 Jul 2024 20:24:12 -0700
Subject: minmax: simplify min()/max()/clamp() implementation

Now that we no longer have any C constant expression contexts (ie array
size declarations or static initializers) that use min() or max(), we
can simpify the implementation by not having to worry about the result
staying as a C constant expression.

So now we can unconditionally just use temporary variables of the right
type, and get rid of the excessive expansion that used to come from the
use of

   __builtin_choose_expr(__is_constexpr(...), ..

to pick the specialized code for constant expressions.

Another expansion simplification is to pass the temporary variables (in
addition to the original expression) to our __types_ok() macro.  That
may superficially look like it complicates the macro, but when we only
want the type of the expression, expanding the temporary variable names
is much simpler and smaller than expanding the potentially complicated
original expression.

As a result, on my machine, doing a

  $ time make drivers/staging/media/atomisp/pci/isp/kernels/ynr/ynr_1.0/ia_css_ynr.host.i

goes from

	real	0m16.621s
	user	0m15.360s
	sys	0m1.221s

to

	real	0m2.532s
	user	0m2.091s
	sys	0m0.452s

because the token expansion goes down dramatically.

In particular, the longest line expansion (which was line 71 of that
'ia_css_ynr.host.c' file) shrinks from 23,338kB (yes, 23MB for one
single line) to "just" 1,444kB (now "only" 1.4MB).

And yes, that line is still the line from hell, because it's doing
multiple levels of "min()/max()" expansion thanks to some of them being
hidden inside the uDIGIT_FITTING() macro.

Lorenzo has a nice cleanup patch that makes that driver use inline
functions instead of macros for sDIGIT_FITTING() and uDIGIT_FITTING(),
which will fix that line once and for all, but the 16-fold reduction in
this case does show why we need to simplify these helpers.

Cc: David Laight <David.Laight@aculab.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/minmax.h | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index fc384714da45..e3e4353df983 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -35,10 +35,10 @@
 #define __is_noneg_int(x)	\
 	(__builtin_choose_expr(__is_constexpr(x) && __is_signed(x), x, -1) >= 0)
 
-#define __types_ok(x, y) 					\
-	(__is_signed(x) == __is_signed(y) ||			\
-		__is_signed((x) + 0) == __is_signed((y) + 0) ||	\
-		__is_noneg_int(x) || __is_noneg_int(y))
+#define __types_ok(x, y, ux, uy) 				\
+	(__is_signed(ux) == __is_signed(uy) ||			\
+	 __is_signed((ux) + 0) == __is_signed((uy) + 0) ||	\
+	 __is_noneg_int(x) || __is_noneg_int(y))
 
 #define __cmp_op_min <
 #define __cmp_op_max >
@@ -51,34 +51,31 @@
 #define __cmp_once(op, type, x, y) \
 	__cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))
 
-#define __careful_cmp_once(op, x, y) ({			\
-	static_assert(__types_ok(x, y),			\
+#define __careful_cmp_once(op, x, y, ux, uy) ({		\
+	__auto_type ux = (x); __auto_type uy = (y);	\
+	static_assert(__types_ok(x, y, ux, uy),		\
 		#op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \
-	__cmp_once(op, __auto_type, x, y); })
+	__cmp(op, ux, uy); })
 
-#define __careful_cmp(op, x, y)					\
-	__builtin_choose_expr(__is_constexpr((x) - (y)),	\
-		__cmp(op, x, y), __careful_cmp_once(op, x, y))
+#define __careful_cmp(op, x, y) \
+	__careful_cmp_once(op, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))
 
 #define __clamp(val, lo, hi)	\
 	((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val)))
 
-#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({		\
-	typeof(val) unique_val = (val);						\
-	typeof(lo) unique_lo = (lo);						\
-	typeof(hi) unique_hi = (hi);						\
+#define __clamp_once(val, lo, hi, uval, ulo, uhi) ({				\
+	__auto_type uval = (val);						\
+	__auto_type ulo = (lo);							\
+	__auto_type uhi = (hi);							\
 	static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), 	\
 			(lo) <= (hi), true),					\
 		"clamp() low limit " #lo " greater than high limit " #hi);	\
-	static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error");	\
-	static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error");	\
-	__clamp(unique_val, unique_lo, unique_hi); })
-
-#define __careful_clamp(val, lo, hi) ({					\
-	__builtin_choose_expr(__is_constexpr((val) - (lo) + (hi)),	\
-		__clamp(val, lo, hi),					\
-		__clamp_once(val, lo, hi, __UNIQUE_ID(__val),		\
-			     __UNIQUE_ID(__lo), __UNIQUE_ID(__hi))); })
+	static_assert(__types_ok(uval, lo, uval, ulo), "clamp() 'lo' signedness error");	\
+	static_assert(__types_ok(uval, hi, uval, uhi), "clamp() 'hi' signedness error");	\
+	__clamp(uval, ulo, uhi); })
+
+#define __careful_clamp(val, lo, hi) \
+	__clamp_once(val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_))
 
 /**
  * min - return minimum of two values of the same or compatible types
-- 
cgit v1.2.3


From 8cd44dd1d17a23d5cc8c443c659ca57aa76e2fa5 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Wed, 15 Feb 2023 09:18:02 +0900
Subject: btrfs: zoned: fix zone_unusable accounting on making block group
 read-write again

When btrfs makes a block group read-only, it adds all free regions in the
block group to space_info->bytes_readonly. That free space excludes
reserved and pinned regions. OTOH, when btrfs makes the block group
read-write again, it moves all the unused regions into the block group's
zone_unusable. That unused region includes reserved and pinned regions.
As a result, it counts too much zone_unusable bytes.

Fortunately (or unfortunately), having erroneous zone_unusable does not
affect the calculation of space_info->bytes_readonly, because free
space (num_bytes in btrfs_dec_block_group_ro) calculation is done based on
the erroneous zone_unusable and it reduces the num_bytes just to cancel the
error.

This behavior can be easily discovered by adding a WARN_ON to check e.g,
"bg->pinned > 0" in btrfs_dec_block_group_ro(), and running fstests test
case like btrfs/282.

Fix it by properly considering pinned and reserved in
btrfs_dec_block_group_ro(). Also, add a WARN_ON and introduce
btrfs_space_info_update_bytes_zone_unusable() to catch a similar mistake.

Fixes: 169e0da91a21 ("btrfs: zoned: track unusable bytes for zones")
CC: stable@vger.kernel.org # 5.15+
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/trace/events/btrfs.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index eeb56975bee7..de55a555d95b 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2383,6 +2383,14 @@ DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned,
 	TP_ARGS(fs_info, sinfo, old, diff)
 );
 
+DEFINE_EVENT(btrfs__space_info_update, update_bytes_zone_unusable,
+
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_space_info *sinfo, u64 old, s64 diff),
+
+	TP_ARGS(fs_info, sinfo, old, diff)
+);
+
 DECLARE_EVENT_CLASS(btrfs_raid56_bio,
 
 	TP_PROTO(const struct btrfs_raid_bio *rbio,
-- 
cgit v1.2.3


From 2accfdb7eff65f390c4308b0e9cb7c3fe48ad63c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 29 Jul 2024 10:58:28 -0700
Subject: profiling: attempt to remove per-cpu profile flip buffer

This is the really old legacy kernel profiling code, which has long
since been obviated by "real profiling" (ie 'prof' and company), and
mainly remains as a source of syzbot reports.

There are anecdotal reports that people still use it for boot-time
profiling, but it's unlikely that such use would care about the old NUMA
optimizations in this code from 2004 (commit ad02973d42: "profile: 512x
Altix timer interrupt livelock fix" in the BK import archive at [1])

So in order to head off future syzbot reports, let's try to simplify
this code and get rid of the per-cpu profile buffers that are quite a
large portion of the complexity footprint of this thing (including CPU
hotplug callbacks etc).

It's unlikely anybody will actually notice, or possibly, as Thomas put
it: "Only people who indulge in nostalgia will notice :)".

That said, if it turns out that this code is actually actively used by
somebody, we can always revert this removal.  Thus the "attempt" in the
summary line.

[ Note: in a small nod to "the profiling code can cause NUMA problems",
  this also removes the "increment the last entry in the profiling array
  on any unknown hits" logic. That would account any program counter in
  a module to that single counter location, and might exacerbate any
  NUMA cacheline bouncing issues ]

Link: https://lore.kernel.org/all/CAHk-=wgs52BxT4Zjmjz8aNvHWKxf5_ThBY4bYL1Y6CTaNL2dTw@mail.gmail.com/
Link:  https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git [1]
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuhotplug.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 51ba681b915a..affdd890899e 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -100,7 +100,6 @@ enum cpuhp_state {
 	CPUHP_WORKQUEUE_PREP,
 	CPUHP_POWER_NUMA_PREPARE,
 	CPUHP_HRTIMERS_PREPARE,
-	CPUHP_PROFILE_PREPARE,
 	CPUHP_X2APIC_PREPARE,
 	CPUHP_SMPCFD_PREPARE,
 	CPUHP_RELAY_PREPARE,
-- 
cgit v1.2.3


From b6a66e521a2032f7fcba2af5a9bcbaeaa19b7ca3 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Sat, 27 Jul 2024 12:01:23 +0200
Subject: mptcp: sched: check both directions for backup

The 'mptcp_subflow_context' structure has two items related to the
backup flags:

 - 'backup': the subflow has been marked as backup by the other peer

 - 'request_bkup': the backup flag has been set by the host

Before this patch, the scheduler was only looking at the 'backup' flag.
That can make sense in some cases, but it looks like that's not what we
wanted for the general use, because either the path-manager was setting
both of them when sending an MP_PRIO, or the receiver was duplicating
the 'backup' flag in the subflow request.

Note that the use of these two flags in the path-manager are going to be
fixed in the next commits, but this change here is needed not to modify
the behaviour.

Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests")
Cc: stable@vger.kernel.org
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/trace/events/mptcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
index 09e72215b9f9..085b749cdd97 100644
--- a/include/trace/events/mptcp.h
+++ b/include/trace/events/mptcp.h
@@ -34,7 +34,7 @@ TRACE_EVENT(mptcp_subflow_get_send,
 		struct sock *ssk;
 
 		__entry->active = mptcp_subflow_active(subflow);
-		__entry->backup = subflow->backup;
+		__entry->backup = subflow->backup || subflow->request_bkup;
 
 		if (subflow->tcp_sock && sk_fullsock(subflow->tcp_sock))
 			__entry->free = sk_stream_memory_free(subflow->tcp_sock);
-- 
cgit v1.2.3


From 22f5468731491e53356ba7c028f0fdea20b18e2c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 30 Jul 2024 10:36:47 -0700
Subject: minmax: improve macro expansion and type checking

This clarifies the rules for min()/max()/clamp() type checking and makes
them a much more efficient macro expansion.

In particular, we now look at the type and range of the inputs to see
whether they work together, generating a mask of acceptable comparisons,
and then just verifying that the inputs have a shared case:

 - an expression with a signed type can be used for
    (1) signed comparisons
    (2) unsigned comparisons if it is statically known to have a
        non-negative value

 - an expression with an unsigned type can be used for
    (3) unsigned comparison
    (4) signed comparisons if the type is smaller than 'int' and thus
        the C integer promotion rules will make it signed anyway

Here rule (1) and (3) are obvious, and rule (2) is important in order to
allow obvious trivial constants to be used together with unsigned
values.

Rule (4) is not necessarily a good idea, but matches what we used to do,
and we have extant cases of this situation in the kernel.  Notably with
bcachefs having an expression like

	min(bch2_bucket_sectors_dirty(a), ca->mi.bucket_size)

where bch2_bucket_sectors_dirty() returns an 's64', and
'ca->mi.bucket_size' is of type 'u16'.

Technically that bcachefs comparison is clearly sensible on a C type
level, because the 'u16' will go through the normal C integer promotion,
and become 'int', and then we're comparing two signed values and
everything looks sane.

However, it's not entirely clear that a 'min(s64,u16)' operation makes a
lot of conceptual sense, and it's possible that we will remove rule (4).
After all, the _reason_ we have these complicated type checks is exactly
that the C type promotion rules are not very intuitive.

But at least for now the rule is in place for backwards compatibility.

Also note that rule (2) existed before, but is hugely relaxed by this
commit.  It used to be true only for the simplest compile-time
non-negative integer constants.  The new macro model will allow cases
where the compiler can trivially see that an expression is non-negative
even if it isn't necessarily a constant.

For example, the amdgpu driver does

	min_t(size_t, sizeof(fru_info->serial), pia[addr] & 0x3F));

because our old 'min()' macro would see that 'pia[addr] & 0x3F' is of
type 'int' and clearly not a C constant expression, so doing a 'min()'
with a 'size_t' is a signedness violation.

Our new 'min()' macro still sees that 'pia[addr] & 0x3F' is of type
'int', but is smart enough to also see that it is clearly non-negative,
and thus would allow that case without any complaints.

Cc: Arnd Bergmann <arnd@kernel.org>
Cc: David Laight <David.Laight@aculab.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler.h |  9 ++++++
 include/linux/minmax.h   | 74 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 68 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 2594553bb30b..2df665fa2964 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -296,6 +296,15 @@ static inline void *offset_to_ptr(const int *off)
 #define is_signed_type(type) (((type)(-1)) < (__force type)1)
 #define is_unsigned_type(type) (!is_signed_type(type))
 
+/*
+ * Useful shorthand for "is this condition known at compile-time?"
+ *
+ * Note that the condition may involve non-constant values,
+ * but the compiler may know enough about the details of the
+ * values to determine that the condition is statically true.
+ */
+#define statically_true(x) (__builtin_constant_p(x) && (x))
+
 /*
  * This is needed in functions which generate the stack canary, see
  * arch/x86/kernel/smpboot.c::start_secondary() for an example.
diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index e3e4353df983..41da6f85a407 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -26,19 +26,63 @@
 #define __typecheck(x, y) \
 	(!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
 
-/* is_signed_type() isn't a constexpr for pointer types */
-#define __is_signed(x) 								\
-	__builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))),	\
-		is_signed_type(typeof(x)), 0)
+/*
+ * __sign_use for integer expressions:
+ *   bit #0 set if ok for unsigned comparisons
+ *   bit #1 set if ok for signed comparisons
+ *
+ * In particular, statically non-negative signed integer
+ * expressions are ok for both.
+ *
+ * NOTE! Unsigned types smaller than 'int' are implicitly
+ * converted to 'int' in expressions, and are accepted for
+ * signed conversions for now. This is debatable.
+ *
+ * Note that 'x' is the original expression, and 'ux' is
+ * the unique variable that contains the value.
+ *
+ * We use 'ux' for pure type checking, and 'x' for when
+ * we need to look at the value (but without evaluating
+ * it for side effects! Careful to only ever evaluate it
+ * with sizeof() or __builtin_constant_p() etc).
+ *
+ * Pointers end up being checked by the normal C type
+ * rules at the actual comparison, and these expressions
+ * only need to be careful to not cause warnings for
+ * pointer use.
+ */
+#define __signed_type_use(x,ux) (2+__is_nonneg(x,ux))
+#define __unsigned_type_use(x,ux) (1+2*(sizeof(ux)<4))
+#define __sign_use(x,ux) (is_signed_type(typeof(ux))? \
+	__signed_type_use(x,ux):__unsigned_type_use(x,ux))
+
+/*
+ * To avoid warnings about casting pointers to integers
+ * of different sizes, we need that special sign type.
+ *
+ * On 64-bit we can just always use 'long', since any
+ * integer or pointer type can just be cast to that.
+ *
+ * This does not work for 128-bit signed integers since
+ * the cast would truncate them, but we do not use s128
+ * types in the kernel (we do use 'u128', but they will
+ * be handled by the !is_signed_type() case).
+ *
+ * NOTE! The cast is there only to avoid any warnings
+ * from when values that aren't signed integer types.
+ */
+#ifdef CONFIG_64BIT
+  #define __signed_type(ux) long
+#else
+  #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux)>4,1LL,1L))
+#endif
+#define __is_nonneg(x,ux) statically_true((__signed_type(ux))(x)>=0)
 
-/* True for a non-negative signed int constant */
-#define __is_noneg_int(x)	\
-	(__builtin_choose_expr(__is_constexpr(x) && __is_signed(x), x, -1) >= 0)
+#define __types_ok(x,y,ux,uy) \
+	(__sign_use(x,ux) & __sign_use(y,uy))
 
-#define __types_ok(x, y, ux, uy) 				\
-	(__is_signed(ux) == __is_signed(uy) ||			\
-	 __is_signed((ux) + 0) == __is_signed((uy) + 0) ||	\
-	 __is_noneg_int(x) || __is_noneg_int(y))
+#define __types_ok3(x,y,z,ux,uy,uz) \
+	(__sign_use(x,ux) & __sign_use(y,uy) & __sign_use(z,uz))
 
 #define __cmp_op_min <
 #define __cmp_op_max >
@@ -53,8 +97,8 @@
 
 #define __careful_cmp_once(op, x, y, ux, uy) ({		\
 	__auto_type ux = (x); __auto_type uy = (y);	\
-	static_assert(__types_ok(x, y, ux, uy),		\
-		#op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \
+	BUILD_BUG_ON_MSG(!__types_ok(x,y,ux,uy),	\
+		#op"("#x", "#y") signedness error");	\
 	__cmp(op, ux, uy); })
 
 #define __careful_cmp(op, x, y) \
@@ -70,8 +114,8 @@
 	static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), 	\
 			(lo) <= (hi), true),					\
 		"clamp() low limit " #lo " greater than high limit " #hi);	\
-	static_assert(__types_ok(uval, lo, uval, ulo), "clamp() 'lo' signedness error");	\
-	static_assert(__types_ok(uval, hi, uval, uhi), "clamp() 'hi' signedness error");	\
+	BUILD_BUG_ON_MSG(!__types_ok3(val,lo,hi,uval,ulo,uhi),			\
+		"clamp("#val", "#lo", "#hi") signedness error");		\
 	__clamp(uval, ulo, uhi); })
 
 #define __careful_clamp(val, lo, hi) \
-- 
cgit v1.2.3


From 89add40066f9ed9abe5f7f886fe5789ff7e0c50e Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 29 Jul 2024 16:10:12 -0400
Subject: net: drop bad gso csum_start and offset in virtio_net_hdr

Tighten csum_start and csum_offset checks in virtio_net_hdr_to_skb
for GSO packets.

The function already checks that a checksum requested with
VIRTIO_NET_HDR_F_NEEDS_CSUM is in skb linear. But for GSO packets
this might not hold for segs after segmentation.

Syzkaller demonstrated to reach this warning in skb_checksum_help

	offset = skb_checksum_start_offset(skb);
	ret = -EINVAL;
	if (WARN_ON_ONCE(offset >= skb_headlen(skb)))

By injecting a TSO packet:

WARNING: CPU: 1 PID: 3539 at net/core/dev.c:3284 skb_checksum_help+0x3d0/0x5b0
 ip_do_fragment+0x209/0x1b20 net/ipv4/ip_output.c:774
 ip_finish_output_gso net/ipv4/ip_output.c:279 [inline]
 __ip_finish_output+0x2bd/0x4b0 net/ipv4/ip_output.c:301
 iptunnel_xmit+0x50c/0x930 net/ipv4/ip_tunnel_core.c:82
 ip_tunnel_xmit+0x2296/0x2c70 net/ipv4/ip_tunnel.c:813
 __gre_xmit net/ipv4/ip_gre.c:469 [inline]
 ipgre_xmit+0x759/0xa60 net/ipv4/ip_gre.c:661
 __netdev_start_xmit include/linux/netdevice.h:4850 [inline]
 netdev_start_xmit include/linux/netdevice.h:4864 [inline]
 xmit_one net/core/dev.c:3595 [inline]
 dev_hard_start_xmit+0x261/0x8c0 net/core/dev.c:3611
 __dev_queue_xmit+0x1b97/0x3c90 net/core/dev.c:4261
 packet_snd net/packet/af_packet.c:3073 [inline]

The geometry of the bad input packet at tcp_gso_segment:

[   52.003050][ T8403] skb len=12202 headroom=244 headlen=12093 tailroom=0
[   52.003050][ T8403] mac=(168,24) mac_len=24 net=(192,52) trans=244
[   52.003050][ T8403] shinfo(txflags=0 nr_frags=1 gso(size=1552 type=3 segs=0))
[   52.003050][ T8403] csum(0x60000c7 start=199 offset=1536
ip_summed=3 complete_sw=0 valid=0 level=0)

Mitigate with stricter input validation.

csum_offset: for GSO packets, deduce the correct value from gso_type.
This is already done for USO. Extend it to TSO. Let UFO be:
udp[46]_ufo_fragment ignores these fields and always computes the
checksum in software.

csum_start: finding the real offset requires parsing to the transport
header. Do not add a parser, use existing segmentation parsing. Thanks
to SKB_GSO_DODGY, that also catches bad packets that are hw offloaded.
Again test both TSO and USO. Do not test UFO for the above reason, and
do not test UDP tunnel offload.

GSO packet are almost always CHECKSUM_PARTIAL. USO packets may be
CHECKSUM_NONE since commit 10154dbded6d6 ("udp: Allow GSO transmit
from devices with no checksum offload"), but then still these fields
are initialized correctly in udp4_hwcsum/udp6_hwcsum_outgoing. So no
need to test for ip_summed == CHECKSUM_PARTIAL first.

This revises an existing fix mentioned in the Fixes tag, which broke
small packets with GSO offload, as detected by kselftests.

Link: https://syzkaller.appspot.com/bug?extid=e1db31216c789f552871
Link: https://lore.kernel.org/netdev/20240723223109.2196886-1-kuba@kernel.org
Fixes: e269d79c7d35 ("net: missing check virtio")
Cc: stable@vger.kernel.org
Signed-off-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20240729201108.1615114-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/virtio_net.h | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index d1d7825318c3..6c395a2600e8 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -56,7 +56,6 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 	unsigned int thlen = 0;
 	unsigned int p_off = 0;
 	unsigned int ip_proto;
-	u64 ret, remainder, gso_size;
 
 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
@@ -99,16 +98,6 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 		u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
 		u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16));
 
-		if (hdr->gso_size) {
-			gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size);
-			ret = div64_u64_rem(skb->len, gso_size, &remainder);
-			if (!(ret && (hdr->gso_size > needed) &&
-						((remainder > needed) || (remainder == 0)))) {
-				return -EINVAL;
-			}
-			skb_shinfo(skb)->tx_flags |= SKBFL_SHARED_FRAG;
-		}
-
 		if (!pskb_may_pull(skb, needed))
 			return -EINVAL;
 
@@ -182,6 +171,11 @@ retry:
 			if (gso_type != SKB_GSO_UDP_L4)
 				return -EINVAL;
 			break;
+		case SKB_GSO_TCPV4:
+		case SKB_GSO_TCPV6:
+			if (skb->csum_offset != offsetof(struct tcphdr, check))
+				return -EINVAL;
+			break;
 		}
 
 		/* Kernel has a special handling for GSO_BY_FRAGS. */
-- 
cgit v1.2.3


From 21b136cc63d2a9ddd60d4699552b69c214b32964 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 30 Jul 2024 15:44:16 -0700
Subject: minmax: fix up min3() and max3() too

David Laight pointed out that we should deal with the min3() and max3()
mess too, which still does excessive expansion.

And our current macros are actually rather broken.

In particular, the macros did this:

  #define min3(x, y, z) min((typeof(x))min(x, y), z)
  #define max3(x, y, z) max((typeof(x))max(x, y), z)

and that not only is a nested expansion of possibly very complex
arguments with all that involves, the typing with that "typeof()" cast
is completely wrong.

For example, imagine what happens in max3() if 'x' happens to be a
'unsigned char', but 'y' and 'z' are 'unsigned long'.  The types are
compatible, and there's no warning - but the result is just random
garbage.

No, I don't think we've ever hit that issue in practice, but since we
now have sane infrastructure for doing this right, let's just use it.
It fixes any excessive expansion, and also avoids these kinds of broken
type issues.

Requested-by: David Laight <David.Laight@aculab.com>
Acked-by: Arnd Bergmann <arnd@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/minmax.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 41da6f85a407..98008dd92153 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -152,13 +152,20 @@
 #define umax(x, y)	\
 	__careful_cmp(max, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull)
 
+#define __careful_op3(op, x, y, z, ux, uy, uz) ({			\
+	__auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\
+	BUILD_BUG_ON_MSG(!__types_ok3(x,y,z,ux,uy,uz),			\
+		#op"3("#x", "#y", "#z") signedness error");		\
+	__cmp(op, ux, __cmp(op, uy, uz)); })
+
 /**
  * min3 - return minimum of three values
  * @x: first value
  * @y: second value
  * @z: third value
  */
-#define min3(x, y, z) min((typeof(x))min(x, y), z)
+#define min3(x, y, z) \
+	__careful_op3(min, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_))
 
 /**
  * max3 - return maximum of three values
@@ -166,7 +173,8 @@
  * @y: second value
  * @z: third value
  */
-#define max3(x, y, z) max((typeof(x))max(x, y), z)
+#define max3(x, y, z) \
+	__careful_op3(max, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_))
 
 /**
  * min_not_zero - return the minimum that is _not_ zero, unless both are zero
-- 
cgit v1.2.3