From 1680ac7a5ad578d7acf819912557ceea4b4a5a88 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 7 Jul 2023 16:18:29 -0700
Subject: Input: gameport - use IS_REACHABLE() instead of open-coding it

Replace an open-coded preprocessor conditional with an equivalent helper.

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/ZKYLLmsdCH0Gp7TO@google.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/gameport.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gameport.h b/include/linux/gameport.h
index 0a221e768ea4..07e370113b2b 100644
--- a/include/linux/gameport.h
+++ b/include/linux/gameport.h
@@ -63,7 +63,7 @@ struct gameport_driver {
 int gameport_open(struct gameport *gameport, struct gameport_driver *drv, int mode);
 void gameport_close(struct gameport *gameport);
 
-#if defined(CONFIG_GAMEPORT) || (defined(MODULE) && defined(CONFIG_GAMEPORT_MODULE))
+#if IS_REACHABLE(CONFIG_GAMEPORT)
 
 void __gameport_register_port(struct gameport *gameport, struct module *owner);
 /* use a define to avoid include chaining to get THIS_MODULE */
-- 
cgit v1.2.3


From 8ce49c2a2aa53afde9a20a8ce02b069d3b262af0 Mon Sep 17 00:00:00 2001
From: Deepak Kumar Singh <quic_deesin@quicinc.com>
Date: Fri, 7 Jul 2023 03:11:36 +0530
Subject: rpmsg: core: Add signal API support

Some transports like Glink support the state notifications between
clients using flow control signals similar to serial protocol signals.
Local glink client drivers can send and receive flow control status
to glink clients running on remote processors.

Add APIs to support sending and receiving of flow control status by
rpmsg clients.

Signed-off-by: Deepak Kumar Singh <quic_deesin@quicinc.com>
Signed-off-by: Sarannya S <quic_sarannya@quicinc.com>
Acked-by: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Link: https://lore.kernel.org/r/1688679698-31274-2-git-send-email-quic_sarannya@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/rpmsg.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index 523c98b96cb4..90d8e4475f80 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -64,12 +64,14 @@ struct rpmsg_device {
 };
 
 typedef int (*rpmsg_rx_cb_t)(struct rpmsg_device *, void *, int, void *, u32);
+typedef int (*rpmsg_flowcontrol_cb_t)(struct rpmsg_device *, void *, bool);
 
 /**
  * struct rpmsg_endpoint - binds a local rpmsg address to its user
  * @rpdev: rpmsg channel device
  * @refcount: when this drops to zero, the ept is deallocated
  * @cb: rx callback handler
+ * @flow_cb: remote flow control callback handler
  * @cb_lock: must be taken before accessing/changing @cb
  * @addr: local rpmsg address
  * @priv: private data for the driver's use
@@ -92,6 +94,7 @@ struct rpmsg_endpoint {
 	struct rpmsg_device *rpdev;
 	struct kref refcount;
 	rpmsg_rx_cb_t cb;
+	rpmsg_flowcontrol_cb_t flow_cb;
 	struct mutex cb_lock;
 	u32 addr;
 	void *priv;
@@ -106,6 +109,7 @@ struct rpmsg_endpoint {
  * @probe: invoked when a matching rpmsg channel (i.e. device) is found
  * @remove: invoked when the rpmsg channel is removed
  * @callback: invoked when an inbound message is received on the channel
+ * @flowcontrol: invoked when remote side flow control request is received
  */
 struct rpmsg_driver {
 	struct device_driver drv;
@@ -113,6 +117,7 @@ struct rpmsg_driver {
 	int (*probe)(struct rpmsg_device *dev);
 	void (*remove)(struct rpmsg_device *dev);
 	int (*callback)(struct rpmsg_device *, void *, int, void *, u32);
+	int (*flowcontrol)(struct rpmsg_device *, void *, bool);
 };
 
 static inline u16 rpmsg16_to_cpu(struct rpmsg_device *rpdev, __rpmsg16 val)
@@ -192,6 +197,8 @@ __poll_t rpmsg_poll(struct rpmsg_endpoint *ept, struct file *filp,
 
 ssize_t rpmsg_get_mtu(struct rpmsg_endpoint *ept);
 
+int rpmsg_set_flow_control(struct rpmsg_endpoint *ept, bool pause, u32 dst);
+
 #else
 
 static inline int rpmsg_register_device_override(struct rpmsg_device *rpdev,
@@ -316,6 +323,14 @@ static inline ssize_t rpmsg_get_mtu(struct rpmsg_endpoint *ept)
 	return -ENXIO;
 }
 
+static inline int rpmsg_set_flow_control(struct rpmsg_endpoint *ept, bool pause, u32 dst)
+{
+	/* This shouldn't be possible */
+	WARN_ON(1);
+
+	return -ENXIO;
+}
+
 #endif /* IS_ENABLED(CONFIG_RPMSG) */
 
 /* use a macro to avoid include chaining to get THIS_MODULE */
-- 
cgit v1.2.3


From f247f08da0ce822de0d6b2feec811dd6d4d599ce Mon Sep 17 00:00:00 2001
From: Siddharth Gupta <sidgup@codeaurora.org>
Date: Fri, 24 Feb 2023 13:17:06 -0800
Subject: remoteproc: core: Export the rproc coredump APIs

The remoteproc coredump APIs are currently only part of the internal
remoteproc header. This prevents the remoteproc platform drivers from
using these APIs when needed. This change moves the rproc_coredump()
and rproc_coredump_cleanup() APIs to the linux header and marks them
as exported symbols.

Signed-off-by: Siddharth Gupta <sidgup@codeaurora.org>
Signed-off-by: Gokul krishna Krishnakumar <quic_gokukris@quicinc.com>
Link: https://lore.kernel.org/r/20230224211707.30916-2-quic_gokukris@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/remoteproc.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index fe8978eb69f1..b4795698d8c2 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -690,6 +690,10 @@ int rproc_detach(struct rproc *rproc);
 int rproc_set_firmware(struct rproc *rproc, const char *fw_name);
 void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type);
 void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem);
+
+/* from remoteproc_coredump.c */
+void rproc_coredump_cleanup(struct rproc *rproc);
+void rproc_coredump(struct rproc *rproc);
 void rproc_coredump_using_sections(struct rproc *rproc);
 int rproc_coredump_add_segment(struct rproc *rproc, dma_addr_t da, size_t size);
 int rproc_coredump_add_custom_segment(struct rproc *rproc,
-- 
cgit v1.2.3


From b4f78ff746ec5274fffa92fa2a4dc531360b5016 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 14 Jul 2023 22:56:14 +0200
Subject: pwm: Use a consistent name for pwm_chip pointers in the core
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most variables of type struct pwm_chip * are named "chip", there are
only three outliers called "pc". Change these three to "chip", too, for
consistency.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 04ae1d9073a7..d2f9f690a9c1 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -298,7 +298,7 @@ struct pwm_chip {
 	int base;
 	unsigned int npwm;
 
-	struct pwm_device * (*of_xlate)(struct pwm_chip *pc,
+	struct pwm_device * (*of_xlate)(struct pwm_chip *chip,
 					const struct of_phandle_args *args);
 	unsigned int of_pwm_n_cells;
 
@@ -395,9 +395,9 @@ struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
 					 unsigned int index,
 					 const char *label);
 
-struct pwm_device *of_pwm_xlate_with_flags(struct pwm_chip *pc,
+struct pwm_device *of_pwm_xlate_with_flags(struct pwm_chip *chip,
 		const struct of_phandle_args *args);
-struct pwm_device *of_pwm_single_xlate(struct pwm_chip *pc,
+struct pwm_device *of_pwm_single_xlate(struct pwm_chip *chip,
 				       const struct of_phandle_args *args);
 
 struct pwm_device *pwm_get(struct device *dev, const char *con_id);
-- 
cgit v1.2.3


From 9e70a5e109a4a23367810de09be826c52d27ee2f Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 17 Jul 2023 21:52:06 +0206
Subject: printk: Add per-console suspended state

Currently the global @console_suspended is used to determine if
consoles are in a suspended state. Its primary purpose is to allow
usage of the console_lock when suspended without causing console
printing. It is synchronized by the console_lock.

Rather than relying on the console_lock to determine suspended
state, make it an official per-console state that is set within
console->flags. This allows the state to be queried via SRCU.

Remove @console_suspended. Console printing will still be avoided
when suspended because console_is_usable() returns false when
the new suspended flag is set for that console.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230717194607.145135-7-john.ogness@linutronix.de
---
 include/linux/console.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index d3195664baa5..7de11c763eb3 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -154,6 +154,8 @@ static inline int con_debug_leave(void)
  *			receiving the printk spam for obvious reasons.
  * @CON_EXTENDED:	The console supports the extended output format of
  *			/dev/kmesg which requires a larger output buffer.
+ * @CON_SUSPENDED:	Indicates if a console is suspended. If true, the
+ *			printing callbacks must not be called.
  */
 enum cons_flags {
 	CON_PRINTBUFFER		= BIT(0),
@@ -163,6 +165,7 @@ enum cons_flags {
 	CON_ANYTIME		= BIT(4),
 	CON_BRL			= BIT(5),
 	CON_EXTENDED		= BIT(6),
+	CON_SUSPENDED		= BIT(7),
 };
 
 /**
-- 
cgit v1.2.3


From 481461f5109919babbb393d6f68002936b8e2493 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 16 Jul 2023 19:15:54 +0900
Subject: linux/export.h: make <linux/export.h> independent of CONFIG_MODULES

Currently, all files with EXPORT_SYMBOL() are rebuilt when CONFIG_MODULES
is flipped due to <linux/export.h> depending on CONFIG_MODULES.

Now that modpost can make a final decision about export symbols,
<linux/export.h> does not need to make EXPORT_SYMBOL() no-op.
Instead, modpost can skip emitting KSYMTAB when CONFIG_MODULES is unset.

This commit will reduce the number of recompilation when CONFIG_MODULES
is toggled.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/export.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/export.h b/include/linux/export.h
index beed8387e0a4..9911508a9604 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -50,7 +50,7 @@ extern struct module __this_module;
 		__EXPORT_SYMBOL_REF(sym)	ASM_NL	\
 	.previous
 
-#if !defined(CONFIG_MODULES) || defined(__DISABLE_EXPORTS)
+#if defined(__DISABLE_EXPORTS)
 
 /*
  * Allow symbol exports to be disabled completely so that C code may
@@ -75,7 +75,7 @@ extern struct module __this_module;
 	__ADDRESSABLE(sym)					\
 	asm(__stringify(___EXPORT_SYMBOL(sym, license, ns)))
 
-#endif /* CONFIG_MODULES */
+#endif
 
 #ifdef DEFAULT_SYMBOL_NAMESPACE
 #define _EXPORT_SYMBOL(sym, license)	__EXPORT_SYMBOL(sym, license, __stringify(DEFAULT_SYMBOL_NAMESPACE))
-- 
cgit v1.2.3


From db2d6038c5e795cab4f0a8d3e86b4f7e33338629 Mon Sep 17 00:00:00 2001
From: Benjamin Bara <benjamin.bara@skidata.com>
Date: Sat, 15 Jul 2023 09:53:25 +0200
Subject: kernel/reboot: Add device to sys_off_handler

If the dev is known (e.g. a devm-based sys_off_handler is used), it can
be passed to the handler's callback to have it available there.
Otherwise, cb_data might be set to the dev in most of the cases.

Reviewed-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Benjamin Bara <benjamin.bara@skidata.com>
Link: https://lore.kernel.org/r/20230327-tegra-pmic-reboot-v7-3-18699d5dcd76@skidata.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/reboot.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 2b6bb593be5b..c4cc3b89ced1 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -129,11 +129,14 @@ enum sys_off_mode {
  * @cb_data: User's callback data.
  * @cmd: Command string. Currently used only by the sys-off restart mode,
  *       NULL otherwise.
+ * @dev: Device of the sys-off handler. Only if known (devm_register_*),
+ *       NULL otherwise.
  */
 struct sys_off_data {
 	int mode;
 	void *cb_data;
 	const char *cmd;
+	struct device *dev;
 };
 
 struct sys_off_handler *
-- 
cgit v1.2.3


From 687fe7dfb736b03ab820d172ea5dbfc1ec447135 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sun, 23 Jul 2023 22:30:18 -0700
Subject: Input: tca6416-keypad - always expect proper IRQ number in i2c client

Remove option having i2c client contain raw gpio number instead of proper
IRQ number. There are no users of this facility in mainline and it will
allow cleaning up the driver code with regard to wakeup handling, etc.

Link: https://lore.kernel.org/r/20230724053024.352054-1-dmitry.torokhov@gmail.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/tca6416_keypad.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tca6416_keypad.h b/include/linux/tca6416_keypad.h
index b0d36a9934cc..5cf6f6f82aa7 100644
--- a/include/linux/tca6416_keypad.h
+++ b/include/linux/tca6416_keypad.h
@@ -25,7 +25,6 @@ struct tca6416_keys_platform_data {
 	unsigned int rep:1;	/* enable input subsystem auto repeat */
 	uint16_t pinmask;
 	uint16_t invert;
-	int irq_is_gpio;
 	int use_polling;	/* use polling if Interrupt is not connected*/
 };
 #endif
-- 
cgit v1.2.3


From 63b93099359eca37c11e4d5db5ea2c3a375f6026 Mon Sep 17 00:00:00 2001
From: Sergey Shtylyov <s.shtylyov@omp.ru>
Date: Sat, 29 Jul 2023 23:17:46 +0300
Subject: ata: libata: fix parameter type of ata_deadline()

ata_deadline() passes its 'unsigned long timeout_msecs'  parameter verbatim
to msecs_to_jiffies() which takes just 'unsigned int' -- eliminate unneeded
implicit cast...

Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 820f7a3a2749..d9edb3d62a42 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1876,7 +1876,7 @@ static inline int ata_check_ready(u8 status)
 }
 
 static inline unsigned long ata_deadline(unsigned long from_jiffies,
-					 unsigned long timeout_msecs)
+					 unsigned int timeout_msecs)
 {
 	return from_jiffies + msecs_to_jiffies(timeout_msecs);
 }
-- 
cgit v1.2.3


From 84abed36d7de7145d393f9d5d05b36717e0cb49d Mon Sep 17 00:00:00 2001
From: Sergey Shtylyov <s.shtylyov@omp.ru>
Date: Sat, 29 Jul 2023 23:17:47 +0300
Subject: ata: libata-core: fix parameter types of ata_wait_register()

ata_wait_register() passes its 'unsigned long {interval|timeout}' params
verbatim to ata_{msleep|deadline}() that just take 'unsigned int' param
for the time intervals in ms -- eliminate unneeded implicit casts...

Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index d9edb3d62a42..4772f64af734 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1116,7 +1116,7 @@ static inline void ata_sas_port_resume(struct ata_port *ap)
 extern int ata_ratelimit(void);
 extern void ata_msleep(struct ata_port *ap, unsigned int msecs);
 extern u32 ata_wait_register(struct ata_port *ap, void __iomem *reg, u32 mask,
-			u32 val, unsigned long interval, unsigned long timeout);
+			     u32 val, unsigned int interval, unsigned int timeout);
 extern int atapi_cmd_type(u8 opcode);
 extern unsigned int ata_pack_xfermask(unsigned int pio_mask,
 				      unsigned int mwdma_mask,
-- 
cgit v1.2.3


From d14d41cc5aaef138face9d5a145b460e2b63697a Mon Sep 17 00:00:00 2001
From: Sergey Shtylyov <s.shtylyov@omp.ru>
Date: Sat, 29 Jul 2023 23:17:49 +0300
Subject: ata: fix debounce timings type

sata_deb_timing_{hotplug|long|normal}[] store 'unsigned long' debounce
timeouts in ms, while sata_link_debounce() eventually uses those timeouts
by calling ata_{deadline|msleep}( which take just 'unsigned int'.  Change
the debounce timeout table element's type to 'unsigned int' -- all these
timeouts happily fit into 'unsigned int'...

Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 4772f64af734..8d510fb00591 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1166,11 +1166,11 @@ extern void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *
  * SATA specific code - drivers/ata/libata-sata.c
  */
 #ifdef CONFIG_SATA_HOST
-extern const unsigned long sata_deb_timing_normal[];
-extern const unsigned long sata_deb_timing_hotplug[];
-extern const unsigned long sata_deb_timing_long[];
+extern const unsigned int sata_deb_timing_normal[];
+extern const unsigned int sata_deb_timing_hotplug[];
+extern const unsigned int sata_deb_timing_long[];
 
-static inline const unsigned long *
+static inline const unsigned int *
 sata_ehc_deb_timing(struct ata_eh_context *ehc)
 {
 	if (ehc->i.flags & ATA_EHI_HOTPLUGGED)
@@ -1185,14 +1185,14 @@ extern int sata_scr_write(struct ata_link *link, int reg, u32 val);
 extern int sata_scr_write_flush(struct ata_link *link, int reg, u32 val);
 extern int sata_set_spd(struct ata_link *link);
 extern int sata_link_hardreset(struct ata_link *link,
-			const unsigned long *timing, unsigned long deadline,
+			const unsigned int *timing, unsigned long deadline,
 			bool *online, int (*check_ready)(struct ata_link *));
-extern int sata_link_resume(struct ata_link *link, const unsigned long *params,
+extern int sata_link_resume(struct ata_link *link, const unsigned int *params,
 			    unsigned long deadline);
 extern int ata_eh_read_sense_success_ncq_log(struct ata_link *link);
 extern void ata_eh_analyze_ncq_error(struct ata_link *link);
 #else
-static inline const unsigned long *
+static inline const unsigned int *
 sata_ehc_deb_timing(struct ata_eh_context *ehc)
 {
 	return NULL;
@@ -1212,7 +1212,7 @@ static inline int sata_scr_write_flush(struct ata_link *link, int reg, u32 val)
 }
 static inline int sata_set_spd(struct ata_link *link) { return -EOPNOTSUPP; }
 static inline int sata_link_hardreset(struct ata_link *link,
-				      const unsigned long *timing,
+				      const unsigned int *timing,
 				      unsigned long deadline,
 				      bool *online,
 				      int (*check_ready)(struct ata_link *))
@@ -1222,7 +1222,7 @@ static inline int sata_link_hardreset(struct ata_link *link,
 	return -EOPNOTSUPP;
 }
 static inline int sata_link_resume(struct ata_link *link,
-				   const unsigned long *params,
+				   const unsigned int *params,
 				   unsigned long deadline)
 {
 	return -EOPNOTSUPP;
@@ -1234,7 +1234,7 @@ static inline int ata_eh_read_sense_success_ncq_log(struct ata_link *link)
 static inline void ata_eh_analyze_ncq_error(struct ata_link *link) { }
 #endif
 extern int sata_link_debounce(struct ata_link *link,
-			const unsigned long *params, unsigned long deadline);
+			      const unsigned int *params, unsigned long deadline);
 extern int sata_link_scr_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 			     bool spm_wakeup);
 extern int ata_slave_link_init(struct ata_port *ap);
-- 
cgit v1.2.3


From ff8072d589dcff7c1f0345a6ec98b5fc1e9ee2a1 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 31 Jul 2023 16:34:12 +0200
Subject: ata: libata: remove references to non-existing error_handler()

With commit 65a15d6560df ("scsi: ipr: Remove SATA support") all
libata drivers now have the error_handler() callback provided,
so we can stop checking for non-existing error_handler callback.

Signed-off-by: Hannes Reinecke <hare@suse.de>
[niklas: fixed review comments, rebased, solved conflicts during rebase,
fixed bug that unconditionally dumped all QCs, removed the now unused
function ata_dump_status(), removed the now unreachable failure paths in
atapi_qc_complete(), removed the non-EH function to request ATAPI sense]
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 8d510fb00591..d61e5465076e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1785,7 +1785,7 @@ static inline struct ata_queued_cmd *ata_qc_from_tag(struct ata_port *ap,
 {
 	struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag);
 
-	if (unlikely(!qc) || !ap->ops->error_handler)
+	if (unlikely(!qc))
 		return qc;
 
 	if ((qc->flags & (ATA_QCFLAG_ACTIVE |
-- 
cgit v1.2.3


From 43aa43351bb551a7732c1b9f6e8ebb9a6f30b063 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 31 Jul 2023 16:34:13 +0200
Subject: ata,scsi: remove ata_sas_port_{start,stop} callbacks

Callbacks are empty now, so remove them.

Also, remove the call to ap->ops->port_start() in ata_sas_port_init(),
as this would otherwise cause a NULL pointer dereference, now when the
callback is gone.

Signed-off-by: Hannes Reinecke <hare@suse.de>
[niklas: remove the call to ap->ops->port_start() in ata_sas_port_init()]
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index d61e5465076e..f8f8b558a8be 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1244,10 +1244,8 @@ extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
 extern void ata_sas_async_probe(struct ata_port *ap);
 extern int ata_sas_sync_probe(struct ata_port *ap);
 extern int ata_sas_port_init(struct ata_port *);
-extern int ata_sas_port_start(struct ata_port *ap);
 extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap);
 extern void ata_sas_tport_delete(struct ata_port *ap);
-extern void ata_sas_port_stop(struct ata_port *ap);
 extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *);
 extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap);
 extern void ata_tf_to_fis(const struct ata_taskfile *tf,
-- 
cgit v1.2.3


From 6c2fe21e08c28bbdb53ada668edea7df1aa9fb1e Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 31 Jul 2023 16:34:14 +0200
Subject: ata,scsi: remove ata_sas_port_destroy()

Is now a wrapper around kfree(), so call it directly.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index f8f8b558a8be..1d1fd16a1492 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1238,7 +1238,6 @@ extern int sata_link_debounce(struct ata_link *link,
 extern int sata_link_scr_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 			     bool spm_wakeup);
 extern int ata_slave_link_init(struct ata_port *ap);
-extern void ata_sas_port_destroy(struct ata_port *);
 extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
 					   struct ata_port_info *, struct Scsi_Host *);
 extern void ata_sas_async_probe(struct ata_port *ap);
-- 
cgit v1.2.3


From 8ac161ea2b3709b789c192de7da31a58aec9d7ee Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 31 Jul 2023 16:34:15 +0200
Subject: ata: libata-sata: remove ata_sas_sync_probe()

Unused.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 1d1fd16a1492..d08dde8c8ad2 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1241,7 +1241,6 @@ extern int ata_slave_link_init(struct ata_port *ap);
 extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
 					   struct ata_port_info *, struct Scsi_Host *);
 extern void ata_sas_async_probe(struct ata_port *ap);
-extern int ata_sas_sync_probe(struct ata_port *ap);
 extern int ata_sas_port_init(struct ata_port *);
 extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap);
 extern void ata_sas_tport_delete(struct ata_port *ap);
-- 
cgit v1.2.3


From a76f1b637ce92c2b8d5da912826079010b11f138 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 31 Jul 2023 16:34:17 +0200
Subject: ata,scsi: cleanup __ata_port_probe()

Rename __ata_port_probe() to ata_port_probe() and drop the wrapper
ata_sas_async_probe().

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index d08dde8c8ad2..7468a330fc77 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1240,7 +1240,7 @@ extern int sata_link_scr_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 extern int ata_slave_link_init(struct ata_port *ap);
 extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
 					   struct ata_port_info *, struct Scsi_Host *);
-extern void ata_sas_async_probe(struct ata_port *ap);
+extern void ata_port_probe(struct ata_port *ap);
 extern int ata_sas_port_init(struct ata_port *);
 extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap);
 extern void ata_sas_tport_delete(struct ata_port *ap);
-- 
cgit v1.2.3


From 541528170a5cb1342c378dfd46b04dfe024dbc7a Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 31 Jul 2023 16:34:18 +0200
Subject: ata,scsi: remove ata_sas_port_init()

ata_sas_port_init() now only contains a single initialization.

Move this single initialization to ata_sas_port_alloc(), since:
1) ata_sas_port_alloc() already initializes some of the struct members.
2) ata_sas_port_alloc() is only used by libsas.

Suggested-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 7468a330fc77..0980992c54c2 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1241,7 +1241,6 @@ extern int ata_slave_link_init(struct ata_port *ap);
 extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
 					   struct ata_port_info *, struct Scsi_Host *);
 extern void ata_port_probe(struct ata_port *ap);
-extern int ata_sas_port_init(struct ata_port *);
 extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap);
 extern void ata_sas_tport_delete(struct ata_port *ap);
 extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *);
-- 
cgit v1.2.3


From 89329c7384ef56c407269157e30e781f55c3c4d2 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 31 Jul 2023 16:34:20 +0200
Subject: ata: libata-core: remove ata_bus_probe()

Remove ata_bus_probe() as it is unused.

Also, remove references to ata_bus_probe and port_disable in
Documentation/driver-api/libata.rst, as neither exist anymore.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0980992c54c2..e176d832467d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -344,7 +344,6 @@ enum {
 	ATA_LINK_RESUME_TRIES	= 5,
 
 	/* how hard are we gonna try to probe/recover devices */
-	ATA_PROBE_MAX_TRIES	= 3,
 	ATA_EH_DEV_TRIES	= 3,
 	ATA_EH_PMP_TRIES	= 5,
 	ATA_EH_PMP_LINK_TRIES	= 3,
-- 
cgit v1.2.3


From 6b4f165e0858016f7bb5f360966882a819519f07 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 31 Jul 2023 16:34:21 +0200
Subject: ata: libata: remove deprecated EH callbacks

Now that all libata drivers have migrated to use the error_handler
callback, remove the deprecated phy_reset and eng_timeout callbacks.

Also remove references to non-existent functions sata_phy_reset and
ata_qc_timeout from Documentation/driver-api/libata.rst.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index e176d832467d..52d58b13e5ee 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -975,12 +975,6 @@ struct ata_port_operations {
 	ssize_t (*transmit_led_message)(struct ata_port *ap, u32 state,
 					ssize_t size);
 
-	/*
-	 * Obsolete
-	 */
-	void (*phy_reset)(struct ata_port *ap);
-	void (*eng_timeout)(struct ata_port *ap);
-
 	/*
 	 * ->inherits must be the last field and all the preceding
 	 * fields must be pointers.
-- 
cgit v1.2.3


From 54e73cd52250adeba836cd3afef3658b48ae8dc9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 1 Aug 2023 12:58:15 +0200
Subject: virtio: Remove PM #ifdef guards to fix i2c driver

A cleanup in the virtio i2c caused a build failure:

drivers/i2c/busses/i2c-virtio.c:270:10: error: 'struct virtio_driver' has no member named 'freeze'
drivers/i2c/busses/i2c-virtio.c:271:10: error: 'struct virtio_driver' has no member named 'restore'

Change the structure definition to allow this cleanup to
be applied everywhere.

Fixes: 73d546c76235b ("i2c: virtio: Remove #ifdef guards for PM related functions")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Paul Cercueil <paul@crapouillou.net>
Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Link: https://lore.kernel.org/r/20230801105846.3708252-1-arnd@kernel.org
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 include/linux/virtio.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index de6041deee37..7ed071b5ef07 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -184,10 +184,8 @@ struct virtio_driver {
 	void (*scan)(struct virtio_device *dev);
 	void (*remove)(struct virtio_device *dev);
 	void (*config_changed)(struct virtio_device *dev);
-#ifdef CONFIG_PM
 	int (*freeze)(struct virtio_device *dev);
 	int (*restore)(struct virtio_device *dev);
-#endif
 };
 
 static inline struct virtio_driver *drv_to_virtio(struct device_driver *drv)
-- 
cgit v1.2.3


From d890cfc25fe9421ffdff3a9ea678172addb36762 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 7 Aug 2023 09:31:06 +0200
Subject: rtc: ds2404: Convert to GPIO descriptors

This converts the DS2404 to use GPIO descriptors instead of
hard-coded global GPIO numbers.

The platform data can be deleted because there are no in-tree
users and it only contained GPIO numbers which are now
passed using descriptor tables (or device tree or ACPI).

The driver was rewritten to use a state container for the
device driver state (struct ds2404 *chip) and pass that
around instead of using a global singleton storage for the
GPIO handles.

When declaring GPIO descriptor tables or other hardware
descriptions for the RTC driver, implementers should take care
to flag the RESET line as active low, such as by using the
GPIOD_ACTIVE_LOW flag in the descriptor table.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20230807-descriptors-rtc-v1-1-ce0f9187576e@linaro.org
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/platform_data/rtc-ds2404.h | 20 --------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 include/linux/platform_data/rtc-ds2404.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/rtc-ds2404.h b/include/linux/platform_data/rtc-ds2404.h
deleted file mode 100644
index 22c53825528f..000000000000
--- a/include/linux/platform_data/rtc-ds2404.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * ds2404.h - platform data structure for the DS2404 RTC.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2012 Sven Schnelle <svens@stackframe.org>
- */
-
-#ifndef __LINUX_DS2404_H
-#define __LINUX_DS2404_H
-
-struct ds2404_platform_data {
-
-	unsigned int gpio_rst;
-	unsigned int gpio_clk;
-	unsigned int gpio_dq;
-};
-#endif
-- 
cgit v1.2.3


From afb48153220d35f330d0d979792920a31f7d9a81 Mon Sep 17 00:00:00 2001
From: Jean-Jacques Hiblot <jjhiblot@traphandler.com>
Date: Fri, 28 Jul 2023 17:37:28 +0200
Subject: leds: Provide devm_of_led_get_optional()

Add an optional variant of devm_of_led_get(). It behaves the same as
devm_of_led_get() except where the LED doesn't exist. In this case,
instead of returning -ENOENT, the function returns NULL.

Signed-off-by: Jean-Jacques Hiblot <jjhiblot@traphandler.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20230728153731.3742339-2-jjhiblot@traphandler.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/leds.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index 7d428100b42b..8740b4e47f88 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -313,6 +313,8 @@ extern struct led_classdev *of_led_get(struct device_node *np, int index);
 extern void led_put(struct led_classdev *led_cdev);
 struct led_classdev *__must_check devm_of_led_get(struct device *dev,
 						  int index);
+struct led_classdev *__must_check devm_of_led_get_optional(struct device *dev,
+						  int index);
 
 /**
  * led_blink_set - set blinking with software fallback
-- 
cgit v1.2.3


From c7d80059b086c4986cd994a1973ec7a5d75f8eea Mon Sep 17 00:00:00 2001
From: Jean-Jacques Hiblot <jjhiblot@traphandler.com>
Date: Fri, 28 Jul 2023 17:37:29 +0200
Subject: leds: class: Store the color index in struct led_classdev

Store the color of the LED so that it is not lost after the LED's
name has been composed. This color information can then be exposed to
the user space or used by the LED consumer.

Signed-off-by: Jean-Jacques Hiblot <jjhiblot@traphandler.com>
Link: https://lore.kernel.org/r/20230728153731.3742339-3-jjhiblot@traphandler.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/leds.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index 8740b4e47f88..aa16dc2a8230 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -100,6 +100,7 @@ struct led_classdev {
 	const char		*name;
 	unsigned int brightness;
 	unsigned int max_brightness;
+	unsigned int color;
 	int			 flags;
 
 	/* Lower 16 bits reflect status */
-- 
cgit v1.2.3


From a1342c8027288e345cc5fd16c6800f9d4eb788ed Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 11 Aug 2023 04:51:14 +0000
Subject: KVM: Rename kvm_arch_flush_remote_tlb() to
 kvm_arch_flush_remote_tlbs()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename kvm_arch_flush_remote_tlb() and the associated macro
__KVM_HAVE_ARCH_FLUSH_REMOTE_TLB to kvm_arch_flush_remote_tlbs() and
__KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS respectively.

Making the name plural matches kvm_flush_remote_tlbs() and makes it more
clear that this function can affect more than one remote TLB.

No functional change intended.

Signed-off-by: David Matlack <dmatlack@google.com>
Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
Acked-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230811045127.3308641-2-rananta@google.com
---
 include/linux/kvm_host.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9d3ac7720da9..e3f968b38ae9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1479,8 +1479,8 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 }
 #endif
 
-#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
-static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
+static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
 {
 	return -ENOTSUPP;
 }
-- 
cgit v1.2.3


From cfb0c08e80120928dda1e951718be135abd49bae Mon Sep 17 00:00:00 2001
From: Raghavendra Rao Ananta <rananta@google.com>
Date: Fri, 11 Aug 2023 04:51:15 +0000
Subject: KVM: Declare kvm_arch_flush_remote_tlbs() globally

There's no reason for the architectures to declare
kvm_arch_flush_remote_tlbs() in their own headers. Hence to
avoid this duplication, make the declaration global, leaving
the architectures to define only __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
as needed.

Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230811045127.3308641-3-rananta@google.com
---
 include/linux/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e3f968b38ae9..ade5d4500c2c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1484,6 +1484,8 @@ static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
 {
 	return -ENOTSUPP;
 }
+#else
+int kvm_arch_flush_remote_tlbs(struct kvm *kvm);
 #endif
 
 #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA
-- 
cgit v1.2.3


From d4788996051e3c07fadc6d9b214073fcf78810a8 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 11 Aug 2023 04:51:18 +0000
Subject: KVM: Allow range-based TLB invalidation from common code

Make kvm_flush_remote_tlbs_range() visible in common code and create a
default implementation that just invalidates the whole TLB.

This paves the way for several future features/cleanups:

 - Introduction of range-based TLBI on ARM.
 - Eliminating kvm_arch_flush_remote_tlbs_memslot()
 - Moving the KVM/x86 TDP MMU to common code.

No functional change intended.

Signed-off-by: David Matlack <dmatlack@google.com>
Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Acked-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230811045127.3308641-6-rananta@google.com
---
 include/linux/kvm_host.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ade5d4500c2c..89d2614e4b7a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1359,6 +1359,7 @@ int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode);
 
 void kvm_flush_remote_tlbs(struct kvm *kvm);
+void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages);
 
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
@@ -1488,6 +1489,16 @@ static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
 int kvm_arch_flush_remote_tlbs(struct kvm *kvm);
 #endif
 
+#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
+static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
+						    gfn_t gfn, u64 nr_pages)
+{
+	return -EOPNOTSUPP;
+}
+#else
+int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages);
+#endif
+
 #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA
 void kvm_arch_register_noncoherent_dma(struct kvm *kvm);
 void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm);
-- 
cgit v1.2.3


From 619b5072443c05cf18c31b2c0320cdb42396d411 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 11 Aug 2023 04:51:19 +0000
Subject: KVM: Move kvm_arch_flush_remote_tlbs_memslot() to common code

Move kvm_arch_flush_remote_tlbs_memslot() to common code and drop
"arch_" from the name. kvm_arch_flush_remote_tlbs_memslot() is just a
range-based TLB invalidation where the range is defined by the memslot.
Now that kvm_flush_remote_tlbs_range() can be called from common code we
can just use that and drop a bunch of duplicate code from the arch
directories.

Note this adds a lockdep assertion for slots_lock being held when
calling kvm_flush_remote_tlbs_memslot(), which was previously only
asserted on x86. MIPS has calls to kvm_flush_remote_tlbs_memslot(),
but they all hold the slots_lock, so the lockdep assertion continues to
hold true.

Also drop the CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT ifdef gating
kvm_flush_remote_tlbs_memslot(), since it is no longer necessary.

Signed-off-by: David Matlack <dmatlack@google.com>
Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
Acked-by: Anup Patel <anup@brainfault.org>
Acked-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230811045127.3308641-7-rananta@google.com
---
 include/linux/kvm_host.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 89d2614e4b7a..394db2ce11e2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1360,6 +1360,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode);
 
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages);
+void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
+				   const struct kvm_memory_slot *memslot);
 
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
@@ -1388,10 +1390,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 					unsigned long mask);
 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);
 
-#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
-void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-					const struct kvm_memory_slot *memslot);
-#else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
+#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
 		      int *is_dirty, struct kvm_memory_slot **memslot);
-- 
cgit v1.2.3


From 3e1efe2b67d3d38116ec010968dbcd89d29e4561 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 28 Jul 2023 17:41:44 -0700
Subject: KVM: Wrap kvm_{gfn,hva}_range.pte in a per-action union

Wrap kvm_{gfn,hva}_range.pte in a union so that future notifier events can
pass event specific information up and down the stack without needing to
constantly expand and churn the APIs.  Lockless aging of SPTEs will pass
around a bitmap, and support for memory attributes will pass around the
new attributes for the range.

Add a "KVM_NO_ARG" placeholder to simplify handling events without an
argument (creating a dummy union variable is midly annoying).

Opportunstically drop explicit zero-initialization of the "pte" field, as
omitting the field (now a union) has the same effect.

Cc: Yu Zhao <yuzhao@google.com>
Link: https://lore.kernel.org/all/CAOUHufagkd2Jk3_HrVoFFptRXM=hX2CV8f+M-dka-hJU4bP8kw@mail.gmail.com
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Acked-by: Yu Zhao <yuzhao@google.com>
Link: https://lore.kernel.org/r/20230729004144.1054885-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9d3ac7720da9..9125d0ab642d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -256,11 +256,15 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+union kvm_mmu_notifier_arg {
+	pte_t pte;
+};
+
 struct kvm_gfn_range {
 	struct kvm_memory_slot *slot;
 	gfn_t start;
 	gfn_t end;
-	pte_t pte;
+	union kvm_mmu_notifier_arg arg;
 	bool may_block;
 };
 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
-- 
cgit v1.2.3


From 1f8403953f05af591ab72cf749b9b9b837ea9595 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Mon, 14 Aug 2023 22:03:39 +0800
Subject: KVM: Remove unused kvm_device_{get,put}() declarations

Commit 07f0a7bdec5c ("kvm: destroy emulated devices on VM exit") removed the
functions but not these declarations.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230814140339.47732-1-yuehaibing@huawei.com
[sean: split to separate patch]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9125d0ab642d..a973a7cb45e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2152,8 +2152,6 @@ struct kvm_device_ops {
 	int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma);
 };
 
-void kvm_device_get(struct kvm_device *dev);
-void kvm_device_put(struct kvm_device *dev);
 struct kvm_device *kvm_device_from_filp(struct file *filp);
 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type);
 void kvm_unregister_device_ops(u32 type);
-- 
cgit v1.2.3


From 458933d33af2cb3663bd8c0080c1efd1f9483db4 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Mon, 14 Aug 2023 22:03:39 +0800
Subject: KVM: Remove unused kvm_make_cpus_request_mask() declaration

Commit 7ee30bc132c6 ("KVM: x86: deliver KVM IOAPIC scan request to target
vCPUs") declared but never implemented kvm_make_cpus_request_mask() as
kvm_make_vcpus_request_mask() already existed.

Note, KVM's APIs are painfully inconsistent, as the inclusive variant uses
"vcpus", whereas the exclusive/all variants use "cpus", which is likely
what led to the spurious declaration.  The "vcpus" terminology is more
correct, especially since the helpers will kick _physical_ CPUs by calling
kvm_kick_many_cpus().  But that's a cleanup for the future.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230814140339.47732-1-yuehaibing@huawei.com
[sean: split to separate patch, call out inconsistent naming]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a973a7cb45e0..199698a5ffa6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -190,8 +190,6 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
 				      struct kvm_vcpu *except);
-bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req,
-				unsigned long *vcpu_bitmap);
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
-- 
cgit v1.2.3


From 2459f4dfe5529f8b847f452473e2da08a8fc9fe5 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 6 Jul 2023 19:39:39 +0800
Subject: mfd: hi655x-pmic: Convert to devm_platform_ioremap_resource()

Use devm_platform_ioremap_resource() to simplify code.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Link: https://lore.kernel.org/r/20230706113939.1178-7-frank.li@vivo.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/hi655x-pmic.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/hi655x-pmic.h b/include/linux/mfd/hi655x-pmic.h
index 6a012784dd1b..194556851ccf 100644
--- a/include/linux/mfd/hi655x-pmic.h
+++ b/include/linux/mfd/hi655x-pmic.h
@@ -52,7 +52,6 @@
 #define OTMP_D1R_INT_MASK               BIT(OTMP_D1R_INT)
 
 struct hi655x_pmic {
-	struct resource *res;
 	struct device *dev;
 	struct regmap *regmap;
 	struct gpio_desc *gpio;
-- 
cgit v1.2.3


From 2dfe293bcde2d302c045d0cd536ccb15f86385d2 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 20 Jul 2023 22:37:38 +0800
Subject: mfd: db8500-prcmu: Remove unused inline functions

Since commit b0e846248de5 ("mfd: db8500-prcmu: Remove dead code for a non-existing config")
these inline helpers also no need any more.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230720143738.13996-1-yuehaibing@huawei.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/dbx500-prcmu.h | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/dbx500-prcmu.h b/include/linux/mfd/dbx500-prcmu.h
index e7a7e70fdb38..dd0fc891b228 100644
--- a/include/linux/mfd/dbx500-prcmu.h
+++ b/include/linux/mfd/dbx500-prcmu.h
@@ -556,16 +556,6 @@ static inline void prcmu_clear(unsigned int reg, u32 bits)
 #define PRCMU_QOS_ARM_OPP 3
 #define PRCMU_QOS_DEFAULT_VALUE -1
 
-static inline unsigned long prcmu_qos_get_cpufreq_opp_delay(void)
-{
-	return 0;
-}
-
-static inline int prcmu_qos_requirement(int prcmu_qos_class)
-{
-	return 0;
-}
-
 static inline int prcmu_qos_add_requirement(int prcmu_qos_class,
 					    char *name, s32 value)
 {
@@ -582,15 +572,4 @@ static inline void prcmu_qos_remove_requirement(int prcmu_qos_class, char *name)
 {
 }
 
-static inline int prcmu_qos_add_notifier(int prcmu_qos_class,
-					 struct notifier_block *notifier)
-{
-	return 0;
-}
-static inline int prcmu_qos_remove_notifier(int prcmu_qos_class,
-					    struct notifier_block *notifier)
-{
-	return 0;
-}
-
 #endif /* __MACH_PRCMU_H */
-- 
cgit v1.2.3


From 10d3340441bd0db857fc7fcb1733a800acf47a3d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 19 Jul 2023 11:02:23 +0200
Subject: mfd: rz-mtu3: Link time dependencies

The new set of drivers for RZ/G2L MTU3a tries to enable compile-testing the
individual client drivers even when the MFD portion is disabled but gets it
wrong, causing a link failure when the core is in a loadable module but the
other drivers are built-in:

x86_64-linux-ld: drivers/pwm/pwm-rz-mtu3.o: in function `rz_mtu3_pwm_apply':
pwm-rz-mtu3.c:(.text+0x4bf): undefined reference to `rz_mtu3_8bit_ch_write'
x86_64-linux-ld: pwm-rz-mtu3.c:(.text+0x509): undefined reference to `rz_mtu3_disable'

arm-linux-gnueabi-ld: drivers/counter/rz-mtu3-cnt.o: in function `rz_mtu3_cascade_counts_enable_get':
rz-mtu3-cnt.c:(.text+0xbec): undefined reference to `rz_mtu3_shared_reg_read'

It seems better not to add the extra complexity here but instead just use
a normal hard dependency, so remove the #else portion in the header along
with the "|| COMPILE_TEST". This could also be fixed by having slightly more
elaborate Kconfig dependencies or using the cursed 'IS_REACHABLE()' helper,
but in practice it's already possible to compile-test all these drivers
by enabling the mtd portion.

Fixes: 254d3a727421c ("pwm: Add Renesas RZ/G2L MTU3a PWM driver")
Fixes: 0be8907359df4 ("counter: Add Renesas RZ/G2L MTU3a counter driver")
Fixes: 654c293e1687b ("mfd: Add Renesas RZ/G2L MTU3a core driver")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Thierry Reding <thierry.reding@gmail.com>
Reviewed-by: Biju Das <biju.das.jz@bp.renesas.com>
Link: https://lore.kernel.org/r/20230719090430.1925182-1-arnd@kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/rz-mtu3.h | 66 ---------------------------------------------
 1 file changed, 66 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rz-mtu3.h b/include/linux/mfd/rz-mtu3.h
index c5173bc06270..8421d49500bf 100644
--- a/include/linux/mfd/rz-mtu3.h
+++ b/include/linux/mfd/rz-mtu3.h
@@ -151,7 +151,6 @@ struct rz_mtu3 {
 	void *priv_data;
 };
 
-#if IS_ENABLED(CONFIG_RZ_MTU3)
 static inline bool rz_mtu3_request_channel(struct rz_mtu3_channel *ch)
 {
 	mutex_lock(&ch->lock);
@@ -188,70 +187,5 @@ void rz_mtu3_32bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u32 val);
 void rz_mtu3_shared_reg_write(struct rz_mtu3_channel *ch, u16 off, u16 val);
 void rz_mtu3_shared_reg_update_bit(struct rz_mtu3_channel *ch, u16 off,
 				   u16 pos, u8 val);
-#else
-static inline bool rz_mtu3_request_channel(struct rz_mtu3_channel *ch)
-{
-	return false;
-}
-
-static inline void rz_mtu3_release_channel(struct rz_mtu3_channel *ch)
-{
-}
-
-static inline bool rz_mtu3_is_enabled(struct rz_mtu3_channel *ch)
-{
-	return false;
-}
-
-static inline void rz_mtu3_disable(struct rz_mtu3_channel *ch)
-{
-}
-
-static inline int rz_mtu3_enable(struct rz_mtu3_channel *ch)
-{
-	return 0;
-}
-
-static inline u8 rz_mtu3_8bit_ch_read(struct rz_mtu3_channel *ch, u16 off)
-{
-	return 0;
-}
-
-static inline u16 rz_mtu3_16bit_ch_read(struct rz_mtu3_channel *ch, u16 off)
-{
-	return 0;
-}
-
-static inline u32 rz_mtu3_32bit_ch_read(struct rz_mtu3_channel *ch, u16 off)
-{
-	return 0;
-}
-
-static inline u16 rz_mtu3_shared_reg_read(struct rz_mtu3_channel *ch, u16 off)
-{
-	return 0;
-}
-
-static inline void rz_mtu3_8bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u8 val)
-{
-}
-
-static inline void rz_mtu3_16bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u16 val)
-{
-}
-
-static inline void rz_mtu3_32bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u32 val)
-{
-}
-
-static inline void rz_mtu3_shared_reg_write(struct rz_mtu3_channel *ch, u16 off, u16 val)
-{
-}
-
-static inline void rz_mtu3_shared_reg_update_bit(struct rz_mtu3_channel *ch,
-						 u16 off, u16 pos, u8 val)
-{
-}
-#endif
 
 #endif /* __MFD_RZ_MTU3_H__ */
-- 
cgit v1.2.3


From e0d77323824054f15f3137e890f7addc48198a3e Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Fri, 28 Jul 2023 21:27:09 +0800
Subject: mfd: max77686: Remove unused extern declarations

max77686_irq_init() and max77686_irq_exit() are not used since
commit 6f1c1e71d933 ("mfd: max77686: Convert to use regmap_irq").
And max77686_irq_resume() never be implemented since introduced in
commit dae8a969d512 ("mfd: Add Maxim 77686 driver").

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Link: https://lore.kernel.org/r/20230728132709.27052-1-yuehaibing@huawei.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/max77686-private.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/max77686-private.h b/include/linux/mfd/max77686-private.h
index 3acceeedbaba..ea635d12a741 100644
--- a/include/linux/mfd/max77686-private.h
+++ b/include/linux/mfd/max77686-private.h
@@ -441,8 +441,4 @@ enum max77686_types {
 	TYPE_MAX77802,
 };
 
-extern int max77686_irq_init(struct max77686_dev *max77686);
-extern void max77686_irq_exit(struct max77686_dev *max77686);
-extern int max77686_irq_resume(struct max77686_dev *max77686);
-
 #endif /*  __LINUX_MFD_MAX77686_PRIV_H */
-- 
cgit v1.2.3


From 733e2e9a28e6fa109e51e0b77901552f69df0ef1 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Fri, 28 Jul 2023 21:24:39 +0800
Subject: mfd: ab8500: Remove unused extern declarations

commit d28f1db8187d ("mfd: Remove confusing ab8500-i2c file and merge into ab8500-core")
left behind this.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230728132439.31568-1-yuehaibing@huawei.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/abx500/ab8500.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/abx500/ab8500.h b/include/linux/mfd/abx500/ab8500.h
index 302a330c5c84..09fb3c56e7d7 100644
--- a/include/linux/mfd/abx500/ab8500.h
+++ b/include/linux/mfd/abx500/ab8500.h
@@ -382,10 +382,6 @@ struct ab8500_platform_data {
 	struct ab8500_sysctrl_platform_data *sysctrl;
 };
 
-extern int ab8500_init(struct ab8500 *ab8500,
-				 enum ab8500_version version);
-extern int ab8500_exit(struct ab8500 *ab8500);
-
 extern int ab8500_suspend(struct ab8500 *ab8500);
 
 static inline int is_ab8500(struct ab8500 *ab)
-- 
cgit v1.2.3


From 54ab43a957bcb2643c13df5ab71de9dc3f72e5a6 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Fri, 28 Jul 2023 21:28:41 +0800
Subject: mfd: 88pm860x: Remove unused extern declarations

commit 260a127bfbeb ("mfd: 88pm860x-i2c: Purge unused functions")
left behind this.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230728132841.10648-1-yuehaibing@huawei.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/88pm860x.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/88pm860x.h b/include/linux/mfd/88pm860x.h
index 473545a2c425..6fa21791fc85 100644
--- a/include/linux/mfd/88pm860x.h
+++ b/include/linux/mfd/88pm860x.h
@@ -472,13 +472,7 @@ extern int pm860x_bulk_read(struct i2c_client *, int, int, unsigned char *);
 extern int pm860x_bulk_write(struct i2c_client *, int, int, unsigned char *);
 extern int pm860x_set_bits(struct i2c_client *, int, unsigned char,
 			   unsigned char);
-extern int pm860x_page_reg_read(struct i2c_client *, int);
 extern int pm860x_page_reg_write(struct i2c_client *, int, unsigned char);
 extern int pm860x_page_bulk_read(struct i2c_client *, int, int,
 				 unsigned char *);
-extern int pm860x_page_bulk_write(struct i2c_client *, int, int,
-				  unsigned char *);
-extern int pm860x_page_set_bits(struct i2c_client *, int, unsigned char,
-				unsigned char);
-
 #endif /* __LINUX_MFD_88PM860X_H */
-- 
cgit v1.2.3


From 875386b98857822b77ac7f95bdf367b70af5b78c Mon Sep 17 00:00:00 2001
From: Manish Rangankar <mrangankar@marvell.com>
Date: Mon, 21 Aug 2023 18:30:37 +0530
Subject: scsi: qla2xxx: Add Unsolicited LS Request and Response Support for
 NVMe

Introduce infrastructure in the driver to support the processing of
unsolicited LS (Link Service) requests. This will involve the utilization
of a new pass-up of unsolicited FC-NVMe request IOCB interface. Unsolicited
requests will be submitted to the NVMe transport layer through
nvme_fc_rcv_ls_req(). Any received LS responses, which are sent using
xmt_ls_rsp(), will be forwarded to the firmware through the existing
Pass-Through IOCB interface, responsible for sending FC-NVMe Link Service
requests and responses.

Signed-off-by: Manish Rangankar <mrangankar@marvell.com>
Signed-off-by: Nilesh Javali <njavali@marvell.com>
Link: https://lore.kernel.org/r/20230821130045.34850-2-njavali@marvell.com
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/nvme-fc-driver.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 4109f1bd6128..f6ef8cf5d774 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -53,10 +53,10 @@
 struct nvmefc_ls_req {
 	void			*rqstaddr;
 	dma_addr_t		rqstdma;
-	u32			rqstlen;
+	__le32			rqstlen;
 	void			*rspaddr;
 	dma_addr_t		rspdma;
-	u32			rsplen;
+	__le32			rsplen;
 	u32			timeout;
 
 	void			*private;
@@ -120,7 +120,7 @@ struct nvmefc_ls_req {
 struct nvmefc_ls_rsp {
 	void		*rspbuf;
 	dma_addr_t	rspdma;
-	u16		rsplen;
+	__le32		rsplen;
 
 	void (*done)(struct nvmefc_ls_rsp *rsp);
 	void		*nvme_fc_private;	/* LLDD is not to access !! */
-- 
cgit v1.2.3


From 08b8a0440eeec83f8330349f829908858fd52d31 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 14 Mar 2022 15:47:18 -0400
Subject: libceph: add spinlock around osd->o_requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In a later patch, we're going to need to search for a request in
the rbtree, but taking the o_mutex is inconvenient as we already
hold the con mutex at the point where we need it.

Add a new spinlock that we take when inserting and erasing entries from
the o_requests tree. Search of the rbtree can be done with either the
mutex or the spinlock, but insertion and removal requires both.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index fb6be72104df..92addef18738 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -29,7 +29,12 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
 
 #define CEPH_HOMELESS_OSD	-1
 
-/* a given osd we're communicating with */
+/*
+ * A given osd we're communicating with.
+ *
+ * Note that the o_requests tree can be searched while holding the "lock" mutex
+ * or the "o_requests_lock" spinlock. Insertion or removal requires both!
+ */
 struct ceph_osd {
 	refcount_t o_ref;
 	struct ceph_osd_client *o_osdc;
@@ -37,6 +42,7 @@ struct ceph_osd {
 	int o_incarnation;
 	struct rb_node o_node;
 	struct ceph_connection o_con;
+	spinlock_t o_requests_lock;
 	struct rb_root o_requests;
 	struct rb_root o_linger_requests;
 	struct rb_root o_backoff_mappings;
-- 
cgit v1.2.3


From a679e50f728648f7b2f3b349e082448abd388038 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 16 Mar 2022 15:23:00 -0400
Subject: libceph: define struct ceph_sparse_extent and add some helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the OSD sends back a sparse read reply, it contains an array of
these structures. Define the structure and add a couple of helpers for
dealing with them.

Also add a place in struct ceph_osd_req_op to store the extent buffer,
and code to free it if it's populated when the req is torn down.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 43 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 92addef18738..05da1e755b7b 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -29,6 +29,17 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
 
 #define CEPH_HOMELESS_OSD	-1
 
+/*
+ * A single extent in a SPARSE_READ reply.
+ *
+ * Note that these come from the OSD as little-endian values. On BE arches,
+ * we convert them in-place after receipt.
+ */
+struct ceph_sparse_extent {
+	u64	off;
+	u64	len;
+} __packed;
+
 /*
  * A given osd we're communicating with.
  *
@@ -104,6 +115,8 @@ struct ceph_osd_req_op {
 			u64 offset, length;
 			u64 truncate_size;
 			u32 truncate_seq;
+			int sparse_ext_cnt;
+			struct ceph_sparse_extent *sparse_ext;
 			struct ceph_osd_data osd_data;
 		} extent;
 		struct {
@@ -510,6 +523,20 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      u32 truncate_seq, u64 truncate_size,
 				      bool use_mempool);
 
+int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt);
+
+/*
+ * How big an extent array should we preallocate for a sparse read? This is
+ * just a starting value.  If we get more than this back from the OSD, the
+ * receiver will reallocate.
+ */
+#define CEPH_SPARSE_EXT_ARRAY_INITIAL  16
+
+static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op)
+{
+	return __ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL);
+}
+
 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
 extern void ceph_osdc_put_request(struct ceph_osd_request *req);
 
@@ -564,5 +591,19 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
 			    struct ceph_object_locator *oloc,
 			    struct ceph_watch_item **watchers,
 			    u32 *num_watchers);
-#endif
 
+/* Find offset into the buffer of the end of the extent map */
+static inline u64 ceph_sparse_ext_map_end(struct ceph_osd_req_op *op)
+{
+	struct ceph_sparse_extent *ext;
+
+	/* No extents? No data */
+	if (op->extent.sparse_ext_cnt == 0)
+		return 0;
+
+	ext = &op->extent.sparse_ext[op->extent.sparse_ext_cnt - 1];
+
+	return ext->off + ext->len - op->extent.offset;
+}
+
+#endif
-- 
cgit v1.2.3


From ec3bc567eac12c557a2b99bd0b34b5dff12cab23 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 25 Jan 2022 08:26:31 -0500
Subject: libceph: new sparse_read op, support sparse reads on msgr2 crc
 codepath
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for a new sparse_read ceph_connection operation. The idea is
that the client driver can define this operation use it to do special
handling for incoming reads.

The alloc_msg routine will look at the request and determine whether the
reply is expected to be sparse. If it is, then we'll dispatch to a
different set of state machine states that will repeatedly call the
driver's sparse_read op to get length and placement info for reading the
extent map, and the extents themselves.

This necessitates adding some new field to some other structs:

- The msg gets a new bool to track whether it's a sparse_read request.

- A new field is added to the cursor to track the amount remaining in the
current extent. This is used to cap the read from the socket into the
msg_data

- Handing a revoke with all of this is particularly difficult, so I've
added a new data_len_remain field to the v2 connection info, and then
use that to skip that much on a revoke. We may want to expand the use of
that to the normal read path as well, just for consistency's sake.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 99c1726be6ee..8a6938fa324e 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -17,6 +17,7 @@
 
 struct ceph_msg;
 struct ceph_connection;
+struct ceph_msg_data_cursor;
 
 /*
  * Ceph defines these callbacks for handling connection events.
@@ -70,6 +71,30 @@ struct ceph_connection_operations {
 				      int used_proto, int result,
 				      const int *allowed_protos, int proto_cnt,
 				      const int *allowed_modes, int mode_cnt);
+
+	/**
+	 * sparse_read: read sparse data
+	 * @con: connection we're reading from
+	 * @cursor: data cursor for reading extents
+	 * @buf: optional buffer to read into
+	 *
+	 * This should be called more than once, each time setting up to
+	 * receive an extent into the current cursor position, and zeroing
+	 * the holes between them.
+	 *
+	 * Returns amount of data to be read (in bytes), 0 if reading is
+	 * complete, or -errno if there was an error.
+	 *
+	 * If @buf is set on a >0 return, then the data should be read into
+	 * the provided buffer. Otherwise, it should be read into the cursor.
+	 *
+	 * The sparse read operation is expected to initialize the cursor
+	 * with a length covering up to the end of the last extent.
+	 */
+	int (*sparse_read)(struct ceph_connection *con,
+			   struct ceph_msg_data_cursor *cursor,
+			   char **buf);
+
 };
 
 /* use format string %s%lld */
@@ -207,6 +232,7 @@ struct ceph_msg_data_cursor {
 
 	struct ceph_msg_data	*data;		/* current data item */
 	size_t			resid;		/* bytes not yet consumed */
+	int			sr_resid;	/* residual sparse_read len */
 	bool			need_crc;	/* crc update needed */
 	union {
 #ifdef CONFIG_BLOCK
@@ -251,6 +277,7 @@ struct ceph_msg {
 	struct kref kref;
 	bool more_to_follow;
 	bool needs_out_seq;
+	bool sparse_read;
 	int front_alloc_len;
 
 	struct ceph_msgpool *pool;
@@ -395,6 +422,7 @@ struct ceph_connection_v2_info {
 
 	void *conn_bufs[16];
 	int conn_buf_cnt;
+	int data_len_remain;
 
 	struct kvec in_sign_kvecs[8];
 	struct kvec out_sign_kvecs[8];
-- 
cgit v1.2.3


From d396f89db39a2f259e2125ca43b4c31bb65afcad Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 24 Mar 2022 13:33:06 -0400
Subject: libceph: add sparse read support to msgr1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add 2 new fields to ceph_connection_v1_info to track the necessary info
in sparse reads. Skip initializing the cursor for a sparse read.

Break out read_partial_message_section into a wrapper around a new
read_partial_message_chunk function that doesn't zero out the crc first.

Add new helper functions to drive receiving into the destinations
provided by the sparse_read state machine.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 8a6938fa324e..9fd7255172ad 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -336,6 +336,10 @@ struct ceph_connection_v1_info {
 
 	int in_base_pos;     /* bytes read */
 
+	/* sparse reads */
+	struct kvec in_sr_kvec; /* current location to receive into */
+	u64 in_sr_len;		/* amount of data in this extent */
+
 	/* message in temps */
 	u8 in_tag;           /* protocol control byte */
 	struct ceph_msg_header in_hdr;
-- 
cgit v1.2.3


From f628d799972799023d32c2542bb2639eb8c4f84e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 11 Feb 2022 11:38:02 -0500
Subject: libceph: add sparse read support to OSD client
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Have get_reply check for the presence of sparse read ops in the
request and set the sparse_read boolean in the msg. That will queue the
messenger layer to use the sparse read codepath instead of the normal
data receive.

Add a new sparse_read operation for the OSD client, driven by its own
state machine. The messenger will repeatedly call the sparse_read
operation, and it will pass back the necessary info to set up to read
the next extent of data, while zero-filling the sparse regions.

The state machine will stop at the end of the last extent, and will
attach the extent map buffer to the ceph_osd_req_op so that the caller
can use it.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 05da1e755b7b..bfa4813590da 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -40,6 +40,36 @@ struct ceph_sparse_extent {
 	u64	len;
 } __packed;
 
+/* Sparse read state machine state values */
+enum ceph_sparse_read_state {
+	CEPH_SPARSE_READ_HDR	= 0,
+	CEPH_SPARSE_READ_EXTENTS,
+	CEPH_SPARSE_READ_DATA_LEN,
+	CEPH_SPARSE_READ_DATA,
+};
+
+/*
+ * A SPARSE_READ reply is a 32-bit count of extents, followed by an array of
+ * 64-bit offset/length pairs, and then all of the actual file data
+ * concatenated after it (sans holes).
+ *
+ * Unfortunately, we don't know how long the extent array is until we've
+ * started reading the data section of the reply. The caller should send down
+ * a destination buffer for the array, but we'll alloc one if it's too small
+ * or if the caller doesn't.
+ */
+struct ceph_sparse_read {
+	enum ceph_sparse_read_state	sr_state;    /* state machine state */
+	u64				sr_req_off;  /* orig request offset */
+	u64				sr_req_len;  /* orig request length */
+	u64				sr_pos;      /* current pos in buffer */
+	int				sr_index;    /* current extent index */
+	__le32				sr_datalen;  /* length of actual data */
+	u32				sr_count;    /* extent count in reply */
+	int				sr_ext_len;  /* length of extent array */
+	struct ceph_sparse_extent	*sr_extent;  /* extent array */
+};
+
 /*
  * A given osd we're communicating with.
  *
@@ -48,6 +78,7 @@ struct ceph_sparse_extent {
  */
 struct ceph_osd {
 	refcount_t o_ref;
+	int o_sparse_op_idx;
 	struct ceph_osd_client *o_osdc;
 	int o_osd;
 	int o_incarnation;
@@ -63,6 +94,7 @@ struct ceph_osd {
 	unsigned long lru_ttl;
 	struct list_head o_keepalive_item;
 	struct mutex lock;
+	struct ceph_sparse_read	o_sparse_read;
 };
 
 #define CEPH_OSD_SLAB_OPS	2
-- 
cgit v1.2.3


From dee0c5f834605ce9b384ee8b9c7032ffd8db4eca Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 1 Jul 2022 06:30:12 -0400
Subject: libceph: add new iov_iter-based ceph_msg_data_type and
 ceph_osd_data_type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an iov_iter to the unions in ceph_msg_data and ceph_msg_data_cursor.
Instead of requiring a list of pages or bvecs, we can just use an
iov_iter directly, and avoid extra allocations.

We assume that the pages represented by the iter are pinned such that
they shouldn't incur page faults, which is the case for the iov_iters
created by netfs.

While working on this, Al Viro informed me that he was going to change
iov_iter_get_pages to auto-advance the iterator as that pattern is more
or less required for ITER_PIPE anyway. We emulate that here for now by
advancing in the _next op and tracking that amount in the "lastlen"
field.

In the event that _next is called twice without an intervening
_advance, we revert the iov_iter by the remaining lastlen before
calling iov_iter_get_pages.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h  | 8 ++++++++
 include/linux/ceph/osd_client.h | 4 ++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 9fd7255172ad..2eaaabbe98cb 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -123,6 +123,7 @@ enum ceph_msg_data_type {
 	CEPH_MSG_DATA_BIO,	/* data source/destination is a bio list */
 #endif /* CONFIG_BLOCK */
 	CEPH_MSG_DATA_BVECS,	/* data source/destination is a bio_vec array */
+	CEPH_MSG_DATA_ITER,	/* data source/destination is an iov_iter */
 };
 
 #ifdef CONFIG_BLOCK
@@ -224,6 +225,7 @@ struct ceph_msg_data {
 			bool		own_pages;
 		};
 		struct ceph_pagelist	*pagelist;
+		struct iov_iter		iter;
 	};
 };
 
@@ -248,6 +250,10 @@ struct ceph_msg_data_cursor {
 			struct page	*page;		/* page from list */
 			size_t		offset;		/* bytes from list */
 		};
+		struct {
+			struct iov_iter		iov_iter;
+			unsigned int		lastlen;
+		};
 	};
 };
 
@@ -605,6 +611,8 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
 #endif /* CONFIG_BLOCK */
 void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
 			     struct ceph_bvec_iter *bvec_pos);
+void ceph_msg_data_add_iter(struct ceph_msg *msg,
+			    struct iov_iter *iter);
 
 struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
 			       gfp_t flags, bool can_fail);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index bfa4813590da..8f5d2b5bbba2 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -108,6 +108,7 @@ enum ceph_osd_data_type {
 	CEPH_OSD_DATA_TYPE_BIO,
 #endif /* CONFIG_BLOCK */
 	CEPH_OSD_DATA_TYPE_BVECS,
+	CEPH_OSD_DATA_TYPE_ITER,
 };
 
 struct ceph_osd_data {
@@ -131,6 +132,7 @@ struct ceph_osd_data {
 			struct ceph_bvec_iter	bvec_pos;
 			u32			num_bvecs;
 		};
+		struct iov_iter		iter;
 	};
 };
 
@@ -501,6 +503,8 @@ void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
 void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
 					 unsigned int which,
 					 struct ceph_bvec_iter *bvec_pos);
+void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
+				unsigned int which, struct iov_iter *iter);
 
 extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
 					unsigned int which,
-- 
cgit v1.2.3


From 2d332d5bc424404911540006a8bb450fbb96b178 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 27 Jul 2020 10:16:09 -0400
Subject: ceph: fscrypt_auth handling for ceph
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most fscrypt-enabled filesystems store the crypto context in an xattr,
but that's problematic for ceph as xatts are governed by the XATTR cap,
but we really want the crypto context as part of the AUTH cap.

Because of this, the MDS has added two new inode metadata fields:
fscrypt_auth and fscrypt_file. The former is used to hold the crypto
context, and the latter is used to track the real file size.

Parse new fscrypt_auth and fscrypt_file fields in inode traces. For now,
we don't use fscrypt_file, but fscrypt_auth is used to hold the fscrypt
context.

Allow the client to use a setattr request for setting the fscrypt_auth
field. Since this is not a standard setattr request from the VFS, we add
a new field to __ceph_setattr that carries ceph-specific inode attrs.

Have the set_context op do a setattr that sets the fscrypt_auth value,
and get_context just return the contents of that field (since it should
always be available).

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 49586ff26152..45f8ce61e103 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -359,14 +359,19 @@ enum {
 
 extern const char *ceph_mds_op_name(int op);
 
-
-#define CEPH_SETATTR_MODE   1
-#define CEPH_SETATTR_UID    2
-#define CEPH_SETATTR_GID    4
-#define CEPH_SETATTR_MTIME  8
-#define CEPH_SETATTR_ATIME 16
-#define CEPH_SETATTR_SIZE  32
-#define CEPH_SETATTR_CTIME 64
+#define CEPH_SETATTR_MODE              (1 << 0)
+#define CEPH_SETATTR_UID               (1 << 1)
+#define CEPH_SETATTR_GID               (1 << 2)
+#define CEPH_SETATTR_MTIME             (1 << 3)
+#define CEPH_SETATTR_ATIME             (1 << 4)
+#define CEPH_SETATTR_SIZE              (1 << 5)
+#define CEPH_SETATTR_CTIME             (1 << 6)
+#define CEPH_SETATTR_MTIME_NOW         (1 << 7)
+#define CEPH_SETATTR_ATIME_NOW         (1 << 8)
+#define CEPH_SETATTR_BTIME             (1 << 9)
+#define CEPH_SETATTR_KILL_SGUID        (1 << 10)
+#define CEPH_SETATTR_FSCRYPT_AUTH      (1 << 11)
+#define CEPH_SETATTR_FSCRYPT_FILE      (1 << 12)
 
 /*
  * Ceph setxattr request flags.
-- 
cgit v1.2.3


From 69dd3b3930f96b624228000921f417fb0919a6ab Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 25 Aug 2022 09:31:05 -0400
Subject: libceph: add CEPH_OSD_OP_ASSERT_VER support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

...and record the user_version in the reply in a new field in
ceph_osd_request, so we can populate the assert_ver appropriately.
Shuffle the fields a bit too so that the new field fits in an
existing hole on x86_64.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 6 +++++-
 include/linux/ceph/rados.h      | 4 ++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 8f5d2b5bbba2..bf9823956758 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -198,6 +198,9 @@ struct ceph_osd_req_op {
 			u32 src_fadvise_flags;
 			struct ceph_osd_data osd_data;
 		} copy_from;
+		struct {
+			u64 ver;
+		} assert_ver;
 	};
 };
 
@@ -252,6 +255,7 @@ struct ceph_osd_request {
 	struct ceph_osd_client *r_osdc;
 	struct kref       r_kref;
 	bool              r_mempool;
+	bool		  r_linger;           /* don't resend on failure */
 	struct completion r_completion;       /* private to osd_client.c */
 	ceph_osdc_callback_t r_callback;
 
@@ -264,9 +268,9 @@ struct ceph_osd_request {
 	struct ceph_snap_context *r_snapc;    /* for writes */
 	struct timespec64 r_mtime;            /* ditto */
 	u64 r_data_offset;                    /* ditto */
-	bool r_linger;                        /* don't resend on failure */
 
 	/* internal */
+	u64 r_version;			      /* data version sent in reply */
 	unsigned long r_stamp;                /* jiffies, send or check time */
 	unsigned long r_start_stamp;          /* jiffies */
 	ktime_t r_start_latency;              /* ktime_t */
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 43a7a1573b51..73c3efbec36c 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -523,6 +523,10 @@ struct ceph_osd_op {
 		struct {
 			__le64 cookie;
 		} __attribute__ ((packed)) notify;
+		struct {
+			__le64 unused;
+			__le64 ver;
+		} __attribute__ ((packed)) assert_ver;
 		struct {
 			__le64 offset, length;
 			__le64 src_offset;
-- 
cgit v1.2.3


From 781589e40ac5f929f58824c15448e1ba49c3ac32 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Thu, 17 Aug 2023 15:55:31 -0700
Subject: rtc: Add support for limited alarm timer offsets

Some alarm timers are based on time offsets, not on absolute times.
In some situations, the amount of time that can be scheduled in the
future is limited. This may result in a refusal to suspend the system,
causing substantial battery drain.

Some RTC alarm drivers remedy the situation by setting the alarm time
to the maximum supported time if a request for an out-of-range timeout
is made. This is not really desirable since it may result in unexpected
early wakeups.

To reduce the impact of this problem, let RTC drivers report the maximum
supported alarm timer offset. The code setting alarm timers can then
decide if it wants to reject setting alarm timers to a larger value, if it
wants to implement recurring alarms until the actually requested alarm
time is met, or if it wants to accept the limited alarm time.

Only introduce the necessary variable into struct rtc_device.
Code to set and use the variable will follow with subsequent patches.

Cc: Brian Norris <briannorris@chromium.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20230817225537.4053865-2-linux@roeck-us.net
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/rtc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 1fd9c6a21ebe..4c0bcbeb1f00 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -146,6 +146,7 @@ struct rtc_device {
 
 	time64_t range_min;
 	timeu64_t range_max;
+	timeu64_t alarm_offset_max;
 	time64_t start_secs;
 	time64_t offset_secs;
 	bool set_start_time;
-- 
cgit v1.2.3


From 35d8dbbb25add265a880ab0dc48a229f06b08325 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 29 Aug 2023 20:46:31 +0200
Subject: thermal: core: Drop unused .get_trip_*() callbacks

After recent changes in the ACPI thermal driver and in the Intel DTS
IOSF thermal driver, all thermal zone drivers are expected to use trip
tables for initialization and none of them should implement
.get_trip_type(), .get_trip_temp() or .get_trip_hyst() callbacks, so
drop these callbacks entirely from the core.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index b449a46766f5..e6bd3b7c9eda 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -62,11 +62,7 @@ struct thermal_zone_device_ops {
 	int (*set_trips) (struct thermal_zone_device *, int, int);
 	int (*change_mode) (struct thermal_zone_device *,
 		enum thermal_device_mode);
-	int (*get_trip_type) (struct thermal_zone_device *, int,
-		enum thermal_trip_type *);
-	int (*get_trip_temp) (struct thermal_zone_device *, int, int *);
 	int (*set_trip_temp) (struct thermal_zone_device *, int, int);
-	int (*get_trip_hyst) (struct thermal_zone_device *, int, int *);
 	int (*set_trip_hyst) (struct thermal_zone_device *, int, int);
 	int (*get_crit_temp) (struct thermal_zone_device *, int *);
 	int (*set_emul_temp) (struct thermal_zone_device *, int);
-- 
cgit v1.2.3


From 8289d810ea85755a9d22f75785806cb34eecd5e5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 22 Aug 2023 13:40:06 +0200
Subject: thermal: core: Rework .get_trend() thermal zone callback

Passing a struct thermal_trip pointer instead of a trip index to the
.get_trend() thermal zone callback allows one of its 2 implementations,
the thermal_get_trend() function in the ACPI thermal driver, to be
simplified quite a bit, and the other implementation of it in the
ti-soc-thermal driver does not even use the relevant callback argument.

For this reason, change the .get_trend() thermal zone callback
definition and adjust the related code accordingly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index e6bd3b7c9eda..eb17495c8acc 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -53,6 +53,20 @@ enum thermal_notify_event {
 	THERMAL_EVENT_KEEP_ALIVE, /* Request for user space handler to respond */
 };
 
+/**
+ * struct thermal_trip - representation of a point in temperature domain
+ * @temperature: temperature value in miliCelsius
+ * @hysteresis: relative hysteresis in miliCelsius
+ * @type: trip point type
+ * @priv: pointer to driver data associated with this trip
+ */
+struct thermal_trip {
+	int temperature;
+	int hysteresis;
+	enum thermal_trip_type type;
+	void *priv;
+};
+
 struct thermal_zone_device_ops {
 	int (*bind) (struct thermal_zone_device *,
 		     struct thermal_cooling_device *);
@@ -66,26 +80,12 @@ struct thermal_zone_device_ops {
 	int (*set_trip_hyst) (struct thermal_zone_device *, int, int);
 	int (*get_crit_temp) (struct thermal_zone_device *, int *);
 	int (*set_emul_temp) (struct thermal_zone_device *, int);
-	int (*get_trend) (struct thermal_zone_device *, int,
+	int (*get_trend) (struct thermal_zone_device *, struct thermal_trip *,
 			  enum thermal_trend *);
 	void (*hot)(struct thermal_zone_device *);
 	void (*critical)(struct thermal_zone_device *);
 };
 
-/**
- * struct thermal_trip - representation of a point in temperature domain
- * @temperature: temperature value in miliCelsius
- * @hysteresis: relative hysteresis in miliCelsius
- * @type: trip point type
- * @priv: pointer to driver data associated with this trip
- */
-struct thermal_trip {
-	int temperature;
-	int hysteresis;
-	enum thermal_trip_type type;
-	void *priv;
-};
-
 struct thermal_cooling_device_ops {
 	int (*get_max_state) (struct thermal_cooling_device *, unsigned long *);
 	int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *);
-- 
cgit v1.2.3


From 218a06a79d9a98a96ef46bb003d4d8adb0962056 Mon Sep 17 00:00:00 2001
From: Jie Zhan <zhanjie9@hisilicon.com>
Date: Tue, 22 Aug 2023 20:48:37 +0800
Subject: cpufreq: Support per-policy performance boost

The boost control currently applies to the whole system.  However, users
may prefer to boost a subset of cores in order to provide prioritized
performance to workloads running on the boosted cores.

Enable per-policy boost by adding a 'boost' sysfs interface under each
policy path.  This can be found at:

	/sys/devices/system/cpu/cpufreq/policy<*>/boost

Same to the global boost switch, writing 1/0 to the per-policy 'boost'
enables/disables boost on a cpufreq policy respectively.

The user view of global and per-policy boost controls should be:

1. Enabling global boost initially enables boost on all policies, and
per-policy boost can then be enabled or disabled individually, given that
the platform does support so.

2. Disabling global boost makes the per-policy boost interface illegal.

Signed-off-by: Jie Zhan <zhanjie9@hisilicon.com>
Reviewed-by: Wei Xu <xuwei5@hisilicon.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 43b363a99215..71d186d6933a 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -141,6 +141,9 @@ struct cpufreq_policy {
 	 */
 	bool			dvfs_possible_from_any_cpu;
 
+	/* Per policy boost enabled flag. */
+	bool			boost_enabled;
+
 	 /* Cached frequency lookup from cpufreq_driver_resolve_freq. */
 	unsigned int cached_target_freq;
 	unsigned int cached_resolved_idx;
-- 
cgit v1.2.3


From 7e9be1124dbe7888907e82cab20164578e3f9ab7 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 29 Aug 2023 19:51:57 +0200
Subject: netfilter: nf_tables: Audit log setelem reset

Since set element reset is not integrated into nf_tables' transaction
logic, an explicit log call is needed, similar to NFT_MSG_GETOBJ_RESET
handling.

For the sake of simplicity, catchall element reset will always generate
a dedicated log entry. This relieves nf_tables_dump_set() from having to
adjust the logged element count depending on whether a catchall element
was found or not.

Fixes: 079cd633219d7 ("netfilter: nf_tables: Introduce NFT_MSG_GETSETELEM_RESET")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/audit.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6a3a9e122bb5..192bf03aacc5 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -117,6 +117,7 @@ enum audit_nfcfgop {
 	AUDIT_NFT_OP_OBJ_RESET,
 	AUDIT_NFT_OP_FLOWTABLE_REGISTER,
 	AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
+	AUDIT_NFT_OP_SETELEM_RESET,
 	AUDIT_NFT_OP_INVALID,
 };
 
-- 
cgit v1.2.3


From ea078ae9108e25fc881c84369f7c03931d22e555 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 29 Aug 2023 19:51:58 +0200
Subject: netfilter: nf_tables: Audit log rule reset

Resetting rules' stateful data happens outside of the transaction logic,
so 'get' and 'dump' handlers have to emit audit log entries themselves.

Fixes: 8daa8fde3fc3f ("netfilter: nf_tables: Introduce NFT_MSG_GETRULE_RESET")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/audit.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 192bf03aacc5..51b1b7054a23 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -118,6 +118,7 @@ enum audit_nfcfgop {
 	AUDIT_NFT_OP_FLOWTABLE_REGISTER,
 	AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
 	AUDIT_NFT_OP_SETELEM_RESET,
+	AUDIT_NFT_OP_RULE_RESET,
 	AUDIT_NFT_OP_INVALID,
 };
 
-- 
cgit v1.2.3


From 3af5ae22030cb59fab4fba35f5a2b62f47e14df9 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Tue, 25 Jul 2023 09:44:40 +0800
Subject: ceph: make members in struct ceph_mds_request_args_ext a union

In ceph mainline it will allow to set the btime in the setattr request
and just add a 'btime' member in the union 'ceph_mds_request_args' and
then bump up the header version to 4. That means the total size of union
'ceph_mds_request_args' will increase sizeof(struct ceph_timespec) bytes,
but in kclient it will increase the sizeof(setattr_ext) bytes for each
request.

Since the MDS will always depend on the header's vesion and front_len
members to decode the 'ceph_mds_request_head' struct, at the same time
kclient hasn't supported the 'btime' feature yet in setattr request,
so it's safe to do this change here.

This will save 48 bytes memories for each request.

Fixes: 4f1ddb1ea874 ("ceph: implement updated ceph_mds_request_head structure")
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 45f8ce61e103..ae44812eb6d4 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -467,17 +467,19 @@ union ceph_mds_request_args {
 } __attribute__ ((packed));
 
 union ceph_mds_request_args_ext {
-	union ceph_mds_request_args old;
-	struct {
-		__le32 mode;
-		__le32 uid;
-		__le32 gid;
-		struct ceph_timespec mtime;
-		struct ceph_timespec atime;
-		__le64 size, old_size;       /* old_size needed by truncate */
-		__le32 mask;                 /* CEPH_SETATTR_* */
-		struct ceph_timespec btime;
-	} __attribute__ ((packed)) setattr_ext;
+	union {
+		union ceph_mds_request_args old;
+		struct {
+			__le32 mode;
+			__le32 uid;
+			__le32 gid;
+			struct ceph_timespec mtime;
+			struct ceph_timespec atime;
+			__le64 size, old_size;       /* old_size needed by truncate */
+			__le32 mask;                 /* CEPH_SETATTR_* */
+			struct ceph_timespec btime;
+		} __attribute__ ((packed)) setattr_ext;
+	};
 };
 
 #define CEPH_MDS_FLAG_REPLAY		1 /* this is a replayed op */
-- 
cgit v1.2.3


From ce0d5bd3a6c176f9a3bf867624a07119dd4d0878 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Tue, 25 Jul 2023 17:51:59 +0800
Subject: ceph: make num_fwd and num_retry to __u32

The num_fwd in MClientRequestForward is int32_t, while the num_fwd
in ceph_mds_request_head is __u8. This is buggy when the num_fwd
is larger than 256 it will always be truncate to 0 again. But the
client couldn't recoginize this.

This will make them to __u32 instead. Because the old cephs will
directly copy the raw memories when decoding the reqeust's head,
so we need to make sure this kclient will be compatible with old
cephs. For newer cephs they will decode the requests depending
the version, which will be much simpler and easier to extend new
members.

Link: https://tracker.ceph.com/issues/62145
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index ae44812eb6d4..5f2301ee88bc 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -486,7 +486,7 @@ union ceph_mds_request_args_ext {
 #define CEPH_MDS_FLAG_WANT_DENTRY	2 /* want dentry in reply */
 #define CEPH_MDS_FLAG_ASYNC		4 /* request is asynchronous */
 
-struct ceph_mds_request_head_old {
+struct ceph_mds_request_head_legacy {
 	__le64 oldest_client_tid;
 	__le32 mdsmap_epoch;           /* on client */
 	__le32 flags;                  /* CEPH_MDS_FLAG_* */
@@ -499,9 +499,9 @@ struct ceph_mds_request_head_old {
 	union ceph_mds_request_args args;
 } __attribute__ ((packed));
 
-#define CEPH_MDS_REQUEST_HEAD_VERSION  1
+#define CEPH_MDS_REQUEST_HEAD_VERSION  2
 
-struct ceph_mds_request_head {
+struct ceph_mds_request_head_old {
 	__le16 version;                /* struct version */
 	__le64 oldest_client_tid;
 	__le32 mdsmap_epoch;           /* on client */
@@ -515,6 +515,23 @@ struct ceph_mds_request_head {
 	union ceph_mds_request_args_ext args;
 } __attribute__ ((packed));
 
+struct ceph_mds_request_head {
+	__le16 version;                /* struct version */
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* legacy count retry and fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args_ext args;
+
+	__le32 ext_num_retry;          /* new count retry attempts */
+	__le32 ext_num_fwd;            /* new count fwd attempts */
+} __attribute__ ((packed));
+
 /* cap/lease release record */
 struct ceph_mds_request_release {
 	__le64 ino, cap_id;            /* ino and unique cap id */
-- 
cgit v1.2.3


From 52e322eda3d475614210efbc0f2793a1da9d367a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 28 Jul 2023 17:47:22 -0700
Subject: KVM: x86/mmu: BUG() in rmap helpers iff
 CONFIG_BUG_ON_DATA_CORRUPTION=y

Introduce KVM_BUG_ON_DATA_CORRUPTION() and use it in the low-level rmap
helpers to convert the existing BUG()s to WARN_ON_ONCE() when the kernel
is built with CONFIG_BUG_ON_DATA_CORRUPTION=n, i.e. does NOT want to BUG()
on corruption of host kernel data structures.  Environments that don't
have infrastructure to automatically capture crash dumps, i.e. aren't
likely to enable CONFIG_BUG_ON_DATA_CORRUPTION=y, are typically better
served overall by WARN-and-continue behavior (for the kernel, the VM is
dead regardless), as a BUG() while holding mmu_lock all but guarantees
the _best_ case scenario is a panic().

Make the BUG()s conditional instead of removing/replacing them entirely as
there's a non-zero chance (though by no means a guarantee) that the damage
isn't contained to the target VM, e.g. if no rmap is found for a SPTE then
KVM may be double-zapping the SPTE, i.e. has already freed the memory the
SPTE pointed at and thus KVM is reading/writing memory that KVM no longer
owns.

Link: https://lore.kernel.org/all/20221129191237.31447-1-mizhang@google.com
Suggested-by: Mingwei Zhang <mizhang@google.com>
Cc: David Matlack <dmatlack@google.com>
Cc: Jim Mattson <jmattson@google.com>
Reviewed-by: Mingwei Zhang <mizhang@google.com>
Link: https://lore.kernel.org/r/20230729004722.1056172-13-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1b583f35547e..fb6c6109fdca 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -867,6 +867,25 @@ static inline void kvm_vm_bugged(struct kvm *kvm)
 	unlikely(__ret);					\
 })
 
+/*
+ * Note, "data corruption" refers to corruption of host kernel data structures,
+ * not guest data.  Guest data corruption, suspected or confirmed, that is tied
+ * and contained to a single VM should *never* BUG() and potentially panic the
+ * host, i.e. use this variant of KVM_BUG() if and only if a KVM data structure
+ * is corrupted and that corruption can have a cascading effect to other parts
+ * of the hosts and/or to other VMs.
+ */
+#define KVM_BUG_ON_DATA_CORRUPTION(cond, kvm)			\
+({								\
+	bool __ret = !!(cond);					\
+								\
+	if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION))		\
+		BUG_ON(__ret);					\
+	else if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged))	\
+		kvm_vm_bugged(kvm);				\
+	unlikely(__ret);					\
+})
+
 static inline void kvm_vcpu_srcu_read_lock(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_PROVE_RCU
-- 
cgit v1.2.3


From 6a86b5b5cd76d2734304a0173f5f01aa8aa2025e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 29 Aug 2023 22:53:52 +0200
Subject: bpf: Annotate bpf_long_memcpy with data_race

syzbot reported a data race splat between two processes trying to
update the same BPF map value via syscall on different CPUs:

  BUG: KCSAN: data-race in bpf_percpu_array_update / bpf_percpu_array_update

  write to 0xffffe8fffe7425d8 of 8 bytes by task 8257 on cpu 1:
   bpf_long_memcpy include/linux/bpf.h:428 [inline]
   bpf_obj_memcpy include/linux/bpf.h:441 [inline]
   copy_map_value_long include/linux/bpf.h:464 [inline]
   bpf_percpu_array_update+0x3bb/0x500 kernel/bpf/arraymap.c:380
   bpf_map_update_value+0x190/0x370 kernel/bpf/syscall.c:175
   generic_map_update_batch+0x3ae/0x4f0 kernel/bpf/syscall.c:1749
   bpf_map_do_batch+0x2df/0x3d0 kernel/bpf/syscall.c:4648
   __sys_bpf+0x28a/0x780
   __do_sys_bpf kernel/bpf/syscall.c:5241 [inline]
   __se_sys_bpf kernel/bpf/syscall.c:5239 [inline]
   __x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5239
   do_syscall_x64 arch/x86/entry/common.c:50 [inline]
   do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
   entry_SYSCALL_64_after_hwframe+0x63/0xcd

  write to 0xffffe8fffe7425d8 of 8 bytes by task 8268 on cpu 0:
   bpf_long_memcpy include/linux/bpf.h:428 [inline]
   bpf_obj_memcpy include/linux/bpf.h:441 [inline]
   copy_map_value_long include/linux/bpf.h:464 [inline]
   bpf_percpu_array_update+0x3bb/0x500 kernel/bpf/arraymap.c:380
   bpf_map_update_value+0x190/0x370 kernel/bpf/syscall.c:175
   generic_map_update_batch+0x3ae/0x4f0 kernel/bpf/syscall.c:1749
   bpf_map_do_batch+0x2df/0x3d0 kernel/bpf/syscall.c:4648
   __sys_bpf+0x28a/0x780
   __do_sys_bpf kernel/bpf/syscall.c:5241 [inline]
   __se_sys_bpf kernel/bpf/syscall.c:5239 [inline]
   __x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5239
   do_syscall_x64 arch/x86/entry/common.c:50 [inline]
   do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
   entry_SYSCALL_64_after_hwframe+0x63/0xcd

  value changed: 0x0000000000000000 -> 0xfffffff000002788

The bpf_long_memcpy is used with 8-byte aligned pointers, power-of-8 size
and forced to use long read/writes to try to atomically copy long counters.
It is best-effort only and no barriers are here since it _will_ race with
concurrent updates from BPF programs. The bpf_long_memcpy() is called from
bpf(2) syscall. Marco suggested that the best way to make this known to
KCSAN would be to use data_race() annotation.

Reported-by: syzbot+97522333291430dd277f@syzkaller.appspotmail.com
Suggested-by: Marco Elver <elver@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Marco Elver <elver@google.com>
Link: https://lore.kernel.org/bpf/000000000000d87a7f06040c970c@google.com
Link: https://lore.kernel.org/bpf/57628f7a15e20d502247c3b55fceb1cb2b31f266.1693342186.git.daniel@iogearbox.net
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 12596af59c00..024e8b28c34b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -438,7 +438,7 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 
 	size /= sizeof(long);
 	while (size--)
-		*ldst++ = *lsrc++;
+		data_race(*ldst++ = *lsrc++);
 }
 
 /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */
-- 
cgit v1.2.3


From 8423be8926aa82cd2e28bba5cc96ccb72c7ce6be Mon Sep 17 00:00:00 2001
From: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
Date: Thu, 31 Aug 2023 10:03:31 +0200
Subject: ipv6: ignore dst hint for multipath routes

Route hints when the nexthop is part of a multipath group causes packets
in the same receive batch to be sent to the same nexthop irrespective of
the multipath hash of the packet. So, do not extract route hint for
packets whose destination is part of a multipath group.

A new SKB flag IP6SKB_MULTIPATH is introduced for this purpose, set the
flag when route is looked up in fib6_select_path() and use it in
ip6_can_use_hint() to check for the existence of the flag.

Fixes: 197dbf24e360 ("ipv6: introduce and uses route look hints for list input.")
Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 5883551b1ee8..af8a771a053c 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -147,6 +147,7 @@ struct inet6_skb_parm {
 #define IP6SKB_JUMBOGRAM      128
 #define IP6SKB_SEG6	      256
 #define IP6SKB_FAKEJUMBO      512
+#define IP6SKB_MULTIPATH      1024
 };
 
 #if defined(CONFIG_NET_L3_MASTER_DEV)
-- 
cgit v1.2.3


From 6ad11bc6ed37c6371e9e13619862709b6529b43a Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Thu, 24 Aug 2023 14:38:08 +0300
Subject: rmap: remove anon_vma_link() nommu stub

anon_vma_link() is unused since commit 5beb49305251 ("mm: change anon_vma
linking to fix multi-process server scalability issue").

Link: https://lkml.kernel.org/r/cdce9b00c9ab15f6d02eddf40dcad537d3e9676f.1692877089.git.baruch@tkos.co.il
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index a3825ce81102..51cc21ebb568 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -479,7 +479,6 @@ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
 
 #define anon_vma_init()		do {} while (0)
 #define anon_vma_prepare(vma)	(0)
-#define anon_vma_link(vma)	do {} while (0)
 
 static inline int folio_referenced(struct folio *folio, int is_locked,
 				  struct mem_cgroup *memcg,
-- 
cgit v1.2.3


From 719c5e37e99d2fd588d1c994284d17650a66354c Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Fri, 1 Sep 2023 06:53:23 +0200
Subject: net: phy: micrel: Correct bit assignments for phy_device flags

Previously, the defines for phy_device flags in the Micrel driver were
ambiguous in their representation. They were intended to be bit masks
but were mistakenly defined as bit positions. This led to the following
issues:

- MICREL_KSZ8_P1_ERRATA, designated for KSZ88xx switches, overlapped
  with MICREL_PHY_FXEN and MICREL_PHY_50MHZ_CLK.
- Due to this overlap, the code path for MICREL_PHY_FXEN, tailored for
  the KSZ8041 PHY, was not executed for KSZ88xx PHYs.
- Similarly, the code associated with MICREL_PHY_50MHZ_CLK wasn't
  triggered for KSZ88xx.

To rectify this, all three flags have now been explicitly converted to
use the `BIT()` macro, ensuring they are defined as bit masks and
preventing potential overlaps in the future.

Fixes: 49011e0c1555 ("net: phy: micrel: ksz886x/ksz8081: add cabletest support")
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/micrel_phy.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 8bef1ab62bba..322d87255984 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -41,9 +41,9 @@
 #define	PHY_ID_KSZ9477		0x00221631
 
 /* struct phy_device dev_flags definitions */
-#define MICREL_PHY_50MHZ_CLK	0x00000001
-#define MICREL_PHY_FXEN		0x00000002
-#define MICREL_KSZ8_P1_ERRATA	0x00000003
+#define MICREL_PHY_50MHZ_CLK	BIT(0)
+#define MICREL_PHY_FXEN		BIT(1)
+#define MICREL_KSZ8_P1_ERRATA	BIT(2)
 
 #define MICREL_KSZ9021_EXTREG_CTRL	0xB
 #define MICREL_KSZ9021_EXTREG_DATA_WRITE	0xC
-- 
cgit v1.2.3


From 0be7592885d7b4c20595c388adc13930b653b847 Mon Sep 17 00:00:00 2001
From: Nilesh Javali <njavali@marvell.com>
Date: Thu, 31 Aug 2023 16:51:45 +0530
Subject: scsi: qla2xxx: Correct endianness for rqstlen and rsplen

rqstlen and rsplen were changed to __le32 to fix sparse warnings:

drivers/scsi/qla2xxx/qla_nvme.c:402:30: warning: incorrect type in assignment (different base types)
drivers/scsi/qla2xxx/qla_nvme.c:402:30:    expected restricted __le32 [usertype] cmd_len
drivers/scsi/qla2xxx/qla_nvme.c:402:30:    got unsigned short [usertype] rsplen
drivers/scsi/qla2xxx/qla_nvme.c:507:30: warning: incorrect type in assignment (different base types)
drivers/scsi/qla2xxx/qla_nvme.c:507:30:    expected restricted __le32 [usertype] cmd_len
drivers/scsi/qla2xxx/qla_nvme.c:507:30:    got unsigned int [usertype] rqstlen
drivers/scsi/qla2xxx/qla_nvme.c:508:30: warning: incorrect type in assignment (different base types)
drivers/scsi/qla2xxx/qla_nvme.c:508:30:    expected restricted __le32 [usertype] rsp_len
drivers/scsi/qla2xxx/qla_nvme.c:508:30:    got unsigned int [usertype] rsplen

Correct the endianness in qla2xxx driver thus avoiding changes in
nvme-fc-driver.h.

Fixes: 875386b98857 ("scsi: qla2xxx: Add Unsolicited LS Request and Response Support for NVMe")
Signed-off-by: Nilesh Javali <njavali@marvell.com>
Link: https://lore.kernel.org/r/20230831112146.32595-1-njavali@marvell.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/nvme-fc-driver.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index f6ef8cf5d774..4109f1bd6128 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -53,10 +53,10 @@
 struct nvmefc_ls_req {
 	void			*rqstaddr;
 	dma_addr_t		rqstdma;
-	__le32			rqstlen;
+	u32			rqstlen;
 	void			*rspaddr;
 	dma_addr_t		rspdma;
-	__le32			rsplen;
+	u32			rsplen;
 	u32			timeout;
 
 	void			*private;
@@ -120,7 +120,7 @@ struct nvmefc_ls_req {
 struct nvmefc_ls_rsp {
 	void		*rspbuf;
 	dma_addr_t	rspdma;
-	__le32		rsplen;
+	u16		rsplen;
 
 	void (*done)(struct nvmefc_ls_rsp *rsp);
 	void		*nvme_fc_private;	/* LLDD is not to access !! */
-- 
cgit v1.2.3


From cf60ce92358da29f3b1e24005f7b748584b82753 Mon Sep 17 00:00:00 2001
From: Pavel Pisa <pisa@cmp.felk.cvut.cz>
Date: Mon, 4 Sep 2023 12:00:02 +0200
Subject: of: overlay: Fix of_overlay_fdt_apply prototype when
 !CONFIG_OF_OVERLAY

The of_overlay_fdt_apply has been changed but when CONFIG_OF_OVERLAY
support is not configured then old stub prototype is declared
by of.h header.

Signed-off-by: Pavel Pisa <pisa@cmp.felk.cvut.cz>
Fixes: 47284862bfc7 ("of: overlay: Extend of_overlay_fdt_apply() to specify the target node")
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Link: https://lore.kernel.org/r/20230904100002.7913-1-pisa@cmp.felk.cvut.cz
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index ed679819c279..6a9ddf20e79a 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1676,8 +1676,8 @@ int of_overlay_notifier_unregister(struct notifier_block *nb);
 
 #else
 
-static inline int of_overlay_fdt_apply(void *overlay_fdt, u32 overlay_fdt_size,
-				       int *ovcs_id)
+static inline int of_overlay_fdt_apply(const void *overlay_fdt, u32 overlay_fdt_size,
+				       int *ovcs_id, struct device_node *target_base)
 {
 	return -ENOTSUPP;
 }
-- 
cgit v1.2.3


From 9ffa7b92bc768609243d79fb0c0125b9704c7061 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 30 Aug 2023 18:11:38 +0200
Subject: thermal: core: Clean up headers of thermal zone registration
 functions

For consistency, add a missing thermal_zone_device_register_with_trips()
stub for the CONFIG_THERMAL unset case, specify argument names in all of
the thermal zone registration and unregistration function headers and
make all of them use white space consistently.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 53 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index eb17495c8acc..a30643104f3f 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -300,16 +300,24 @@ int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp);
 #endif
 
 #ifdef CONFIG_THERMAL
-struct thermal_zone_device *thermal_zone_device_register(const char *, int, int,
-		void *, struct thermal_zone_device_ops *,
-		const struct thermal_zone_params *, int, int);
-
-void thermal_zone_device_unregister(struct thermal_zone_device *);
-
-struct thermal_zone_device *
-thermal_zone_device_register_with_trips(const char *, struct thermal_trip *, int, int,
-					void *, struct thermal_zone_device_ops *,
-					const struct thermal_zone_params *, int, int);
+struct thermal_zone_device *thermal_zone_device_register(
+					const char *type,
+					int num_trips, int mask,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp,
+					int passive_delay, int polling_delay);
+
+struct thermal_zone_device *thermal_zone_device_register_with_trips(
+					const char *type,
+					struct thermal_trip *trips,
+					int num_trips, int mask,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp,
+					int passive_delay, int polling_delay);
+
+void thermal_zone_device_unregister(struct thermal_zone_device *tz);
 
 void *thermal_zone_device_priv(struct thermal_zone_device *tzd);
 const char *thermal_zone_device_type(struct thermal_zone_device *tzd);
@@ -351,14 +359,27 @@ int thermal_zone_device_disable(struct thermal_zone_device *tz);
 void thermal_zone_device_critical(struct thermal_zone_device *tz);
 #else
 static inline struct thermal_zone_device *thermal_zone_device_register(
-	const char *type, int trips, int mask, void *devdata,
-	struct thermal_zone_device_ops *ops,
-	const struct thermal_zone_params *tzp,
-	int passive_delay, int polling_delay)
+					const char *type,
+					int num_trips, int mask,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp,
+					int passive_delay, int polling_delay)
+{ return ERR_PTR(-ENODEV); }
+
+static inline struct thermal_zone_device *thermal_zone_device_register_with_trips(
+					const char *type,
+					struct thermal_trip *trips,
+					int num_trips, int mask,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp,
+					int passive_delay, int polling_delay)
 { return ERR_PTR(-ENODEV); }
-static inline void thermal_zone_device_unregister(
-	struct thermal_zone_device *tz)
+
+static inline void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 { }
+
 static inline struct thermal_cooling_device *
 thermal_cooling_device_register(const char *type, void *devdata,
 	const struct thermal_cooling_device_ops *ops)
-- 
cgit v1.2.3


From d332db8fc1a2dfb4738281b1d6d4ed20115dd9d3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 30 Aug 2023 18:13:35 +0200
Subject: thermal: core: Add function for registering tripless thermal zones

Multiple callers of thermal_zone_device_register() don't pass any trips
to it and they might use a shortened argument list for that, so add
a special function with fewer arguments for this purpose.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index a30643104f3f..1a1433eb9b59 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -317,6 +317,12 @@ struct thermal_zone_device *thermal_zone_device_register_with_trips(
 					const struct thermal_zone_params *tzp,
 					int passive_delay, int polling_delay);
 
+struct thermal_zone_device *thermal_tripless_zone_device_register(
+					const char *type,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp);
+
 void thermal_zone_device_unregister(struct thermal_zone_device *tz);
 
 void *thermal_zone_device_priv(struct thermal_zone_device *tzd);
@@ -377,6 +383,13 @@ static inline struct thermal_zone_device *thermal_zone_device_register_with_trip
 					int passive_delay, int polling_delay)
 { return ERR_PTR(-ENODEV); }
 
+static inline struct thermal_zone_device *thermal_tripless_zone_device_register(
+					const char *type,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp)
+{ return ERR_PTR(-ENODEV); }
+
 static inline void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 { }
 
-- 
cgit v1.2.3


From edd220b33f479cf9dcda0bfefb2cb8c5902e9885 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 30 Aug 2023 18:16:29 +0200
Subject: thermal: core: Drop thermal_zone_device_register()

There are no more users of thermal_zone_device_register(), so drop it
from the core.

Note that thermal_zone_device_register_with_trips() may be renamed to
thermal_zone_device_register() in the future, but only after a grace
period allowing all of the possible work in progress that may be using
the latter to adjust.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 1a1433eb9b59..c99440aac1a1 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -300,14 +300,6 @@ int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp);
 #endif
 
 #ifdef CONFIG_THERMAL
-struct thermal_zone_device *thermal_zone_device_register(
-					const char *type,
-					int num_trips, int mask,
-					void *devdata,
-					struct thermal_zone_device_ops *ops,
-					const struct thermal_zone_params *tzp,
-					int passive_delay, int polling_delay);
-
 struct thermal_zone_device *thermal_zone_device_register_with_trips(
 					const char *type,
 					struct thermal_trip *trips,
@@ -364,15 +356,6 @@ int thermal_zone_device_enable(struct thermal_zone_device *tz);
 int thermal_zone_device_disable(struct thermal_zone_device *tz);
 void thermal_zone_device_critical(struct thermal_zone_device *tz);
 #else
-static inline struct thermal_zone_device *thermal_zone_device_register(
-					const char *type,
-					int num_trips, int mask,
-					void *devdata,
-					struct thermal_zone_device_ops *ops,
-					const struct thermal_zone_params *tzp,
-					int passive_delay, int polling_delay)
-{ return ERR_PTR(-ENODEV); }
-
 static inline struct thermal_zone_device *thermal_zone_device_register_with_trips(
 					const char *type,
 					struct thermal_trip *trips,
-- 
cgit v1.2.3


From e7716c74e3882405f9eca16faa6cb1bf19995399 Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Mon, 21 Aug 2023 10:21:29 +0200
Subject: xarray: Document necessary flag in alloc functions

Adds a new line to the docstrings of functions wrapping __xa_alloc() and
__xa_alloc_cyclic(), informing about the necessity of flag XA_FLAGS_ALLOC
being set previously.

The documentation so far says that functions wrapping __xa_alloc() and
__xa_alloc_cyclic() are supposed to return either -ENOMEM or -EBUSY in
case of an error. If the xarray has been initialized without the flag
XA_FLAGS_ALLOC, however, they fail with a different, undocumented error
code.

As hinted at in Documentation/core-api/xarray.rst, wrappers around these
functions should only be invoked when the flag has been set. The
functions' documentation should reflect that as well.

Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/xarray.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 741703b45f61..cb571dfcf4b1 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -856,6 +856,9 @@ static inline int __must_check xa_insert_irq(struct xarray *xa,
  * stores the index into the @id pointer, then stores the entry at
  * that index.  A concurrent lookup will not see an uninitialised @id.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Any context.  Takes and releases the xa_lock.  May sleep if
  * the @gfp flags permit.
  * Return: 0 on success, -ENOMEM if memory could not be allocated or
@@ -886,6 +889,9 @@ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
  * stores the index into the @id pointer, then stores the entry at
  * that index.  A concurrent lookup will not see an uninitialised @id.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Any context.  Takes and releases the xa_lock while
  * disabling softirqs.  May sleep if the @gfp flags permit.
  * Return: 0 on success, -ENOMEM if memory could not be allocated or
@@ -916,6 +922,9 @@ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
  * stores the index into the @id pointer, then stores the entry at
  * that index.  A concurrent lookup will not see an uninitialised @id.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Process context.  Takes and releases the xa_lock while
  * disabling interrupts.  May sleep if the @gfp flags permit.
  * Return: 0 on success, -ENOMEM if memory could not be allocated or
@@ -949,6 +958,9 @@ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
  * The search for an empty entry will start at @next and will wrap
  * around if necessary.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Any context.  Takes and releases the xa_lock.  May sleep if
  * the @gfp flags permit.
  * Return: 0 if the allocation succeeded without wrapping.  1 if the
@@ -983,6 +995,9 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
  * The search for an empty entry will start at @next and will wrap
  * around if necessary.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Any context.  Takes and releases the xa_lock while
  * disabling softirqs.  May sleep if the @gfp flags permit.
  * Return: 0 if the allocation succeeded without wrapping.  1 if the
@@ -1017,6 +1032,9 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
  * The search for an empty entry will start at @next and will wrap
  * around if necessary.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Process context.  Takes and releases the xa_lock while
  * disabling interrupts.  May sleep if the @gfp flags permit.
  * Return: 0 if the allocation succeeded without wrapping.  1 if the
-- 
cgit v1.2.3


From 39285e124edbc752331e98ace37cc141a6a3747a Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Tue, 5 Sep 2023 08:46:10 +0000
Subject: net: team: do not use dynamic lockdep key

team interface has used a dynamic lockdep key to avoid false-positive
lockdep deadlock detection. Virtual interfaces such as team usually
have their own lock for protecting private data.
These interfaces can be nested.
team0
  |
team1

Each interface's lock is actually different(team0->lock and team1->lock).
So,
mutex_lock(&team0->lock);
mutex_lock(&team1->lock);
mutex_unlock(&team1->lock);
mutex_unlock(&team0->lock);
The above case is absolutely safe. But lockdep warns about deadlock.
Because the lockdep understands these two locks are same. This is a
false-positive lockdep warning.

So, in order to avoid this problem, the team interfaces started to use
dynamic lockdep key. The false-positive problem was fixed, but it
introduced a new problem.

When the new team virtual interface is created, it registers a dynamic
lockdep key(creates dynamic lockdep key) and uses it. But there is the
limitation of the number of lockdep keys.
So, If so many team interfaces are created, it consumes all lockdep keys.
Then, the lockdep stops to work and warns about it.

In order to fix this problem, team interfaces use the subclass instead
of the dynamic key. So, when a new team interface is created, it doesn't
register(create) a new lockdep, but uses existed subclass key instead.
It is already used by the bonding interface for a similar case.

As the bonding interface does, the subclass variable is the same as
the 'dev->nested_level'. This variable indicates the depth in the stacked
interface graph.

The 'dev->nested_level' is protected by RTNL and RCU.
So, 'mutex_lock_nested()' for 'team->lock' requires RTNL or RCU.
In the current code, 'team->lock' is usually acquired under RTNL, there is
no problem with using 'dev->nested_level'.

The 'team_nl_team_get()' and The 'lb_stats_refresh()' functions acquire
'team->lock' without RTNL.
But these don't iterate their own ports nested so they don't need nested
lock.

Reproducer:
   for i in {0..1000}
   do
           ip link add team$i type team
           ip link add dummy$i master team$i type dummy
           ip link set dummy$i up
           ip link set team$i up
   done

Splat looks like:
   BUG: MAX_LOCKDEP_ENTRIES too low!
   turning off the locking correctness validator.
   Please attach the output of /proc/lock_stat to the bug report
   CPU: 0 PID: 4104 Comm: ip Not tainted 6.5.0-rc7+ #45
   Call Trace:
    <TASK>
   dump_stack_lvl+0x64/0xb0
   add_lock_to_list+0x30d/0x5e0
   check_prev_add+0x73a/0x23a0
   ...
   sock_def_readable+0xfe/0x4f0
   netlink_broadcast+0x76b/0xac0
   nlmsg_notify+0x69/0x1d0
   dev_open+0xed/0x130
   ...

Reported-by: syzbot+9bbbacfbf1e04d5221f7@syzkaller.appspotmail.com
Fixes: 369f61bee0f5 ("team: fix nested locking lockdep warning")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_team.h | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index 1b9b15a492fa..12d4447fc8ab 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -221,10 +221,38 @@ struct team {
 		atomic_t count_pending;
 		struct delayed_work dw;
 	} mcast_rejoin;
-	struct lock_class_key team_lock_key;
 	long mode_priv[TEAM_MODE_PRIV_LONGS];
 };
 
+static inline void __team_lock(struct team *team)
+{
+	mutex_lock(&team->lock);
+}
+
+static inline int team_trylock(struct team *team)
+{
+	return mutex_trylock(&team->lock);
+}
+
+#ifdef CONFIG_LOCKDEP
+static inline void team_lock(struct team *team)
+{
+	ASSERT_RTNL();
+	mutex_lock_nested(&team->lock, team->dev->nested_level);
+}
+
+#else
+static inline void team_lock(struct team *team)
+{
+	__team_lock(team);
+}
+#endif
+
+static inline void team_unlock(struct team *team)
+{
+	mutex_unlock(&team->lock);
+}
+
 static inline int team_dev_queue_xmit(struct team *team, struct team_port *port,
 				      struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 1a961e74d5abbea049588a3d74b759955b4ed9d5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 5 Sep 2023 16:42:02 -0700
Subject: net: phylink: fix sphinx complaint about invalid literal

sphinx complains about the use of "%PHYLINK_PCS_NEG_*":

Documentation/networking/kapi:144: ./include/linux/phylink.h:601: WARNING: Inline literal start-string without end-string.
Documentation/networking/kapi:144: ./include/linux/phylink.h:633: WARNING: Inline literal start-string without end-string.

These are not valid symbols so drop the '%' prefix.

Alternatively we could use %PHYLINK_PCS_NEG_\* (escape the *)
or use normal literal ``PHYLINK_PCS_NEG_*`` but there is already
a handful of un-adorned DEFINE_* in this file.

Fixes: f99d471afa03 ("net: phylink: add PCS negotiation mode")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/all/20230626162908.2f149f98@canb.auug.org.au/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 7d07f8736431..2b886ea654bb 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -600,7 +600,7 @@ void pcs_get_state(struct phylink_pcs *pcs,
  *
  * The %neg_mode argument should be tested via the phylink_mode_*() family of
  * functions, or for PCS that set pcs->neg_mode true, should be tested
- * against the %PHYLINK_PCS_NEG_* definitions.
+ * against the PHYLINK_PCS_NEG_* definitions.
  */
 int pcs_config(struct phylink_pcs *pcs, unsigned int neg_mode,
 	       phy_interface_t interface, const unsigned long *advertising,
@@ -630,7 +630,7 @@ void pcs_an_restart(struct phylink_pcs *pcs);
  *
  * The %mode argument should be tested via the phylink_mode_*() family of
  * functions, or for PCS that set pcs->neg_mode true, should be tested
- * against the %PHYLINK_PCS_NEG_* definitions.
+ * against the PHYLINK_PCS_NEG_* definitions.
  */
 void pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode,
 		 phy_interface_t interface, int speed, int duplex);
-- 
cgit v1.2.3


From 8f3f06dfd6873135068ccf1a0b386308e8c4da38 Mon Sep 17 00:00:00 2001
From: WANG Xuerui <git@xen0n.name>
Date: Wed, 6 Sep 2023 22:53:55 +0800
Subject: raid6: Add LoongArch SIMD syndrome calculation

The algorithms work on 64 bytes at a time, which is the L1 cache line
size of all current and future LoongArch cores (that we care about), as
confirmed by Huacai. The code is based on the generic int.uc algorithm,
unrolled 4 times for LSX and 2 times for LASX. Further unrolling does
not meaningfully improve the performance according to experiments.

Performance numbers measured during system boot on a 3A5000 @ 2.5GHz:

> raid6: lasx     gen() 12726 MB/s
> raid6: lsx      gen() 10001 MB/s
> raid6: int64x8  gen()  2876 MB/s
> raid6: int64x4  gen()  3867 MB/s
> raid6: int64x2  gen()  2531 MB/s
> raid6: int64x1  gen()  1945 MB/s

Comparison of xor() speeds (from different boots but meaningful anyway):

> lasx:    11226 MB/s
> lsx:     6395 MB/s
> int64x4: 2147 MB/s

Performance as measured by raid6test:

> raid6: lasx     gen() 25109 MB/s
> raid6: lsx      gen() 13233 MB/s
> raid6: int64x8  gen()  4164 MB/s
> raid6: int64x4  gen()  6005 MB/s
> raid6: int64x2  gen()  5781 MB/s
> raid6: int64x1  gen()  4119 MB/s
> raid6: using algorithm lasx gen() 25109 MB/s
> raid6: .... xor() 14439 MB/s, rmw enabled

Acked-by: Song Liu <song@kernel.org>
Signed-off-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 include/linux/raid/pq.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index f29aaaf2eb21..874447485848 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -108,6 +108,8 @@ extern const struct raid6_calls raid6_vpermxor1;
 extern const struct raid6_calls raid6_vpermxor2;
 extern const struct raid6_calls raid6_vpermxor4;
 extern const struct raid6_calls raid6_vpermxor8;
+extern const struct raid6_calls raid6_lsx;
+extern const struct raid6_calls raid6_lasx;
 
 struct raid6_recov_calls {
 	void (*data2)(int, size_t, int, int, void **);
-- 
cgit v1.2.3


From f2091321044d9fbcadb93dfc1c9cf23e563ea40c Mon Sep 17 00:00:00 2001
From: WANG Xuerui <git@xen0n.name>
Date: Wed, 6 Sep 2023 22:53:55 +0800
Subject: raid6: Add LoongArch SIMD recovery implementation

Similar to the syndrome calculation, the recovery algorithms also work
on 64 bytes at a time to align with the L1 cache line size of current
and future LoongArch cores (that we care about). Which means
unrolled-by-4 LSX and unrolled-by-2 LASX code.

The assembly is originally based on the x86 SSSE3/AVX2 ports, but
register allocation has been redone to take advantage of LSX/LASX's 32
vector registers, and instruction sequence has been optimized to suit
(e.g. LoongArch can perform per-byte srl and andi on vectors, but x86
cannot).

Performance numbers measured by instrumenting the raid6test code, on a
3A5000 system clocked at 2.5GHz:

> lasx  2data: 354.987 MiB/s
> lasx  datap: 350.430 MiB/s
> lsx   2data: 340.026 MiB/s
> lsx   datap: 337.318 MiB/s
> intx1 2data: 164.280 MiB/s
> intx1 datap: 187.966 MiB/s

Because recovery algorithms are chosen solely based on priority and
availability, lasx is marked as priority 2 and lsx priority 1. At least
for the current generation of LoongArch micro-architectures, LASX should
always be faster than LSX whenever supported, and have similar power
consumption characteristics (because the only known LASX-capable uarch,
the LA464, always compute the full 256-bit result for vector ops).

Acked-by: Song Liu <song@kernel.org>
Signed-off-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 include/linux/raid/pq.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 874447485848..006e18decfad 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -125,6 +125,8 @@ extern const struct raid6_recov_calls raid6_recov_avx2;
 extern const struct raid6_recov_calls raid6_recov_avx512;
 extern const struct raid6_recov_calls raid6_recov_s390xc;
 extern const struct raid6_recov_calls raid6_recov_neon;
+extern const struct raid6_recov_calls raid6_recov_lsx;
+extern const struct raid6_recov_calls raid6_recov_lasx;
 
 extern const struct raid6_calls raid6_neonx1;
 extern const struct raid6_calls raid6_neonx2;
-- 
cgit v1.2.3


From 9b04c764af18a1dab6d48ca0671f70cdcccf90a2 Mon Sep 17 00:00:00 2001
From: Qing Zhang <zhangqing@loongson.cn>
Date: Wed, 6 Sep 2023 22:54:16 +0800
Subject: kasan: Add __HAVE_ARCH_SHADOW_MAP to support arch specific mapping

MIPS, LoongArch and some other architectures have many holes between
different segments and the valid address space (256T available) is
insufficient to map all these segments to kasan shadow memory with the
common formula provided by kasan core. So we need architecture specific
mapping formulas to ensure different segments are mapped individually,
and only limited space lengths of those specific segments are mapped to
shadow.

Therefore, when the incoming address is converted to a shadow, we need
to add a condition to determine whether it is valid.

Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Signed-off-by: Qing Zhang <zhangqing@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 include/linux/kasan.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 819b6bc8ac08..3df5499f7936 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -54,11 +54,13 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
 int kasan_populate_early_shadow(const void *shadow_start,
 				const void *shadow_end);
 
+#ifndef __HAVE_ARCH_SHADOW_MAP
 static inline void *kasan_mem_to_shadow(const void *addr)
 {
 	return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
 		+ KASAN_SHADOW_OFFSET;
 }
+#endif
 
 int kasan_add_zero_shadow(void *start, unsigned long size);
 void kasan_remove_zero_shadow(void *start, unsigned long size);
-- 
cgit v1.2.3


From 08c6d8bae48c2c28f7017d7b61b5d5a1518ceb39 Mon Sep 17 00:00:00 2001
From: Lukasz Majewski <lukma@denx.de>
Date: Tue, 5 Sep 2023 11:33:15 +0200
Subject: net: phy: Provide Module 4 KSZ9477 errata (DS80000754C)

The KSZ9477 errata points out (in 'Module 4') the link up/down problems
when EEE (Energy Efficient Ethernet) is enabled in the device to which
the KSZ9477 tries to auto negotiate.

The suggested workaround is to clear advertisement of EEE for PHYs in
this chip driver.

To avoid regressions with other switch ICs the new MICREL_NO_EEE flag
has been introduced.

Moreover, the in-register disablement of MMD_DEVICE_ID_EEE_ADV.MMD_EEE_ADV
MMD register is removed, as this code is both; now executed too late
(after previous rework of the PHY and DSA for KSZ switches) and not
required as setting all members of eee_broken_modes bit field prevents
the KSZ9477 from advertising EEE.

Fixes: 69d3b36ca045 ("net: dsa: microchip: enable EEE support") # for KSZ9477
Signed-off-by: Lukasz Majewski <lukma@denx.de>
Tested-by: Oleksij Rempel <o.rempel@pengutronix.de> # Confirmed disabled EEE with oscilloscope.
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/20230905093315.784052-1-lukma@denx.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/micrel_phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 322d87255984..4e27ca7c49de 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -44,6 +44,7 @@
 #define MICREL_PHY_50MHZ_CLK	BIT(0)
 #define MICREL_PHY_FXEN		BIT(1)
 #define MICREL_KSZ8_P1_ERRATA	BIT(2)
+#define MICREL_NO_EEE		BIT(3)
 
 #define MICREL_KSZ9021_EXTREG_CTRL	0xB
 #define MICREL_KSZ9021_EXTREG_DATA_WRITE	0xC
-- 
cgit v1.2.3


From 6afcf0fb92701487421aa73c692855aa70fbc796 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 7 Sep 2023 11:01:04 -0700
Subject: Revert "net: team: do not use dynamic lockdep key"

This reverts commit 39285e124edbc752331e98ace37cc141a6a3747a.

Looks like the change has unintended consequences in exposing
objects before they are initialized. Let's drop this patch
and try again in net-next.

Reported-by: syzbot+44ae022028805f4600fc@syzkaller.appspotmail.com
Fixes: 39285e124edb ("net: team: do not use dynamic lockdep key")
Link: https://lore.kernel.org/all/20230907103124.6adb7256@kernel.org/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/if_team.h | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index 12d4447fc8ab..1b9b15a492fa 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -221,38 +221,10 @@ struct team {
 		atomic_t count_pending;
 		struct delayed_work dw;
 	} mcast_rejoin;
+	struct lock_class_key team_lock_key;
 	long mode_priv[TEAM_MODE_PRIV_LONGS];
 };
 
-static inline void __team_lock(struct team *team)
-{
-	mutex_lock(&team->lock);
-}
-
-static inline int team_trylock(struct team *team)
-{
-	return mutex_trylock(&team->lock);
-}
-
-#ifdef CONFIG_LOCKDEP
-static inline void team_lock(struct team *team)
-{
-	ASSERT_RTNL();
-	mutex_lock_nested(&team->lock, team->dev->nested_level);
-}
-
-#else
-static inline void team_lock(struct team *team)
-{
-	__team_lock(team);
-}
-#endif
-
-static inline void team_unlock(struct team *team)
-{
-	mutex_unlock(&team->lock);
-}
-
 static inline int team_dev_queue_xmit(struct team *team, struct team_port *port,
 				      struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From f94cf2206b066bd6d761d3347fd35f77b828c376 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 7 Sep 2023 09:40:07 -0400
Subject: buffer: Make bh_offset() work for compound pages

If the buffer pointed to by the buffer_head is part of a compound page,
bh_offset() assumes that b_page is the precise page that contains
the data.  A recent change to jbd2 inadvertently violated that assumption.

By using page_size(), we support both b_page being set to the head page
(as page_size() will return the size of the entire folio) and the precise
page (as it will return PAGE_SIZE for a tail page).

Fixes: 8147c4c4546f ("jbd2: use a folio in jbd2_journal_write_metadata_buffer()")
Reported-by: Zorro Lang <zlang@kernel.org>
Tested-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/buffer_head.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 6cb3e9af78c9..4ba242073adc 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -173,7 +173,10 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh)
 	return test_bit_acquire(BH_Uptodate, &bh->b_state);
 }
 
-#define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
+static inline unsigned long bh_offset(const struct buffer_head *bh)
+{
+	return (unsigned long)(bh)->b_data & (page_size(bh->b_page) - 1);
+}
 
 /* If we *know* page->private refers to buffer_heads */
 #define page_buffers(page)					\
-- 
cgit v1.2.3


From 41a5db8d8161457b121a03fde999ff6e00090ee2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:34 -0700
Subject: bpf: Add support for non-fix-size percpu mem allocation

This is needed for later percpu mem allocation when the
allocation is done by bpf program. For such cases, a global
bpf_global_percpu_ma is added where a flexible allocation
size is needed.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152734.1995725-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 024e8b28c34b..440dd1f59a1c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -55,8 +55,8 @@ struct cgroup;
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
-extern struct bpf_mem_alloc bpf_global_ma;
-extern bool bpf_global_ma_set;
+extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
+extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
 
 typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
-- 
cgit v1.2.3


From 55db92f42fe4a4ef7b4c2b4960c6212c8512dd53 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:39 -0700
Subject: bpf: Add BPF_KPTR_PERCPU as a field type

BPF_KPTR_PERCPU represents a percpu field type like below

  struct val_t {
    ... fields ...
  };
  struct t {
    ...
    struct val_t __percpu_kptr *percpu_data_ptr;
    ...
  };

where
  #define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr")))

While BPF_KPTR_REF points to a trusted kernel object or a trusted
local object, BPF_KPTR_PERCPU points to a trusted local
percpu object.

This patch added basic support for BPF_KPTR_PERCPU
related to percpu_kptr field parsing, recording and free operations.
BPF_KPTR_PERCPU also supports the same map types
as BPF_KPTR_REF does.

Note that unlike a local kptr, it is possible that
a BPF_KTPR_PERCPU struct may not contain any
special fields like other kptr, bpf_spin_lock, bpf_list_head, etc.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152739.1996391-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 440dd1f59a1c..87eeb3a46a1d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -180,14 +180,15 @@ enum btf_field_type {
 	BPF_TIMER      = (1 << 1),
 	BPF_KPTR_UNREF = (1 << 2),
 	BPF_KPTR_REF   = (1 << 3),
-	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
-	BPF_LIST_HEAD  = (1 << 4),
-	BPF_LIST_NODE  = (1 << 5),
-	BPF_RB_ROOT    = (1 << 6),
-	BPF_RB_NODE    = (1 << 7),
+	BPF_KPTR_PERCPU = (1 << 4),
+	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_PERCPU,
+	BPF_LIST_HEAD  = (1 << 5),
+	BPF_LIST_NODE  = (1 << 6),
+	BPF_RB_ROOT    = (1 << 7),
+	BPF_RB_NODE    = (1 << 8),
 	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
 				 BPF_RB_NODE | BPF_RB_ROOT,
-	BPF_REFCOUNT   = (1 << 8),
+	BPF_REFCOUNT   = (1 << 9),
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
@@ -300,6 +301,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 		return "kptr";
+	case BPF_KPTR_PERCPU:
+		return "percpu_kptr";
 	case BPF_LIST_HEAD:
 		return "bpf_list_head";
 	case BPF_LIST_NODE:
@@ -325,6 +328,7 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
 		return sizeof(struct bpf_timer);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_PERCPU:
 		return sizeof(u64);
 	case BPF_LIST_HEAD:
 		return sizeof(struct bpf_list_head);
@@ -351,6 +355,7 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 		return __alignof__(struct bpf_timer);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_PERCPU:
 		return __alignof__(u64);
 	case BPF_LIST_HEAD:
 		return __alignof__(struct bpf_list_head);
@@ -389,6 +394,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
 	case BPF_TIMER:
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_PERCPU:
 		break;
 	default:
 		WARN_ON_ONCE(1);
-- 
cgit v1.2.3


From 01cc55af93884f1ff5a883426e1924378dfcc62a Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:49 -0700
Subject: bpf: Add bpf_this_cpu_ptr/bpf_per_cpu_ptr support for allocated
 percpu obj

The bpf helpers bpf_this_cpu_ptr() and bpf_per_cpu_ptr() are re-purposed
for allocated percpu objects. For an allocated percpu obj,
the reg type is 'PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU'.

The return type for these two re-purposed helpera is
'PTR_TO_MEM | MEM_RCU | MEM_ALLOC'.
The MEM_ALLOC allows that the per-cpu data can be read and written.

Since the memory allocator bpf_mem_alloc() returns
a ptr to a percpu ptr for percpu data, the first argument
of bpf_this_cpu_ptr() and bpf_per_cpu_ptr() is patched
with a dereference before passing to the helper func.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152749.1997202-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b6e58dab8e27..a3236651ec64 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -480,6 +480,7 @@ struct bpf_insn_aux_data {
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
 	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
+	bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
-- 
cgit v1.2.3


From 6fdac58c560e4d164eb8161987bee045147cabe4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 7 Sep 2023 22:19:12 -0400
Subject: tracing: Remove unused trace_event_file dir field

Now that eventfs structure is used to create the events directory via the
eventfs dynamically allocate code, the "dir" field of the trace_event_file
structure is no longer used. Remove it.

Link: https://lkml.kernel.org/r/20230908022001.580400115@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index eb5c3add939b..12f875e9e69a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -650,7 +650,6 @@ struct trace_event_file {
 	struct trace_event_call		*event_call;
 	struct event_filter __rcu	*filter;
 	struct eventfs_file             *ef;
-	struct dentry			*dir;
 	struct trace_array		*tr;
 	struct trace_subsystem_dir	*system;
 	struct list_head		triggers;
-- 
cgit v1.2.3


From 5d153cd128251aaedc8e9657f0a949ec94952055 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Fri, 8 Sep 2023 16:34:59 -0500
Subject: spnego: add missing OID to oid registry

Add missing OID to the registry. Some servers and clients (including
Windows) now request "NEGOEX - SPNEGEO Extended Negotiation Security")

See https://datatracker.ietf.org/doc/html/draft-zhu-negoex-02

Reviewed-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 include/linux/oid_registry.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 0f4a8903922a..f86a08ba0207 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -67,6 +67,7 @@ enum OID {
 	OID_msOutlookExpress,		/* 1.3.6.1.4.1.311.16.4 */
 
 	OID_ntlmssp,			/* 1.3.6.1.4.1.311.2.2.10 */
+	OID_negoex,			/* 1.3.6.1.4.1.311.2.2.30 */
 
 	OID_spnego,			/* 1.3.6.1.5.5.2 */
 
-- 
cgit v1.2.3


From 24e0e61db3cb86a66824531989f1df80e0939f26 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 4 Sep 2023 22:42:56 +0200
Subject: ata: libata: disallow dev-initiated LPM transitions to unsupported
 states
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In AHCI 1.3.1, the register description for CAP.SSC:
"When cleared to ‘0’, software must not allow the HBA to initiate
transitions to the Slumber state via agressive link power management nor
the PxCMD.ICC field in each port, and the PxSCTL.IPM field in each port
must be programmed to disallow device initiated Slumber requests."

In AHCI 1.3.1, the register description for CAP.PSC:
"When cleared to ‘0’, software must not allow the HBA to initiate
transitions to the Partial state via agressive link power management nor
the PxCMD.ICC field in each port, and the PxSCTL.IPM field in each port
must be programmed to disallow device initiated Partial requests."

Ensure that we always set the corresponding bits in PxSCTL.IPM, such that
a device is not allowed to initiate transitions to power states which are
unsupported by the HBA.

DevSleep is always initiated by the HBA, however, for completeness, set the
corresponding bit in PxSCTL.IPM such that agressive link power management
cannot transition to DevSleep if DevSleep is not supported.

sata_link_scr_lpm() is used by libahci, ata_piix and libata-pmp.
However, only libahci has the ability to read the CAP/CAP2 register to see
if these features are supported. Therefore, in order to not introduce any
regressions on ata_piix or libata-pmp, create flags that indicate that the
respective feature is NOT supported. This way, the behavior for ata_piix
and libata-pmp should remain unchanged.

This change is based on a patch originally submitted by Runa Guo-oc.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Fixes: 1152b2617a6e ("libata: implement sata_link_scr_lpm() and make ata_dev_set_feature() global")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 52d58b13e5ee..bf4913f4d7ac 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -222,6 +222,10 @@ enum {
 	ATA_HOST_PARALLEL_SCAN	= (1 << 2),	/* Ports on this host can be scanned in parallel */
 	ATA_HOST_IGNORE_ATA	= (1 << 3),	/* Ignore ATA devices on this host. */
 
+	ATA_HOST_NO_PART	= (1 << 4), /* Host does not support partial */
+	ATA_HOST_NO_SSC		= (1 << 5), /* Host does not support slumber */
+	ATA_HOST_NO_DEVSLP	= (1 << 6), /* Host does not support devslp */
+
 	/* bits 24:31 of host->flags are reserved for LLD specific flags */
 
 	/* various lengths of time */
-- 
cgit v1.2.3


From 2f4d3e293392571e02b106c8b431b638bd029276 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 1 Sep 2023 16:40:32 +0300
Subject: gpio: pca953x: Drop unused fields in struct pca953x_platform_data

New code should solely use firmware nodes for the specifics and
not any callbacks.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/platform_data/pca953x.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/pca953x.h b/include/linux/platform_data/pca953x.h
index 96c1a14ab365..3c3787c4d96c 100644
--- a/include/linux/platform_data/pca953x.h
+++ b/include/linux/platform_data/pca953x.h
@@ -11,21 +11,8 @@ struct pca953x_platform_data {
 	/* number of the first GPIO */
 	unsigned	gpio_base;
 
-	/* initial polarity inversion setting */
-	u32		invert;
-
 	/* interrupt base */
 	int		irq_base;
-
-	void		*context;	/* param to setup/teardown */
-
-	int		(*setup)(struct i2c_client *client,
-				unsigned gpio, unsigned ngpio,
-				void *context);
-	void		(*teardown)(struct i2c_client *client,
-				unsigned gpio, unsigned ngpio,
-				void *context);
-	const char	*const *names;
 };
 
 #endif /* _LINUX_PCA953X_H */
-- 
cgit v1.2.3


From cf8e8658100d4eae80ce9b21f7a81cb024dd5057 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Thu, 20 Oct 2022 15:54:33 +0200
Subject: arch: Remove Itanium (IA-64) architecture

The Itanium architecture is obsolete, and an informal survey [0] reveals
that any residual use of Itanium hardware in production is mostly HP-UX
or OpenVMS based. The use of Linux on Itanium appears to be limited to
enthusiasts that occasionally boot a fresh Linux kernel to see whether
things are still working as intended, and perhaps to churn out some
distro packages that are rarely used in practice.

None of the original companies behind Itanium still produce or support
any hardware or software for the architecture, and it is listed as
'Orphaned' in the MAINTAINERS file, as apparently, none of the engineers
that contributed on behalf of those companies (nor anyone else, for that
matter) have been willing to support or maintain the architecture
upstream or even be responsible for applying the odd fix. The Intel
firmware team removed all IA-64 support from the Tianocore/EDK2
reference implementation of EFI in 2018. (Itanium is the original
architecture for which EFI was developed, and the way Linux supports it
deviates significantly from other architectures.) Some distros, such as
Debian and Gentoo, still maintain [unofficial] ia64 ports, but many have
dropped support years ago.

While the argument is being made [1] that there is a 'for the common
good' angle to being able to build and run existing projects such as the
Grid Community Toolkit [2] on Itanium for interoperability testing, the
fact remains that none of those projects are known to be deployed on
Linux/ia64, and very few people actually have access to such a system in
the first place. Even if there were ways imaginable in which Linux/ia64
could be put to good use today, what matters is whether anyone is
actually doing that, and this does not appear to be the case.

There are no emulators widely available, and so boot testing Itanium is
generally infeasible for ordinary contributors. GCC still supports IA-64
but its compile farm [3] no longer has any IA-64 machines. GLIBC would
like to get rid of IA-64 [4] too because it would permit some overdue
code cleanups. In summary, the benefits to the ecosystem of having IA-64
be part of it are mostly theoretical, whereas the maintenance overhead
of keeping it supported is real.

So let's rip off the band aid, and remove the IA-64 arch code entirely.
This follows the timeline proposed by the Debian/ia64 maintainer [5],
which removes support in a controlled manner, leaving IA-64 in a known
good state in the most recent LTS release. Other projects will follow
once the kernel support is removed.

[0] https://lore.kernel.org/all/CAMj1kXFCMh_578jniKpUtx_j8ByHnt=s7S+yQ+vGbKt9ud7+kQ@mail.gmail.com/
[1] https://lore.kernel.org/all/0075883c-7c51-00f5-2c2d-5119c1820410@web.de/
[2] https://gridcf.org/gct-docs/latest/index.html
[3] https://cfarm.tetaneutral.net/machines/list/
[4] https://lore.kernel.org/all/87bkiilpc4.fsf@mid.deneb.enyo.de/
[5] https://lore.kernel.org/all/ff58a3e76e5102c94bb5946d99187b358def688a.camel@physik.fu-berlin.de/

Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/acpi.h        | 9 +--------
 include/linux/efi.h         | 7 -------
 include/linux/mm.h          | 2 --
 include/linux/moduleparam.h | 2 +-
 4 files changed, 2 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a73246c3c35e..9bcf5641a7cf 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -259,7 +259,7 @@ void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
 /* the following numa functions are architecture-dependent */
 void acpi_numa_slit_init (struct acpi_table_slit *slit);
 
-#if defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_LOONGARCH)
+#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
 void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa);
 #else
 static inline void
@@ -1114,15 +1114,8 @@ static inline int acpi_get_lps0_constraint(struct device *dev)
 	return ACPI_STATE_UNKNOWN;
 }
 #endif /* CONFIG_SUSPEND && CONFIG_X86 */
-#ifndef CONFIG_IA64
 void arch_reserve_mem_area(acpi_physical_address addr, size_t size);
 #else
-static inline void arch_reserve_mem_area(acpi_physical_address addr,
-					  size_t size)
-{
-}
-#endif /* CONFIG_X86 */
-#else
 #define acpi_os_set_prepare_sleep(func, pm1a_ctrl, pm1b_ctrl) do { } while (0)
 #endif
 
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 80b21d1c6eaf..9cc5bf32f6f2 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -358,13 +358,10 @@ void efi_native_runtime_setup(void);
  * where the UEFI SPEC breaks the line.
  */
 #define NULL_GUID				EFI_GUID(0x00000000, 0x0000, 0x0000,  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00)
-#define MPS_TABLE_GUID				EFI_GUID(0xeb9d2d2f, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
 #define ACPI_TABLE_GUID				EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
 #define ACPI_20_TABLE_GUID			EFI_GUID(0x8868e871, 0xe4f1, 0x11d3,  0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81)
 #define SMBIOS_TABLE_GUID			EFI_GUID(0xeb9d2d31, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
 #define SMBIOS3_TABLE_GUID			EFI_GUID(0xf2fd1544, 0x9794, 0x4a2c,  0x99, 0x2e, 0xe5, 0xbb, 0xcf, 0x20, 0xe3, 0x94)
-#define SAL_SYSTEM_TABLE_GUID			EFI_GUID(0xeb9d2d32, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
-#define HCDP_TABLE_GUID				EFI_GUID(0xf951938d, 0x620b, 0x42ef,  0x82, 0x79, 0xa8, 0x4b, 0x79, 0x61, 0x78, 0x98)
 #define UGA_IO_PROTOCOL_GUID			EFI_GUID(0x61a4d49e, 0x6f68, 0x4f1b,  0xb9, 0x22, 0xa8, 0x6e, 0xed, 0x0b, 0x07, 0xa2)
 #define EFI_GLOBAL_VARIABLE_GUID		EFI_GUID(0x8be4df61, 0x93ca, 0x11d2,  0xaa, 0x0d, 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c)
 #define UV_SYSTEM_TABLE_GUID			EFI_GUID(0x3b13a7d4, 0x633e, 0x11dd,  0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93)
@@ -851,10 +848,6 @@ static inline int efi_range_is_wc(unsigned long start, unsigned long len)
 	return 1;
 }
 
-#ifdef CONFIG_EFI_PCDP
-extern int __init efi_setup_pcdp_console(char *);
-#endif
-
 /*
  * We play games with efi_enabled so that the compiler will, if
  * possible, remove EFI-related code altogether.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..17d530e12f70 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -362,8 +362,6 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_SAO		VM_ARCH_1	/* Strong Access Ordering (powerpc) */
 #elif defined(CONFIG_PARISC)
 # define VM_GROWSUP	VM_ARCH_1
-#elif defined(CONFIG_IA64)
-# define VM_GROWSUP	VM_ARCH_1
 #elif defined(CONFIG_SPARC64)
 # define VM_SPARC_ADI	VM_ARCH_1	/* Uses ADI tag for access control */
 # define VM_ARCH_CLEAR	VM_SPARC_ADI
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 962cd41a2cb5..99e4f1f718c7 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -276,7 +276,7 @@ struct kparam_array
    read-only sections (which is part of respective UNIX ABI on these
    platforms). So 'const' makes no sense and even causes compile failures
    with some compilers. */
-#if defined(CONFIG_ALPHA) || defined(CONFIG_IA64) || defined(CONFIG_PPC64)
+#if defined(CONFIG_ALPHA) || defined(CONFIG_PPC64)
 #define __moduleparam_const
 #else
 #define __moduleparam_const const
-- 
cgit v1.2.3


From f5e836884d8e55b416dfad55c29481ec1b65c1f0 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 13 Jan 2023 17:57:47 +0100
Subject: kernel: Drop IA64 support from sig_fault handlers

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/sched/signal.h | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..155332977239 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -303,20 +303,11 @@ static inline void kernel_signal_stop(void)
 
 	schedule();
 }
-#ifdef __ia64__
-# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3
-#else
-# define ___ARCH_SI_IA64(_a1, _a2, _a3)
-#endif
 
-int force_sig_fault_to_task(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-	, struct task_struct *t);
-int force_sig_fault(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr));
-int send_sig_fault(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-	, struct task_struct *t);
+int force_sig_fault_to_task(int sig, int code, void __user *addr,
+			    struct task_struct *t);
+int force_sig_fault(int sig, int code, void __user *addr);
+int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t);
 
 int force_sig_mceerr(int code, void __user *, short);
 int send_sig_mceerr(int code, void __user *, short, struct task_struct *);
-- 
cgit v1.2.3


From b089ea3cc30de85ea7e20aa66500feb4082dfbf7 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 13 Jan 2023 18:08:32 +0100
Subject: lib/raid6: Drop IA64 support

Drop Itanium support from the RAID6 code, and along with it, the 16x and
32x unrolled versions, which were only used by IA64.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/raid/pq.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 006e18decfad..98030accf641 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -84,8 +84,6 @@ extern const struct raid6_calls raid6_intx1;
 extern const struct raid6_calls raid6_intx2;
 extern const struct raid6_calls raid6_intx4;
 extern const struct raid6_calls raid6_intx8;
-extern const struct raid6_calls raid6_intx16;
-extern const struct raid6_calls raid6_intx32;
 extern const struct raid6_calls raid6_mmxx1;
 extern const struct raid6_calls raid6_mmxx2;
 extern const struct raid6_calls raid6_sse1x1;
-- 
cgit v1.2.3


From f42dafe3da0cd887c9d2aaa59576f2a92ee4d876 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Sun, 3 Sep 2023 21:06:57 +0200
Subject: gpiolib: unexport gpiod_set_transitory()

There are no and never have been any users of gpiod_set_transitory()
outside the core GPIOLIB code. Make it private.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/consumer.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 1c4385a00f88..6cc345440a5b 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -159,7 +159,6 @@ int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 
 int gpiod_set_config(struct gpio_desc *desc, unsigned long config);
 int gpiod_set_debounce(struct gpio_desc *desc, unsigned int debounce);
-int gpiod_set_transitory(struct gpio_desc *desc, bool transitory);
 void gpiod_toggle_active_low(struct gpio_desc *desc);
 
 int gpiod_is_active_low(const struct gpio_desc *desc);
@@ -494,13 +493,6 @@ static inline int gpiod_set_debounce(struct gpio_desc *desc, unsigned int deboun
 	return -ENOSYS;
 }
 
-static inline int gpiod_set_transitory(struct gpio_desc *desc, bool transitory)
-{
-	/* GPIO can never have been requested */
-	WARN_ON(desc);
-	return -ENOSYS;
-}
-
 static inline void gpiod_toggle_active_low(struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-- 
cgit v1.2.3


From 8de54392b849a612f337044d81d9859ee95ab871 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Fri, 1 Sep 2023 13:35:20 +0200
Subject: gpiolib: remove stray newline in gpio/driver.h

Fix a double newline in the GPIO provider header.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio/driver.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 4f0c5d62c8f3..1571cfca65e7 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -764,7 +764,6 @@ void gpiochip_free_own_desc(struct gpio_desc *desc);
 int gpiochip_lock_as_irq(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_unlock_as_irq(struct gpio_chip *gc, unsigned int offset);
 
-
 struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc);
 
 #else /* CONFIG_GPIOLIB */
-- 
cgit v1.2.3


From 37d42ab3924919652858f836a80ab49ec7d11f1e Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Fri, 1 Sep 2023 13:34:58 +0200
Subject: gpiolib: remove unnecessary extern specifiers from the driver header

'extern' doesn't do anything for function declarations. Remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 1571cfca65e7..b721422f4bfa 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -529,8 +529,7 @@ struct gpio_chip {
 #endif /* CONFIG_OF_GPIO */
 };
 
-extern const char *gpiochip_is_requested(struct gpio_chip *gc,
-			unsigned int offset);
+const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset);
 
 /**
  * for_each_requested_gpio_in_range - iterates over requested GPIOs in a given range
@@ -549,9 +548,9 @@ extern const char *gpiochip_is_requested(struct gpio_chip *gc,
 	for_each_requested_gpio_in_range(chip, i, 0, chip->ngpio, label)
 
 /* add/remove chips */
-extern int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
-				      struct lock_class_key *lock_key,
-				      struct lock_class_key *request_key);
+int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
+			       struct lock_class_key *lock_key,
+			       struct lock_class_key *request_key);
 
 /**
  * gpiochip_add_data() - register a gpio_chip
@@ -599,13 +598,13 @@ static inline int gpiochip_add(struct gpio_chip *gc)
 {
 	return gpiochip_add_data(gc, NULL);
 }
-extern void gpiochip_remove(struct gpio_chip *gc);
-extern int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc, void *data,
-					   struct lock_class_key *lock_key,
-					   struct lock_class_key *request_key);
+void gpiochip_remove(struct gpio_chip *gc);
+int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc,
+				    void *data, struct lock_class_key *lock_key,
+				    struct lock_class_key *request_key);
 
-extern struct gpio_chip *gpiochip_find(void *data,
-			      int (*match)(struct gpio_chip *gc, void *data));
+struct gpio_chip *gpiochip_find(void *data,
+				int (*match)(struct gpio_chip *gc, void *data));
 
 bool gpiochip_line_is_irq(struct gpio_chip *gc, unsigned int offset);
 int gpiochip_reqres_irq(struct gpio_chip *gc, unsigned int offset);
-- 
cgit v1.2.3


From 2ae5c9248e06dac2c2360be26b4e25f673238337 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Thu, 31 Aug 2023 11:22:58 -0700
Subject: wifi: mac80211: Use flexible array in struct ieee80211_tim_ie

Currently struct ieee80211_tim_ie defines:
	u8 virtual_map[1];

Per the guidance in [1] change this to be a flexible array.

Per the discussion in [2] wrap the virtual_map in a union with a u8
item in order to preserve the existing expectation that the
virtual_map must contain at least one octet (at least when used in a
non-S1G PPDU). This means that no driver changes are required.

[1] https://docs.kernel.org/process/deprecated.html#zero-length-and-one-element-arrays
[2] https://lore.kernel.org/linux-wireless/202308301529.AC90A9EF98@keescook/

Suggested-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20230831-ieee80211_tim_ie-v3-2-e10ff584ab5d@quicinc.com
[add wifi prefix]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index bd2f6e19c357..340d7e0f6bf7 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -951,17 +951,24 @@ struct ieee80211_wide_bw_chansw_ie {
  * @dtim_count: DTIM Count
  * @dtim_period: DTIM Period
  * @bitmap_ctrl: Bitmap Control
+ * @required_octet: "Syntatic sugar" to force the struct size to the
+ *                  minimum valid size when carried in a non-S1G PPDU
  * @virtual_map: Partial Virtual Bitmap
  *
  * This structure represents the payload of the "TIM element" as
- * described in IEEE Std 802.11-2020 section 9.4.2.5.
+ * described in IEEE Std 802.11-2020 section 9.4.2.5. Note that this
+ * definition is only applicable when the element is carried in a
+ * non-S1G PPDU. When the TIM is carried in an S1G PPDU, the Bitmap
+ * Control and Partial Virtual Bitmap may not be present.
  */
 struct ieee80211_tim_ie {
 	u8 dtim_count;
 	u8 dtim_period;
 	u8 bitmap_ctrl;
-	/* variable size: 1 - 251 bytes */
-	u8 virtual_map[1];
+	union {
+		u8 required_octet;
+		DECLARE_FLEX_ARRAY(u8, virtual_map);
+	};
 } __packed;
 
 /**
-- 
cgit v1.2.3


From 0d423c4a78984dd02f6596d6fd9dd40446eec517 Mon Sep 17 00:00:00 2001
From: Alexey Romanov <avromanov@salutedevices.com>
Date: Wed, 30 Aug 2023 17:08:50 +0300
Subject: drivers: meson: sm: correct meson_sm_* API retval handling

1. Following the ARM SMC32 calling convention, the return value
from secure monitor is a 32-bit signed integer. This patch changes
the type of the return value of the function meson_sm_call().

2. Now, when meson_sm_call() returns a 32-bit signed integer, we need
to ensure that this value is not negative. It is important to check
that the return value is not negative in both the meson_sm_call_read()
and meson_sm_call_write() functions.

3. Add a comment explaining why it is necessary to check if the SMC
return value is equal to 0 in the function meson_sm_call_read().
It is not obvious when reading this code.

Signed-off-by: Alexey Romanov <avromanov@salutedevices.com>
Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://lore.kernel.org/r/20230830140850.17130-1-avromanov@salutedevices.com
Signed-off-by: Neil Armstrong <neil.armstrong@linaro.org>
---
 include/linux/firmware/meson/meson_sm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/meson/meson_sm.h b/include/linux/firmware/meson/meson_sm.h
index 95b0da2326a9..8eaf8922ab02 100644
--- a/include/linux/firmware/meson/meson_sm.h
+++ b/include/linux/firmware/meson/meson_sm.h
@@ -19,7 +19,7 @@ enum {
 struct meson_sm_firmware;
 
 int meson_sm_call(struct meson_sm_firmware *fw, unsigned int cmd_index,
-		  u32 *ret, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
+		  s32 *ret, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
 int meson_sm_call_write(struct meson_sm_firmware *fw, void *buffer,
 			unsigned int b_size, unsigned int cmd_index, u32 arg0,
 			u32 arg1, u32 arg2, u32 arg3, u32 arg4);
-- 
cgit v1.2.3


From 3e15dcf77b23b8e9b9b7f3c0d4def8fe9c12c534 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 8 Sep 2023 16:28:59 +0300
Subject: fs: rename __mnt_{want,drop}_write*() helpers

Before exporting these helpers to modules, make their names more
meaningful.

The names mnt_{get,put)_write_access*() were chosen, because they rhyme
with the inode {get,put)_write_access() helpers, which have a very close
meaning for the inode object.

Suggested-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20230817-anfechtbar-ruhelosigkeit-8c6cca8443fc@brauner/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Message-Id: <20230908132900.2983519-2-amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/mount.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mount.h b/include/linux/mount.h
index 4f40b40306d0..ac3dd2876197 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -92,8 +92,8 @@ extern bool __mnt_is_readonly(struct vfsmount *mnt);
 extern bool mnt_may_suid(struct vfsmount *mnt);
 
 extern struct vfsmount *clone_private_mount(const struct path *path);
-extern int __mnt_want_write(struct vfsmount *);
-extern void __mnt_drop_write(struct vfsmount *);
+int mnt_get_write_access(struct vfsmount *mnt);
+void mnt_put_write_access(struct vfsmount *mnt);
 
 extern struct vfsmount *fc_mount(struct fs_context *fc);
 extern struct vfsmount *vfs_create_mount(struct fs_context *fc);
-- 
cgit v1.2.3


From fa671e4f1556e2c18e5443f777a75ae041290068 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Thu, 7 Sep 2023 10:52:03 +0200
Subject: fbdev/core: Unexport logo helpers

The interfaces for the fbdev logo are not used outside of the fbdev
module. Hence declare the fbdev logo functions in the internal header
file and remove their symbol exports. Only build the functions if
CONFIG_LOGO has been selected.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Acked-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230907085408.9354-5-tzimmermann@suse.de
---
 include/linux/fb.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index 16c3e6d6c55d..d110676c9c83 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -591,8 +591,6 @@ extern ssize_t fb_sys_write(struct fb_info *info, const char __user *buf,
 /* drivers/video/fbmem.c */
 extern int register_framebuffer(struct fb_info *fb_info);
 extern void unregister_framebuffer(struct fb_info *fb_info);
-extern int fb_prepare_logo(struct fb_info *fb_info, int rotate);
-extern int fb_show_logo(struct fb_info *fb_info, int rotate);
 extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size);
 extern void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx,
 				u32 height, u32 shift_high, u32 shift_low, u32 mod);
@@ -603,9 +601,6 @@ extern int fb_get_color_depth(struct fb_var_screeninfo *var,
 extern int fb_get_options(const char *name, char **option);
 extern int fb_new_modelist(struct fb_info *info);
 
-extern bool fb_center_logo;
-extern int fb_logo_count;
-
 static inline void lock_fb_info(struct fb_info *info)
 {
 	mutex_lock(&info->lock);
-- 
cgit v1.2.3


From ebc7abb35b258152d4a424f89d7c03db1d7ce61c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 7 Sep 2023 20:18:56 +0200
Subject: thermal: Constify the trip argument of the .get_trend() zone callback

Add 'const' to the definition of the 'trip' argument of the
.get_trend() thermal zone callback to indicate that the trip point
passed to it should not be modified by it and adjust the
callback functions implementing it, thermal_get_trend() in the
ACPI thermal driver and __ti_thermal_get_trend(), accordingly.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Michal Wilczynski <michal.wilczynski@intel.com>
---
 include/linux/thermal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index c99440aac1a1..a5ae4af955ff 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -80,8 +80,8 @@ struct thermal_zone_device_ops {
 	int (*set_trip_hyst) (struct thermal_zone_device *, int, int);
 	int (*get_crit_temp) (struct thermal_zone_device *, int *);
 	int (*set_emul_temp) (struct thermal_zone_device *, int);
-	int (*get_trend) (struct thermal_zone_device *, struct thermal_trip *,
-			  enum thermal_trend *);
+	int (*get_trend) (struct thermal_zone_device *,
+			  const struct thermal_trip *, enum thermal_trend *);
 	void (*hot)(struct thermal_zone_device *);
 	void (*critical)(struct thermal_zone_device *);
 };
-- 
cgit v1.2.3


From ef7d9593390a050c50eba5fc02d2cb65a1104434 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 11 Sep 2023 08:39:04 -0700
Subject: xfs: remove CPU hotplug infrastructure

There are no users of the cpu hotplug hooks in xfs now, so remove it.
This reverts f1653c2e2831e ("xfs: introduce CPU hotplug
infrastructure").

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 include/linux/cpuhotplug.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 06dda85f0424..068f7738be22 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -90,7 +90,6 @@ enum cpuhp_state {
 	CPUHP_FS_BUFF_DEAD,
 	CPUHP_PRINTK_DEAD,
 	CPUHP_MM_MEMCQ_DEAD,
-	CPUHP_XFS_DEAD,
 	CPUHP_PERCPU_CNT_DEAD,
 	CPUHP_RADIX_DEAD,
 	CPUHP_PAGE_ALLOC,
-- 
cgit v1.2.3


From 5b404fdabacf4bee92d8c66013402a85f18a6a10 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 15 Aug 2023 15:46:32 -0700
Subject: rcu: Add RCU CPU stall notifier

It is sometimes helpful to have a way for the subsystem causing
the stall to dump its state when an RCU CPU stall occurs.  This
commit therefore bases rcu_stall_chain_notifier_register() and
rcu_stall_chain_notifier_unregister() on atomic notifiers in order to
provide this functionality.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/rcu_notifier.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 include/linux/rcu_notifier.h

(limited to 'include/linux')

diff --git a/include/linux/rcu_notifier.h b/include/linux/rcu_notifier.h
new file mode 100644
index 000000000000..ebf371364581
--- /dev/null
+++ b/include/linux/rcu_notifier.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Read-Copy Update notifiers, initially RCU CPU stall notifier.
+ * Separate from rcupdate.h to avoid #include loops.
+ *
+ * Copyright (C) 2023 Paul E. McKenney.
+ */
+
+#ifndef __LINUX_RCU_NOTIFIER_H
+#define __LINUX_RCU_NOTIFIER_H
+
+// Actions for RCU CPU stall notifier calls.
+#define RCU_STALL_NOTIFY_NORM	1
+#define RCU_STALL_NOTIFY_EXP	2
+
+#ifdef CONFIG_RCU_STALL_COMMON
+
+#include <linux/notifier.h>
+#include <linux/types.h>
+
+int rcu_stall_chain_notifier_register(struct notifier_block *n);
+int rcu_stall_chain_notifier_unregister(struct notifier_block *n);
+
+#else // #ifdef CONFIG_RCU_STALL_COMMON
+
+// No RCU CPU stall warnings in Tiny RCU.
+static inline int rcu_stall_chain_notifier_register(struct notifier_block *n) { return -EEXIST; }
+static inline int rcu_stall_chain_notifier_unregister(struct notifier_block *n) { return -ENOENT; }
+
+#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON
+
+#endif /* __LINUX_RCU_NOTIFIER_H */
-- 
cgit v1.2.3


From fc52a64416b010c8324e2cb50070faae868521c1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 8 Sep 2023 16:39:29 -0400
Subject: tracing/synthetic: Fix order of struct trace_dynamic_info

To make handling BIG and LITTLE endian better the offset/len of dynamic
fields of the synthetic events was changed into a structure of:

 struct trace_dynamic_info {
 #ifdef CONFIG_CPU_BIG_ENDIAN
	u16	offset;
	u16	len;
 #else
	u16	len;
	u16	offset;
 #endif
 };

to replace the manual changes of:

 data_offset = offset & 0xffff;
 data_offest = len << 16;

But if you look closely, the above is:

  <len> << 16 | offset

Which in little endian would be in memory:

 offset_lo offset_hi len_lo len_hi

and in big endian:

 len_hi len_lo offset_hi offset_lo

Which if broken into a structure would be:

 struct trace_dynamic_info {
 #ifdef CONFIG_CPU_BIG_ENDIAN
	u16	len;
	u16	offset;
 #else
	u16	offset;
	u16	len;
 #endif
 };

Which is the opposite of what was defined.

Fix this and just to be safe also add "__packed".

Link: https://lore.kernel.org/all/20230908154417.5172e343@gandalf.local.home/
Link: https://lore.kernel.org/linux-trace-kernel/20230908163929.2c25f3dc@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Fixes: ddeea494a16f3 ("tracing/synthetic: Use union instead of casts")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 12f875e9e69a..21ae37e49319 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -62,13 +62,13 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...);
 /* Used to find the offset and length of dynamic fields in trace events */
 struct trace_dynamic_info {
 #ifdef CONFIG_CPU_BIG_ENDIAN
-	u16	offset;
 	u16	len;
+	u16	offset;
 #else
-	u16	len;
 	u16	offset;
+	u16	len;
 #endif
-};
+} __packed;
 
 /*
  * The trace entry - the most basic unit of tracing. This is what
-- 
cgit v1.2.3


From 49f776724e64c27dd861e7ac8da9d42f01d9d172 Mon Sep 17 00:00:00 2001
From: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Date: Wed, 23 Aug 2023 23:43:04 +0000
Subject: PCI/AER: Export pcie_aer_is_native()

Export and move the declaration of pcie_aer_is_native() to a common header
file to be reused by cxl/pci module.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Robert Richter <rrichter@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20230823234305.27333-3-Smita.KoralahalliChannabasappa@amd.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/aer.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/aer.h b/include/linux/aer.h
index 2dd175f5debd..29cc10220952 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -42,11 +42,13 @@ struct aer_capability_regs {
 
 #if defined(CONFIG_PCIEAER)
 int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
+int pcie_aer_is_native(struct pci_dev *dev);
 #else
 static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 {
 	return -EINVAL;
 }
+static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
 #endif
 
 void cper_print_aer(struct pci_dev *dev, int aer_severity,
-- 
cgit v1.2.3


From 2b69987be575b92adb6c177679f3c559134f0d8f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 16 Oct 2019 15:03:50 -0400
Subject: sched: Add task_struct->faults_disabled_mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There has been a long standing page cache coherence bug with direct IO.
This provides part of a mechanism to fix it, currently just used by
bcachefs but potentially worth promoting to the VFS.

Direct IO evicts the range of the pagecache being read or written to.

For reads, we need dirty pages to be written to disk, so that the read
doesn't return stale data. For writes, we need to evict that range of
the pagecache so that it's not stale after the write completes.

However, without a locking mechanism to prevent those pages from being
re-added to the pagecache - by a buffered read or page fault - page
cache inconsistency is still possible.

This isn't necessarily just an issue for userspace when they're playing
games; filesystems may hang arbitrary state off the pagecache, and so
page cache inconsistency may cause real filesystem bugs, depending on
the filesystem. This is less of an issue for iomap based filesystems,
but e.g. buffer heads caches disk block mappings (!) and attaches them
to the pagecache, and bcachefs attaches disk reservations to pagecache
pages.

This issue has been hard to fix, because
 - we need to add a lock (henceforth called pagecache_add_lock), which
   would be held for the duration of the direct IO
 - page faults add pages to the page cache, thus need to take the same
   lock
 - dio -> gup -> page fault thus can deadlock

And we cannot enforce a lock ordering with this lock, since userspace
will be controlling the lock ordering (via the fd and buffer arguments
to direct IOs), so we need a different method of deadlock avoidance.

We need to tell the page fault handler that we're already holding a
pagecache_add_lock, and since plumbing it through the entire gup() path
would be highly impractical this adds a field to task_struct.

Then the full method is:
 - in the dio path, when we first take the pagecache_add_lock, note the
   mapping in the current task_struct
 - in the page fault handler, if faults_disabled_mapping is set, we
   check if it's the same mapping as the one we're taking a page fault
   for, and if so return an error.

   Then we check lock ordering: if there's a lock ordering violation and
   trylock fails, we'll have to cycle the locks and return an error that
   tells the DIO path to retry: faults_disabled_mapping is also used for
   signalling "locks were dropped, please retry".

Also relevant to this patch: mapping->invalidate_lock.
mapping->invalidate_lock provides most of the required semantics - it's
used by truncate/fallocate to block pages being added to the pagecache.
However, since it's a rwsem, direct IOs would need to take the write
side in order to block page cache adds, and would then be exclusive with
each other - we'll need a new type of lock to pair with this approach.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Jan Kara <jack@suse.cz>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: Andreas Grünbacher <andreas.gruenbacher@gmail.com>
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..d5951e99706a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -875,6 +875,7 @@ struct task_struct {
 
 	struct mm_struct		*mm;
 	struct mm_struct		*active_mm;
+	struct address_space		*faults_disabled_mapping;
 
 	int				exit_state;
 	int				exit_code;
-- 
cgit v1.2.3


From 771eb4fe8b420bb8563863e242861e635c742bc2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 9 Jul 2018 23:27:33 -0400
Subject: fs: factor out d_mark_tmpfile()

New helper for bcachefs - bcachefs doesn't want the
inode_dec_link_count() call that d_tmpfile does, it handles i_nlink on
its own atomically with other btree updates

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/dcache.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 6b351e009f59..3da2f0545d5d 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -251,6 +251,7 @@ extern struct dentry * d_make_root(struct inode *);
 /* <clickety>-<click> the ramfs-type tree */
 extern void d_genocide(struct dentry *);
 
+extern void d_mark_tmpfile(struct file *, struct inode *);
 extern void d_tmpfile(struct file *, struct inode *);
 
 extern struct dentry *d_find_alias(struct inode *);
-- 
cgit v1.2.3


From 83feeb195592b533541ff6399c8084e5b7c59677 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 Apr 2022 15:26:28 -0400
Subject: lib/string_helpers: string_get_size() now returns characters wrote

printbuf now needs to know the number of characters that would have been
written if the buffer was too small, like snprintf(); this changes
string_get_size() to return the the return value of snprintf().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/string_helpers.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 9d1f5bb74dd5..58fb1f90eda5 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -24,8 +24,8 @@ enum string_size_units {
 	STRING_UNITS_2,		/* use binary powers of 2^10 */
 };
 
-void string_get_size(u64 size, u64 blk_size, enum string_size_units units,
-		     char *buf, int len);
+int string_get_size(u64 size, u64 blk_size, enum string_size_units units,
+		    char *buf, int len);
 
 int parse_int_array_user(const char __user *from, size_t count, int **array);
 
-- 
cgit v1.2.3


From fe4fa2e4f7d0722c179fffa25911ea35cafadce2 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Fri, 1 Sep 2023 13:29:25 +0200
Subject: gpiolib: make gpiochip_get_desc() public

It makes sense for a GPIO driver to want to get its own descriptor
without requesting it. After all, the driver knows that it'll still be
valid. Let's move this helper to linux/gpio/driver.h.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Andy Shevchenko <andy@kernel.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index b721422f4bfa..8f0859ba7065 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -757,6 +757,8 @@ struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *gc,
 					    enum gpiod_flags dflags);
 void gpiochip_free_own_desc(struct gpio_desc *desc);
 
+struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum);
+
 #ifdef CONFIG_GPIOLIB
 
 /* lock/unlock as IRQ */
-- 
cgit v1.2.3


From 2c97d3e55b70edf33b6e7f211bab8a748a0a2bcc Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Wed, 30 Aug 2023 15:22:37 +1200
Subject: platform/x86: asus-wmi: add support for ASUS screenpad

Add support for the WMI methods used to turn off and adjust the
brightness of the secondary "screenpad" device found on some high-end
ASUS laptops like the GX650P series and others.

There are some small quirks with this device when considering only the
raw WMI methods:
1. The Off method can only switch the device off
2. Changing the brightness turns the device back on
3. To turn the device back on the brightness must be > 1
4. When the device is off the brightness can't be changed (so it is
   stored by the driver if device is off).
5. Booting with a value of 0 brightness (retained by bios) means the bios
   will set a value of >0 <15
6. When the device is off it is "unplugged"

asus_wmi sets the minimum brightness as 20 in general use, and 60 for
booting with values <= min.

The ACPI methods are used in a new backlight device named asus_screenpad.

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20230830032237.42987-2-luke@ljones.dev
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 16e99a1c37fc..63e630276499 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -58,6 +58,10 @@
 #define ASUS_WMI_DEVID_KBD_BACKLIGHT	0x00050021
 #define ASUS_WMI_DEVID_LIGHT_SENSOR	0x00050022 /* ?? */
 #define ASUS_WMI_DEVID_LIGHTBAR		0x00050025
+/* This can only be used to disable the screen, not re-enable */
+#define ASUS_WMI_DEVID_SCREENPAD_POWER	0x00050031
+/* Writing a brightness re-enables the screen if disabled */
+#define ASUS_WMI_DEVID_SCREENPAD_LIGHT	0x00050032
 #define ASUS_WMI_DEVID_FAN_BOOST_MODE	0x00110018
 #define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY 0x00120075
 
-- 
cgit v1.2.3


From 4eaf928622abc5dabc9b42a0de4dafbe29ddf491 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Fri, 11 Aug 2023 17:57:01 +0800
Subject: iio: Remove unused declarations

Commit 0f3a8c3f34f7 ("iio: Add support for creating IIO devices via configfs")
declared but never implemented iio_sw_device_type_configfs_{un}register().
Commit b662f809d410 ("iio: core: Introduce IIO software triggers") declared but
never implemented iio_sw_trigger_type_configfs_{un}register().
Commit a3e0b51884ee ("iio: accel: add support for FXLS8962AF/FXLS8964AF accelerometers")
declared but never implemented fxls8962af_core_remove().
Commit 8dedcc3eee3a ("iio: core: centralize ioctl() calls to the main chardev")
declared but never implemented iio_device_ioctl().

Commit d430f3c36ca6 ("iio: imu: inv_mpu6050: Use regmap instead of i2c specific functions")
removed inv_mpu6050_write_reg() but not its declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230811095701.35372-1-yuehaibing@huawei.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/sw_device.h  | 3 ---
 include/linux/iio/sw_trigger.h | 3 ---
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/sw_device.h b/include/linux/iio/sw_device.h
index eff1e6b2595c..0f7fe7b522e3 100644
--- a/include/linux/iio/sw_device.h
+++ b/include/linux/iio/sw_device.h
@@ -51,9 +51,6 @@ void iio_unregister_sw_device_type(struct iio_sw_device_type *dt);
 struct iio_sw_device *iio_sw_device_create(const char *, const char *);
 void iio_sw_device_destroy(struct iio_sw_device *);
 
-int iio_sw_device_type_configfs_register(struct iio_sw_device_type *dt);
-void iio_sw_device_type_configfs_unregister(struct iio_sw_device_type *dt);
-
 static inline
 void iio_swd_group_init_type_name(struct iio_sw_device *d,
 				  const char *name,
diff --git a/include/linux/iio/sw_trigger.h b/include/linux/iio/sw_trigger.h
index 47de2443e984..bc77f88df303 100644
--- a/include/linux/iio/sw_trigger.h
+++ b/include/linux/iio/sw_trigger.h
@@ -51,9 +51,6 @@ void iio_unregister_sw_trigger_type(struct iio_sw_trigger_type *tt);
 struct iio_sw_trigger *iio_sw_trigger_create(const char *, const char *);
 void iio_sw_trigger_destroy(struct iio_sw_trigger *);
 
-int iio_sw_trigger_type_configfs_register(struct iio_sw_trigger_type *tt);
-void iio_sw_trigger_type_configfs_unregister(struct iio_sw_trigger_type *tt);
-
 static inline
 void iio_swt_group_init_type_name(struct iio_sw_trigger *t,
 				  const char *name,
-- 
cgit v1.2.3


From 373beef00f7d781a000b12c31fb17a5a9c25969c Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Mon, 11 Sep 2023 15:52:57 +0100
Subject: KVM: arm64: nvhe: Ignore SVE hint in SMCCC function ID

When SVE is enabled, the host may set bit 16 in SMCCC function IDs, a
hint that indicates an unused SVE state. At the moment NVHE doesn't
account for this bit when inspecting the function ID, and rejects most
calls. Clear the hint bit before comparing function IDs.

About version compatibility: the host's PSCI driver initially probes the
firmware for a SMCCC version number. If the firmware implements a
protocol recent enough (1.3), subsequent SMCCC calls have the hint bit
set. Since the hint bit was reserved in earlier versions of the
protocol, clearing it is fine regardless of the version in use.

When a new hint is added to the protocol in the future, it will be added
to ARM_SMCCC_CALL_HINTS and NVHE will handle it straight away. This
patch only clears known hints and leaves reserved bits as is, because
future SMCCC versions could use reserved bits as modifiers for the
function ID, rather than hints.

Fixes: cfa7ff959a78 ("arm64: smccc: Support SMCCC v1.3 SVE register saving hint")
Reported-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230911145254.934414-4-jean-philippe@linaro.org
---
 include/linux/arm-smccc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 7c67c17321d4..083f85653716 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -67,6 +67,8 @@
 #define ARM_SMCCC_VERSION_1_3		0x10003
 
 #define ARM_SMCCC_1_3_SVE_HINT		0x10000
+#define ARM_SMCCC_CALL_HINTS		ARM_SMCCC_1_3_SVE_HINT
+
 
 #define ARM_SMCCC_VERSION_FUNC_ID					\
 	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
-- 
cgit v1.2.3


From 08700ec705043eb0cee01b35cf5b9d63f0230d12 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Wed, 6 Sep 2023 03:46:57 +0900
Subject: linux/export: fix reference to exported functions for parisc64

John David Anglin reported parisc has been broken since commit
ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost").

Like ia64, parisc64 uses a function descriptor. The function
references must be prefixed with P%.

Also, symbols prefixed $$ from the library have the symbol type
STT_LOPROC instead of STT_FUNC. They should be handled as functions
too.

Fixes: ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost")
Reported-by: John David Anglin <dave.anglin@bell.net>
Tested-by: John David Anglin <dave.anglin@bell.net>
Tested-by: Helge Deller <deller@gmx.de>
Closes: https://lore.kernel.org/linux-parisc/1901598a-e11d-f7dd-a5d9-9a69d06e6b6e@bell.net/T/#u
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/linux/export-internal.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h
index 1c849db953a5..45fca09b2319 100644
--- a/include/linux/export-internal.h
+++ b/include/linux/export-internal.h
@@ -52,6 +52,8 @@
 
 #ifdef CONFIG_IA64
 #define KSYM_FUNC(name)		@fptr(name)
+#elif defined(CONFIG_PARISC) && defined(CONFIG_64BIT)
+#define KSYM_FUNC(name)		P%name
 #else
 #define KSYM_FUNC(name)		name
 #endif
-- 
cgit v1.2.3


From 25e73b7e3f72a25aa30cbb2eecb49036e0acf066 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 2 Aug 2023 12:55:46 +0200
Subject: x86/ibt: Suppress spurious ENDBR

It was reported that under certain circumstances GCC emits ENDBR
instructions for _THIS_IP_ usage. Specifically, when it appears at the
start of a basic block -- but not elsewhere.

Since _THIS_IP_ is never used for control flow, these ENDBR
instructions are completely superfluous. Override the _THIS_IP_
definition for x86_64 to avoid this.

Less ENDBR instructions is better.

Fixes: 156ff4a544ae ("x86/ibt: Base IBT bits")
Reported-by: David Kaplan <David.Kaplan@amd.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230802110323.016197440@infradead.org
---
 include/linux/instruction_pointer.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/instruction_pointer.h b/include/linux/instruction_pointer.h
index cda1f706eaeb..aa0b3ffea935 100644
--- a/include/linux/instruction_pointer.h
+++ b/include/linux/instruction_pointer.h
@@ -2,7 +2,12 @@
 #ifndef _LINUX_INSTRUCTION_POINTER_H
 #define _LINUX_INSTRUCTION_POINTER_H
 
+#include <asm/linkage.h>
+
 #define _RET_IP_		(unsigned long)__builtin_return_address(0)
+
+#ifndef _THIS_IP_
 #define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
+#endif
 
 #endif /* _LINUX_INSTRUCTION_POINTER_H */
-- 
cgit v1.2.3


From 133c4c0d37175f510a10fa9bed51e223936073fc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 Sep 2023 17:05:31 +0000
Subject: tcp: defer regular ACK while processing socket backlog

This idea came after a particular workload requested
the quickack attribute set on routes, and a performance
drop was noticed for large bulk transfers.

For high throughput flows, it is best to use one cpu
running the user thread issuing socket system calls,
and a separate cpu to process incoming packets from BH context.
(With TSO/GRO, bottleneck is usually the 'user' cpu)

Problem is the user thread can spend a lot of time while holding
the socket lock, forcing BH handler to queue most of incoming
packets in the socket backlog.

Whenever the user thread releases the socket lock, it must first
process all accumulated packets in the backlog, potentially
adding latency spikes. Due to flood mitigation, having too many
packets in the backlog increases chance of unexpected drops.

Backlog processing unfortunately shifts a fair amount of cpu cycles
from the BH cpu to the 'user' cpu, thus reducing max throughput.

This patch takes advantage of the backlog processing,
and the fact that ACK are mostly cumulative.

The idea is to detect we are in the backlog processing
and defer all eligible ACK into a single one,
sent from tcp_release_cb().

This saves cpu cycles on both sides, and network resources.

Performance of a single TCP flow on a 200Gbit NIC:

- Throughput is increased by 20% (100Gbit -> 120Gbit).
- Number of generated ACK per second shrinks from 240,000 to 40,000.
- Number of backlog drops per second shrinks from 230 to 0.

Benchmark context:
 - Regular netperf TCP_STREAM (no zerocopy)
 - Intel(R) Xeon(R) Platinum 8481C (Saphire Rapids)
 - MAX_SKB_FRAGS = 17 (~60KB per GRO packet)

This feature is guarded by a new sysctl, and enabled by default:
 /proc/sys/net/ipv4/tcp_backlog_ack_defer

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/tcp.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3c5efeeb024f..44d946161d4a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -463,15 +463,17 @@ enum tsq_enum {
 	TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
 				    * tcp_v{4|6}_mtu_reduced()
 				    */
+	TCP_ACK_DEFERRED,	   /* TX pure ack is deferred */
 };
 
 enum tsq_flags {
-	TSQF_THROTTLED			= (1UL << TSQ_THROTTLED),
-	TSQF_QUEUED			= (1UL << TSQ_QUEUED),
-	TCPF_TSQ_DEFERRED		= (1UL << TCP_TSQ_DEFERRED),
-	TCPF_WRITE_TIMER_DEFERRED	= (1UL << TCP_WRITE_TIMER_DEFERRED),
-	TCPF_DELACK_TIMER_DEFERRED	= (1UL << TCP_DELACK_TIMER_DEFERRED),
-	TCPF_MTU_REDUCED_DEFERRED	= (1UL << TCP_MTU_REDUCED_DEFERRED),
+	TSQF_THROTTLED			= BIT(TSQ_THROTTLED),
+	TSQF_QUEUED			= BIT(TSQ_QUEUED),
+	TCPF_TSQ_DEFERRED		= BIT(TCP_TSQ_DEFERRED),
+	TCPF_WRITE_TIMER_DEFERRED	= BIT(TCP_WRITE_TIMER_DEFERRED),
+	TCPF_DELACK_TIMER_DEFERRED	= BIT(TCP_DELACK_TIMER_DEFERRED),
+	TCPF_MTU_REDUCED_DEFERRED	= BIT(TCP_MTU_REDUCED_DEFERRED),
+	TCPF_ACK_DEFERRED		= BIT(TCP_ACK_DEFERRED),
 };
 
 #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
-- 
cgit v1.2.3


From 2b5dcb31a19a2e0acd869b12c9db9b2d696ef544 Mon Sep 17 00:00:00 2001
From: Leon Hwang <hffilwlqm@gmail.com>
Date: Tue, 12 Sep 2023 23:04:41 +0800
Subject: bpf, x64: Fix tailcall infinite loop

From commit ebf7d1f508a73871 ("bpf, x64: rework pro/epilogue and tailcall
handling in JIT"), the tailcall on x64 works better than before.

From commit e411901c0b775a3a ("bpf: allow for tailcalls in BPF subprograms
for x64 JIT"), tailcall is able to run in BPF subprograms on x64.

From commit 5b92a28aae4dd0f8 ("bpf: Support attaching tracing BPF program
to other BPF programs"), BPF program is able to trace other BPF programs.

How about combining them all together?

1. FENTRY/FEXIT on a BPF subprogram.
2. A tailcall runs in the BPF subprogram.
3. The tailcall calls the subprogram's caller.

As a result, a tailcall infinite loop comes up. And the loop would halt
the machine.

As we know, in tail call context, the tail_call_cnt propagates by stack
and rax register between BPF subprograms. So do in trampolines.

Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT")
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Leon Hwang <hffilwlqm@gmail.com>
Link: https://lore.kernel.org/r/20230912150442.2009-3-hffilwlqm@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 87eeb3a46a1d..b9e573159432 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1035,6 +1035,11 @@ struct btf_func_model {
  */
 #define BPF_TRAMP_F_SHARE_IPMODIFY	BIT(6)
 
+/* Indicate that current trampoline is in a tail call context. Then, it has to
+ * cache and restore tail_call_cnt to avoid infinite tail call loop.
+ */
+#define BPF_TRAMP_F_TAIL_CALL_CTX	BIT(7)
+
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.
  */
-- 
cgit v1.2.3


From 8f012db27c9516be1a7aca93ea4a6ca9c75056c9 Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Mon, 10 Jul 2023 13:02:58 -0700
Subject: x86/numa: Introduce numa_fill_memblks()

numa_fill_memblks() fills in the gaps in numa_meminfo memblks
over an physical address range.

The ACPI driver will use numa_fill_memblks() to implement a new Linux
policy that prescribes extending proximity domains in a portion of a
CFMWS window to the entire window.

Dan Williams offered this explanation of the policy:
A CFWMS is an ACPI data structure that indicates *potential* locations
where CXL memory can be placed. It is the playground where the CXL
driver has free reign to establish regions. That space can be populated
by BIOS created regions, or driver created regions, after hotplug or
other reconfiguration.

When BIOS creates a region in a CXL Window it additionally describes
that subset of the Window range in the other typical ACPI tables SRAT,
SLIT, and HMAT. The rationale for BIOS not pre-describing the entire
CXL Window in SRAT, SLIT, and HMAT is that it can not predict the
future. I.e. there is nothing stopping higher or lower performance
devices being placed in the same Window. Compare that to ACPI memory
hotplug that just onlines additional capacity in the proximity domain
with little freedom for dynamic performance differentiation.

That leaves the OS with a choice, should unpopulated window capacity
match the proximity domain of an existing region, or should it allocate
a new one? This patch takes the simple position of minimizing proximity
domain proliferation by reusing any proximity domain intersection for
the entire Window. If the Window has no intersections then allocate a
new proximity domain. Note that SRAT, SLIT and HMAT information can be
enumerated dynamically in a standard way from device provided data.
Think of CXL as the end of ACPI needing to describe memory attributes,
CXL offers a standard discovery model for performance attributes, but
Linux still needs to interoperate with the old regime.

Reported-by: Derick Marks <derick.w.marks@intel.com>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Derick Marks <derick.w.marks@intel.com>
Link: https://lore.kernel.org/all/ef078a6f056ca974e5af85997013c0fda9e3326d.1689018477.git.alison.schofield%40intel.com
---
 include/linux/numa.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/numa.h b/include/linux/numa.h
index 59df211d051f..0f512c0aba54 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -12,6 +12,7 @@
 #define MAX_NUMNODES    (1 << NODES_SHIFT)
 
 #define	NUMA_NO_NODE	(-1)
+#define	NUMA_NO_MEMBLK	(-1)
 
 /* optionally keep NUMA memory info available post init */
 #ifdef CONFIG_NUMA_KEEP_MEMINFO
@@ -43,6 +44,12 @@ static inline int phys_to_target_node(u64 start)
 	return 0;
 }
 #endif
+#ifndef numa_fill_memblks
+static inline int __init numa_fill_memblks(u64 start, u64 end)
+{
+	return NUMA_NO_MEMBLK;
+}
+#endif
 #else /* !CONFIG_NUMA */
 static inline int numa_map_to_online_node(int node)
 {
-- 
cgit v1.2.3


From 5eb1e6e459cfa025f79c43014f66ff62a55542f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Tue, 5 Sep 2023 21:42:53 +0200
Subject: i2c: Drop legacy callback .probe_new()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that all drivers are converted to the (new) .probe() callback, the
temporary .probe_new() can go away. \o/

Link: https://lore.kernel.org/linux-i2c/20230626094548.559542-1-u.kleine-koenig@pengutronix.de
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Reviewed-by: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/i2c.h | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 3430cc2b05a6..0dae9db27538 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -237,7 +237,6 @@ enum i2c_driver_flags {
  * struct i2c_driver - represent an I2C device driver
  * @class: What kind of i2c device we instantiate (for detect)
  * @probe: Callback for device binding
- * @probe_new: Transitional callback for device binding - do not use
  * @remove: Callback for device unbinding
  * @shutdown: Callback for device shutdown
  * @alert: Alert callback, for example for the SMBus alert protocol
@@ -272,16 +271,8 @@ enum i2c_driver_flags {
 struct i2c_driver {
 	unsigned int class;
 
-	union {
 	/* Standard driver model interfaces */
-		int (*probe)(struct i2c_client *client);
-		/*
-		 * Legacy callback that was part of a conversion of .probe().
-		 * Today it has the same semantic as .probe(). Don't use for new
-		 * code.
-		 */
-		int (*probe_new)(struct i2c_client *client);
-	};
+	int (*probe)(struct i2c_client *client);
 	void (*remove)(struct i2c_client *client);
 
 
-- 
cgit v1.2.3


From bbaa6ffa5b6c9609d3b3c431c389b407eea5441f Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Wed, 13 Sep 2023 11:32:33 +0800
Subject: power: supply: core: Use blocking_notifier_call_chain to avoid RCU
 complaint

AMD PMF driver can cause the following warning:
[  196.159546] ------------[ cut here ]------------
[  196.159556] Voluntary context switch within RCU read-side critical section!
[  196.159571] WARNING: CPU: 0 PID: 9 at kernel/rcu/tree_plugin.h:320 rcu_note_context_switch+0x43d/0x560
[  196.159604] Modules linked in: nvme_fabrics ccm rfcomm snd_hda_scodec_cs35l41_spi cmac algif_hash algif_skcipher af_alg bnep joydev btusb btrtl uvcvideo btintel btbcm videobuf2_vmalloc intel_rapl_msr btmtk videobuf2_memops uvc videobuf2_v4l2 intel_rapl_common binfmt_misc hid_sensor_als snd_sof_amd_vangogh hid_sensor_trigger bluetooth industrialio_triggered_buffer videodev snd_sof_amd_rembrandt hid_sensor_iio_common amdgpu ecdh_generic kfifo_buf videobuf2_common hp_wmi kvm_amd sparse_keymap snd_sof_amd_renoir wmi_bmof industrialio ecc mc nls_iso8859_1 kvm snd_sof_amd_acp irqbypass snd_sof_xtensa_dsp crct10dif_pclmul crc32_pclmul mt7921e snd_sof_pci snd_ctl_led polyval_clmulni mt7921_common polyval_generic snd_sof ghash_clmulni_intel mt792x_lib mt76_connac_lib sha512_ssse3 snd_sof_utils aesni_intel snd_hda_codec_realtek crypto_simd mt76 snd_hda_codec_generic cryptd snd_soc_core snd_hda_codec_hdmi rapl ledtrig_audio input_leds snd_compress i2c_algo_bit drm_ttm_helper mac80211 snd_pci_ps hid_multitouch ttm drm_exec
[  196.159970]  drm_suballoc_helper snd_rpl_pci_acp6x amdxcp drm_buddy snd_hda_intel snd_acp_pci snd_hda_scodec_cs35l41_i2c serio_raw gpu_sched snd_hda_scodec_cs35l41 snd_acp_legacy_common snd_intel_dspcfg snd_hda_cs_dsp_ctls snd_hda_codec libarc4 drm_display_helper snd_pci_acp6x cs_dsp snd_hwdep snd_soc_cs35l41_lib video k10temp snd_pci_acp5x thunderbolt snd_hda_core drm_kms_helper cfg80211 snd_seq snd_rn_pci_acp3x snd_pcm snd_acp_config cec snd_soc_acpi snd_seq_device rc_core ccp snd_pci_acp3x snd_timer snd soundcore wmi amd_pmf platform_profile amd_pmc mac_hid serial_multi_instantiate wireless_hotkey hid_sensor_hub sch_fq_codel msr parport_pc ppdev lp parport efi_pstore ip_tables x_tables autofs4 btrfs blake2b_generic raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx libcrc32c xor raid6_pq raid1 raid0 multipath linear dm_mirror dm_region_hash dm_log cdc_ether usbnet r8152 mii hid_generic nvme i2c_hid_acpi i2c_hid nvme_core i2c_piix4 xhci_pci amd_sfh drm xhci_pci_renesas nvme_common hid
[  196.160382] CPU: 0 PID: 9 Comm: kworker/0:1 Not tainted 6.6.0-rc1 #4
[  196.160397] Hardware name: HP HP EliteBook 845 14 inch G10 Notebook PC/8B6E, BIOS V82 Ver. 01.02.00 08/24/2023
[  196.160405] Workqueue: events power_supply_changed_work
[  196.160426] RIP: 0010:rcu_note_context_switch+0x43d/0x560
[  196.160440] Code: 00 48 89 be 40 08 00 00 48 89 86 48 08 00 00 48 89 10 e9 63 fe ff ff 48 c7 c7 10 e7 b0 9e c6 05 e8 d8 20 02 01 e8 13 0f f3 ff <0f> 0b e9 27 fc ff ff a9 ff ff ff 7f 0f 84 cf fc ff ff 65 48 8b 3c
[  196.160450] RSP: 0018:ffffc900001878f0 EFLAGS: 00010046
[  196.160462] RAX: 0000000000000000 RBX: ffff88885e834040 RCX: 0000000000000000
[  196.160470] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[  196.160476] RBP: ffffc90000187910 R08: 0000000000000000 R09: 0000000000000000
[  196.160482] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[  196.160488] R13: 0000000000000000 R14: ffff888100990000 R15: ffff888100990000
[  196.160495] FS:  0000000000000000(0000) GS:ffff88885e800000(0000) knlGS:0000000000000000
[  196.160504] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  196.160512] CR2: 000055cb053c8246 CR3: 000000013443a000 CR4: 0000000000750ef0
[  196.160520] PKRU: 55555554
[  196.160526] Call Trace:
[  196.160532]  <TASK>
[  196.160548]  ? show_regs+0x72/0x90
[  196.160570]  ? rcu_note_context_switch+0x43d/0x560
[  196.160580]  ? __warn+0x8d/0x160
[  196.160600]  ? rcu_note_context_switch+0x43d/0x560
[  196.160613]  ? report_bug+0x1bb/0x1d0
[  196.160637]  ? handle_bug+0x46/0x90
[  196.160658]  ? exc_invalid_op+0x19/0x80
[  196.160675]  ? asm_exc_invalid_op+0x1b/0x20
[  196.160709]  ? rcu_note_context_switch+0x43d/0x560
[  196.160727]  __schedule+0xb9/0x15f0
[  196.160746]  ? srso_alias_return_thunk+0x5/0x7f
[  196.160765]  ? srso_alias_return_thunk+0x5/0x7f
[  196.160778]  ? acpi_ns_search_one_scope+0xbe/0x270
[  196.160806]  schedule+0x68/0x110
[  196.160820]  schedule_timeout+0x151/0x160
[  196.160829]  ? srso_alias_return_thunk+0x5/0x7f
[  196.160842]  ? srso_alias_return_thunk+0x5/0x7f
[  196.160855]  ? acpi_ns_lookup+0x3c5/0xa90
[  196.160878]  __down_common+0xff/0x220
[  196.160905]  __down_timeout+0x16/0x30
[  196.160920]  down_timeout+0x64/0x70
[  196.160938]  acpi_os_wait_semaphore+0x85/0x200
[  196.160959]  acpi_ut_acquire_mutex+0x9e/0x280
[  196.160979]  acpi_ex_enter_interpreter+0x2d/0xb0
[  196.160992]  acpi_ns_evaluate+0x2f0/0x5f0
[  196.161005]  acpi_evaluate_object+0x172/0x490
[  196.161018]  ? acpi_os_signal_semaphore+0x8a/0xd0
[  196.161038]  acpi_evaluate_integer+0x52/0xe0
[  196.161055]  ? kfree+0x79/0x120
[  196.161071]  ? srso_alias_return_thunk+0x5/0x7f
[  196.161089]  acpi_ac_get_state.part.0+0x27/0x80
[  196.161110]  get_ac_property+0x5c/0x70
[  196.161127]  ? __pfx___power_supply_is_system_supplied+0x10/0x10
[  196.161146]  __power_supply_is_system_supplied+0x44/0xb0
[  196.161166]  class_for_each_device+0x124/0x160
[  196.161184]  ? acpi_ac_get_state.part.0+0x27/0x80
[  196.161203]  ? srso_alias_return_thunk+0x5/0x7f
[  196.161223]  power_supply_is_system_supplied+0x3c/0x70
[  196.161243]  amd_pmf_get_power_source+0xe/0x20 [amd_pmf]
[  196.161276]  amd_pmf_power_slider_update_event+0x49/0x90 [amd_pmf]
[  196.161310]  amd_pmf_pwr_src_notify_call+0xe7/0x100 [amd_pmf]
[  196.161340]  notifier_call_chain+0x5f/0xe0
[  196.161362]  atomic_notifier_call_chain+0x33/0x60
[  196.161378]  power_supply_changed_work+0x84/0x110
[  196.161394]  process_one_work+0x178/0x360
[  196.161412]  ? __pfx_worker_thread+0x10/0x10
[  196.161424]  worker_thread+0x307/0x430
[  196.161440]  ? __pfx_worker_thread+0x10/0x10
[  196.161451]  kthread+0xf4/0x130
[  196.161467]  ? __pfx_kthread+0x10/0x10
[  196.161486]  ret_from_fork+0x43/0x70
[  196.161502]  ? __pfx_kthread+0x10/0x10
[  196.161518]  ret_from_fork_asm+0x1b/0x30
[  196.161558]  </TASK>
[  196.161562] ---[ end trace 0000000000000000 ]---

Since there's no guarantee that all the callbacks can work in atomic
context, switch to use blocking_notifier_call_chain to relax the
constraint.

Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Reported-by: Allen Zhong <allen@atr.me>
Fixes: 4c71ae414474 ("platform/x86/amd/pmf: Add support SPS PMF feature")
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217571
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Link: https://lore.kernel.org/r/20230913033233.602986-1-kai.heng.feng@canonical.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/power_supply.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index a427f13c757f..85b86768c0b9 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -767,7 +767,7 @@ struct power_supply_battery_info {
 	int bti_resistance_tolerance;
 };
 
-extern struct atomic_notifier_head power_supply_notifier;
+extern struct blocking_notifier_head power_supply_notifier;
 extern int power_supply_reg_notifier(struct notifier_block *nb);
 extern void power_supply_unreg_notifier(struct notifier_block *nb);
 #if IS_ENABLED(CONFIG_POWER_SUPPLY)
-- 
cgit v1.2.3


From d090ec0df81e56556af3a2bf04a7e89347ae5784 Mon Sep 17 00:00:00 2001
From: Leonardo Bras <leobras@redhat.com>
Date: Thu, 31 Aug 2023 03:31:28 -0300
Subject: smp: Change function signatures to use call_single_data_t

call_single_data_t is a size-aligned typedef of struct __call_single_data.

This alignment is desirable in order to have smp_call_function*() avoid
bouncing an extra cacheline in case of an unaligned csd, given this
would hurt performance.

Since the removal of struct request->csd in commit 660e802c76c8
("blk-mq: use percpu csd to remote complete instead of per-rq csd") there
are no current users of smp_call_function*() with unaligned csd.

Change every 'struct __call_single_data' function parameter to
'call_single_data_t', so we have warnings if any new code tries to
introduce an smp_call_function*() call with unaligned csd.

Signed-off-by: Leonardo Bras <leobras@redhat.com>
Reviewed-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230831063129.335425-1-leobras@redhat.com
---
 include/linux/smp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 91ea4a67f8ca..e87520dc2959 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -53,7 +53,7 @@ int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 			   void *info, bool wait, const struct cpumask *mask);
 
-int smp_call_function_single_async(int cpu, struct __call_single_data *csd);
+int smp_call_function_single_async(int cpu, call_single_data_t *csd);
 
 /*
  * Cpus stopping functions in panic. All have default weak definitions.
-- 
cgit v1.2.3


From 85be6d842447067ce76047a14d4258c96fd33b7b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 15 Aug 2023 12:52:04 +0200
Subject: cleanup: Make no_free_ptr() __must_check

recent discussion brought about the realization that it makes sense for
no_free_ptr() to have __must_check semantics in order to avoid leaking
the resource.

Additionally, add a few comments to clarify why/how things work.

All credit to Linus on how to combine __must_check and the
stmt-expression.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20230816103102.GF980931@hirez.programming.kicks-ass.net
---
 include/linux/cleanup.h | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 53f1a7a932b0..9f1a9c455b68 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -7,8 +7,9 @@
 /*
  * DEFINE_FREE(name, type, free):
  *	simple helper macro that defines the required wrapper for a __free()
- *	based cleanup function. @free is an expression using '_T' to access
- *	the variable.
+ *	based cleanup function. @free is an expression using '_T' to access the
+ *	variable. @free should typically include a NULL test before calling a
+ *	function, see the example below.
  *
  * __free(name):
  *	variable attribute to add a scoped based cleanup to the variable.
@@ -17,6 +18,9 @@
  *	like a non-atomic xchg(var, NULL), such that the cleanup function will
  *	be inhibited -- provided it sanely deals with a NULL value.
  *
+ *	NOTE: this has __must_check semantics so that it is harder to accidentally
+ *	leak the resource.
+ *
  * return_ptr(p):
  *	returns p while inhibiting the __free().
  *
@@ -24,6 +28,8 @@
  *
  * DEFINE_FREE(kfree, void *, if (_T) kfree(_T))
  *
+ * void *alloc_obj(...)
+ * {
  *	struct obj *p __free(kfree) = kmalloc(...);
  *	if (!p)
  *		return NULL;
@@ -32,6 +38,24 @@
  *		return NULL;
  *
  *	return_ptr(p);
+ * }
+ *
+ * NOTE: the DEFINE_FREE()'s @free expression includes a NULL test even though
+ * kfree() is fine to be called with a NULL value. This is on purpose. This way
+ * the compiler sees the end of our alloc_obj() function as:
+ *
+ *	tmp = p;
+ *	p = NULL;
+ *	if (p)
+ *		kfree(p);
+ *	return tmp;
+ *
+ * And through the magic of value-propagation and dead-code-elimination, it
+ * eliminates the actual cleanup call and compiles into:
+ *
+ *	return p;
+ *
+ * Without the NULL test it turns into a mess and the compiler can't help us.
  */
 
 #define DEFINE_FREE(_name, _type, _free) \
@@ -39,8 +63,17 @@
 
 #define __free(_name)	__cleanup(__free_##_name)
 
+#define __get_and_null_ptr(p) \
+	({ __auto_type __ptr = &(p); \
+	   __auto_type __val = *__ptr; \
+	   *__ptr = NULL;  __val; })
+
+static inline __must_check
+const volatile void * __must_check_fn(const volatile void *val)
+{ return val; }
+
 #define no_free_ptr(p) \
-	({ __auto_type __ptr = (p); (p) = NULL; __ptr; })
+	((typeof(p)) __must_check_fn(__get_and_null_ptr(p)))
 
 #define return_ptr(p)	return no_free_ptr(p)
 
-- 
cgit v1.2.3


From 6fb45460615358157a6d3c990e74f9c1395247e2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 20:45:16 +0200
Subject: sched: Simplify tg_set_cfs_bandwidth()

Use guards to reduce gotos and simplify control flow.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpu.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 0abd60a7987b..f19f56501809 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -153,6 +153,8 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
 static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
 #endif	/* !CONFIG_HOTPLUG_CPU */
 
+DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock())
+
 #ifdef CONFIG_PM_SLEEP_SMP
 extern int freeze_secondary_cpus(int primary);
 extern void thaw_secondary_cpus(void);
-- 
cgit v1.2.3


From 89de9921dfa77e43b985bde99a6031ab66511020 Mon Sep 17 00:00:00 2001
From: Paul M Stillwell Jr <paul.m.stillwell.jr@intel.com>
Date: Wed, 6 Sep 2023 13:57:01 -0600
Subject: virtchnl: Add CRC stripping capability

Some VFs may want to disable CRC stripping on incoming packets so create
an offload for that. The VF already sends information about configuring
its RX queues so use that structure to indicate that the CRC stripping
should be enabled or not.

Signed-off-by: Paul M Stillwell Jr <paul.m.stillwell.jr@intel.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index d0807ad43f93..dd71d3009771 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -240,6 +240,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES		BIT(6)
 /* used to negotiate communicating link speeds in Mbps */
 #define VIRTCHNL_VF_CAP_ADV_LINK_SPEED		BIT(7)
+#define  VIRTCHNL_VF_OFFLOAD_CRC		BIT(10)
 #define VIRTCHNL_VF_OFFLOAD_VLAN_V2		BIT(15)
 #define VIRTCHNL_VF_OFFLOAD_VLAN		BIT(16)
 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING		BIT(17)
@@ -295,7 +296,13 @@ VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_txq_info);
 /* VIRTCHNL_OP_CONFIG_RX_QUEUE
  * VF sends this message to set up parameters for one RX queue.
  * External data buffer contains one instance of virtchnl_rxq_info.
- * PF configures requested queue and returns a status code.
+ * PF configures requested queue and returns a status code. The
+ * crc_disable flag disables CRC stripping on the VF. Setting
+ * the crc_disable flag to 1 will disable CRC stripping for each
+ * queue in the VF where the flag is set. The VIRTCHNL_VF_OFFLOAD_CRC
+ * offload must have been set prior to sending this info or the PF
+ * will ignore the request. This flag should be set the same for
+ * all of the queues for a VF.
  */
 
 /* Rx queue config info */
@@ -307,7 +314,7 @@ struct virtchnl_rxq_info {
 	u16 splithdr_enabled; /* deprecated with AVF 1.0 */
 	u32 databuffer_size;
 	u32 max_pkt_size;
-	u8 pad0;
+	u8 crc_disable;
 	u8 rxdid;
 	u8 pad1[2];
 	u64 dma_ring_addr;
-- 
cgit v1.2.3


From b193a78ddb5ee7dba074d3f28dc050069ba083c0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 4 Sep 2023 12:34:40 -0400
Subject: NFS: Use the correct commit info in nfs_join_page_group()

Ensure that nfs_clear_request_commit() updates the correct counters when
it removes them from the commit list.

Fixes: ed5d588fe47f ("NFS: Try to join page groups before an O_DIRECT retransmission")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_page.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index aa9f4c6ebe26..1c315f854ea8 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -157,7 +157,9 @@ extern	void nfs_unlock_request(struct nfs_page *req);
 extern	void nfs_unlock_and_release_request(struct nfs_page *);
 extern	struct nfs_page *nfs_page_group_lock_head(struct nfs_page *req);
 extern	int nfs_page_group_lock_subrequests(struct nfs_page *head);
-extern	void nfs_join_page_group(struct nfs_page *head, struct inode *inode);
+extern void nfs_join_page_group(struct nfs_page *head,
+				struct nfs_commit_info *cinfo,
+				struct inode *inode);
 extern int nfs_page_group_lock(struct nfs_page *);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
-- 
cgit v1.2.3


From 806a3bc421a115fbb287c1efce63a48c54ee804b Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 30 Aug 2023 15:29:34 -0400
Subject: NFSv4.1: fix pnfs MDS=DS session trunking

Currently, when GETDEVICEINFO returns multiple locations where each
is a different IP but the server's identity is same as MDS, then
nfs4_set_ds_client() finds the existing nfs_client structure which
has the MDS's max_connect value (and if it's 1), then the 1st IP
on the DS's list will get dropped due to MDS trunking rules. Other
IPs would be added as they fall under the pnfs trunking rules.

For the list of IPs the 1st goes thru calling nfs4_set_ds_client()
which will eventually call nfs4_add_trunk() and call into
rpc_clnt_test_and_add_xprt() which has the check for MDS trunking.
The other IPs (after the 1st one), would call rpc_clnt_add_xprt()
which doesn't go thru that check.

nfs4_add_trunk() is called when MDS trunking is happening and it
needs to enforce the usage of max_connect mount option of the
1st mount. However, this shouldn't be applied to pnfs flow.

Instead, this patch proposed to treat MDS=DS as DS trunking and
make sure that MDS's max_connect limit does not apply to the
1st IP returned in the GETDEVICEINFO list. It does so by
marking the newly created client with a new flag NFS_CS_PNFS
which then used to pass max_connect value to use into the
rpc_clnt_test_and_add_xprt() instead of the existing rpc
client's max_connect value set by the MDS connection.

For example, mount was done without max_connect value set
so MDS's rpc client has cl_max_connect=1. Upon calling into
rpc_clnt_test_and_add_xprt() and using rpc client's value,
the caller passes in max_connect value which is previously
been set in the pnfs path (as a part of handling
GETDEVICEINFO list of IPs) in nfs4_set_ds_client().

However, when NFS_CS_PNFS flag is not set and we know we
are doing MDS trunking, comparing a new IP of the same
server, we then set the max_connect value to the
existing MDS's value and pass that into
rpc_clnt_test_and_add_xprt().

Fixes: dc48e0abee24 ("SUNRPC enforce creation of no more than max_connect xprts")
Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_fs_sb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 20eeba8b009d..cd628c4b011e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -48,6 +48,7 @@ struct nfs_client {
 #define NFS_CS_NOPING		6		/* - don't ping on connect */
 #define NFS_CS_DS		7		/* - Server is a DS */
 #define NFS_CS_REUSEPORT	8		/* - reuse src port on reconnect */
+#define NFS_CS_PNFS		9		/* - Server used for pnfs */
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	size_t			cl_addrlen;
 	char *			cl_hostname;	/* hostname of server */
-- 
cgit v1.2.3


From e4c89f9380017b6b2e63836e2de1af8eb4535384 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Sun, 27 Aug 2023 23:14:04 +0200
Subject: lib/ucs2_string: Add UCS-2 strscpy function

Add a ucs2_strscpy() function for UCS-2 strings. The behavior is
equivalent to the standard strscpy() function, just for 16-bit character
UCS-2 strings.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230827211408.689076-2-luzmaximilian@gmail.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/ucs2_string.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ucs2_string.h b/include/linux/ucs2_string.h
index cf3ada3e820e..c499ae809c7d 100644
--- a/include/linux/ucs2_string.h
+++ b/include/linux/ucs2_string.h
@@ -10,6 +10,7 @@ typedef u16 ucs2_char_t;
 unsigned long ucs2_strnlen(const ucs2_char_t *s, size_t maxlength);
 unsigned long ucs2_strlen(const ucs2_char_t *s);
 unsigned long ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength);
+ssize_t ucs2_strscpy(ucs2_char_t *dst, const ucs2_char_t *src, size_t count);
 int ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len);
 
 unsigned long ucs2_utf8size(const ucs2_char_t *src);
-- 
cgit v1.2.3


From 00b1248606ba3979ccae30ed11df8cdc1a84245a Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Sun, 27 Aug 2023 23:14:05 +0200
Subject: firmware: qcom_scm: Add support for Qualcomm Secure Execution
 Environment SCM interface

Add support for SCM calls to Secure OS and the Secure Execution
Environment (SEE) residing in the TrustZone (TZ) via the QSEECOM
interface. This allows communication with Secure/TZ applications, for
example 'uefisecapp' managing access to UEFI variables.

For better separation, make qcom_scm spin up a dedicated child
(platform) device in case QSEECOM support has been detected. The
corresponding driver for this device is then responsible for managing
any QSEECOM clients. Specifically, this driver attempts to automatically
detect known and supported applications, creating a client (auxiliary)
device for each one. The respective client/auxiliary driver is then
responsible for managing and communicating with the application.

While this patch introduces only a very basic interface without the more
advanced features (such as re-entrant and blocking SCM calls and
listeners/callbacks), this is enough to talk to the aforementioned
'uefisecapp'.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://lore.kernel.org/r/20230827211408.689076-3-luzmaximilian@gmail.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/firmware/qcom/qcom_qseecom.h | 46 ++++++++++++++++++++++++++++++
 include/linux/firmware/qcom/qcom_scm.h     | 22 ++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 include/linux/firmware/qcom/qcom_qseecom.h

(limited to 'include/linux')

diff --git a/include/linux/firmware/qcom/qcom_qseecom.h b/include/linux/firmware/qcom/qcom_qseecom.h
new file mode 100644
index 000000000000..b531547e1dc9
--- /dev/null
+++ b/include/linux/firmware/qcom/qcom_qseecom.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Qualcomm Secure Execution Environment (SEE) interface (QSEECOM).
+ * Responsible for setting up and managing QSEECOM client devices.
+ *
+ * Copyright (C) 2023 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+#include <linux/auxiliary_bus.h>
+#include <linux/types.h>
+
+#include <linux/firmware/qcom/qcom_scm.h>
+
+/**
+ * struct qseecom_client - QSEECOM client device.
+ * @aux_dev: Underlying auxiliary device.
+ * @app_id: ID of the loaded application.
+ */
+struct qseecom_client {
+	struct auxiliary_device aux_dev;
+	u32 app_id;
+};
+
+/**
+ * qcom_qseecom_app_send() - Send to and receive data from a given QSEE app.
+ * @client:   The QSEECOM client associated with the target app.
+ * @req:      Request buffer sent to the app (must be DMA-mappable).
+ * @req_size: Size of the request buffer.
+ * @rsp:      Response buffer, written to by the app (must be DMA-mappable).
+ * @rsp_size: Size of the response buffer.
+ *
+ * Sends a request to the QSEE app associated with the given client and read
+ * back its response. The caller must provide two DMA memory regions, one for
+ * the request and one for the response, and fill out the @req region with the
+ * respective (app-specific) request data. The QSEE app reads this and returns
+ * its response in the @rsp region.
+ *
+ * Note: This is a convenience wrapper around qcom_scm_qseecom_app_send().
+ * Clients should prefer to use this wrapper.
+ *
+ * Return: Zero on success, nonzero on failure.
+ */
+static inline int qcom_qseecom_app_send(struct qseecom_client *client, void *req, size_t req_size,
+					void *rsp, size_t rsp_size)
+{
+	return qcom_scm_qseecom_app_send(client->app_id, req, req_size, rsp, rsp_size);
+}
diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h
index 0c091a3f6d49..e1ea2eb56d04 100644
--- a/include/linux/firmware/qcom/qcom_scm.h
+++ b/include/linux/firmware/qcom/qcom_scm.h
@@ -122,4 +122,26 @@ extern int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
 extern int qcom_scm_lmh_profile_change(u32 profile_id);
 extern bool qcom_scm_lmh_dcvsh_available(void);
 
+#ifdef CONFIG_QCOM_QSEECOM
+
+int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id);
+int qcom_scm_qseecom_app_send(u32 app_id, void *req, size_t req_size, void *rsp,
+			      size_t rsp_size);
+
+#else /* CONFIG_QCOM_QSEECOM */
+
+static inline int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id)
+{
+	return -EINVAL;
+}
+
+static inline int qcom_scm_qseecom_app_send(u32 app_id, void *req,
+					    size_t req_size, void *rsp,
+					    size_t rsp_size)
+{
+	return -EINVAL;
+}
+
+#endif /* CONFIG_QCOM_QSEECOM */
+
 #endif
-- 
cgit v1.2.3


From b93c5fe16e4aa177cd072c1c4652cbe1b19a7812 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Thu, 3 Aug 2023 21:49:19 +0800
Subject: rcu: Remove unused function declaration rcu_eqs_special_set()

Commit a86baa69c2b7 ("rcu: Remove special bit at the bottom of the ->dynticks counter")
left behind this, remove it.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/rcutree.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 126f6b418f6a..153cfc7bbffd 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -37,7 +37,6 @@ void synchronize_rcu_expedited(void);
 void kvfree_call_rcu(struct rcu_head *head, void *ptr);
 
 void rcu_barrier(void);
-bool rcu_eqs_special_set(int cpu);
 void rcu_momentary_dyntick_idle(void);
 void kfree_rcu_scheduler_running(void);
 bool rcu_gp_might_be_stalled(void);
-- 
cgit v1.2.3


From 6e284c55fc0bef7d25fd34d29db11f483da60ea4 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Sat, 5 Aug 2023 11:17:25 +0800
Subject: mm: Remove kmem_valid_obj()

Function kmem_dump_obj() will splat if passed a pointer to a non-slab
object. So nothing calls it directly, instead calling kmem_valid_obj()
first to determine whether the passed pointer to a valid slab object. This
means that merging kmem_valid_obj() into kmem_dump_obj() will make the
code more concise. Therefore, convert kmem_dump_obj() to work the same
way as vmalloc_dump_obj(), removing the need for the kmem_dump_obj()
caller to check kmem_valid_obj().  After this, there are no remaining
calls to kmem_valid_obj() anymore, and it can be safely removed.

Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/slab.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 8228d1276a2f..ff56ab804bf6 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -245,8 +245,9 @@ DEFINE_FREE(kfree, void *, if (_T) kfree(_T))
 size_t ksize(const void *objp);
 
 #ifdef CONFIG_PRINTK
-bool kmem_valid_obj(void *object);
-void kmem_dump_obj(void *object);
+bool kmem_dump_obj(void *object);
+#else
+static inline bool kmem_dump_obj(void *object) { return false; }
 #endif
 
 /*
-- 
cgit v1.2.3


From 25cc71d1527b55c880d333e7cc1dc37aeef9843f Mon Sep 17 00:00:00 2001
From: Khadija Kamran <kamrankhadijadj@gmail.com>
Date: Wed, 23 Aug 2023 11:44:41 +0500
Subject: lsm: constify 'sb' parameter in security_quotactl()

SELinux registers the implementation for the "quotactl" hook. Looking at
the function implementation we observe that the parameter "sb" is not
changing.

Mark the "sb" parameter of LSM hook security_quotactl() as "const" since
it will not be changing in the LSM hook.

Signed-off-by: Khadija Kamran <kamrankhadijadj@gmail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ac962c4cb44b..b464f9c1894f 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -43,7 +43,7 @@ LSM_HOOK(int, 0, capset, struct cred *new, const struct cred *old,
 	 const kernel_cap_t *permitted)
 LSM_HOOK(int, 0, capable, const struct cred *cred, struct user_namespace *ns,
 	 int cap, unsigned int opts)
-LSM_HOOK(int, 0, quotactl, int cmds, int type, int id, struct super_block *sb)
+LSM_HOOK(int, 0, quotactl, int cmds, int type, int id, const struct super_block *sb)
 LSM_HOOK(int, 0, quota_on, struct dentry *dentry)
 LSM_HOOK(int, 0, syslog, int type)
 LSM_HOOK(int, 0, settime, const struct timespec64 *ts,
diff --git a/include/linux/security.h b/include/linux/security.h
index 5f16eecde00b..1a02e67e682f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -284,7 +284,7 @@ int security_capable(const struct cred *cred,
 		       struct user_namespace *ns,
 		       int cap,
 		       unsigned int opts);
-int security_quotactl(int cmds, int type, int id, struct super_block *sb);
+int security_quotactl(int cmds, int type, int id, const struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
 int security_syslog(int type);
 int security_settime64(const struct timespec64 *ts, const struct timezone *tz);
@@ -581,7 +581,7 @@ static inline int security_capable(const struct cred *cred,
 }
 
 static inline int security_quotactl(int cmds, int type, int id,
-				     struct super_block *sb)
+				     const struct super_block *sb)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 4a00c673068e72c12d243f5c31000246d6984e44 Mon Sep 17 00:00:00 2001
From: Khadija Kamran <kamrankhadijadj@gmail.com>
Date: Wed, 23 Aug 2023 12:17:29 +0500
Subject: lsm: constify 'file' parameter in security_bprm_creds_from_file()

The 'bprm_creds_from_file' hook has implementation registered in
commoncap. Looking at the function implementation we observe that the
'file' parameter is not changing.

Mark the 'file' parameter of LSM hook security_bprm_creds_from_file() as
'const' since it will not be changing in the LSM hook.

Signed-off-by: Khadija Kamran <kamrankhadijadj@gmail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/fs.h            | 2 +-
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4aeb3fa11927..f69d085e531f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2450,7 +2450,7 @@ struct filename {
 };
 static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
 
-static inline struct mnt_idmap *file_mnt_idmap(struct file *file)
+static inline struct mnt_idmap *file_mnt_idmap(const struct file *file)
 {
 	return mnt_idmap(file->f_path.mnt);
 }
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index b464f9c1894f..5dfe67d69aba 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -50,7 +50,7 @@ LSM_HOOK(int, 0, settime, const struct timespec64 *ts,
 	 const struct timezone *tz)
 LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages)
 LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm)
-LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, struct file *file)
+LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file)
 LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
 LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm)
 LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm)
diff --git a/include/linux/security.h b/include/linux/security.h
index 1a02e67e682f..edbea3e0a13f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -151,7 +151,7 @@ extern int cap_capset(struct cred *new, const struct cred *old,
 		      const kernel_cap_t *effective,
 		      const kernel_cap_t *inheritable,
 		      const kernel_cap_t *permitted);
-extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file);
+extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file);
 int cap_inode_setxattr(struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags);
 int cap_inode_removexattr(struct mnt_idmap *idmap,
@@ -290,7 +290,7 @@ int security_syslog(int type);
 int security_settime64(const struct timespec64 *ts, const struct timezone *tz);
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_bprm_creds_for_exec(struct linux_binprm *bprm);
-int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file);
+int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file);
 int security_bprm_check(struct linux_binprm *bprm);
 void security_bprm_committing_creds(struct linux_binprm *bprm);
 void security_bprm_committed_creds(struct linux_binprm *bprm);
@@ -613,7 +613,7 @@ static inline int security_bprm_creds_for_exec(struct linux_binprm *bprm)
 }
 
 static inline int security_bprm_creds_from_file(struct linux_binprm *bprm,
-						struct file *file)
+						const struct file *file)
 {
 	return cap_bprm_creds_from_file(bprm, file);
 }
-- 
cgit v1.2.3


From 64fc9526147c7fc14535134d8ea79b9c8dc549a7 Mon Sep 17 00:00:00 2001
From: Khadija Kamran <kamrankhadijadj@gmail.com>
Date: Wed, 23 Aug 2023 12:47:56 +0500
Subject: lsm: constify 'bprm' parameter in security_bprm_committing_creds()

The 'bprm_committing_creds' hook has implementations registered in
SELinux and Apparmor. Looking at the function implementations we observe
that the 'bprm' parameter is not changing.

Mark the 'bprm' parameter of LSM hook security_bprm_committing_creds()
as 'const' since it will not be changing in the LSM hook.

Signed-off-by: Khadija Kamran <kamrankhadijadj@gmail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 5dfe67d69aba..f6acc3ed66a3 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -52,7 +52,7 @@ LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages)
 LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm)
 LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file)
 LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
-LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm)
+LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, const struct linux_binprm *bprm)
 LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm)
 LSM_HOOK(int, 0, fs_context_submount, struct fs_context *fc, struct super_block *reference)
 LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
diff --git a/include/linux/security.h b/include/linux/security.h
index edbea3e0a13f..885053f81019 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -292,7 +292,7 @@ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_bprm_creds_for_exec(struct linux_binprm *bprm);
 int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file);
 int security_bprm_check(struct linux_binprm *bprm);
-void security_bprm_committing_creds(struct linux_binprm *bprm);
+void security_bprm_committing_creds(const struct linux_binprm *bprm);
 void security_bprm_committed_creds(struct linux_binprm *bprm);
 int security_fs_context_submount(struct fs_context *fc, struct super_block *reference);
 int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc);
@@ -623,7 +623,7 @@ static inline int security_bprm_check(struct linux_binprm *bprm)
 	return 0;
 }
 
-static inline void security_bprm_committing_creds(struct linux_binprm *bprm)
+static inline void security_bprm_committing_creds(const struct linux_binprm *bprm)
 {
 }
 
-- 
cgit v1.2.3


From 81b36803ac139827538ac5ce4028e750a3c53f53 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:21 +0000
Subject: udp: introduce udp->udp_flags

According to syzbot, it is time to use proper atomic flags
for various UDP flags.

Add udp_flags field, and convert udp->corkflag to first
bit in it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 43c1fb2d2c21..23f0693e0d9c 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -32,14 +32,20 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
 	return (num + net_hash_mix(net)) & mask;
 }
 
+enum {
+	UDP_FLAGS_CORK,		/* Cork is required */
+};
+
 struct udp_sock {
 	/* inet_sock has to be the first member */
 	struct inet_sock inet;
 #define udp_port_hash		inet.sk.__sk_common.skc_u16hashes[0]
 #define udp_portaddr_hash	inet.sk.__sk_common.skc_u16hashes[1]
 #define udp_portaddr_node	inet.sk.__sk_common.skc_portaddr_node
+
+	unsigned long	 udp_flags;
+
 	int		 pending;	/* Any pending frames ? */
-	unsigned int	 corkflag;	/* Cork is required */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
 			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
@@ -51,6 +57,11 @@ struct udp_sock {
 			 gro_enabled:1,	/* Request GRO aggregation */
 			 accept_udp_l4:1,
 			 accept_udp_fraglist:1;
+/* indicator bits used by pcflag: */
+#define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
+#define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
+#define UDPLITE_RECV_CC  0x4		/* set via udplite setsocktopt        */
+	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
@@ -62,12 +73,6 @@ struct udp_sock {
 	 */
 	__u16		 pcslen;
 	__u16		 pcrlen;
-/* indicator bits used by pcflag: */
-#define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
-#define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
-#define UDPLITE_RECV_CC  0x4		/* set via udplite setsocktopt        */
-	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
-	__u8		 unused[3];
 	/*
 	 * For encapsulation sockets.
 	 */
@@ -95,6 +100,15 @@ struct udp_sock {
 	int		forward_threshold;
 };
 
+#define udp_test_bit(nr, sk)			\
+	test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+#define udp_set_bit(nr, sk)			\
+	set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+#define udp_clear_bit(nr, sk)			\
+	clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+#define udp_assign_bit(nr, sk, val)		\
+	assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val)
+
 #define UDP_MAX_SEGMENTS	(1 << 6UL)
 
 #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk)
-- 
cgit v1.2.3


From a0002127cd746fcaa182ad3386ef6931c37f3bda Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:22 +0000
Subject: udp: move udp->no_check6_tx to udp->udp_flags

syzbot reported that udp->no_check6_tx can be read locklessly.
Use one atomic bit from udp->udp_flags

Fixes: 1c19448c9ba6 ("net: Make enabling of zero UDP6 csums more restrictive")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 23f0693e0d9c..e3f2a6c7ac1d 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -34,6 +34,7 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
 
 enum {
 	UDP_FLAGS_CORK,		/* Cork is required */
+	UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
 };
 
 struct udp_sock {
@@ -47,8 +48,7 @@ struct udp_sock {
 
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
-	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
-			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
+	unsigned char	 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
 			 encap_enabled:1, /* This socket enabled encap
 					   * processing; UDP tunnels and
 					   * different encapsulation layer set
@@ -115,7 +115,7 @@ struct udp_sock {
 
 static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
 {
-	udp_sk(sk)->no_check6_tx = val;
+	udp_assign_bit(NO_CHECK6_TX, sk, val);
 }
 
 static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
@@ -123,9 +123,9 @@ static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
 	udp_sk(sk)->no_check6_rx = val;
 }
 
-static inline bool udp_get_no_check6_tx(struct sock *sk)
+static inline bool udp_get_no_check6_tx(const struct sock *sk)
 {
-	return udp_sk(sk)->no_check6_tx;
+	return udp_test_bit(NO_CHECK6_TX, sk);
 }
 
 static inline bool udp_get_no_check6_rx(struct sock *sk)
-- 
cgit v1.2.3


From bcbc1b1de884647aa0318bf74eb7f293d72a1e40 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:23 +0000
Subject: udp: move udp->no_check6_rx to udp->udp_flags

syzbot reported that udp->no_check6_rx can be read locklessly.
Use one atomic bit from udp->udp_flags.

Fixes: 1c19448c9ba6 ("net: Make enabling of zero UDP6 csums more restrictive")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index e3f2a6c7ac1d..8d4c3835b1b2 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -35,6 +35,7 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
 enum {
 	UDP_FLAGS_CORK,		/* Cork is required */
 	UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
+	UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
 };
 
 struct udp_sock {
@@ -48,8 +49,7 @@ struct udp_sock {
 
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
-	unsigned char	 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
-			 encap_enabled:1, /* This socket enabled encap
+	unsigned char	 encap_enabled:1, /* This socket enabled encap
 					   * processing; UDP tunnels and
 					   * different encapsulation layer set
 					   * this
@@ -120,7 +120,7 @@ static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
 
 static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
 {
-	udp_sk(sk)->no_check6_rx = val;
+	udp_assign_bit(NO_CHECK6_RX, sk, val);
 }
 
 static inline bool udp_get_no_check6_tx(const struct sock *sk)
@@ -128,9 +128,9 @@ static inline bool udp_get_no_check6_tx(const struct sock *sk)
 	return udp_test_bit(NO_CHECK6_TX, sk);
 }
 
-static inline bool udp_get_no_check6_rx(struct sock *sk)
+static inline bool udp_get_no_check6_rx(const struct sock *sk)
 {
-	return udp_sk(sk)->no_check6_rx;
+	return udp_test_bit(NO_CHECK6_RX, sk);
 }
 
 static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
-- 
cgit v1.2.3


From e1dc0615c6b08ef36414f08c011965b8fb56198b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:24 +0000
Subject: udp: move udp->gro_enabled to udp->udp_flags

syzbot reported that udp->gro_enabled can be read locklessly.
Use one atomic bit from udp->udp_flags.

Fixes: e20cf8d3f1f7 ("udp: implement GRO for plain UDP sockets.")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 8d4c3835b1b2..b344bd2e41fc 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -36,6 +36,7 @@ enum {
 	UDP_FLAGS_CORK,		/* Cork is required */
 	UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
 	UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
+	UDP_FLAGS_GRO_ENABLED,	/* Request GRO aggregation */
 };
 
 struct udp_sock {
@@ -54,7 +55,6 @@ struct udp_sock {
 					   * different encapsulation layer set
 					   * this
 					   */
-			 gro_enabled:1,	/* Request GRO aggregation */
 			 accept_udp_l4:1,
 			 accept_udp_fraglist:1;
 /* indicator bits used by pcflag: */
-- 
cgit v1.2.3


From f5f52f0884a595ff99ab1a608643fe4025fca2d5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:26 +0000
Subject: udp: move udp->accept_udp_{l4|fraglist} to udp->udp_flags

These are read locklessly, move them to udp_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index b344bd2e41fc..bb2b87adfbea 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -37,6 +37,8 @@ enum {
 	UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
 	UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
 	UDP_FLAGS_GRO_ENABLED,	/* Request GRO aggregation */
+	UDP_FLAGS_ACCEPT_FRAGLIST,
+	UDP_FLAGS_ACCEPT_L4,
 };
 
 struct udp_sock {
@@ -50,13 +52,11 @@ struct udp_sock {
 
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
-	unsigned char	 encap_enabled:1, /* This socket enabled encap
+	unsigned char	 encap_enabled:1; /* This socket enabled encap
 					   * processing; UDP tunnels and
 					   * different encapsulation layer set
 					   * this
 					   */
-			 accept_udp_l4:1,
-			 accept_udp_fraglist:1;
 /* indicator bits used by pcflag: */
 #define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
 #define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
@@ -149,10 +149,12 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
 	if (!skb_is_gso(skb))
 		return false;
 
-	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4)
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
+	    !udp_test_bit(ACCEPT_L4, sk))
 		return true;
 
-	if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist)
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST &&
+	    !udp_test_bit(ACCEPT_FRAGLIST, sk))
 		return true;
 
 	return false;
@@ -160,8 +162,8 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
 
 static inline void udp_allow_gso(struct sock *sk)
 {
-	udp_sk(sk)->accept_udp_l4 = 1;
-	udp_sk(sk)->accept_udp_fraglist = 1;
+	udp_set_bit(ACCEPT_L4, sk);
+	udp_set_bit(ACCEPT_FRAGLIST, sk);
 }
 
 #define udp_portaddr_for_each_entry(__sk, list) \
-- 
cgit v1.2.3


From ac9a7f4ce5dda1472e8f44096f33066c6ec1a3b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:27 +0000
Subject: udp: lockless UDP_ENCAP_L2TPINUDP / UDP_GRO

Move udp->encap_enabled to udp->udp_flags.

Add udp_test_and_set_bit() helper to allow lockless
udp_tunnel_encap_enable() implementation.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index bb2b87adfbea..0cf83270a4a2 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -39,6 +39,7 @@ enum {
 	UDP_FLAGS_GRO_ENABLED,	/* Request GRO aggregation */
 	UDP_FLAGS_ACCEPT_FRAGLIST,
 	UDP_FLAGS_ACCEPT_L4,
+	UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */
 };
 
 struct udp_sock {
@@ -52,11 +53,7 @@ struct udp_sock {
 
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
-	unsigned char	 encap_enabled:1; /* This socket enabled encap
-					   * processing; UDP tunnels and
-					   * different encapsulation layer set
-					   * this
-					   */
+
 /* indicator bits used by pcflag: */
 #define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
 #define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
@@ -104,6 +101,8 @@ struct udp_sock {
 	test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
 #define udp_set_bit(nr, sk)			\
 	set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+#define udp_test_and_set_bit(nr, sk)		\
+	test_and_set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
 #define udp_clear_bit(nr, sk)			\
 	clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
 #define udp_assign_bit(nr, sk, val)		\
-- 
cgit v1.2.3


From 729549aa350c56a777bb342941ed4d69b6585769 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:29 +0000
Subject: udplite: remove UDPLITE_BIT

This flag is set but never read, we can remove it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 0cf83270a4a2..58156edec009 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -55,9 +55,8 @@ struct udp_sock {
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 
 /* indicator bits used by pcflag: */
-#define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
-#define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
-#define UDPLITE_RECV_CC  0x4		/* set via udplite setsocktopt        */
+#define UDPLITE_SEND_CC  0x1  		/* set via udplite setsockopt         */
+#define UDPLITE_RECV_CC  0x2		/* set via udplite setsocktopt        */
 	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
 	/*
 	 * Following member retains the information to create a UDP header
-- 
cgit v1.2.3


From 882af43a0fc37e26d85fb0df0c9edd3bed928de4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:30 +0000
Subject: udplite: fix various data-races

udp->pcflag, udp->pcslen and udp->pcrlen reads/writes are racy.

Move udp->pcflag to udp->udp_flags for atomicity,
and add READ_ONCE()/WRITE_ONCE() annotations for pcslen and pcrlen.

Fixes: ba4e58eca8aa ("[NET]: Supporting UDP-Lite (RFC 3828) in Linux")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 58156edec009..d04188714dca 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -40,6 +40,8 @@ enum {
 	UDP_FLAGS_ACCEPT_FRAGLIST,
 	UDP_FLAGS_ACCEPT_L4,
 	UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */
+	UDP_FLAGS_UDPLITE_SEND_CC, /* set via udplite setsockopt */
+	UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */
 };
 
 struct udp_sock {
@@ -54,10 +56,6 @@ struct udp_sock {
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 
-/* indicator bits used by pcflag: */
-#define UDPLITE_SEND_CC  0x1  		/* set via udplite setsockopt         */
-#define UDPLITE_RECV_CC  0x2		/* set via udplite setsocktopt        */
-	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
-- 
cgit v1.2.3


From 2758ac3a11d78af56e6969af04dec611806a62de Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 13 Sep 2023 21:28:25 +0200
Subject: firmware: qcom-scm: drop unneeded 'extern' specifiers

The 'extern' specifier in front of a function declaration has no effect.
Remove all of them from the qcom-scm header.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230913192826.36187-1-bartosz.golaszewski@linaro.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/firmware/qcom/qcom_scm.h | 101 +++++++++++++++------------------
 1 file changed, 47 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h
index e1ea2eb56d04..ccaf28846054 100644
--- a/include/linux/firmware/qcom/qcom_scm.h
+++ b/include/linux/firmware/qcom/qcom_scm.h
@@ -59,12 +59,12 @@ enum qcom_scm_ice_cipher {
 #define QCOM_SCM_PERM_RW (QCOM_SCM_PERM_READ | QCOM_SCM_PERM_WRITE)
 #define QCOM_SCM_PERM_RWX (QCOM_SCM_PERM_RW | QCOM_SCM_PERM_EXEC)
 
-extern bool qcom_scm_is_available(void);
+bool qcom_scm_is_available(void);
 
-extern int qcom_scm_set_cold_boot_addr(void *entry);
-extern int qcom_scm_set_warm_boot_addr(void *entry);
-extern void qcom_scm_cpu_power_down(u32 flags);
-extern int qcom_scm_set_remote_state(u32 state, u32 id);
+int qcom_scm_set_cold_boot_addr(void *entry);
+int qcom_scm_set_warm_boot_addr(void *entry);
+void qcom_scm_cpu_power_down(u32 flags);
+int qcom_scm_set_remote_state(u32 state, u32 id);
 
 struct qcom_scm_pas_metadata {
 	void *ptr;
@@ -72,55 +72,48 @@ struct qcom_scm_pas_metadata {
 	ssize_t size;
 };
 
-extern int qcom_scm_pas_init_image(u32 peripheral, const void *metadata,
-				   size_t size,
-				   struct qcom_scm_pas_metadata *ctx);
-extern void qcom_scm_pas_metadata_release(struct qcom_scm_pas_metadata *ctx);
-extern int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr,
-				  phys_addr_t size);
-extern int qcom_scm_pas_auth_and_reset(u32 peripheral);
-extern int qcom_scm_pas_shutdown(u32 peripheral);
-extern bool qcom_scm_pas_supported(u32 peripheral);
-
-extern int qcom_scm_io_readl(phys_addr_t addr, unsigned int *val);
-extern int qcom_scm_io_writel(phys_addr_t addr, unsigned int val);
-
-extern bool qcom_scm_restore_sec_cfg_available(void);
-extern int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare);
-extern int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size);
-extern int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare);
-extern int qcom_scm_iommu_set_cp_pool_size(u32 spare, u32 size);
-extern int qcom_scm_mem_protect_video_var(u32 cp_start, u32 cp_size,
-					  u32 cp_nonpixel_start,
-					  u32 cp_nonpixel_size);
-extern int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz,
-			       u64 *src,
-			       const struct qcom_scm_vmperm *newvm,
-			       unsigned int dest_cnt);
-
-extern bool qcom_scm_ocmem_lock_available(void);
-extern int qcom_scm_ocmem_lock(enum qcom_scm_ocmem_client id, u32 offset,
-			       u32 size, u32 mode);
-extern int qcom_scm_ocmem_unlock(enum qcom_scm_ocmem_client id, u32 offset,
-				 u32 size);
-
-extern bool qcom_scm_ice_available(void);
-extern int qcom_scm_ice_invalidate_key(u32 index);
-extern int qcom_scm_ice_set_key(u32 index, const u8 *key, u32 key_size,
-				enum qcom_scm_ice_cipher cipher,
-				u32 data_unit_size);
-
-extern bool qcom_scm_hdcp_available(void);
-extern int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt,
-			     u32 *resp);
-
-extern int qcom_scm_iommu_set_pt_format(u32 sec_id, u32 ctx_num, u32 pt_fmt);
-extern int qcom_scm_qsmmu500_wait_safe_toggle(bool en);
-
-extern int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
-			      u64 limit_node, u32 node_id, u64 version);
-extern int qcom_scm_lmh_profile_change(u32 profile_id);
-extern bool qcom_scm_lmh_dcvsh_available(void);
+int qcom_scm_pas_init_image(u32 peripheral, const void *metadata, size_t size,
+			    struct qcom_scm_pas_metadata *ctx);
+void qcom_scm_pas_metadata_release(struct qcom_scm_pas_metadata *ctx);
+int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr, phys_addr_t size);
+int qcom_scm_pas_auth_and_reset(u32 peripheral);
+int qcom_scm_pas_shutdown(u32 peripheral);
+bool qcom_scm_pas_supported(u32 peripheral);
+
+int qcom_scm_io_readl(phys_addr_t addr, unsigned int *val);
+int qcom_scm_io_writel(phys_addr_t addr, unsigned int val);
+
+bool qcom_scm_restore_sec_cfg_available(void);
+int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare);
+int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size);
+int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare);
+int qcom_scm_iommu_set_cp_pool_size(u32 spare, u32 size);
+int qcom_scm_mem_protect_video_var(u32 cp_start, u32 cp_size,
+				   u32 cp_nonpixel_start, u32 cp_nonpixel_size);
+int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz, u64 *src,
+			const struct qcom_scm_vmperm *newvm,
+			unsigned int dest_cnt);
+
+bool qcom_scm_ocmem_lock_available(void);
+int qcom_scm_ocmem_lock(enum qcom_scm_ocmem_client id, u32 offset, u32 size,
+			u32 mode);
+int qcom_scm_ocmem_unlock(enum qcom_scm_ocmem_client id, u32 offset, u32 size);
+
+bool qcom_scm_ice_available(void);
+int qcom_scm_ice_invalidate_key(u32 index);
+int qcom_scm_ice_set_key(u32 index, const u8 *key, u32 key_size,
+			 enum qcom_scm_ice_cipher cipher, u32 data_unit_size);
+
+bool qcom_scm_hdcp_available(void);
+int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, u32 *resp);
+
+int qcom_scm_iommu_set_pt_format(u32 sec_id, u32 ctx_num, u32 pt_fmt);
+int qcom_scm_qsmmu500_wait_safe_toggle(bool en);
+
+int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
+		       u64 limit_node, u32 node_id, u64 version);
+int qcom_scm_lmh_profile_change(u32 profile_id);
+bool qcom_scm_lmh_dcvsh_available(void);
 
 #ifdef CONFIG_QCOM_QSEECOM
 
-- 
cgit v1.2.3


From a721f7b8c3548e943e514a957f2a37f4763b9888 Mon Sep 17 00:00:00 2001
From: Khadija Kamran <kamrankhadijadj@gmail.com>
Date: Wed, 23 Aug 2023 13:16:40 +0500
Subject: lsm: constify 'bprm' parameter in security_bprm_committed_creds()

Three LSMs register the implementations for the 'bprm_committed_creds()'
hook: AppArmor, SELinux and tomoyo. Looking at the function
implementations we may observe that the 'bprm' parameter is not changing.

Mark the 'bprm' parameter of LSM hook security_bprm_committed_creds() as
'const' since it will not be changing in the LSM hook.

Signed-off-by: Khadija Kamran <kamrankhadijadj@gmail.com>
[PM: minor merge fuzzing due to other constification patches]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index f6acc3ed66a3..3b0f5cfca464 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -53,7 +53,7 @@ LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm)
 LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file)
 LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
 LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, const struct linux_binprm *bprm)
-LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm)
+LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, const struct linux_binprm *bprm)
 LSM_HOOK(int, 0, fs_context_submount, struct fs_context *fc, struct super_block *reference)
 LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
 	 struct fs_context *src_sc)
diff --git a/include/linux/security.h b/include/linux/security.h
index 885053f81019..3148103123fb 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -293,7 +293,7 @@ int security_bprm_creds_for_exec(struct linux_binprm *bprm);
 int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file);
 int security_bprm_check(struct linux_binprm *bprm);
 void security_bprm_committing_creds(const struct linux_binprm *bprm);
-void security_bprm_committed_creds(struct linux_binprm *bprm);
+void security_bprm_committed_creds(const struct linux_binprm *bprm);
 int security_fs_context_submount(struct fs_context *fc, struct super_block *reference);
 int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc);
 int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param);
@@ -627,7 +627,7 @@ static inline void security_bprm_committing_creds(const struct linux_binprm *bpr
 {
 }
 
-static inline void security_bprm_committed_creds(struct linux_binprm *bprm)
+static inline void security_bprm_committed_creds(const struct linux_binprm *bprm)
 {
 }
 
-- 
cgit v1.2.3


From 20a2aa47097aae7016209c4dbe392b3b25e0d883 Mon Sep 17 00:00:00 2001
From: Khadija Kamran <kamrankhadijadj@gmail.com>
Date: Wed, 23 Aug 2023 14:01:28 +0500
Subject: lsm: constify 'sb' parameter in security_sb_kern_mount()

The "sb_kern_mount" hook has implementation registered in SELinux.
Looking at the function implementation we observe that the "sb"
parameter is not changing.

Mark the "sb" parameter of LSM hook security_sb_kern_mount() as "const"
since it will not be changing in the LSM hook.

Signed-off-by: Khadija Kamran <kamrankhadijadj@gmail.com>
[PM: minor merge fuzzing due to other constification patches]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 3b0f5cfca464..99b8176c3738 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -66,7 +66,7 @@ LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
 LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts)
 LSM_HOOK(int, 0, sb_mnt_opts_compat, struct super_block *sb, void *mnt_opts)
 LSM_HOOK(int, 0, sb_remount, struct super_block *sb, void *mnt_opts)
-LSM_HOOK(int, 0, sb_kern_mount, struct super_block *sb)
+LSM_HOOK(int, 0, sb_kern_mount, const struct super_block *sb)
 LSM_HOOK(int, 0, sb_show_options, struct seq_file *m, struct super_block *sb)
 LSM_HOOK(int, 0, sb_statfs, struct dentry *dentry)
 LSM_HOOK(int, 0, sb_mount, const char *dev_name, const struct path *path,
diff --git a/include/linux/security.h b/include/linux/security.h
index 3148103123fb..1d1df326c881 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -304,7 +304,7 @@ void security_free_mnt_opts(void **mnt_opts);
 int security_sb_eat_lsm_opts(char *options, void **mnt_opts);
 int security_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts);
 int security_sb_remount(struct super_block *sb, void *mnt_opts);
-int security_sb_kern_mount(struct super_block *sb);
+int security_sb_kern_mount(const struct super_block *sb);
 int security_sb_show_options(struct seq_file *m, struct super_block *sb);
 int security_sb_statfs(struct dentry *dentry);
 int security_sb_mount(const char *dev_name, const struct path *path,
-- 
cgit v1.2.3


From 4de7b17fd05d03fa919e8c47fc66122bd24d7b6c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 21 Aug 2023 14:44:28 +0100
Subject: sched: Assert for_each_thread() is properly locked

list_for_each_entry_rcu() takes an optional fourth argument which
allows RCU to assert that the correct lock is held.  Several callers
of for_each_thread() rely on their caller to be holding the appropriate
lock, so this is a useful assertion to include.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Link: https://lore.kernel.org/r/20230821134428.2504912-1-willy@infradead.org
---
 include/linux/sched/signal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..9610bad018a3 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -656,7 +656,8 @@ extern bool current_is_single_threaded(void);
 	while ((t = next_thread(t)) != g)
 
 #define __for_each_thread(signal, t)	\
-	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
+		lockdep_is_held(&tasklist_lock))
 
 #define for_each_thread(p, t)		\
 	__for_each_thread((p)->signal, t)
-- 
cgit v1.2.3


From b0adfba7ee770fef20b1b6d86706c28f7fccfb07 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:01:59 +0000
Subject: ipv6: lockless IPV6_UNICAST_HOPS implementation

Some np->hop_limit accesses are racy, when socket lock is not held.

Add missing annotations and switch to full lockless implementation.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index af8a771a053c..c2e087071384 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -213,17 +213,7 @@ struct ipv6_pinfo {
 	__be32			flow_label;
 	__u32			frag_size;
 
-	/*
-	 * Packed in 16bits.
-	 * Omit one shift by putting the signed field at MSB.
-	 */
-#if defined(__BIG_ENDIAN_BITFIELD)
-	__s16			hop_limit:9;
-	__u16			__unused_1:7;
-#else
-	__u16			__unused_1:7;
-	__s16			hop_limit:9;
-#endif
+	s16			hop_limit;
 
 #if defined(__BIG_ENDIAN_BITFIELD)
 	/* Packed in 16bits. */
-- 
cgit v1.2.3


From d986f52124e062753e33b6fe303be5904a997eac Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:00 +0000
Subject: ipv6: lockless IPV6_MULTICAST_LOOP implementation

Add inet6_{test|set|clear|assign}_bit() helpers.

Note that I am using bits from inet->inet_flags,
this might change in the future if we need more flags.

While solving data-races accessing np->mc_loop,
this patch also allows to implement lockless accesses
to np->mcast_hops in the following patch.

Also constify sk_mc_loop() argument.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index c2e087071384..68cf1ca94914 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -218,11 +218,9 @@ struct ipv6_pinfo {
 #if defined(__BIG_ENDIAN_BITFIELD)
 	/* Packed in 16bits. */
 	__s16			mcast_hops:9;
-	__u16			__unused_2:6,
-				mc_loop:1;
+	__u16			__unused_2:7,
 #else
-	__u16			mc_loop:1,
-				__unused_2:6;
+	__u16			__unused_2:7;
 	__s16			mcast_hops:9;
 #endif
 	int			ucast_oif;
@@ -283,6 +281,18 @@ struct ipv6_pinfo {
 	struct inet6_cork	cork;
 };
 
+/* We currently use available bits from inet_sk(sk)->inet_flags,
+ * this could change in the future.
+ */
+#define inet6_test_bit(nr, sk)			\
+	test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
+#define inet6_set_bit(nr, sk)			\
+	set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
+#define inet6_clear_bit(nr, sk)			\
+	clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
+#define inet6_assign_bit(nr, sk, val)		\
+	assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)
+
 /* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */
 struct raw6_sock {
 	/* inet_sock has to be the first member of raw6_sock */
-- 
cgit v1.2.3


From 2da23eb07c91241d962f3ff05565065484cd8929 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:01 +0000
Subject: ipv6: lockless IPV6_MULTICAST_HOPS implementation

This fixes data-races around np->mcast_hops,
and make IPV6_MULTICAST_HOPS lockless.

Note that np->mcast_hops is never negative,
thus can fit an u8 field instead of s16.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 68cf1ca94914..9cc278b5e4f4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -214,15 +214,8 @@ struct ipv6_pinfo {
 	__u32			frag_size;
 
 	s16			hop_limit;
+	u8			mcast_hops;
 
-#if defined(__BIG_ENDIAN_BITFIELD)
-	/* Packed in 16bits. */
-	__s16			mcast_hops:9;
-	__u16			__unused_2:7,
-#else
-	__u16			__unused_2:7;
-	__s16			mcast_hops:9;
-#endif
 	int			ucast_oif;
 	int			mcast_oif;
 
-- 
cgit v1.2.3


From dcae74622c051b219ee628669a31716473efda2c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:04 +0000
Subject: ipv6: lockless IPV6_RECVERR_RFC4884 implementation

Move np->recverr_rfc4884 to an atomic flag to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 9cc278b5e4f4..0d2b0a1b2dae 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -256,7 +256,6 @@ struct ipv6_pinfo {
 				autoflowlabel:1,
 				autoflowlabel_set:1,
 				mc_all:1,
-				recverr_rfc4884:1,
 				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
-- 
cgit v1.2.3


From 6559c0ff3bc27d7e4d447d31c1d7e8eae0e959f5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:05 +0000
Subject: ipv6: lockless IPV6_MULTICAST_ALL implementation

Move np->mc_all to an atomic flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 0d2b0a1b2dae..d88e91b7f0a3 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -255,7 +255,6 @@ struct ipv6_pinfo {
 				dontfrag:1,
 				autoflowlabel:1,
 				autoflowlabel_set:1,
-				mc_all:1,
 				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
-- 
cgit v1.2.3


From 5121516b0c4736b7977d977b239e36d23ec64401 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:06 +0000
Subject: ipv6: lockless IPV6_AUTOFLOWLABEL implementation

Move np->autoflowlabel and np->autoflowlabel_set in inet->inet_flags,
to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index d88e91b7f0a3..e3be5dc21b7d 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -253,8 +253,6 @@ struct ipv6_pinfo {
 						 * 100: prefer care-of address
 						 */
 				dontfrag:1,
-				autoflowlabel:1,
-				autoflowlabel_set:1,
 				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
-- 
cgit v1.2.3


From 1086ca7cce292bb498d7f8f85f4593c9ef4902b7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:07 +0000
Subject: ipv6: lockless IPV6_DONTFRAG implementation

Move np->dontfrag flag to inet->inet_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e3be5dc21b7d..57d563f1d4b1 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -252,7 +252,6 @@ struct ipv6_pinfo {
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
-				dontfrag:1,
 				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
-- 
cgit v1.2.3


From 3fa29971c69519629370b119b0b618ee88ade6b9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:08 +0000
Subject: ipv6: lockless IPV6_RECVERR implemetation

np->recverr is moved to inet->inet_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 57d563f1d4b1..53f4f1b97a78 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -243,8 +243,7 @@ struct ipv6_pinfo {
 	} rxopt;
 
 	/* sockopt flags */
-	__u16			recverr:1,
-	                        sndflow:1,
+	__u16			sndflow:1,
 				repflow:1,
 				pmtudisc:3,
 				padding:1,	/* 1 bit hole */
-- 
cgit v1.2.3


From 3cccda8db2cf2f2a224d55d5b6e2251d478c58ca Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:09 +0000
Subject: ipv6: move np->repflow to atomic flags

Move np->repflow to inet->inet_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 53f4f1b97a78..e62413371ea4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -244,7 +244,6 @@ struct ipv6_pinfo {
 
 	/* sockopt flags */
 	__u16			sndflow:1,
-				repflow:1,
 				pmtudisc:3,
 				padding:1,	/* 1 bit hole */
 				srcprefs:3,	/* 001: prefer temporary address
-- 
cgit v1.2.3


From 83cd5eb654b320c1972254f243531f3f3cebcccf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:10 +0000
Subject: ipv6: lockless IPV6_ROUTER_ALERT_ISOLATE implementation

Reads from np->rtalert_isolate are racy.

Move this flag to inet->inet_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e62413371ea4..f288a35f157f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -246,11 +246,10 @@ struct ipv6_pinfo {
 	__u16			sndflow:1,
 				pmtudisc:3,
 				padding:1,	/* 1 bit hole */
-				srcprefs:3,	/* 001: prefer temporary address
+				srcprefs:3;	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
-				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
 	__be32			rcv_flowinfo;
-- 
cgit v1.2.3


From 6b724bc4300b431443f3b99520994a5aece347cd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:11 +0000
Subject: ipv6: lockless IPV6_MTU_DISCOVER implementation

Most np->pmtudisc reads are racy.

Move this 3bit field on a full byte, add annotations
and make IPV6_MTU_DISCOVER setsockopt() lockless.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index f288a35f157f..10f521a6a9c8 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -243,13 +243,12 @@ struct ipv6_pinfo {
 	} rxopt;
 
 	/* sockopt flags */
-	__u16			sndflow:1,
-				pmtudisc:3,
-				padding:1,	/* 1 bit hole */
+	__u8			sndflow:1,
 				srcprefs:3;	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
+	__u8			pmtudisc;
 	__u8			min_hopcount;
 	__u8			tclass;
 	__be32			rcv_flowinfo;
-- 
cgit v1.2.3


From 859f8b265fc2a11af0fb0c52b4087e0409250592 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:12 +0000
Subject: ipv6: lockless IPV6_FLOWINFO_SEND implementation

np->sndflow reads are racy.

Use one bit ftom atomic inet->inet_flags instead,
IPV6_FLOWINFO_SEND setsockopt() can be lockless.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 10f521a6a9c8..09253825c99c 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -243,8 +243,7 @@ struct ipv6_pinfo {
 	} rxopt;
 
 	/* sockopt flags */
-	__u8			sndflow:1,
-				srcprefs:3;	/* 001: prefer temporary address
+	__u8			srcprefs:3;	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
-- 
cgit v1.2.3


From 0a596b0682a7ce37e26c36629816f105c6459d06 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 26 Aug 2023 16:36:41 +0800
Subject: KEYS: Include linux/errno.h in linux/verification.h

Add inclusion of linux/errno.h as otherwise the reference to EINVAL
may be invalid.

Fixes: f3cf4134c5c6 ("bpf: Add bpf_lookup_*_key() and bpf_key_put() kfuncs")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202308261414.HKw1Mrip-lkp@intel.com/
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/verification.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/verification.h b/include/linux/verification.h
index f34e50ebcf60..cb2d47f28091 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h
@@ -8,6 +8,7 @@
 #ifndef _LINUX_VERIFICATION_H
 #define _LINUX_VERIFICATION_H
 
+#include <linux/errno.h>
 #include <linux/types.h>
 
 /*
-- 
cgit v1.2.3


From b58a36008bfa1aadf55f516bcbfae40c779eb54b Mon Sep 17 00:00:00 2001
From: Stefan Wahren <wahrenst@gmx.net>
Date: Wed, 6 Sep 2023 01:27:57 +0200
Subject: hwrng: bcm2835 - Fix hwrng throughput regression

The last RCU stall fix caused a massive throughput regression of the
hwrng on Raspberry Pi 0 - 3. hwrng_msleep doesn't sleep precisely enough
and usleep_range doesn't allow scheduling. So try to restore the
best possible throughput by introducing hwrng_yield which interruptable
sleeps for one jiffy.

Some performance measurements on Raspberry Pi 3B+ (arm64/defconfig):

sudo dd if=/dev/hwrng of=/dev/null count=1 bs=10000

cpu_relax              ~138025 Bytes / sec
hwrng_msleep(1000)         ~13 Bytes / sec
hwrng_yield              ~2510 Bytes / sec

Fixes: 96cb9d055445 ("hwrng: bcm2835 - use hwrng_msleep() instead of cpu_relax()")
Link: https://lore.kernel.org/linux-arm-kernel/bc97ece5-44a3-4c4e-77da-2db3eb66b128@gmx.net/
Signed-off-by: Stefan Wahren <wahrenst@gmx.net>
Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/hw_random.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index 8a3115516a1b..136e9842120e 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -63,5 +63,6 @@ extern void hwrng_unregister(struct hwrng *rng);
 extern void devm_hwrng_unregister(struct device *dve, struct hwrng *rng);
 
 extern long hwrng_msleep(struct hwrng *rng, unsigned int msecs);
+extern long hwrng_yield(struct hwrng *rng);
 
 #endif /* LINUX_HWRANDOM_H_ */
-- 
cgit v1.2.3


From b1f099b1cf51d553c510c6c8141c27d9ba7ea1fe Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 19 Aug 2023 07:12:33 -0700
Subject: numa: Generalize numa_map_to_online_node()

The function in fact searches the nearest node for a given one,
based on a N_ONLINE state. This is a common pattern to search
for a nearest node.

This patch converts numa_map_to_online_node() to numa_nearest_node()
so that others won't need to opencode the logic.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20230819141239.287290-2-yury.norov@gmail.com
---
 include/linux/numa.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/numa.h b/include/linux/numa.h
index 59df211d051f..fb30a42f0700 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -25,7 +25,7 @@
 #include <asm/sparsemem.h>
 
 /* Generic implementation available */
-int numa_map_to_online_node(int node);
+int numa_nearest_node(int node, unsigned int state);
 
 #ifndef memory_add_physaddr_to_nid
 static inline int memory_add_physaddr_to_nid(u64 start)
@@ -44,10 +44,11 @@ static inline int phys_to_target_node(u64 start)
 }
 #endif
 #else /* !CONFIG_NUMA */
-static inline int numa_map_to_online_node(int node)
+static inline int numa_nearest_node(int node, unsigned int state)
 {
 	return NUMA_NO_NODE;
 }
+
 static inline int memory_add_physaddr_to_nid(u64 start)
 {
 	return 0;
@@ -58,6 +59,8 @@ static inline int phys_to_target_node(u64 start)
 }
 #endif
 
+#define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE)
+
 #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
 extern const struct attribute_group arch_node_dev_group;
 #endif
-- 
cgit v1.2.3


From 8ab63d418d4339d996f80d02a00dbce0aa3ff972 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 19 Aug 2023 07:12:36 -0700
Subject: sched/topology: Fix sched_numa_find_nth_cpu() in non-NUMA case

When CONFIG_NUMA is enabled, sched_numa_find_nth_cpu() searches for a
CPU in sched_domains_numa_masks. The masks includes only online CPUs,
so effectively offline CPUs are skipped.

When CONFIG_NUMA is disabled, the fallback function should be consistent.

Fixes: cd7f55359c90 ("sched: add sched_numa_find_nth_cpu()")
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20230819141239.287290-5-yury.norov@gmail.com
---
 include/linux/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index fea32377f7c7..52f5850730b3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -251,7 +251,7 @@ extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int
 #else
 static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 {
-	return cpumask_nth(cpu, cpus);
+	return cpumask_nth_and(cpu, cpus, cpu_online_mask);
 }
 
 static inline const struct cpumask *
-- 
cgit v1.2.3


From 8f908db77782630c45ba29dac35c434b5ce0b730 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 15 Sep 2023 10:34:27 -0700
Subject: bpf: Fix BTF_ID symbol generation collision

Marcus and Satya reported an issue where BTF_ID macro generates same
symbol in separate objects and that breaks final vmlinux link.

ld.lld: error: ld-temp.o <inline asm>:14577:1: symbol
'__BTF_ID__struct__cgroup__624' is already defined

This can be triggered under specific configs when __COUNTER__ happens to
be the same for the same symbol in two different translation units,
which is already quite unlikely to happen.

Add __LINE__ number suffix to make BTF_ID symbol more unique, which is
not a complete fix, but it would help for now and meanwhile we can work
on better solution as suggested by Andrii.

Cc: stable@vger.kernel.org
Reported-by: Satya Durga Srinivasu Prabhala <quic_satyap@quicinc.com>
Reported-by: Marcus Seyfarth <m.seyfarth@gmail.com>
Closes: https://github.com/ClangBuiltLinux/linux/issues/1913
Debugged-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/bpf/CAEf4Bzb5KQ2_LmhN769ifMeSJaWfebccUasQOfQKaOd0nQ51tw@mail.gmail.com/
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/r/20230915-bpf_collision-v3-1-263fc519c21f@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf_ids.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index a3462a9b8e18..a9cb10b0e2e9 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -49,7 +49,7 @@ word							\
 	____BTF_ID(symbol, word)
 
 #define __ID(prefix) \
-	__PASTE(prefix, __COUNTER__)
+	__PASTE(__PASTE(prefix, __COUNTER__), __LINE__)
 
 /*
  * The BTF_ID defines unique symbol for each ID pointing
-- 
cgit v1.2.3


From 32e4fa37fa667fdf53499b9de92737dc75199d8e Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Mon, 4 Sep 2023 14:13:50 +0200
Subject: cpu/hotplug: Remove unused cpuhp_state CPUHP_AP_X86_VDSO_VMA_ONLINE

Commit b2e2ba578e01 ("x86/vdso: Initialize the CPU/node NR segment
descriptor earlier") removed the single user of this constant.

Remove it to reduce the size of cpuhp_hp_states[].

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230904121350.18055-1-olaf@aepfle.de
---
 include/linux/cpuhotplug.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 06dda85f0424..cd8bd6ed04f9 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -205,7 +205,6 @@ enum cpuhp_state {
 	CPUHP_AP_KVM_ONLINE,
 	CPUHP_AP_SCHED_WAIT_EMPTY,
 	CPUHP_AP_SMPBOOT_THREADS,
-	CPUHP_AP_X86_VDSO_VMA_ONLINE,
 	CPUHP_AP_IRQ_AFFINITY_ONLINE,
 	CPUHP_AP_BLK_MQ_ONLINE,
 	CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS,
-- 
cgit v1.2.3


From c656a4d5484ad99e97de549a9affc12a91d94963 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 15 Sep 2023 15:46:08 -0400
Subject: Revert "SUNRPC: clean up integer overflow check"

This reverts commit e87cf8a28e7592bd19064e8181324ae26bc02932.

This commit was added to silence a tautological comparison warning, but
removing the 'len' value check before calling xdr_inline_decode() is
really not what we want.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xdr.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 5b4fb3c791bc..896a6d2a9cf0 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -779,7 +779,9 @@ xdr_stream_decode_uint32_array(struct xdr_stream *xdr,
 
 	if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
 		return -EBADMSG;
-	p = xdr_inline_decode(xdr, size_mul(len, sizeof(*p)));
+	if (len > SIZE_MAX / sizeof(*p))
+		return -EBADMSG;
+	p = xdr_inline_decode(xdr, len * sizeof(*p));
 	if (unlikely(!p))
 		return -EBADMSG;
 	if (array == NULL)
-- 
cgit v1.2.3


From 993b5662f302628db4eb358d69b2720c88cbfaf0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 13 Sep 2023 16:12:33 -0400
Subject: SUNRPC: Silence compiler complaints about tautological comparisons

On 64-bit systems, the compiler will complain that the comparison
between SIZE_MAX and the 32-bit unsigned int 'len' is unnecessary.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xdr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 896a6d2a9cf0..2f8dc47f1eb0 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -779,7 +779,7 @@ xdr_stream_decode_uint32_array(struct xdr_stream *xdr,
 
 	if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
 		return -EBADMSG;
-	if (len > SIZE_MAX / sizeof(*p))
+	if (U32_MAX >= SIZE_MAX / sizeof(*p) && len > SIZE_MAX / sizeof(*p))
 		return -EBADMSG;
 	p = xdr_inline_decode(xdr, len * sizeof(*p));
 	if (unlikely(!p))
-- 
cgit v1.2.3


From daabb2b098e04753fa3d1b1feed13e5a61bef61c Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay12@gmail.com>
Date: Thu, 7 Sep 2023 23:05:49 +0000
Subject: bpf/tests: add tests for cpuv4 instructions

The BPF JITs now support cpuv4 instructions. Add tests for these new
instructions to the test suite:

1. Sign extended Load
2. Sign extended Mov
3. Unconditional byte swap
4. Unconditional jump with 32-bit offset
5. Signed division and modulo

Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
Link: https://lore.kernel.org/r/20230907230550.1417590-9-puranjay12@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 761af6b3cf2b..0138832ad571 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -117,21 +117,25 @@ struct ctl_table_header;
 
 /* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
 
-#define BPF_ALU64_IMM(OP, DST, IMM)				\
+#define BPF_ALU64_IMM_OFF(OP, DST, IMM, OFF)			\
 	((struct bpf_insn) {					\
 		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,	\
 		.dst_reg = DST,					\
 		.src_reg = 0,					\
-		.off   = 0,					\
+		.off   = OFF,					\
 		.imm   = IMM })
+#define BPF_ALU64_IMM(OP, DST, IMM)				\
+	BPF_ALU64_IMM_OFF(OP, DST, IMM, 0)
 
-#define BPF_ALU32_IMM(OP, DST, IMM)				\
+#define BPF_ALU32_IMM_OFF(OP, DST, IMM, OFF)			\
 	((struct bpf_insn) {					\
 		.code  = BPF_ALU | BPF_OP(OP) | BPF_K,		\
 		.dst_reg = DST,					\
 		.src_reg = 0,					\
-		.off   = 0,					\
+		.off   = OFF,					\
 		.imm   = IMM })
+#define BPF_ALU32_IMM(OP, DST, IMM)				\
+	BPF_ALU32_IMM_OFF(OP, DST, IMM, 0)
 
 /* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */
 
@@ -143,6 +147,16 @@ struct ctl_table_header;
 		.off   = 0,					\
 		.imm   = LEN })
 
+/* Byte Swap, bswap16/32/64 */
+
+#define BPF_BSWAP(DST, LEN)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_END | BPF_SRC(BPF_TO_LE),	\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = LEN })
+
 /* Short form of mov, dst_reg = src_reg */
 
 #define BPF_MOV64_REG(DST, SRC)					\
@@ -179,6 +193,24 @@ struct ctl_table_header;
 		.off   = 0,					\
 		.imm   = IMM })
 
+/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */
+
+#define BPF_MOVSX64_REG(DST, SRC, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+#define BPF_MOVSX32_REG(DST, SRC, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
 /* Special form of mov32, used for doing explicit zero extension on dst. */
 #define BPF_ZEXT_REG(DST)					\
 	((struct bpf_insn) {					\
@@ -263,6 +295,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
 		.off   = OFF,					\
 		.imm   = 0 })
 
+/* Memory load, dst_reg = *(signed size *) (src_reg + off16) */
+
+#define BPF_LDX_MEMSX(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEMSX,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
 /* Memory store, *(uint *) (dst_reg + off16) = src_reg */
 
 #define BPF_STX_MEM(SIZE, DST, SRC, OFF)			\
-- 
cgit v1.2.3


From 3868ab0f192581eff978501a05f3dc2e01541d77 Mon Sep 17 00:00:00 2001
From: Aananth V <aananthv@google.com>
Date: Thu, 14 Sep 2023 14:36:21 +0000
Subject: tcp: new TCP_INFO stats for RTO events

The 2023 SIGCOMM paper "Improving Network Availability with Protective
ReRoute" has indicated Linux TCP's RTO-triggered txhash rehashing can
effectively reduce application disruption during outages. To better
measure the efficacy of this feature, this patch adds three more
detailed stats during RTO recovery and exports via TCP_INFO.
Applications and monitoring systems can leverage this data to measure
the network path diversity and end-to-end repair latency during network
outages to improve their network infrastructure.

The following counters are added to tcp_sock in order to track RTO
events over the lifetime of a TCP socket.

1. u16 total_rto - Counts the total number of RTO timeouts.
2. u16 total_rto_recoveries - Counts the total number of RTO recoveries.
3. u32 total_rto_time - Counts the total time spent (ms) in RTO
                        recoveries. (time spent in CA_Loss and
                        CA_Recovery states)

To compute total_rto_time, we add a new u32 rto_stamp field to
tcp_sock. rto_stamp records the start timestamp (ms) of the last RTO
recovery (CA_Loss).

Corresponding fields are also added to the tcp_info struct.

Signed-off-by: Aananth V <aananthv@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 44d946161d4a..e15452df9804 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -377,6 +377,14 @@ struct tcp_sock {
 				 * Total data bytes retransmitted
 				 */
 	u32	total_retrans;	/* Total retransmits for entire connection */
+	u32	rto_stamp;	/* Start time (ms) of last CA_Loss recovery */
+	u16	total_rto;	/* Total number of RTO timeouts, including
+				 * SYN/SYN-ACK and recurring timeouts.
+				 */
+	u16	total_rto_recoveries;	/* Total number of RTO recoveries,
+					 * including any unfinished recovery.
+					 */
+	u32	total_rto_time;	/* ms spent in (completed) RTO recoveries. */
 
 	u32	urg_seq;	/* Seq of received urgent pointer */
 	unsigned int		keepalive_time;	  /* time before keep alive takes place */
-- 
cgit v1.2.3


From 9af27da6313c8f8c6a26c7ea3fe23d6b9664a3a8 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:31:58 +0200
Subject: bpf: Use bpf_is_subprog to check for subprogs

We would like to know whether a bpf_prog corresponds to the main prog or
one of the subprogs. The current JIT implementations simply check this
using the func_idx in bpf_prog->aux->func_idx. When the index is 0, it
belongs to the main program, otherwise it corresponds to some
subprogram.

This will also be necessary to halt exception propagation while walking
the stack when an exception is thrown, so we add a simple helper
function to check this, named bpf_is_subprog, and convert existing JIT
implementations to also make use of it.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b9e573159432..9171b0b6a590 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3194,4 +3194,9 @@ static inline gfp_t bpf_memcg_flags(gfp_t flags)
 	return flags;
 }
 
+static inline bool bpf_is_subprog(const struct bpf_prog *prog)
+{
+	return prog->aux->func_idx != 0;
+}
+
 #endif /* _LINUX_BPF_H */
-- 
cgit v1.2.3


From fd5d27b70188379bb441d404c29a0afb111e1753 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:31:59 +0200
Subject: arch/x86: Implement arch_bpf_stack_walk

The plumbing for offline unwinding when we throw an exception in
programs would require walking the stack, hence introduce a new
arch_bpf_stack_walk function. This is provided when the JIT supports
exceptions, i.e. bpf_jit_supports_exceptions is true. The arch-specific
code is really minimal, hence it should be straightforward to extend
this support to other architectures as well, as it reuses the logic of
arch_stack_walk, but allowing access to unwind_state data.

Once the stack pointer and frame pointer are known for the main subprog
during the unwinding, we know the stack layout and location of any
callee-saved registers which must be restored before we return back to
the kernel. This handling will be added in the subsequent patches.

Note that while we primarily unwind through BPF frames, which are
effectively CONFIG_UNWINDER_FRAME_POINTER, we still need one of this or
CONFIG_UNWINDER_ORC to be able to unwind through the bpf_throw frame
from which we begin walking the stack. We also require both sp and bp
(stack and frame pointers) from the unwind_state structure, which are
only available when one of these two options are enabled.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0138832ad571..88874de974cb 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -954,6 +954,8 @@ bool bpf_jit_needs_zext(void);
 bool bpf_jit_supports_subprog_tailcalls(void);
 bool bpf_jit_supports_kfunc_call(void);
 bool bpf_jit_supports_far_kfunc_call(void);
+bool bpf_jit_supports_exceptions(void);
+void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);
 
 static inline bool bpf_dump_raw_ok(const struct cred *cred)
-- 
cgit v1.2.3


From 335d1c5b545284d75ef96ee42e461eacefe865bb Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:00 +0200
Subject: bpf: Implement support for adding hidden subprogs

Introduce support in the verifier for generating a subprogram and
include it as part of a BPF program dynamically after the do_check phase
is complete. The first user will be the next patch which generates
default exception callbacks if none are set for the program. The phase
of invocation will be do_misc_fixups. Note that this is an internal
verifier function, and should be used with instruction blocks which
uphold the invariants stated in check_subprogs.

Since these subprogs are always appended to the end of the instruction
sequence of the program, it becomes relatively inexpensive to do the
related adjustments to the subprog_info of the program. Only the fake
exit subprogram is shifted forward, making room for our new subprog.

This is useful to insert a new subprogram, get it JITed, and obtain its
function pointer. The next patch will use this functionality to insert a
default exception callback which will be invoked after unwinding the
stack.

Note that these added subprograms are invisible to userspace, and never
reported in BPF_OBJ_GET_INFO_BY_ID etc. For now, only a single
subprogram is supported, but more can be easily supported in the future.

To this end, two function counts are introduced now, the existing
func_cnt, and real_func_cnt, the latter including hidden programs. This
allows us to conver the JIT code to use the real_func_cnt for management
of resources while syscall path continues working with existing
func_cnt.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          | 1 +
 include/linux/bpf_verifier.h | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9171b0b6a590..c3667e95af59 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1389,6 +1389,7 @@ struct bpf_prog_aux {
 	u32 stack_depth;
 	u32 id;
 	u32 func_cnt; /* used by non-func prog as the number of func progs */
+	u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */
 	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
 	u32 attach_btf_id; /* in-kernel BTF type id to attach to */
 	u32 ctx_arg_info_size;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index a3236651ec64..3c2a8636ab29 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -588,6 +588,7 @@ struct bpf_verifier_env {
 	u32 used_map_cnt;		/* number of used maps */
 	u32 used_btf_cnt;		/* number of used BTF objects */
 	u32 id_gen;			/* used to generate unique reg IDs */
+	u32 hidden_subprog_cnt;		/* number of hidden subprogs */
 	bool explore_alu_limits;
 	bool allow_ptr_leaks;
 	bool allow_uninit_stack;
@@ -598,7 +599,7 @@ struct bpf_verifier_env {
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
-	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
+	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 2]; /* max + 2 for the fake and exception subprogs */
 	union {
 		struct bpf_idmap idmap_scratch;
 		struct bpf_idset idset_scratch;
-- 
cgit v1.2.3


From f18b03fabaa9b7c80e80b72a621f481f0d706ae0 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:01 +0200
Subject: bpf: Implement BPF exceptions

This patch implements BPF exceptions, and introduces a bpf_throw kfunc
to allow programs to throw exceptions during their execution at runtime.
A bpf_throw invocation is treated as an immediate termination of the
program, returning back to its caller within the kernel, unwinding all
stack frames.

This allows the program to simplify its implementation, by testing for
runtime conditions which the verifier has no visibility into, and assert
that they are true. In case they are not, the program can simply throw
an exception from the other branch.

BPF exceptions are explicitly *NOT* an unlikely slowpath error handling
primitive, and this objective has guided design choices of the
implementation of the them within the kernel (with the bulk of the cost
for unwinding the stack offloaded to the bpf_throw kfunc).

The implementation of this mechanism requires use of add_hidden_subprog
mechanism introduced in the previous patch, which generates a couple of
instructions to move R1 to R0 and exit. The JIT then rewrites the
prologue of this subprog to take the stack pointer and frame pointer as
inputs and reset the stack frame, popping all callee-saved registers
saved by the main subprog. The bpf_throw function then walks the stack
at runtime, and invokes this exception subprog with the stack and frame
pointers as parameters.

Reviewers must take note that currently the main program is made to save
all callee-saved registers on x86_64 during entry into the program. This
is because we must do an equivalent of a lightweight context switch when
unwinding the stack, therefore we need the callee-saved registers of the
caller of the BPF program to be able to return with a sane state.

Note that we have to additionally handle r12, even though it is not used
by the program, because when throwing the exception the program makes an
entry into the kernel which could clobber r12 after saving it on the
stack. To be able to preserve the value we received on program entry, we
push r12 and restore it from the generated subprogram when unwinding the
stack.

For now, bpf_throw invocation fails when lingering resources or locks
exist in that path of the program. In a future followup, bpf_throw will
be extended to perform frame-by-frame unwinding to release lingering
resources for each stack frame, removing this limitation.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-5-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          | 3 +++
 include/linux/bpf_verifier.h | 4 ++++
 include/linux/filter.h       | 6 ++++++
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3667e95af59..16740ee82082 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1410,6 +1410,8 @@ struct bpf_prog_aux {
 	bool sleepable;
 	bool tail_call_reachable;
 	bool xdp_has_frags;
+	bool exception_cb;
+	bool exception_boundary;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
@@ -1432,6 +1434,7 @@ struct bpf_prog_aux {
 	int cgroup_atype; /* enum cgroup_bpf_attach_type */
 	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
+	unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp);
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3c2a8636ab29..da21a3ec5027 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -541,7 +541,9 @@ struct bpf_subprog_info {
 	bool has_tail_call;
 	bool tail_call_reachable;
 	bool has_ld_abs;
+	bool is_cb;
 	bool is_async_cb;
+	bool is_exception_cb;
 };
 
 struct bpf_verifier_env;
@@ -589,6 +591,7 @@ struct bpf_verifier_env {
 	u32 used_btf_cnt;		/* number of used BTF objects */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	u32 hidden_subprog_cnt;		/* number of hidden subprogs */
+	int exception_callback_subprog;
 	bool explore_alu_limits;
 	bool allow_ptr_leaks;
 	bool allow_uninit_stack;
@@ -596,6 +599,7 @@ struct bpf_verifier_env {
 	bool bypass_spec_v1;
 	bool bypass_spec_v4;
 	bool seen_direct_write;
+	bool seen_exception;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 88874de974cb..27406aee2d40 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1171,6 +1171,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
 bool is_bpf_text_address(unsigned long addr);
 int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		    char *sym);
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr);
 
 static inline const char *
 bpf_address_lookup(unsigned long addr, unsigned long *size,
@@ -1238,6 +1239,11 @@ static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
 	return -ERANGE;
 }
 
+static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+{
+	return NULL;
+}
+
 static inline const char *
 bpf_address_lookup(unsigned long addr, unsigned long *size,
 		   unsigned long *off, char **modname, char *sym)
-- 
cgit v1.2.3


From b9ae0c9dd0aca79bffc17be51c2dc148d1f72708 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:03 +0200
Subject: bpf: Add support for custom exception callbacks

By default, the subprog generated by the verifier to handle a thrown
exception hardcodes a return value of 0. To allow user-defined logic
and modification of the return value when an exception is thrown,
introduce the 'exception_callback:' declaration tag, which marks a
callback as the default exception handler for the program.

The format of the declaration tag is 'exception_callback:<value>', where
<value> is the name of the exception callback. Each main program can be
tagged using this BTF declaratiion tag to associate it with an exception
callback. In case the tag is absent, the default callback is used.

As such, the exception callback cannot be modified at runtime, only set
during verification.

Allowing modification of the callback for the current program execution
at runtime leads to issues when the programs begin to nest, as any
per-CPU state maintaing this information will have to be saved and
restored. We don't want it to stay in bpf_prog_aux as this takes a
global effect for all programs. An alternative solution is spilling
the callback pointer at a known location on the program stack on entry,
and then passing this location to bpf_throw as a parameter.

However, since exceptions are geared more towards a use case where they
are ideally never invoked, optimizing for this use case and adding to
the complexity has diminishing returns.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          | 4 +++-
 include/linux/bpf_verifier.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 16740ee82082..30063a760b5a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2422,9 +2422,11 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *reg);
+			  struct bpf_reg_state *reg, bool is_ex_cb);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+				    int comp_idx, const char *tag_key);
 
 struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index da21a3ec5027..94ec766432f5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -300,6 +300,7 @@ struct bpf_func_state {
 	bool in_callback_fn;
 	struct tnum callback_ret_range;
 	bool in_async_callback_fn;
+	bool in_exception_callback_fn;
 
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
-- 
cgit v1.2.3


From 7ccb84f04cda1dd6f64f352e9795db308e9cdc0c Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:06 +0200
Subject: mm: kasan: Declare kasan_unpoison_task_stack_below in kasan.h

We require access to this kasan helper in BPF code in the next patch
where we have to unpoison the task stack when we unwind and reset the
stack frame from bpf_throw, and it never really unpoisons the poisoned
stack slots on entry when compiler instrumentation is generated by
CONFIG_KASAN_STACK and inline instrumentation is supported.

Also, remove the declaration from mm/kasan/kasan.h as we put it in the
header file kasan.h.

Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Suggested-by: Andrey Konovalov <andreyknvl@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-10-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/kasan.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 819b6bc8ac08..7a463f814db2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -283,8 +283,10 @@ static inline bool kasan_check_byte(const void *address)
 
 #if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
 void kasan_unpoison_task_stack(struct task_struct *task);
+asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
 #else
 static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+static inline void kasan_unpoison_task_stack_below(const void *watermark) {}
 #endif
 
 #ifdef CONFIG_KASAN_GENERIC
-- 
cgit v1.2.3


From 12e94aee074ce1c5ffdb8f2246a8c4a095b6aa8a Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sebastian.reichel@collabora.com>
Date: Wed, 13 Sep 2023 15:39:00 +0200
Subject: power: supply: core: Don't export power_supply_notifier

power_supply_notifier can be internal, since all users are going
through power_supply_reg_notifier()/power_supply_unreg_notifier().

Link: https://lore.kernel.org/r/20230913133900.591637-1-sebastian.reichel@collabora.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/power_supply.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 85b86768c0b9..c0992a77feea 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -767,7 +767,6 @@ struct power_supply_battery_info {
 	int bti_resistance_tolerance;
 };
 
-extern struct blocking_notifier_head power_supply_notifier;
 extern int power_supply_reg_notifier(struct notifier_block *nb);
 extern void power_supply_unreg_notifier(struct notifier_block *nb);
 #if IS_ENABLED(CONFIG_POWER_SUPPLY)
-- 
cgit v1.2.3


From 9431063ad323ac864750aeba4d304389bc42ca4e Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Date: Wed, 13 Sep 2023 21:49:37 +0100
Subject: dpll: core: Add DPLL framework base functions

DPLL framework is used to represent and configure DPLL devices
in systems. Each device that has DPLL and can configure inputs
and outputs can use this framework.

Implement core framework functions for further interactions
with device drivers implementing dpll subsystem, as well as for
interactions of DPLL netlink framework part with the subsystem
itself.

Co-developed-by: Milena Olech <milena.olech@intel.com>
Signed-off-by: Milena Olech <milena.olech@intel.com>
Co-developed-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Co-developed-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dpll.h | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 include/linux/dpll.h

(limited to 'include/linux')

diff --git a/include/linux/dpll.h b/include/linux/dpll.h
new file mode 100644
index 000000000000..b47c3560b937
--- /dev/null
+++ b/include/linux/dpll.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Copyright (c) 2023 Meta Platforms, Inc. and affiliates
+ *  Copyright (c) 2023 Intel and affiliates
+ */
+
+#ifndef __DPLL_H__
+#define __DPLL_H__
+
+#include <uapi/linux/dpll.h>
+#include <linux/device.h>
+#include <linux/netlink.h>
+
+struct dpll_device;
+struct dpll_pin;
+
+struct dpll_device_ops {
+	int (*mode_get)(const struct dpll_device *dpll, void *dpll_priv,
+			enum dpll_mode *mode, struct netlink_ext_ack *extack);
+	bool (*mode_supported)(const struct dpll_device *dpll, void *dpll_priv,
+			       const enum dpll_mode mode,
+			       struct netlink_ext_ack *extack);
+	int (*lock_status_get)(const struct dpll_device *dpll, void *dpll_priv,
+			       enum dpll_lock_status *status,
+			       struct netlink_ext_ack *extack);
+	int (*temp_get)(const struct dpll_device *dpll, void *dpll_priv,
+			s32 *temp, struct netlink_ext_ack *extack);
+};
+
+struct dpll_pin_ops {
+	int (*frequency_set)(const struct dpll_pin *pin, void *pin_priv,
+			     const struct dpll_device *dpll, void *dpll_priv,
+			     const u64 frequency,
+			     struct netlink_ext_ack *extack);
+	int (*frequency_get)(const struct dpll_pin *pin, void *pin_priv,
+			     const struct dpll_device *dpll, void *dpll_priv,
+			     u64 *frequency, struct netlink_ext_ack *extack);
+	int (*direction_set)(const struct dpll_pin *pin, void *pin_priv,
+			     const struct dpll_device *dpll, void *dpll_priv,
+			     const enum dpll_pin_direction direction,
+			     struct netlink_ext_ack *extack);
+	int (*direction_get)(const struct dpll_pin *pin, void *pin_priv,
+			     const struct dpll_device *dpll, void *dpll_priv,
+			     enum dpll_pin_direction *direction,
+			     struct netlink_ext_ack *extack);
+	int (*state_on_pin_get)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_pin *parent_pin,
+				void *parent_pin_priv,
+				enum dpll_pin_state *state,
+				struct netlink_ext_ack *extack);
+	int (*state_on_dpll_get)(const struct dpll_pin *pin, void *pin_priv,
+				 const struct dpll_device *dpll,
+				 void *dpll_priv, enum dpll_pin_state *state,
+				 struct netlink_ext_ack *extack);
+	int (*state_on_pin_set)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_pin *parent_pin,
+				void *parent_pin_priv,
+				const enum dpll_pin_state state,
+				struct netlink_ext_ack *extack);
+	int (*state_on_dpll_set)(const struct dpll_pin *pin, void *pin_priv,
+				 const struct dpll_device *dpll,
+				 void *dpll_priv,
+				 const enum dpll_pin_state state,
+				 struct netlink_ext_ack *extack);
+	int (*prio_get)(const struct dpll_pin *pin,  void *pin_priv,
+			const struct dpll_device *dpll,  void *dpll_priv,
+			u32 *prio, struct netlink_ext_ack *extack);
+	int (*prio_set)(const struct dpll_pin *pin, void *pin_priv,
+			const struct dpll_device *dpll, void *dpll_priv,
+			const u32 prio, struct netlink_ext_ack *extack);
+};
+
+struct dpll_pin_frequency {
+	u64 min;
+	u64 max;
+};
+
+#define DPLL_PIN_FREQUENCY_RANGE(_min, _max)	\
+	{					\
+		.min = _min,			\
+		.max = _max,			\
+	}
+
+#define DPLL_PIN_FREQUENCY(_val) DPLL_PIN_FREQUENCY_RANGE(_val, _val)
+#define DPLL_PIN_FREQUENCY_1PPS \
+	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_1_HZ)
+#define DPLL_PIN_FREQUENCY_10MHZ \
+	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_10_MHZ)
+#define DPLL_PIN_FREQUENCY_IRIG_B \
+	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_10_KHZ)
+#define DPLL_PIN_FREQUENCY_DCF77 \
+	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_77_5_KHZ)
+
+struct dpll_pin_properties {
+	const char *board_label;
+	const char *panel_label;
+	const char *package_label;
+	enum dpll_pin_type type;
+	unsigned long capabilities;
+	u32 freq_supported_num;
+	struct dpll_pin_frequency *freq_supported;
+};
+
+struct dpll_device *
+dpll_device_get(u64 clock_id, u32 dev_driver_id, struct module *module);
+
+void dpll_device_put(struct dpll_device *dpll);
+
+int dpll_device_register(struct dpll_device *dpll, enum dpll_type type,
+			 const struct dpll_device_ops *ops, void *priv);
+
+void dpll_device_unregister(struct dpll_device *dpll,
+			    const struct dpll_device_ops *ops, void *priv);
+
+struct dpll_pin *
+dpll_pin_get(u64 clock_id, u32 dev_driver_id, struct module *module,
+	     const struct dpll_pin_properties *prop);
+
+int dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
+		      const struct dpll_pin_ops *ops, void *priv);
+
+void dpll_pin_unregister(struct dpll_device *dpll, struct dpll_pin *pin,
+			 const struct dpll_pin_ops *ops, void *priv);
+
+void dpll_pin_put(struct dpll_pin *pin);
+
+int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
+			     const struct dpll_pin_ops *ops, void *priv);
+
+void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin,
+				const struct dpll_pin_ops *ops, void *priv);
+
+#endif
-- 
cgit v1.2.3


From 9d71b54b65b1fb6c0d3a6c5c88ba9b915c783fbc Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Date: Wed, 13 Sep 2023 21:49:38 +0100
Subject: dpll: netlink: Add DPLL framework base functions

DPLL framework is used to represent and configure DPLL devices
in systems. Each device that has DPLL and can configure inputs
and outputs can use this framework.

Implement dpll netlink framework functions for enablement of dpll
subsystem netlink family.

Co-developed-by: Milena Olech <milena.olech@intel.com>
Signed-off-by: Milena Olech <milena.olech@intel.com>
Co-developed-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Co-developed-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dpll.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index b47c3560b937..2202310c10cd 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -130,4 +130,8 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
 void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin,
 				const struct dpll_pin_ops *ops, void *priv);
 
+int dpll_device_change_ntf(struct dpll_device *dpll);
+
+int dpll_pin_change_ntf(struct dpll_pin *pin);
+
 #endif
-- 
cgit v1.2.3


From 5f18426928800c59fb0f9bc8fb0c182bb6f5ee24 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Sep 2023 21:49:39 +0100
Subject: netdev: expose DPLL pin handle for netdevice

In case netdevice represents a SyncE port, the user needs to understand
the connection between netdevice and associated DPLL pin. There might me
multiple netdevices pointing to the same pin, in case of VF/SF
implementation.

Add a IFLA Netlink attribute to nest the DPLL pin handle, similar to
how it is implemented for devlink port. Add a struct dpll_pin pointer
to netdev and protect access to it by RTNL. Expose netdev_dpll_pin_set()
and netdev_dpll_pin_clear() helpers to the drivers so they can set/clear
the DPLL pin relationship to netdev.

Note that during the lifetime of struct dpll_pin the pin handle does not
change. Therefore it is save to access it lockless. It is drivers
responsibility to call netdev_dpll_pin_clear() before dpll_pin_put().

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dpll.h      | 15 +++++++++++++++
 include/linux/netdevice.h | 21 +++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 2202310c10cd..bbc480cd2932 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -101,6 +101,21 @@ struct dpll_pin_properties {
 	struct dpll_pin_frequency *freq_supported;
 };
 
+#if IS_ENABLED(CONFIG_DPLL)
+size_t dpll_msg_pin_handle_size(struct dpll_pin *pin);
+int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin);
+#else
+static inline size_t dpll_msg_pin_handle_size(struct dpll_pin *pin)
+{
+	return 0;
+}
+
+static inline int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin)
+{
+	return 0;
+}
+#endif
+
 struct dpll_device *
 dpll_device_get(u64 clock_id, u32 dev_driver_id, struct module *module);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0896aaa91dd7..db3d8429d50d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -79,6 +79,8 @@ struct xdp_buff;
 struct xdp_frame;
 struct xdp_metadata_ops;
 struct xdp_md;
+/* DPLL specific */
+struct dpll_pin;
 
 typedef u32 xdp_features_t;
 
@@ -2049,6 +2051,9 @@ enum netdev_ml_priv_type {
  *			SET_NETDEV_DEVLINK_PORT macro. This pointer is static
  *			during the time netdevice is registered.
  *
+ *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
+ *		   where the clock is recovered.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2405,6 +2410,10 @@ struct net_device {
 	struct rtnl_hw_stats64	*offload_xstats_l3;
 
 	struct devlink_port	*devlink_port;
+
+#if IS_ENABLED(CONFIG_DPLL)
+	struct dpll_pin		*dpll_pin;
+#endif
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -3940,6 +3949,18 @@ int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
 int dev_get_port_parent_id(struct net_device *dev,
 			   struct netdev_phys_item_id *ppid, bool recurse);
 bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);
+void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin);
+void netdev_dpll_pin_clear(struct net_device *dev);
+
+static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev)
+{
+#if IS_ENABLED(CONFIG_DPLL)
+	return dev->dpll_pin;
+#else
+	return NULL;
+#endif
+}
+
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
-- 
cgit v1.2.3


From 496fd0a26bbf73b6b12407ee4fbe5ff49d659a6d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Sep 2023 21:49:43 +0100
Subject: mlx5: Implement SyncE support using DPLL infrastructure

Implement SyncE support using newly introduced DPLL support.
Make sure that each PFs/VFs/SFs probed with appropriate capability
will spawn a dpll auxiliary device and register appropriate dpll device
and pin instances.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/driver.h   |  2 ++
 include/linux/mlx5/mlx5_ifc.h | 59 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3033bbaeac81..92434814c855 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -155,6 +155,8 @@ enum {
 	MLX5_REG_MCC		 = 0x9062,
 	MLX5_REG_MCDA		 = 0x9063,
 	MLX5_REG_MCAM		 = 0x907f,
+	MLX5_REG_MSECQ		 = 0x9155,
+	MLX5_REG_MSEES		 = 0x9156,
 	MLX5_REG_MIRC		 = 0x9162,
 	MLX5_REG_SBCAM		 = 0xB01F,
 	MLX5_REG_RESOURCE_DUMP   = 0xC000,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index fc3db401f8a2..dd8421d021cf 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10176,7 +10176,9 @@ struct mlx5_ifc_mcam_access_reg_bits2 {
 	u8         mirc[0x1];
 	u8         regs_97_to_96[0x2];
 
-	u8         regs_95_to_64[0x20];
+	u8         regs_95_to_87[0x09];
+	u8         synce_registers[0x2];
+	u8         regs_84_to_64[0x15];
 
 	u8         regs_63_to_32[0x20];
 
@@ -12549,4 +12551,59 @@ struct mlx5_ifc_modify_page_track_obj_in_bits {
 	struct mlx5_ifc_page_track_bits obj_context;
 };
 
+struct mlx5_ifc_msecq_reg_bits {
+	u8         reserved_at_0[0x20];
+
+	u8         reserved_at_20[0x12];
+	u8         network_option[0x2];
+	u8         local_ssm_code[0x4];
+	u8         local_enhanced_ssm_code[0x8];
+
+	u8         local_clock_identity[0x40];
+
+	u8         reserved_at_80[0x180];
+};
+
+enum {
+	MLX5_MSEES_FIELD_SELECT_ENABLE			= BIT(0),
+	MLX5_MSEES_FIELD_SELECT_ADMIN_STATUS		= BIT(1),
+	MLX5_MSEES_FIELD_SELECT_ADMIN_FREQ_MEASURE	= BIT(2),
+};
+
+enum mlx5_msees_admin_status {
+	MLX5_MSEES_ADMIN_STATUS_FREE_RUNNING		= 0x0,
+	MLX5_MSEES_ADMIN_STATUS_TRACK			= 0x1,
+};
+
+enum mlx5_msees_oper_status {
+	MLX5_MSEES_OPER_STATUS_FREE_RUNNING		= 0x0,
+	MLX5_MSEES_OPER_STATUS_SELF_TRACK		= 0x1,
+	MLX5_MSEES_OPER_STATUS_OTHER_TRACK		= 0x2,
+	MLX5_MSEES_OPER_STATUS_HOLDOVER			= 0x3,
+	MLX5_MSEES_OPER_STATUS_FAIL_HOLDOVER		= 0x4,
+	MLX5_MSEES_OPER_STATUS_FAIL_FREE_RUNNING	= 0x5,
+};
+
+struct mlx5_ifc_msees_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         pnat[0x2];
+	u8         lp_msb[0x2];
+	u8         reserved_at_14[0xc];
+
+	u8         field_select[0x20];
+
+	u8         admin_status[0x4];
+	u8         oper_status[0x4];
+	u8         ho_acq[0x1];
+	u8         reserved_at_49[0xc];
+	u8         admin_freq_measure[0x1];
+	u8         oper_freq_measure[0x1];
+	u8         failure_reason[0x9];
+
+	u8         frequency_diff[0x20];
+
+	u8         reserved_at_80[0x180];
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From ac5f395685bd16ca30c1c834dcbf8b555605ccae Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Sep 2023 09:12:40 +0200
Subject: net/mlx5: SF, Implement peer devlink set for SF representor devlink
 port

Benefit from the existence of internal mlx5 notifier and extend it by
event MLX5_DRIVER_EVENT_SF_PEER_DEVLINK. Use this event from SF
auxiliary device probe/remove functions to pass the registered SF
devlink instance to the SF representor.

Process the new event in SF representor code and call
devl_port_fn_devlink_set() to do the assignments. Implement this in work
to avoid possible deadlock when probe/remove function of SF may be
called with devlink instance lock held during devlink reload.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/device.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 4d5be378fa8c..8fbe22de16ef 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -366,6 +366,7 @@ enum mlx5_driver_event {
 	MLX5_DRIVER_EVENT_UPLINK_NETDEV,
 	MLX5_DRIVER_EVENT_MACSEC_SA_ADDED,
 	MLX5_DRIVER_EVENT_MACSEC_SA_DELETED,
+	MLX5_DRIVER_EVENT_SF_PEER_DEVLINK,
 };
 
 enum {
-- 
cgit v1.2.3


From 86328b338c3996b814417dd68e3f899a1a649059 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Tue, 12 Sep 2023 13:59:49 +0530
Subject: vmcore: remove dependency with is_kdump_kernel() for exporting vmcore

Currently, is_kdump_kernel() returns true when elfcorehdr_addr is set.
While elfcorehdr_addr is set for kexec based kernel dump mechanism,
alternate dump capturing methods like fadump [1] also set it to export
the vmcore. Since, is_kdump_kernel() is used to restrict resources in
crash dump capture kernel and such restrictions may not be desirable
for fadump, allow is_kdump_kernel() to be defined differently for such
scenarios. With this, is_kdump_kernel() could be false while vmcore is
usable. So, remove unnecessary dependency with is_kdump_kernel(), for
exporting vmcore.

[1] https://docs.kernel.org/powerpc/firmware-assisted-dump.html

Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20230912082950.856977-1-hbathini@linux.ibm.com
---
 include/linux/crash_dump.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 0f3a656293b0..acc55626afdc 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -50,6 +50,7 @@ void vmcore_cleanup(void);
 #define vmcore_elf64_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x))
 #endif
 
+#ifndef is_kdump_kernel
 /*
  * is_kdump_kernel() checks whether this kernel is booting after a panic of
  * previous kernel or not. This is determined by checking if previous kernel
@@ -64,6 +65,7 @@ static inline bool is_kdump_kernel(void)
 {
 	return elfcorehdr_addr != ELFCORE_ADDR_MAX;
 }
+#endif
 
 /* is_vmcore_usable() checks if the kernel is booting after a panic and
  * the vmcore region is usable.
@@ -75,7 +77,8 @@ static inline bool is_kdump_kernel(void)
 
 static inline int is_vmcore_usable(void)
 {
-	return is_kdump_kernel() && elfcorehdr_addr != ELFCORE_ADDR_ERR ? 1 : 0;
+	return elfcorehdr_addr != ELFCORE_ADDR_ERR &&
+		elfcorehdr_addr != ELFCORE_ADDR_MAX ? 1 : 0;
 }
 
 /* vmcore_unusable() marks the vmcore as unusable,
@@ -84,8 +87,7 @@ static inline int is_vmcore_usable(void)
 
 static inline void vmcore_unusable(void)
 {
-	if (is_kdump_kernel())
-		elfcorehdr_addr = ELFCORE_ADDR_ERR;
+	elfcorehdr_addr = ELFCORE_ADDR_ERR;
 }
 
 /**
-- 
cgit v1.2.3


From fbaa6a181a4b1886cbf4214abdf9a2df68471510 Mon Sep 17 00:00:00 2001
From: Elliot Berman <quic_eberman@quicinc.com>
Date: Fri, 8 Sep 2023 15:49:15 -0700
Subject: sched/core: Remove ifdeffery for saved_state

In preparation for freezer to also use saved_state, remove the
CONFIG_PREEMPT_RT compilation guard around saved_state.

On the arm64 platform I tested which did not have CONFIG_PREEMPT_RT,
there was no statistically significant deviation by applying this patch.

Test methodology:

perf bench sched message -g 40 -l 40

Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..dc37ae787e33 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -750,10 +750,8 @@ struct task_struct {
 #endif
 	unsigned int			__state;
 
-#ifdef CONFIG_PREEMPT_RT
 	/* saved state for "spinlock sleepers" */
 	unsigned int			saved_state;
-#endif
 
 	/*
 	 * This begins the randomizable portion of task_struct. Only
-- 
cgit v1.2.3


From f7b5bd725b737de3f2c4a836e07c82ba156d75df Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Thu, 14 Sep 2023 15:31:57 -0700
Subject: pds_core: check health in devcmd wait

Similar to what we do in the AdminQ, check for devcmd health
while waiting for an answer.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Reviewed-by: Brett Creeley <brett.creeley@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pds/pds_core_if.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pds/pds_core_if.h b/include/linux/pds/pds_core_if.h
index e838a2b90440..17a87c1a55d7 100644
--- a/include/linux/pds/pds_core_if.h
+++ b/include/linux/pds/pds_core_if.h
@@ -79,6 +79,7 @@ enum pds_core_status_code {
 	PDS_RC_EVFID	= 31,	/* VF ID does not exist */
 	PDS_RC_BAD_FW	= 32,	/* FW file is invalid or corrupted */
 	PDS_RC_ECLIENT	= 33,   /* No such client id */
+	PDS_RC_BAD_PCI	= 255,  /* Broken PCI when reading status */
 };
 
 /**
-- 
cgit v1.2.3


From b0af4bcb49464c221ad5f95d40f2b1b252ceedcc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Sep 2023 20:43:18 +0206
Subject: serial: core: Provide port lock wrappers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a serial port is used for kernel console output, then all
modifications to the UART registers which are done from other contexts,
e.g. getty, termios, are interference points for the kernel console.

So far this has been ignored and the printk output is based on the
principle of hope. The rework of the console infrastructure which aims to
support threaded and atomic consoles, requires to mark sections which
modify the UART registers as unsafe. This allows the atomic write function
to make informed decisions and eventually to restore operational state. It
also allows to prevent the regular UART code from modifying UART registers
while printk output is in progress.

All modifications of UART registers are guarded by the UART port lock,
which provides an obvious synchronization point with the console
infrastructure.

Provide wrapper functions for spin_[un]lock*(port->lock) invocations so
that the console mechanics can be applied later on at a single place and
does not require to copy the same logic all over the drivers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Link: https://lore.kernel.org/r/20230914183831.587273-2-john.ogness@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 79 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index bb6f073bc159..f1d5c0d1568c 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -588,6 +588,85 @@ struct uart_port {
 	void			*private_data;		/* generic platform data pointer */
 };
 
+/**
+ * uart_port_lock - Lock the UART port
+ * @up:		Pointer to UART port structure
+ */
+static inline void uart_port_lock(struct uart_port *up)
+{
+	spin_lock(&up->lock);
+}
+
+/**
+ * uart_port_lock_irq - Lock the UART port and disable interrupts
+ * @up:		Pointer to UART port structure
+ */
+static inline void uart_port_lock_irq(struct uart_port *up)
+{
+	spin_lock_irq(&up->lock);
+}
+
+/**
+ * uart_port_lock_irqsave - Lock the UART port, save and disable interrupts
+ * @up:		Pointer to UART port structure
+ * @flags:	Pointer to interrupt flags storage
+ */
+static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags)
+{
+	spin_lock_irqsave(&up->lock, *flags);
+}
+
+/**
+ * uart_port_trylock - Try to lock the UART port
+ * @up:		Pointer to UART port structure
+ *
+ * Returns: True if lock was acquired, false otherwise
+ */
+static inline bool uart_port_trylock(struct uart_port *up)
+{
+	return spin_trylock(&up->lock);
+}
+
+/**
+ * uart_port_trylock_irqsave - Try to lock the UART port, save and disable interrupts
+ * @up:		Pointer to UART port structure
+ * @flags:	Pointer to interrupt flags storage
+ *
+ * Returns: True if lock was acquired, false otherwise
+ */
+static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags)
+{
+	return spin_trylock_irqsave(&up->lock, *flags);
+}
+
+/**
+ * uart_port_unlock - Unlock the UART port
+ * @up:		Pointer to UART port structure
+ */
+static inline void uart_port_unlock(struct uart_port *up)
+{
+	spin_unlock(&up->lock);
+}
+
+/**
+ * uart_port_unlock_irq - Unlock the UART port and re-enable interrupts
+ * @up:		Pointer to UART port structure
+ */
+static inline void uart_port_unlock_irq(struct uart_port *up)
+{
+	spin_unlock_irq(&up->lock);
+}
+
+/**
+ * uart_port_lock_irqrestore - Unlock the UART port, restore interrupts
+ * @up:		Pointer to UART port structure
+ * @flags:	The saved interrupt flags for restore
+ */
+static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags)
+{
+	spin_unlock_irqrestore(&up->lock, flags);
+}
+
 static inline int serial_port_in(struct uart_port *up, int offset)
 {
 	return up->serial_in(up, offset);
-- 
cgit v1.2.3


From c5cbdb76e8e33ce90fec2946e8eee7d71d68e57a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Sep 2023 20:43:19 +0206
Subject: serial: core: Use lock wrappers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a serial port is used for kernel console output, then all
modifications to the UART registers which are done from other contexts,
e.g. getty, termios, are interference points for the kernel console.

So far this has been ignored and the printk output is based on the
principle of hope. The rework of the console infrastructure which aims to
support threaded and atomic consoles, requires to mark sections which
modify the UART registers as unsafe. This allows the atomic write function
to make informed decisions and eventually to restore operational state. It
also allows to prevent the regular UART code from modifying UART registers
while printk output is in progress.

All modifications of UART registers are guarded by the UART port lock,
which provides an obvious synchronization point with the console
infrastructure.

To avoid adding this functionality to all UART drivers, wrap the
spin_[un]lock*() invocations for uart_port::lock into helper functions
which just contain the spin_[un]lock*() invocations for now. In a
subsequent step these helpers will gain the console synchronization
mechanisms.

Converted with coccinelle. No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Link: https://lore.kernel.org/r/20230914183831.587273-3-john.ogness@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index f1d5c0d1568c..3091c62ec37b 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -1035,14 +1035,14 @@ static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
 	u8 sysrq_ch;
 
 	if (!port->has_sysrq) {
-		spin_unlock(&port->lock);
+		uart_port_unlock(port);
 		return;
 	}
 
 	sysrq_ch = port->sysrq_ch;
 	port->sysrq_ch = 0;
 
-	spin_unlock(&port->lock);
+	uart_port_unlock(port);
 
 	if (sysrq_ch)
 		handle_sysrq(sysrq_ch);
@@ -1054,14 +1054,14 @@ static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port
 	u8 sysrq_ch;
 
 	if (!port->has_sysrq) {
-		spin_unlock_irqrestore(&port->lock, flags);
+		uart_port_unlock_irqrestore(port, flags);
 		return;
 	}
 
 	sysrq_ch = port->sysrq_ch;
 	port->sysrq_ch = 0;
 
-	spin_unlock_irqrestore(&port->lock, flags);
+	uart_port_unlock_irqrestore(port, flags);
 
 	if (sysrq_ch)
 		handle_sysrq(sysrq_ch);
@@ -1077,12 +1077,12 @@ static inline int uart_prepare_sysrq_char(struct uart_port *port, u8 ch)
 }
 static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
 {
-	spin_unlock(&port->lock);
+	uart_port_unlock(port);
 }
 static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port,
 		unsigned long flags)
 {
-	spin_unlock_irqrestore(&port->lock, flags);
+	uart_port_unlock_irqrestore(port, flags);
 }
 #endif	/* CONFIG_MAGIC_SYSRQ_SERIAL */
 
-- 
cgit v1.2.3


From 1cb6422ecac8804ebe0b71f4b3440674955fec73 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 15 Sep 2023 13:15:10 -0700
Subject: ceph: Annotate struct ceph_monmap with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ceph_monmap.
Additionally, since the element count member must be set before accessing
the annotated flexible array member, move its initialization earlier.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Xiubo Li <xiubli@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: ceph-devel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ceph/mon_client.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index b658961156a0..7a9a40163c0f 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -19,7 +19,7 @@ struct ceph_monmap {
 	struct ceph_fsid fsid;
 	u32 epoch;
 	u32 num_mon;
-	struct ceph_entity_inst mon_inst[];
+	struct ceph_entity_inst mon_inst[] __counted_by(num_mon);
 };
 
 struct ceph_mon_client;
-- 
cgit v1.2.3


From d57125b55a292a8e74a1fb17182576a3b2b2e795 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 18 Sep 2023 10:44:08 +0200
Subject: Revert "ceph: make members in struct ceph_mds_request_args_ext a
 union"

This reverts commit 3af5ae22030cb59fab4fba35f5a2b62f47e14df9.

ceph_mds_request_args_ext was already (and remains to be) a union.  An
additional anonymous union inside is bogus:

    union ceph_mds_request_args_ext {
        union {
            union ceph_mds_request_args old;
            struct { ... } __attribute__ ((packed)) setattr_ext;
        };
    }

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
---
 include/linux/ceph/ceph_fs.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 5f2301ee88bc..f3b3593254b9 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -467,19 +467,17 @@ union ceph_mds_request_args {
 } __attribute__ ((packed));
 
 union ceph_mds_request_args_ext {
-	union {
-		union ceph_mds_request_args old;
-		struct {
-			__le32 mode;
-			__le32 uid;
-			__le32 gid;
-			struct ceph_timespec mtime;
-			struct ceph_timespec atime;
-			__le64 size, old_size;       /* old_size needed by truncate */
-			__le32 mask;                 /* CEPH_SETATTR_* */
-			struct ceph_timespec btime;
-		} __attribute__ ((packed)) setattr_ext;
-	};
+	union ceph_mds_request_args old;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+		struct ceph_timespec btime;
+	} __attribute__ ((packed)) setattr_ext;
 };
 
 #define CEPH_MDS_FLAG_REPLAY		1 /* this is a replayed op */
-- 
cgit v1.2.3


From 8452a05b2c633b708dbe3e742f71b24bf21fe42d Mon Sep 17 00:00:00 2001
From: Rohan G Thomas <rohan.g.thomas@intel.com>
Date: Sat, 16 Sep 2023 14:33:12 +0800
Subject: net: stmmac: Tx coe sw fallback

Add sw fallback of tx checksum calculation for those tx queues that
don't support tx checksum offloading. DW xGMAC IP can be synthesized
such that it can support tx checksum offloading only for a few
initial tx queues. Also as Serge pointed out, for the DW QoS IP, tx
coe can be individually configured for each tx queue.

So when tx coe is enabled, for any tx queue that doesn't support
tx coe with 'coe-unsupported' flag set will have a sw fallback
happen in the driver for tx checksum calculation when any packets to
be transmitted on these tx queues.

Signed-off-by: Rohan G Thomas <rohan.g.thomas@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/stmmac.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index ce89cc3e4913..c0079a7574ae 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -139,6 +139,7 @@ struct stmmac_rxq_cfg {
 
 struct stmmac_txq_cfg {
 	u32 weight;
+	bool coe_unsupported;
 	u8 mode_to_use;
 	/* Credit Base Shaper parameters */
 	u32 send_slope;
-- 
cgit v1.2.3


From 6b93bb41f6eaa1cc5c5f30ec3b687b380f116cd0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:00 +0206
Subject: printk: Add non-BKL (nbcon) console basic infrastructure

The current console/printk subsystem is protected by a Big Kernel Lock,
(aka console_lock) which has ill defined semantics and is more or less
stateless. This puts severe limitations on the console subsystem and
makes forced takeover and output in emergency and panic situations a
fragile endeavour that is based on try and pray.

The goal of non-BKL (nbcon) consoles is to break out of the console lock
jail and to provide a new infrastructure that avoids the pitfalls and
also allows console drivers to be gradually converted over.

The proposed infrastructure aims for the following properties:

  - Per console locking instead of global locking
  - Per console state that allows to make informed decisions
  - Stateful handover and takeover

As a first step, state is added to struct console. The per console state
is an atomic_t using a 32bit bit field.

Reserve state bits, which will be populated later in the series. Wire
it up into the console register/unregister functionality.

It was decided to use a bitfield because using a plain u32 with
mask/shift operations resulted in uncomprehensible code.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-2-john.ogness@linutronix.de
---
 include/linux/console.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 7de11c763eb3..a2d37a7a98a8 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -156,6 +156,8 @@ static inline int con_debug_leave(void)
  *			/dev/kmesg which requires a larger output buffer.
  * @CON_SUSPENDED:	Indicates if a console is suspended. If true, the
  *			printing callbacks must not be called.
+ * @CON_NBCON:		Console can operate outside of the legacy style console_lock
+ *			constraints.
  */
 enum cons_flags {
 	CON_PRINTBUFFER		= BIT(0),
@@ -166,8 +168,32 @@ enum cons_flags {
 	CON_BRL			= BIT(5),
 	CON_EXTENDED		= BIT(6),
 	CON_SUSPENDED		= BIT(7),
+	CON_NBCON		= BIT(8),
 };
 
+/**
+ * struct nbcon_state - console state for nbcon consoles
+ * @atom:	Compound of the state fields for atomic operations
+ *
+ * To be used for reading and preparing of the value stored in the nbcon
+ * state variable @console::nbcon_state.
+ */
+struct nbcon_state {
+	union {
+		unsigned int	atom;
+		struct {
+		};
+	};
+};
+
+/*
+ * The nbcon_state struct is used to easily create and interpret values that
+ * are stored in the @console::nbcon_state variable. Ensure this struct stays
+ * within the size boundaries of the atomic variable's underlying type in
+ * order to avoid any accidental truncation.
+ */
+static_assert(sizeof(struct nbcon_state) <= sizeof(int));
+
 /**
  * struct console - The console descriptor structure
  * @name:		The name of the console driver
@@ -187,6 +213,8 @@ enum cons_flags {
  * @dropped:		Number of unreported dropped ringbuffer records
  * @data:		Driver private data
  * @node:		hlist node for the console list
+ *
+ * @nbcon_state:	State for nbcon consoles
  */
 struct console {
 	char			name[16];
@@ -206,6 +234,9 @@ struct console {
 	unsigned long		dropped;
 	void			*data;
 	struct hlist_node	node;
+
+	/* nbcon console specific members */
+	atomic_t		__private nbcon_state;
 };
 
 #ifdef CONFIG_LOCKDEP
-- 
cgit v1.2.3


From 3a5bb25162b880da749f7cdf281c78dbade4164b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:01 +0206
Subject: printk: nbcon: Add acquire/release logic

Add per console acquire/release functionality.

The state of the console is maintained in the "nbcon_state" atomic
variable.

The console is locked when:

  - The 'prio' field contains the priority of the context that owns the
    console. Only higher priority contexts are allowed to take over the
    lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.

  - The 'cpu' field denotes on which CPU the console is locked. It is used
    to prevent busy waiting on the same CPU. Also it informs the lock owner
    that it has lost the lock in a more complex scenario when the lock was
    taken over by a higher priority context, released, and taken on another
    CPU with the same priority as the interrupted owner.

The acquire mechanism uses a few more fields:

  - The 'req_prio' field is used by the handover approach to make the
    current owner aware that there is a context with a higher priority
    waiting for the friendly handover.

  - The 'unsafe' field allows to take over the console in a safe way in the
    middle of emitting a message. The field is set only when accessing some
    shared resources or when the console device is manipulated. It can be
    cleared, for example, after emitting one character when the console
    device is in a consistent state.

  - The 'unsafe_takeover' field is set when a hostile takeover took the
    console in an unsafe state. The console will stay in the unsafe state
    until re-initialized.

The acquire mechanism uses three approaches:

  1) Direct acquire when the console is not owned or is owned by a lower
     priority context and is in a safe state.

  2) Friendly handover mechanism uses a request/grant handshake. It is used
     when the current owner has lower priority and the console is in an
     unsafe state.

     The requesting context:

       a) Sets its priority into the 'req_prio' field.

       b) Waits (with a timeout) for the owning context to unlock the
          console.

       c) Takes the lock and clears the 'req_prio' field.

     The owning context:

       a) Observes the 'req_prio' field set on exit from the unsafe
          console state.

       b) Gives up console ownership by clearing the 'prio' field.

  3) Unsafe hostile takeover allows to take over the lock even when the
     console is an unsafe state. It is used only in panic() by the final
     attempt to flush consoles in a try and hope mode.

     Note that separate record buffers are used in panic(). As a result,
     the messages can be read and formatted without any risk even after
     using the hostile takeover in unsafe state.

The release function simply clears the 'prio' field.

All operations on @console::nbcon_state are atomic cmpxchg based to
handle concurrency.

The acquire/release functions implement only minimal policies:

  - Preference for higher priority contexts.
  - Protection of the panic CPU.

All other policy decisions must be made at the call sites:

  - What is marked as an unsafe section.
  - Whether to spin-wait if there is already an owner and the console is
    in an unsafe state.
  - Whether to attempt an unsafe hostile takeover.

The design allows to implement the well known:

    acquire()
    output_one_printk_record()
    release()

The output of one printk record might be interrupted with a higher priority
context. The new owner is supposed to reprint the entire interrupted record
from scratch.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-3-john.ogness@linutronix.de
---
 include/linux/console.h | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index a2d37a7a98a8..98210fd01f18 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -175,13 +175,29 @@ enum cons_flags {
  * struct nbcon_state - console state for nbcon consoles
  * @atom:	Compound of the state fields for atomic operations
  *
+ * @req_prio:		The priority of a handover request
+ * @prio:		The priority of the current owner
+ * @unsafe:		Console is busy in a non takeover region
+ * @unsafe_takeover:	A hostile takeover in an unsafe state happened in the
+ *			past. The console cannot be safe until re-initialized.
+ * @cpu:		The CPU on which the owner runs
+ *
  * To be used for reading and preparing of the value stored in the nbcon
  * state variable @console::nbcon_state.
+ *
+ * The @prio and @req_prio fields are particularly important to allow
+ * spin-waiting to timeout and give up without the risk of a waiter being
+ * assigned the lock after giving up.
  */
 struct nbcon_state {
 	union {
 		unsigned int	atom;
 		struct {
+			unsigned int prio		:  2;
+			unsigned int req_prio		:  2;
+			unsigned int unsafe		:  1;
+			unsigned int unsafe_takeover	:  1;
+			unsigned int cpu		: 24;
 		};
 	};
 };
@@ -194,6 +210,46 @@ struct nbcon_state {
  */
 static_assert(sizeof(struct nbcon_state) <= sizeof(int));
 
+/**
+ * nbcon_prio - console owner priority for nbcon consoles
+ * @NBCON_PRIO_NONE:		Unused
+ * @NBCON_PRIO_NORMAL:		Normal (non-emergency) usage
+ * @NBCON_PRIO_EMERGENCY:	Emergency output (WARN/OOPS...)
+ * @NBCON_PRIO_PANIC:		Panic output
+ * @NBCON_PRIO_MAX:		The number of priority levels
+ *
+ * A higher priority context can takeover the console when it is
+ * in the safe state. The final attempt to flush consoles in panic()
+ * can be allowed to do so even in an unsafe state (Hope and pray).
+ */
+enum nbcon_prio {
+	NBCON_PRIO_NONE = 0,
+	NBCON_PRIO_NORMAL,
+	NBCON_PRIO_EMERGENCY,
+	NBCON_PRIO_PANIC,
+	NBCON_PRIO_MAX,
+};
+
+struct console;
+
+/**
+ * struct nbcon_context - Context for console acquire/release
+ * @console:			The associated console
+ * @spinwait_max_us:		Limit for spin-wait acquire
+ * @prio:			Priority of the context
+ * @allow_unsafe_takeover:	Allow performing takeover even if unsafe. Can
+ *				be used only with NBCON_PRIO_PANIC @prio. It
+ *				might cause a system freeze when the console
+ *				is used later.
+ */
+struct nbcon_context {
+	/* members set by caller */
+	struct console		*console;
+	unsigned int		spinwait_max_us;
+	enum nbcon_prio		prio;
+	unsigned int		allow_unsafe_takeover	: 1;
+};
+
 /**
  * struct console - The console descriptor structure
  * @name:		The name of the console driver
-- 
cgit v1.2.3


From 5634c90fd8553de7cbafecd048d0273690a2e84e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:03 +0206
Subject: printk: nbcon: Add buffer management

In case of hostile takeovers it must be ensured that the previous
owner cannot scribble over the output buffer of the emergency/panic
context. This is achieved by:

 - Adding a global output buffer instance for the panic context.
   This is the only situation where hostile takeovers can occur and
   there is always at most 1 panic context.

 - Allocating an output buffer per non-boot console upon console
   registration. This buffer is used by the console owner when not
   in panic context. (For boot consoles, the existing shared global
   legacy output buffer is used instead. Boot console printing will
   be synchronized with legacy console printing.)

 - Choosing the appropriate buffer is handled in the acquire/release
   functions.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-5-john.ogness@linutronix.de
---
 include/linux/console.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 98210fd01f18..ca1ef8700e55 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -231,6 +231,7 @@ enum nbcon_prio {
 };
 
 struct console;
+struct printk_buffers;
 
 /**
  * struct nbcon_context - Context for console acquire/release
@@ -241,6 +242,7 @@ struct console;
  *				be used only with NBCON_PRIO_PANIC @prio. It
  *				might cause a system freeze when the console
  *				is used later.
+ * @pbufs:			Pointer to the text buffer for this context
  */
 struct nbcon_context {
 	/* members set by caller */
@@ -248,6 +250,9 @@ struct nbcon_context {
 	unsigned int		spinwait_max_us;
 	enum nbcon_prio		prio;
 	unsigned int		allow_unsafe_takeover	: 1;
+
+	/* members set by acquire */
+	struct printk_buffers	*pbufs;
 };
 
 /**
@@ -271,6 +276,7 @@ struct nbcon_context {
  * @node:		hlist node for the console list
  *
  * @nbcon_state:	State for nbcon consoles
+ * @pbufs:		Pointer to nbcon private buffer
  */
 struct console {
 	char			name[16];
@@ -293,6 +299,7 @@ struct console {
 
 	/* nbcon console specific members */
 	atomic_t		__private nbcon_state;
+	struct printk_buffers	*pbufs;
 };
 
 #ifdef CONFIG_LOCKDEP
-- 
cgit v1.2.3


From ad56ebd1d79b216dc147474fac89a11daf6b10df Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:05 +0206
Subject: printk: nbcon: Add sequence handling

Add an atomic_long_t field @nbcon_seq to the console struct to
store the sequence number for nbcon consoles. For nbcon consoles
this will be used instead of the non-atomic @seq field. The new
field allows for safe atomic sequence number updates without
requiring any locking.

On 64bit systems the new field stores the full sequence number.
On 32bit systems the new field stores the lower 32 bits of the
sequence number, which are expanded to 64bit as needed by
folding the values based on the sequence numbers available in
the ringbuffer.

For 32bit systems, having a 32bit representation in the console
is sufficient. If a console ever gets more than 2^31 records
behind the ringbuffer then this is the least of the problems.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-7-john.ogness@linutronix.de
---
 include/linux/console.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index ca1ef8700e55..20cd486b76ad 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -243,6 +243,7 @@ struct printk_buffers;
  *				might cause a system freeze when the console
  *				is used later.
  * @pbufs:			Pointer to the text buffer for this context
+ * @seq:			The sequence number to print for this context
  */
 struct nbcon_context {
 	/* members set by caller */
@@ -253,6 +254,7 @@ struct nbcon_context {
 
 	/* members set by acquire */
 	struct printk_buffers	*pbufs;
+	u64			seq;
 };
 
 /**
@@ -276,6 +278,7 @@ struct nbcon_context {
  * @node:		hlist node for the console list
  *
  * @nbcon_state:	State for nbcon consoles
+ * @nbcon_seq:		Sequence number of the next record for nbcon to print
  * @pbufs:		Pointer to nbcon private buffer
  */
 struct console {
@@ -299,6 +302,7 @@ struct console {
 
 	/* nbcon console specific members */
 	atomic_t		__private nbcon_state;
+	atomic_long_t		__private nbcon_seq;
 	struct printk_buffers	*pbufs;
 };
 
-- 
cgit v1.2.3


From 06653d57ff283be627a2c769139d73ecc487810f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:06 +0206
Subject: printk: nbcon: Add emit function and callback function for atomic
 printing

Implement an emit function for nbcon consoles to output printk
messages. It utilizes the lockless printk_get_next_message() and
console_prepend_dropped() functions to retrieve/build the output
message. The emit function includes the required safety points to
check for handover/takeover and calls a new write_atomic callback
of the console driver to output the message. It also includes
proper handling for updating the nbcon console sequence number.

A new nbcon_write_context struct is introduced. This is provided
to the write_atomic callback and includes only the information
necessary for performing atomic writes.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-8-john.ogness@linutronix.de
---
 include/linux/console.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 20cd486b76ad..14563dcb34b1 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -242,6 +242,7 @@ struct printk_buffers;
  *				be used only with NBCON_PRIO_PANIC @prio. It
  *				might cause a system freeze when the console
  *				is used later.
+ * @backlog:			Ringbuffer has pending records
  * @pbufs:			Pointer to the text buffer for this context
  * @seq:			The sequence number to print for this context
  */
@@ -252,11 +253,28 @@ struct nbcon_context {
 	enum nbcon_prio		prio;
 	unsigned int		allow_unsafe_takeover	: 1;
 
+	/* members set by emit */
+	unsigned int		backlog			: 1;
+
 	/* members set by acquire */
 	struct printk_buffers	*pbufs;
 	u64			seq;
 };
 
+/**
+ * struct nbcon_write_context - Context handed to the nbcon write callbacks
+ * @ctxt:		The core console context
+ * @outbuf:		Pointer to the text buffer for output
+ * @len:		Length to write
+ * @unsafe_takeover:	If a hostile takeover in an unsafe state has occurred
+ */
+struct nbcon_write_context {
+	struct nbcon_context	__private ctxt;
+	char			*outbuf;
+	unsigned int		len;
+	bool			unsafe_takeover;
+};
+
 /**
  * struct console - The console descriptor structure
  * @name:		The name of the console driver
@@ -277,6 +295,7 @@ struct nbcon_context {
  * @data:		Driver private data
  * @node:		hlist node for the console list
  *
+ * @write_atomic:	Write callback for atomic context
  * @nbcon_state:	State for nbcon consoles
  * @nbcon_seq:		Sequence number of the next record for nbcon to print
  * @pbufs:		Pointer to nbcon private buffer
@@ -301,6 +320,8 @@ struct console {
 	struct hlist_node	node;
 
 	/* nbcon console specific members */
+	bool			(*write_atomic)(struct console *con,
+						struct nbcon_write_context *wctxt);
 	atomic_t		__private nbcon_state;
 	atomic_long_t		__private nbcon_seq;
 	struct printk_buffers	*pbufs;
-- 
cgit v1.2.3


From 9757acd0a700ba4a0d16dde4ba820eb052aba1a7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:07 +0206
Subject: printk: nbcon: Allow drivers to mark unsafe regions and check state

For the write_atomic callback, the console driver may have unsafe
regions that need to be appropriately marked. Provide functions
that accept the nbcon_write_context struct to allow for the driver
to enter and exit unsafe regions.

Also provide a function for drivers to check if they are still the
owner of the console.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-9-john.ogness@linutronix.de
---
 include/linux/console.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 14563dcb34b1..e4fc6f7c1496 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -451,6 +451,16 @@ static inline bool console_is_registered(const struct console *con)
 	lockdep_assert_console_list_lock_held();			\
 	hlist_for_each_entry(con, &console_list, node)
 
+#ifdef CONFIG_PRINTK
+extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
+extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
+extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
+#else
+static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; }
+static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
+static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
+#endif
+
 extern int console_set_on_cmdline;
 extern struct console *early_console;
 
-- 
cgit v1.2.3


From 492032760127251e5540a5716a70996bacf2a3fd Mon Sep 17 00:00:00 2001
From: Ziyang Xuan <william.xuanziyang@huawei.com>
Date: Mon, 18 Sep 2023 20:30:11 +0800
Subject: team: fix null-ptr-deref when team device type is changed

Get a null-ptr-deref bug as follows with reproducer [1].

BUG: kernel NULL pointer dereference, address: 0000000000000228
...
RIP: 0010:vlan_dev_hard_header+0x35/0x140 [8021q]
...
Call Trace:
 <TASK>
 ? __die+0x24/0x70
 ? page_fault_oops+0x82/0x150
 ? exc_page_fault+0x69/0x150
 ? asm_exc_page_fault+0x26/0x30
 ? vlan_dev_hard_header+0x35/0x140 [8021q]
 ? vlan_dev_hard_header+0x8e/0x140 [8021q]
 neigh_connected_output+0xb2/0x100
 ip6_finish_output2+0x1cb/0x520
 ? nf_hook_slow+0x43/0xc0
 ? ip6_mtu+0x46/0x80
 ip6_finish_output+0x2a/0xb0
 mld_sendpack+0x18f/0x250
 mld_ifc_work+0x39/0x160
 process_one_work+0x1e6/0x3f0
 worker_thread+0x4d/0x2f0
 ? __pfx_worker_thread+0x10/0x10
 kthread+0xe5/0x120
 ? __pfx_kthread+0x10/0x10
 ret_from_fork+0x34/0x50
 ? __pfx_kthread+0x10/0x10
 ret_from_fork_asm+0x1b/0x30

[1]
$ teamd -t team0 -d -c '{"runner": {"name": "loadbalance"}}'
$ ip link add name t-dummy type dummy
$ ip link add link t-dummy name t-dummy.100 type vlan id 100
$ ip link add name t-nlmon type nlmon
$ ip link set t-nlmon master team0
$ ip link set t-nlmon nomaster
$ ip link set t-dummy up
$ ip link set team0 up
$ ip link set t-dummy.100 down
$ ip link set t-dummy.100 master team0

When enslave a vlan device to team device and team device type is changed
from non-ether to ether, header_ops of team device is changed to
vlan_header_ops. That is incorrect and will trigger null-ptr-deref
for vlan->real_dev in vlan_dev_hard_header() because team device is not
a vlan device.

Cache eth_header_ops in team_setup(), then assign cached header_ops to
header_ops of team net device when its type is changed from non-ether
to ether to fix the bug.

Fixes: 1d76efe1577b ("team: add support for non-ethernet devices")
Suggested-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20230918123011.1884401-1-william.xuanziyang@huawei.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/if_team.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index 1b9b15a492fa..cdc684e04a2f 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -189,6 +189,8 @@ struct team {
 	struct net_device *dev; /* associated netdevice */
 	struct team_pcpu_stats __percpu *pcpu_stats;
 
+	const struct header_ops *header_ops_cache;
+
 	struct mutex lock; /* used for overall locking, e.g. port lists write */
 
 	/*
-- 
cgit v1.2.3


From 6a23c555f7eb436d6799533675ffa179db3d5834 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 18 Sep 2023 14:25:36 +0100
Subject: net: phy: fix regression with AX88772A PHY driver

Marek reports that a deadlock occurs with the AX88772A PHY used on the
ASIX USB network driver:

asix 1-1.4:1.0 (unnamed net_device) (uninitialized): PHY [usb-001:003:10] driver [Asix Electronics AX88772A] (irq=POLL)
Asix Electronics AX88772A usb-001:003:10: attached PHY driver(mii_bus:phy_addr=usb-001:003:10, irq=POLL)
asix 1-1.4:1.0 eth0: register 'asix' at usb-12110000.usb-1.4, ASIX AX88772 USB 2.0 Ethernet, a2:99:b6:cd:11:eb
asix 1-1.4:1.0 eth0: configuring for phy/internal link mode

============================================
WARNING: possible recursive locking detected
6.6.0-rc1-00239-g8da77df649c4-dirty #13949 Not tainted
--------------------------------------------
kworker/3:3/71 is trying to acquire lock:
c6c704cc (&dev->lock){+.+.}-{3:3}, at: phy_start_aneg+0x1c/0x38

but task is already holding lock:
c6c704cc (&dev->lock){+.+.}-{3:3}, at: phy_state_machine+0x100/0x2b8

This is because we now consistently call phy_process_state_change()
while holding phydev->lock, but the AX88772A PHY driver then goes on
to call phy_start_aneg() which tries to grab the same lock - causing
deadlock.

Fix this by exporting the unlocked version, and use this in the PHY
driver instead.

Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Fixes: ef113a60d0a9 ("net: phy: call phy_error_precise() while holding the lock")
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/E1qiEFs-007g7b-Lq@rmk-PC.armlinux.org.uk
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 1351b802ffcf..3cc52826f18e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1736,6 +1736,7 @@ void phy_detach(struct phy_device *phydev);
 void phy_start(struct phy_device *phydev);
 void phy_stop(struct phy_device *phydev);
 int phy_config_aneg(struct phy_device *phydev);
+int _phy_start_aneg(struct phy_device *phydev);
 int phy_start_aneg(struct phy_device *phydev);
 int phy_aneg_done(struct phy_device *phydev);
 int phy_speed_down(struct phy_device *phydev, bool sync);
-- 
cgit v1.2.3


From fa17a6d8a5bd0cd7565b613cb804242cd0f6b7ab Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 18 Sep 2023 14:23:21 +0000
Subject: ipv6: lockless IPV6_ADDR_PREFERENCES implementation

We have data-races while reading np->srcprefs

Switch the field to a plain byte, add READ_ONCE()
and WRITE_ONCE() annotations where needed,
and IPV6_ADDR_PREFERENCES setsockopt() can now be lockless.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20230918142321.1794107-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/ipv6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 09253825c99c..e400ff757f13 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -243,7 +243,7 @@ struct ipv6_pinfo {
 	} rxopt;
 
 	/* sockopt flags */
-	__u8			srcprefs:3;	/* 001: prefer temporary address
+	__u8			srcprefs;	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
-- 
cgit v1.2.3


From bafd764a8baa87e19e6beeaa58eb85fcbbdd6b20 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 18 Sep 2023 12:29:07 +0200
Subject: net: ethernet: mtk_wed: rename mtk_rxbm_desc in mtk_wed_bm_desc

Rename mtk_rxbm_desc structure in mtk_wed_bm_desc since it will be used
even on tx side by MT7988 SoC.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/soc/mediatek/mtk_wed.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index b2b28180dff7..c6512c216b27 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -45,7 +45,7 @@ enum mtk_wed_wo_cmd {
 	MTK_WED_WO_CMD_WED_END
 };
 
-struct mtk_rxbm_desc {
+struct mtk_wed_bm_desc {
 	__le32 buf0;
 	__le32 token;
 } __packed __aligned(4);
@@ -104,7 +104,7 @@ struct mtk_wed_device {
 
 	struct {
 		int size;
-		struct mtk_rxbm_desc *desc;
+		struct mtk_wed_bm_desc *desc;
 		dma_addr_t desc_phys;
 	} rx_buf_ring;
 
-- 
cgit v1.2.3


From ff0ea57fa30e860d3373acd1383e9d9599144b58 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 18 Sep 2023 12:29:08 +0200
Subject: net: ethernet: mtk_wed: introduce mtk_wed_buf structure

Introduce mtk_wed_buf structure to store both virtual and physical
addresses allocated in mtk_wed_tx_buffer_alloc() routine. This is a
preliminary patch to add WED support for MT7988 SoC since it relies on a
different dma descriptor layout not storing page dma addresses.

Co-developed-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/soc/mediatek/mtk_wed.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index c6512c216b27..5f00dc26582b 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -76,6 +76,11 @@ struct mtk_wed_wo_rx_stats {
 	__le32 rx_drop_cnt;
 };
 
+struct mtk_wed_buf {
+	void *p;
+	dma_addr_t phy_addr;
+};
+
 struct mtk_wed_device {
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
 	const struct mtk_wed_ops *ops;
@@ -97,7 +102,7 @@ struct mtk_wed_device {
 
 	struct {
 		int size;
-		void **pages;
+		struct mtk_wed_buf *pages;
 		struct mtk_wdma_desc *desc;
 		dma_addr_t desc_phys;
 	} tx_buf_ring;
-- 
cgit v1.2.3


From e2f64db13aa1d08e32621067e4fe16bbc114b375 Mon Sep 17 00:00:00 2001
From: Sujuan Chen <sujuan.chen@mediatek.com>
Date: Mon, 18 Sep 2023 12:29:13 +0200
Subject: net: ethernet: mtk_wed: introduce WED support for MT7988

Similar to MT7986 and MT7622, enable Wireless Ethernet Ditpatcher for
MT7988 in order to offload traffic forwarded from LAN/WLAN to WLAN/LAN

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/soc/mediatek/mtk_wed.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 5f00dc26582b..5b096f9f1975 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -138,6 +138,8 @@ struct mtk_wed_device {
 		u32 wpdma_rx;
 
 		bool wcid_512;
+		bool hw_rro;
+		bool msi;
 
 		u16 token_start;
 		unsigned int nbuf;
@@ -211,10 +213,12 @@ mtk_wed_device_attach(struct mtk_wed_device *dev)
 	return ret;
 }
 
-static inline bool
-mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
+static inline bool mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
 {
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
+	if (dev->version == 3)
+		return dev->wlan.hw_rro;
+
 	return dev->version != 1;
 #else
 	return false;
-- 
cgit v1.2.3


From b230812b9dda125e69ab0a5a11cda88d9c0d18a9 Mon Sep 17 00:00:00 2001
From: Sujuan Chen <sujuan.chen@mediatek.com>
Date: Mon, 18 Sep 2023 12:29:15 +0200
Subject: net: ethernet: mtk_wed: introduce partial AMSDU offload support for
 MT7988

Introduce partial AMSDU offload support for MT7988 SoC in order to merge
in hw packets belonging to the same AMSDU before passing them to the
WLAN nic.

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h            |  1 +
 include/linux/soc/mediatek/mtk_wed.h | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index db3d8429d50d..7e520c14eb8c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -919,6 +919,7 @@ struct net_device_path {
 			u8 queue;
 			u16 wcid;
 			u8 bss;
+			u8 amsdu;
 		} mtk_wdma;
 	};
 };
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 5b096f9f1975..90d9c9ead3bc 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -128,6 +128,7 @@ struct mtk_wed_device {
 		enum mtk_wed_bus_tye bus_type;
 		void __iomem *base;
 		u32 phy_base;
+		u32 id;
 
 		u32 wpdma_phys;
 		u32 wpdma_int;
@@ -146,10 +147,12 @@ struct mtk_wed_device {
 		unsigned int rx_nbuf;
 		unsigned int rx_npkt;
 		unsigned int rx_size;
+		unsigned int amsdu_max_len;
 
 		u8 tx_tbit[MTK_WED_TX_QUEUES];
 		u8 rx_tbit[MTK_WED_RX_QUEUES];
 		u8 txfree_tbit;
+		u8 amsdu_max_subframes;
 
 		u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id);
 		int (*offload_enable)(struct mtk_wed_device *wed);
@@ -225,6 +228,15 @@ static inline bool mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
 #endif
 }
 
+static inline bool mtk_wed_is_amsdu_supported(struct mtk_wed_device *dev)
+{
+#ifdef CONFIG_NET_MEDIATEK_SOC_WED
+	return dev->version == 3;
+#else
+	return false;
+#endif
+}
+
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
 #define mtk_wed_device_active(_dev) !!(_dev)->ops
 #define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev)
-- 
cgit v1.2.3


From 6757d345dd7dba795f5af44d4442d55a83c4b1b4 Mon Sep 17 00:00:00 2001
From: Sujuan Chen <sujuan.chen@mediatek.com>
Date: Mon, 18 Sep 2023 12:29:16 +0200
Subject: net: ethernet: mtk_wed: introduce hw_rro support for MT7988

MT7988 SoC support 802.11 receive reordering offload in hw while
MT7986 SoC implements it through the firmware running on the mcu.

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/soc/mediatek/mtk_wed.h | 45 ++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 90d9c9ead3bc..a476648858a6 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -10,6 +10,7 @@
 
 #define MTK_WED_TX_QUEUES		2
 #define MTK_WED_RX_QUEUES		2
+#define MTK_WED_RX_PAGE_QUEUES		3
 
 #define WED_WO_STA_REC			0x6
 
@@ -99,6 +100,9 @@ struct mtk_wed_device {
 	struct mtk_wed_ring txfree_ring;
 	struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES];
 	struct mtk_wed_ring rx_wdma[MTK_WED_RX_QUEUES];
+	struct mtk_wed_ring rx_rro_ring[MTK_WED_RX_QUEUES];
+	struct mtk_wed_ring rx_page_ring[MTK_WED_RX_PAGE_QUEUES];
+	struct mtk_wed_ring ind_cmd_ring;
 
 	struct {
 		int size;
@@ -119,6 +123,13 @@ struct mtk_wed_device {
 		dma_addr_t fdbk_phys;
 	} rro;
 
+	struct {
+		int size;
+		struct mtk_wed_buf *pages;
+		struct mtk_wed_bm_desc *desc;
+		dma_addr_t desc_phys;
+	} hw_rro;
+
 	/* filled by driver: */
 	struct {
 		union {
@@ -137,6 +148,8 @@ struct mtk_wed_device {
 		u32 wpdma_txfree;
 		u32 wpdma_rx_glo;
 		u32 wpdma_rx;
+		u32 wpdma_rx_rro[MTK_WED_RX_QUEUES];
+		u32 wpdma_rx_pg;
 
 		bool wcid_512;
 		bool hw_rro;
@@ -151,9 +164,20 @@ struct mtk_wed_device {
 
 		u8 tx_tbit[MTK_WED_TX_QUEUES];
 		u8 rx_tbit[MTK_WED_RX_QUEUES];
+		u8 rro_rx_tbit[MTK_WED_RX_QUEUES];
+		u8 rx_pg_tbit[MTK_WED_RX_PAGE_QUEUES];
 		u8 txfree_tbit;
 		u8 amsdu_max_subframes;
 
+		struct {
+			u8 se_group_nums;
+			u16 win_size;
+			u16 particular_sid;
+			u32 ack_sn_addr;
+			dma_addr_t particular_se_phys;
+			dma_addr_t addr_elem_phys[1024];
+		} ind_cmd;
+
 		u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id);
 		int (*offload_enable)(struct mtk_wed_device *wed);
 		void (*offload_disable)(struct mtk_wed_device *wed);
@@ -192,6 +216,14 @@ struct mtk_wed_ops {
 	void (*irq_set_mask)(struct mtk_wed_device *dev, u32 mask);
 	int (*setup_tc)(struct mtk_wed_device *wed, struct net_device *dev,
 			enum tc_setup_type type, void *type_data);
+	void (*start_hw_rro)(struct mtk_wed_device *dev, u32 irq_mask,
+			     bool reset);
+	void (*rro_rx_ring_setup)(struct mtk_wed_device *dev, int ring,
+				  void __iomem *regs);
+	void (*msdu_pg_rx_ring_setup)(struct mtk_wed_device *dev, int ring,
+				      void __iomem *regs);
+	int (*ind_rx_ring_setup)(struct mtk_wed_device *dev,
+				 void __iomem *regs);
 };
 
 extern const struct mtk_wed_ops __rcu *mtk_soc_wed_ops;
@@ -263,6 +295,15 @@ static inline bool mtk_wed_is_amsdu_supported(struct mtk_wed_device *dev)
 #define mtk_wed_device_dma_reset(_dev) (_dev)->ops->reset_dma(_dev)
 #define mtk_wed_device_setup_tc(_dev, _netdev, _type, _type_data) \
 	(_dev)->ops->setup_tc(_dev, _netdev, _type, _type_data)
+#define mtk_wed_device_start_hw_rro(_dev, _mask, _reset) \
+	(_dev)->ops->start_hw_rro(_dev, _mask, _reset)
+#define mtk_wed_device_rro_rx_ring_setup(_dev, _ring, _regs) \
+	(_dev)->ops->rro_rx_ring_setup(_dev, _ring, _regs)
+#define mtk_wed_device_msdu_pg_rx_ring_setup(_dev, _ring, _regs) \
+	(_dev)->ops->msdu_pg_rx_ring_setup(_dev, _ring, _regs)
+#define mtk_wed_device_ind_rx_ring_setup(_dev, _regs) \
+	(_dev)->ops->ind_rx_ring_setup(_dev, _regs)
+
 #else
 static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 {
@@ -282,6 +323,10 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 #define mtk_wed_device_stop(_dev) do {} while (0)
 #define mtk_wed_device_dma_reset(_dev) do {} while (0)
 #define mtk_wed_device_setup_tc(_dev, _netdev, _type, _type_data) -EOPNOTSUPP
+#define mtk_wed_device_start_hw_rro(_dev, _mask, _reset) do {} while (0)
+#define mtk_wed_device_rro_rx_ring_setup(_dev, _ring, _regs) -ENODEV
+#define mtk_wed_device_msdu_pg_rx_ring_setup(_dev, _ring, _regs) -ENODEV
+#define mtk_wed_device_ind_rx_ring_setup(_dev, _regs) -ENODEV
 #endif
 
 #endif
-- 
cgit v1.2.3


From 9ea9cb00a82b53ec39630eac718776d37e41b35a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 14 Sep 2023 11:21:39 -0400
Subject: mm: memcontrol: fix GFP_NOFS recursion in memory.high enforcement

Breno and Josef report a deadlock scenario from cgroup reclaim
re-entering the filesystem:

[  361.546690] ======================================================
[  361.559210] WARNING: possible circular locking dependency detected
[  361.571703] 6.5.0-0_fbk700_debug_rc0_kbuilder_13159_gbf787a128001 #1 Tainted: G S          E
[  361.589704] ------------------------------------------------------
[  361.602277] find/9315 is trying to acquire lock:
[  361.611625] ffff88837ba140c0 (&delayed_node->mutex){+.+.}-{4:4}, at: __btrfs_release_delayed_node+0x68/0x4f0
[  361.631437]
[  361.631437] but task is already holding lock:
[  361.643243] ffff8881765b8678 (btrfs-tree-01){++++}-{4:4}, at: btrfs_tree_read_lock+0x1e/0x40

[  362.904457]  mutex_lock_nested+0x1c/0x30
[  362.912414]  __btrfs_release_delayed_node+0x68/0x4f0
[  362.922460]  btrfs_evict_inode+0x301/0x770
[  362.982726]  evict+0x17c/0x380
[  362.988944]  prune_icache_sb+0x100/0x1d0
[  363.005559]  super_cache_scan+0x1f8/0x260
[  363.013695]  do_shrink_slab+0x2a2/0x540
[  363.021489]  shrink_slab_memcg+0x237/0x3d0
[  363.050606]  shrink_slab+0xa7/0x240
[  363.083382]  shrink_node_memcgs+0x262/0x3b0
[  363.091870]  shrink_node+0x1a4/0x720
[  363.099150]  shrink_zones+0x1f6/0x5d0
[  363.148798]  do_try_to_free_pages+0x19b/0x5e0
[  363.157633]  try_to_free_mem_cgroup_pages+0x266/0x370
[  363.190575]  reclaim_high+0x16f/0x1f0
[  363.208409]  mem_cgroup_handle_over_high+0x10b/0x270
[  363.246678]  try_charge_memcg+0xaf2/0xc70
[  363.304151]  charge_memcg+0xf0/0x350
[  363.320070]  __mem_cgroup_charge+0x28/0x40
[  363.328371]  __filemap_add_folio+0x870/0xd50
[  363.371303]  filemap_add_folio+0xdd/0x310
[  363.399696]  __filemap_get_folio+0x2fc/0x7d0
[  363.419086]  pagecache_get_page+0xe/0x30
[  363.427048]  alloc_extent_buffer+0x1cd/0x6a0
[  363.435704]  read_tree_block+0x43/0xc0
[  363.443316]  read_block_for_search+0x361/0x510
[  363.466690]  btrfs_search_slot+0xc8c/0x1520

This is caused by the mem_cgroup_handle_over_high() not respecting the
gfp_mask of the allocation context.  We used to only call this function on
resume to userspace, where no locks were held.  But c9afe31ec443 ("memcg:
synchronously enforce memory.high for large overcharges") added a call
from the allocation context without considering the gfp.

Link: https://lkml.kernel.org/r/20230914152139.100822-1-hannes@cmpxchg.org
Fixes: c9afe31ec443 ("memcg: synchronously enforce memory.high for large overcharges")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Breno Leitao <leitao@debian.org>
Reported-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: <stable@vger.kernel.org>	[5.17+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h       | 4 ++--
 include/linux/resume_user_mode.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ab94ad4597d0..e4e24da16d2c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -920,7 +920,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
 	return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
 }
 
-void mem_cgroup_handle_over_high(void);
+void mem_cgroup_handle_over_high(gfp_t gfp_mask);
 
 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
 
@@ -1458,7 +1458,7 @@ static inline void mem_cgroup_unlock_pages(void)
 	rcu_read_unlock();
 }
 
-static inline void mem_cgroup_handle_over_high(void)
+static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
 {
 }
 
diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h
index 285189454449..f8f3e958e9cf 100644
--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -55,7 +55,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs)
 	}
 #endif
 
-	mem_cgroup_handle_over_high();
+	mem_cgroup_handle_over_high(GFP_KERNEL);
 	blkcg_maybe_throttle_current();
 
 	rseq_handle_notify_resume(NULL, regs);
-- 
cgit v1.2.3


From b724a6418f1f853bcb39c8923bf14a50c7bdbd07 Mon Sep 17 00:00:00 2001
From: Leon Hwang <hffilwlqm@gmail.com>
Date: Sun, 17 Sep 2023 23:38:46 +0800
Subject: bpf: Fix tr dereferencing

Fix 'tr' dereferencing bug when CONFIG_BPF_JIT is turned off.

When CONFIG_BPF_JIT is turned off, 'bpf_trampoline_get()' returns NULL,
which is same as the cases when CONFIG_BPF_JIT is turned on.

Closes: https://lore.kernel.org/r/202309131936.5Nc8eUD0-lkp@intel.com/
Fixes: f7b12b6fea00 ("bpf: verifier: refactor check_attach_btf_id()")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Leon Hwang <hffilwlqm@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230917153846.88732-1-hffilwlqm@gmail.com
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 024e8b28c34b..49f8b691496c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1307,7 +1307,7 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
 static inline struct bpf_trampoline *bpf_trampoline_get(u64 key,
 							struct bpf_attach_target_info *tgt_info)
 {
-	return ERR_PTR(-EOPNOTSUPP);
+	return NULL;
 }
 static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
 #define DEFINE_BPF_DISPATCHER(name)
-- 
cgit v1.2.3


From 54e1f99d91405417b3ddb6050cfba82733c3aa41 Mon Sep 17 00:00:00 2001
From: Komal Bajaj <quic_kbajaj@quicinc.com>
Date: Wed, 30 Aug 2023 16:26:51 +0530
Subject: nvmem: core: Add stub for nvmem_cell_read_u8

Add the stub nvmem_cell_read_u8() function for drivers running with
CONFIG_NVMEM disabled.

Signed-off-by: Komal Bajaj <quic_kbajaj@quicinc.com>
Reviewed-by: Mukesh Ojha <quic_mojha@quicinc.com>
Link: https://lore.kernel.org/r/20230830105654.28057-4-quic_kbajaj@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/nvmem-consumer.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index 4523e4e83319..6ec4b9743e25 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -127,6 +127,12 @@ static inline int nvmem_cell_write(struct nvmem_cell *cell,
 	return -EOPNOTSUPP;
 }
 
+static inline int nvmem_cell_read_u8(struct device *dev,
+				     const char *cell_id, u8 *val)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int nvmem_cell_read_u16(struct device *dev,
 				      const char *cell_id, u16 *val)
 {
-- 
cgit v1.2.3


From 0bc76be64e80b15b975345b6957a87a1893c34f2 Mon Sep 17 00:00:00 2001
From: Komal Bajaj <quic_kbajaj@quicinc.com>
Date: Wed, 30 Aug 2023 16:26:53 +0530
Subject: soc: qcom: llcc: Updating the macro name

Update macro name for LLCC_DRE to LLCC_ECC as per the latest specification.

Signed-off-by: Komal Bajaj <quic_kbajaj@quicinc.com>
Reviewed-by: Mukesh Ojha <quic_mojha@quicinc.com>
Acked-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Link: https://lore.kernel.org/r/20230830105654.28057-6-quic_kbajaj@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/soc/qcom/llcc-qcom.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index 93417ba1ead4..1a886666bbb6 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -30,7 +30,7 @@
 #define LLCC_NPU         23
 #define LLCC_WLHW        24
 #define LLCC_PIMEM       25
-#define LLCC_DRE         26
+#define LLCC_ECC         26
 #define LLCC_CVP         28
 #define LLCC_MODPE       29
 #define LLCC_APTCM       30
-- 
cgit v1.2.3


From b64d143b752932ef483d0ed8d00958f1832dd6bc Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 14 Sep 2023 16:28:23 +0800
Subject: crypto: hash - Hide CRYPTO_ALG_TYPE_AHASH_MASK

Move the macro CRYPTO_ALG_TYPE_AHASH_MASK out of linux/crypto.h
and into crypto/ahash.c so that it's not visible to users of the
Crypto API.

Also remove the unused CRYPTO_ALG_TYPE_HASH_MASK macro.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crypto.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 31f6fee0c36c..a0780deb017a 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -35,8 +35,6 @@
 #define CRYPTO_ALG_TYPE_SHASH		0x0000000e
 #define CRYPTO_ALG_TYPE_AHASH		0x0000000f
 
-#define CRYPTO_ALG_TYPE_HASH_MASK	0x0000000e
-#define CRYPTO_ALG_TYPE_AHASH_MASK	0x0000000e
 #define CRYPTO_ALG_TYPE_ACOMPRESS_MASK	0x0000000e
 
 #define CRYPTO_ALG_LARVAL		0x00000010
-- 
cgit v1.2.3


From 31865c4c4db2b742fec6ccbff80483fa3e7ab9b9 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 14 Sep 2023 16:28:24 +0800
Subject: crypto: skcipher - Add lskcipher

Add a new API type lskcipher designed for taking straight kernel
pointers instead of SG lists.  Its relationship to skcipher will
be analogous to that between shash and ahash.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crypto.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index a0780deb017a..f3c3a3b27fac 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -24,6 +24,7 @@
 #define CRYPTO_ALG_TYPE_CIPHER		0x00000001
 #define CRYPTO_ALG_TYPE_COMPRESS	0x00000002
 #define CRYPTO_ALG_TYPE_AEAD		0x00000003
+#define CRYPTO_ALG_TYPE_LSKCIPHER	0x00000004
 #define CRYPTO_ALG_TYPE_SKCIPHER	0x00000005
 #define CRYPTO_ALG_TYPE_AKCIPHER	0x00000006
 #define CRYPTO_ALG_TYPE_SIG		0x00000007
-- 
cgit v1.2.3


From 2a86f1b56a30e242caf7ee1268af68f4f49ce847 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Wed, 20 Sep 2023 14:26:29 +0800
Subject: kasan: Cleanup the __HAVE_ARCH_SHADOW_MAP usage

As Linus suggested, __HAVE_ARCH_XYZ is "stupid" and "having historical
uses of it doesn't make it good". So migrate __HAVE_ARCH_SHADOW_MAP to
separate macros named after the respective functions.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 include/linux/kasan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 3df5499f7936..842623d708c2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -54,7 +54,7 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
 int kasan_populate_early_shadow(const void *shadow_start,
 				const void *shadow_end);
 
-#ifndef __HAVE_ARCH_SHADOW_MAP
+#ifndef kasan_mem_to_shadow
 static inline void *kasan_mem_to_shadow(const void *addr)
 {
 	return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
-- 
cgit v1.2.3


From 653b7eb9d74426397c95061fd57da3063625af65 Mon Sep 17 00:00:00 2001
From: Erez Shitrit <erezsh@nvidia.com>
Date: Mon, 28 Aug 2023 14:20:00 +0300
Subject: net/mlx5: Bridge, Enable mcast in smfs steering mode

In order to have mcast offloads the driver needs the following:
It should know if that mcast comes from wire port, in addition the flow
should not be marked as any specific source, that way it will give the
flexibility for the driver not to be depended on the way iterator
implemented in the FW.

Signed-off-by: Erez Shitrit <erezsh@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 1e00c2436377..6f7725238abc 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -67,6 +67,7 @@ enum {
 	MLX5_FLOW_TABLE_TERMINATION = BIT(2),
 	MLX5_FLOW_TABLE_UNMANAGED = BIT(3),
 	MLX5_FLOW_TABLE_OTHER_VPORT = BIT(4),
+	MLX5_FLOW_TABLE_UPLINK_VPORT = BIT(5),
 };
 
 #define LEFTOVERS_RULE_NUM	 2
-- 
cgit v1.2.3


From e0cc92fd945a9c8e43d4268cf1ea985d0e99a90f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Thu, 3 Aug 2023 16:39:40 +0300
Subject: net/mlx5: Add a health error syndrome for pci data poisoned

Add new health error syndrome to indicate that pci data poisoned error
has been received while fetching device ICM data.

Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index dd8421d021cf..b23d8ff286a1 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10574,6 +10574,7 @@ enum {
 	MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_INV                       = 0xe,
 	MLX5_INITIAL_SEG_HEALTH_SYNDROME_FFSER_ERR                    = 0xf,
 	MLX5_INITIAL_SEG_HEALTH_SYNDROME_HIGH_TEMP_ERR                = 0x10,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PCI_POISONED_ERR         = 0x12,
 };
 
 struct mlx5_ifc_initial_seg_bits {
-- 
cgit v1.2.3


From 6b596e62ed9f90c4a97e68ae1f7b1af5beeb3c05 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 8 Sep 2023 18:22:51 +0200
Subject: sched: Provide rt_mutex specific scheduler helpers

With PREEMPT_RT there is a rt_mutex recursion problem where
sched_submit_work() can use an rtlock (aka spinlock_t). More
specifically what happens is:

  mutex_lock() /* really rt_mutex */
    ...
      __rt_mutex_slowlock_locked()
	task_blocks_on_rt_mutex()
          // enqueue current task as waiter
          // do PI chain walk
        rt_mutex_slowlock_block()
          schedule()
            sched_submit_work()
              ...
              spin_lock() /* really rtlock */
                ...
                  __rt_mutex_slowlock_locked()
                    task_blocks_on_rt_mutex()
                      // enqueue current task as waiter *AGAIN*
                      // *CONFUSION*

Fix this by making rt_mutex do the sched_submit_work() early, before
it enqueues itself as a waiter -- before it even knows *if* it will
wait.

[[ basically Thomas' patch but with different naming and a few asserts
   added ]]

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230908162254.999499-5-bigeasy@linutronix.de
---
 include/linux/sched.h    | 3 +++
 include/linux/sched/rt.h | 4 ++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..67623ffd4a8e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -911,6 +911,9 @@ struct task_struct {
 	 * ->sched_remote_wakeup gets used, so it can be in this word.
 	 */
 	unsigned			sched_remote_wakeup:1;
+#ifdef CONFIG_RT_MUTEXES
+	unsigned			sched_rt_mutex:1;
+#endif
 
 	/* Bit to tell LSMs we're in execve(): */
 	unsigned			in_execve:1;
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 994c25640e15..b2b9e6eb9683 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -30,6 +30,10 @@ static inline bool task_is_realtime(struct task_struct *tsk)
 }
 
 #ifdef CONFIG_RT_MUTEXES
+extern void rt_mutex_pre_schedule(void);
+extern void rt_mutex_schedule(void);
+extern void rt_mutex_post_schedule(void);
+
 /*
  * Must hold either p->pi_lock or task_rq(p)->lock.
  */
-- 
cgit v1.2.3


From 6d2779ecaeb56f92d7105c56772346c71c88c278 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 19 Sep 2023 18:14:29 +0100
Subject: locking/atomic: scripts: fix fallback ifdeffery

Since commit:

  9257959a6e5b4fca ("locking/atomic: scripts: restructure fallback ifdeffery")

The ordering fallbacks for atomic*_read_acquire() and
atomic*_set_release() erroneously fall back to the implictly relaxed
atomic*_read() and atomic*_set() variants respectively, without any
additional barriers. This loses the ACQUIRE and RELEASE ordering
semantics, which can result in a wide variety of problems, even on
strongly-ordered architectures where the implementation of
atomic*_read() and/or atomic*_set() allows the compiler to reorder those
relative to other accesses.

In practice this has been observed to break bit spinlocks on arm64,
resulting in dentry cache corruption.

The fallback logic was intended to allow ACQUIRE/RELEASE/RELAXED ops to
be defined in terms of FULL ops, but where an op had RELAXED ordering by
default, this unintentionally permitted the ACQUIRE/RELEASE ops to be
defined in terms of the implicitly RELAXED default.

This patch corrects the logic to avoid falling back to implicitly
RELAXED ops, resulting in the same behaviour as prior to commit
9257959a6e5b4fca.

I've verified the resulting assembly on arm64 by generating outlined
wrappers of the atomics. Prior to this patch the compiler generates
sequences using relaxed load (LDR) and store (STR) instructions, e.g.

| <outlined_atomic64_read_acquire>:
|         ldr     x0, [x0]
|         ret
|
| <outlined_atomic64_set_release>:
|         str     x1, [x0]
|         ret

With this patch applied the compiler generates sequences using the
intended load-acquire (LDAR) and store-release (STLR) instructions, e.g.

| <outlined_atomic64_read_acquire>:
|         ldar    x0, [x0]
|         ret
|
| <outlined_atomic64_set_release>:
|         stlr    x1, [x0]
|         ret

To make sure that there were no other victims of the ifdeffery rewrite,
I generated outlined copies of all of the {atomic,atomic64,atomic_long}
atomic operations before and after commit 9257959a6e5b4fca. A diff of
the generated assembly on arm64 shows that only the read_acquire() and
set_release() operations were changed, and only lost their intended
ordering:

| [mark@lakrids:~/src/linux]% diff -u \
| 	<(aarch64-linux-gnu-objdump -d before-9257959a6e5b4fca.o)
| 	<(aarch64-linux-gnu-objdump -d after-9257959a6e5b4fca.o)
| --- /proc/self/fd/11    2023-09-19 16:51:51.114779415 +0100
| +++ /proc/self/fd/16    2023-09-19 16:51:51.114779415 +0100
| @@ -1,5 +1,5 @@
|
| -before-9257959a6e5b4fca.o:     file format elf64-littleaarch64
| +after-9257959a6e5b4fca.o:     file format elf64-littleaarch64
|
|
|  Disassembly of section .text:
| @@ -9,7 +9,7 @@
|         4:      d65f03c0        ret
|
|  0000000000000008 <outlined_atomic_read_acquire>:
| -       8:      88dffc00        ldar    w0, [x0]
| +       8:      b9400000        ldr     w0, [x0]
|         c:      d65f03c0        ret
|
|  0000000000000010 <outlined_atomic_set>:
| @@ -17,7 +17,7 @@
|        14:      d65f03c0        ret
|
|  0000000000000018 <outlined_atomic_set_release>:
| -      18:      889ffc01        stlr    w1, [x0]
| +      18:      b9000001        str     w1, [x0]
|        1c:      d65f03c0        ret
|
|  0000000000000020 <outlined_atomic_add>:
| @@ -1230,7 +1230,7 @@
|      1070:      d65f03c0        ret
|
|  0000000000001074 <outlined_atomic64_read_acquire>:
| -    1074:      c8dffc00        ldar    x0, [x0]
| +    1074:      f9400000        ldr     x0, [x0]
|      1078:      d65f03c0        ret
|
|  000000000000107c <outlined_atomic64_set>:
| @@ -1238,7 +1238,7 @@
|      1080:      d65f03c0        ret
|
|  0000000000001084 <outlined_atomic64_set_release>:
| -    1084:      c89ffc01        stlr    x1, [x0]
| +    1084:      f9000001        str     x1, [x0]
|      1088:      d65f03c0        ret
|
|  000000000000108c <outlined_atomic64_add>:
| @@ -2427,7 +2427,7 @@
|      207c:      d65f03c0        ret
|
|  0000000000002080 <outlined_atomic_long_read_acquire>:
| -    2080:      c8dffc00        ldar    x0, [x0]
| +    2080:      f9400000        ldr     x0, [x0]
|      2084:      d65f03c0        ret
|
|  0000000000002088 <outlined_atomic_long_set>:
| @@ -2435,7 +2435,7 @@
|      208c:      d65f03c0        ret
|
|  0000000000002090 <outlined_atomic_long_set_release>:
| -    2090:      c89ffc01        stlr    x1, [x0]
| +    2090:      f9000001        str     x1, [x0]
|      2094:      d65f03c0        ret
|
|  0000000000002098 <outlined_atomic_long_add>:

I've build tested this with a variety of configs for alpha, arm, arm64,
csky, i386, m68k, microblaze, mips, nios2, openrisc, powerpc, riscv,
s390, sh, sparc, x86_64, and xtensa, for which I've seen no issues. I
was unable to build test for ia64 and parisc due to existing build
breakage in v6.6-rc2.

Fixes: 9257959a6e5b4fca ("locking/atomic: scripts: restructure fallback ifdeffery")
Reported-by: Ming Lei <ming.lei@redhat.com>
Reported-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Baokun Li <libaokun1@huawei.com>
Link: https://lkml.kernel.org/r/20230919171430.2697727-1-mark.rutland@arm.com
---
 include/linux/atomic/atomic-arch-fallback.h | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h
index 18f5744dfb5d..b83ef19da13d 100644
--- a/include/linux/atomic/atomic-arch-fallback.h
+++ b/include/linux/atomic/atomic-arch-fallback.h
@@ -459,8 +459,6 @@ raw_atomic_read_acquire(const atomic_t *v)
 {
 #if defined(arch_atomic_read_acquire)
 	return arch_atomic_read_acquire(v);
-#elif defined(arch_atomic_read)
-	return arch_atomic_read(v);
 #else
 	int ret;
 
@@ -508,8 +506,6 @@ raw_atomic_set_release(atomic_t *v, int i)
 {
 #if defined(arch_atomic_set_release)
 	arch_atomic_set_release(v, i);
-#elif defined(arch_atomic_set)
-	arch_atomic_set(v, i);
 #else
 	if (__native_word(atomic_t)) {
 		smp_store_release(&(v)->counter, i);
@@ -2575,8 +2571,6 @@ raw_atomic64_read_acquire(const atomic64_t *v)
 {
 #if defined(arch_atomic64_read_acquire)
 	return arch_atomic64_read_acquire(v);
-#elif defined(arch_atomic64_read)
-	return arch_atomic64_read(v);
 #else
 	s64 ret;
 
@@ -2624,8 +2618,6 @@ raw_atomic64_set_release(atomic64_t *v, s64 i)
 {
 #if defined(arch_atomic64_set_release)
 	arch_atomic64_set_release(v, i);
-#elif defined(arch_atomic64_set)
-	arch_atomic64_set(v, i);
 #else
 	if (__native_word(atomic64_t)) {
 		smp_store_release(&(v)->counter, i);
@@ -4657,4 +4649,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v)
 }
 
 #endif /* _LINUX_ATOMIC_FALLBACK_H */
-// 202b45c7db600ce36198eb1f1fc2c2d5268ace2d
+// 2fdd6702823fa842f9cea57a002e6e4476ae780c
-- 
cgit v1.2.3


From 03a95cf233b5bdd65ddd4eca63cd4874b6b84d50 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Sat, 26 Aug 2023 13:53:03 +0100
Subject: firmware: arm_scmi: Simplify enable/disable clock operations

SCMI clock enable/disable operations come in 2 different flavours which
simply just differ in how the underlying SCMI transactions is carried on:
atomic or not.

Currently we expose such SCMI operations through 2 distinctly named
wrappers, that, in turn, are wrapped into another couple of similarly and
distinctly named callbacks inside SCMI clock driver user.

Reduce the churn of duplicated wrappers by adding a param to SCMI clock
enable/disable operations to ask for atomic operation while removing the
_atomic version of such operations.

No functional change.

CC: Michael Turquette <mturquette@baylibre.com>
CC: Stephen Boyd <sboyd@kernel.org>
CC: linux-clk@vger.kernel.org
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Link: https://lore.kernel.org/r/20230826125308.462328-2-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_protocol.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index e6fe4f73ffe6..b4c631a8d0ac 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -90,11 +90,10 @@ struct scmi_clk_proto_ops {
 			u64 *rate);
 	int (*rate_set)(const struct scmi_protocol_handle *ph, u32 clk_id,
 			u64 rate);
-	int (*enable)(const struct scmi_protocol_handle *ph, u32 clk_id);
-	int (*disable)(const struct scmi_protocol_handle *ph, u32 clk_id);
-	int (*enable_atomic)(const struct scmi_protocol_handle *ph, u32 clk_id);
-	int (*disable_atomic)(const struct scmi_protocol_handle *ph,
-			      u32 clk_id);
+	int (*enable)(const struct scmi_protocol_handle *ph, u32 clk_id,
+		      bool atomic);
+	int (*disable)(const struct scmi_protocol_handle *ph, u32 clk_id,
+		       bool atomic);
 };
 
 /**
-- 
cgit v1.2.3


From 34592bf0a5cb0681ce3d64de5598951768f43328 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Sat, 26 Aug 2023 13:53:05 +0100
Subject: firmware: arm_scmi: Add v3.2 clock CONFIG_GET support

Add support for v3.2 clock CONFIG_GET command and related new clock
protocol operation state_get() to retrieve the status of a clock.

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20230826125308.462328-4-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_protocol.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index b4c631a8d0ac..d11ca4286d57 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -80,6 +80,7 @@ struct scmi_protocol_handle;
  * @rate_set: set the clock rate of a clock
  * @enable: enables the specified clock
  * @disable: disables the specified clock
+ * @state_get: get the status of the specified clock
  */
 struct scmi_clk_proto_ops {
 	int (*count_get)(const struct scmi_protocol_handle *ph);
@@ -94,6 +95,8 @@ struct scmi_clk_proto_ops {
 		      bool atomic);
 	int (*disable)(const struct scmi_protocol_handle *ph, u32 clk_id,
 		       bool atomic);
+	int (*state_get)(const struct scmi_protocol_handle *ph, u32 clk_id,
+			 bool *enabled, bool atomic);
 };
 
 /**
-- 
cgit v1.2.3


From 141b4fa0362569138653cf0165d92d48576db3fa Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Sat, 26 Aug 2023 13:53:08 +0100
Subject: firmware: arm_scmi: Add clock OEM config clock operations

Expose a couple of new SCMI clock operations to get and set OEM specific
clock configurations when talking to an SCMI v3.2 compliant.

Issuing such requests against an SCMI platform server not supporting v3.2
extension for OEM specific clock configurations will fail.

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20230826125308.462328-7-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_protocol.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index d11ca4286d57..e09ac428fa1b 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -81,6 +81,8 @@ struct scmi_protocol_handle;
  * @enable: enables the specified clock
  * @disable: disables the specified clock
  * @state_get: get the status of the specified clock
+ * @config_oem_get: get the value of an OEM specific clock config
+ * @config_oem_set: set the value of an OEM specific clock config
  */
 struct scmi_clk_proto_ops {
 	int (*count_get)(const struct scmi_protocol_handle *ph);
@@ -97,6 +99,11 @@ struct scmi_clk_proto_ops {
 		       bool atomic);
 	int (*state_get)(const struct scmi_protocol_handle *ph, u32 clk_id,
 			 bool *enabled, bool atomic);
+	int (*config_oem_get)(const struct scmi_protocol_handle *ph, u32 clk_id,
+			      u8 oem_type, u32 *oem_val, u32 *attributes,
+			      bool atomic);
+	int (*config_oem_set)(const struct scmi_protocol_handle *ph, u32 clk_id,
+			      u8 oem_type, u32 oem_val, bool atomic);
 };
 
 /**
-- 
cgit v1.2.3


From 647aa768281f38cb1002edb3a1f673c3d66a8d81 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 20 Sep 2023 16:40:13 +0200
Subject: Revert "fs: add infrastructure for multigrain timestamps"

This reverts commit ffb6cf19e06334062744b7e3493f71e500964f8e.

Users reported regressions due to enabling multi-grained timestamps
unconditionally. As no clear consensus on a solution has come up and the
discussion has gone back to the drawing board revert the infrastructure
changes for. If it isn't code that's here to stay, make it go away.

Message-ID: <20230920-keine-eile-c9755b5825db@brauner>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 46 ++--------------------------------------------
 1 file changed, 2 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4aeb3fa11927..b528f063e8ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1508,47 +1508,18 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
 	       kgid_has_mapping(fs_userns, kgid);
 }
 
-struct timespec64 current_mgtime(struct inode *inode);
 struct timespec64 current_time(struct inode *inode);
 struct timespec64 inode_set_ctime_current(struct inode *inode);
 
-/*
- * Multigrain timestamps
- *
- * Conditionally use fine-grained ctime and mtime timestamps when there
- * are users actively observing them via getattr. The primary use-case
- * for this is NFS clients that use the ctime to distinguish between
- * different states of the file, and that are often fooled by multiple
- * operations that occur in the same coarse-grained timer tick.
- *
- * The kernel always keeps normalized struct timespec64 values in the ctime,
- * which means that only the first 30 bits of the value are used. Use the
- * 31st bit of the ctime's tv_nsec field as a flag to indicate that the value
- * has been queried since it was last updated.
- */
-#define I_CTIME_QUERIED		(1L<<30)
-
 /**
  * inode_get_ctime - fetch the current ctime from the inode
  * @inode: inode from which to fetch ctime
  *
- * Grab the current ctime tv_nsec field from the inode, mask off the
- * I_CTIME_QUERIED flag and return it. This is mostly intended for use by
- * internal consumers of the ctime that aren't concerned with ensuring a
- * fine-grained update on the next change (e.g. when preparing to store
- * the value in the backing store for later retrieval).
- *
- * This is safe to call regardless of whether the underlying filesystem
- * is using multigrain timestamps.
+ * Grab the current ctime from the inode and return it.
  */
 static inline struct timespec64 inode_get_ctime(const struct inode *inode)
 {
-	struct timespec64 ctime;
-
-	ctime.tv_sec = inode->__i_ctime.tv_sec;
-	ctime.tv_nsec = inode->__i_ctime.tv_nsec & ~I_CTIME_QUERIED;
-
-	return ctime;
+	return inode->__i_ctime;
 }
 
 /**
@@ -2334,7 +2305,6 @@ struct file_system_type {
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_DISALLOW_NOTIFY_PERM	16	/* Disable fanotify permission events */
 #define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
-#define FS_MGTIME		64	/* FS uses multigrain timestamps */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	int (*init_fs_context)(struct fs_context *);
 	const struct fs_parameter_spec *parameters;
@@ -2358,17 +2328,6 @@ struct file_system_type {
 
 #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
 
-/**
- * is_mgtime: is this inode using multigrain timestamps
- * @inode: inode to test for multigrain timestamps
- *
- * Return true if the inode uses multigrain timestamps, false otherwise.
- */
-static inline bool is_mgtime(const struct inode *inode)
-{
-	return inode->i_sb->s_type->fs_flags & FS_MGTIME;
-}
-
 extern struct dentry *mount_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int));
@@ -3054,7 +3013,6 @@ extern void page_put_link(void *);
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern void kfree_link(void *);
-void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode);
 void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
 void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
 extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
-- 
cgit v1.2.3


From 41b43b6c6e30a832c790b010a06772e793bca193 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 20 Sep 2023 12:46:27 +0200
Subject: locking/seqlock: Do the lockdep annotation before locking in
 do_write_seqcount_begin_nested()

It was brought up by Tetsuo that the following sequence:

   write_seqlock_irqsave()
   printk_deferred_enter()

could lead to a deadlock if the lockdep annotation within
write_seqlock_irqsave() triggers.

The problem is that the sequence counter is incremented before the lockdep
annotation is performed. The lockdep splat would then attempt to invoke
printk() but the reader side, of the same seqcount, could have a
tty_port::lock acquired waiting for the sequence number to become even again.

The other lockdep annotations come before the actual locking because "we
want to see the locking error before it happens". There is no reason why
seqcount should be different here.

Do the lockdep annotation first then perform the locking operation (the
sequence increment).

Fixes: 1ca7d67cf5d5a ("seqcount: Add lockdep functionality to seqcount/seqlock structures")
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230920104627._DTHgPyA@linutronix.de

Closes: https://lore.kernel.org/20230621130641.-5iueY1I@linutronix.de
---
 include/linux/seqlock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 987a59d977c5..e9bd2f65d7f4 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -512,8 +512,8 @@ do {									\
 
 static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass)
 {
-	do_raw_write_seqcount_begin(s);
 	seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
+	do_raw_write_seqcount_begin(s);
 }
 
 /**
-- 
cgit v1.2.3


From 87c3a5893e865739ce78aa7192d36011022e0af7 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@linux-m68k.org>
Date: Fri, 15 Sep 2023 15:47:11 +1000
Subject: sched/core: Optimize in_task() and in_interrupt() a bit

Except on x86, preempt_count is always accessed with READ_ONCE().
Repeated invocations in macros like irq_count() produce repeated loads.
These redundant instructions appear in various fast paths. In the one
shown below, for example, irq_count() is evaluated during kernel entry
if !tick_nohz_full_cpu(smp_processor_id()).

0001ed0a <irq_enter_rcu>:
   1ed0a:       4e56 0000       linkw %fp,#0
   1ed0e:       200f            movel %sp,%d0
   1ed10:       0280 ffff e000  andil #-8192,%d0
   1ed16:       2040            moveal %d0,%a0
   1ed18:       2028 0008       movel %a0@(8),%d0
   1ed1c:       0680 0001 0000  addil #65536,%d0
   1ed22:       2140 0008       movel %d0,%a0@(8)
   1ed26:       082a 0001 000f  btst #1,%a2@(15)
   1ed2c:       670c            beqs 1ed3a <irq_enter_rcu+0x30>
   1ed2e:       2028 0008       movel %a0@(8),%d0
   1ed32:       2028 0008       movel %a0@(8),%d0
   1ed36:       2028 0008       movel %a0@(8),%d0
   1ed3a:       4e5e            unlk %fp
   1ed3c:       4e75            rts

This patch doesn't prevent the pointless btst and beqs instructions
above, but it does eliminate 2 of the 3 pointless move instructions
here and elsewhere.

On x86, preempt_count is per-cpu data and the problem does not arise
presumably because the compiler is free to optimize more effectively.

This patch was tested on m68k and x86. I was expecting no changes
to object code for x86 and mostly that's what I saw. However, there
were a few places where code generation was perturbed for some reason.

The performance issue addressed here is minor on uniprocessor m68k. I
got a 0.01% improvement from this patch for a simple "find /sys -false"
benchmark. For architectures and workloads susceptible to cache line bounce
the improvement is expected to be larger. The only SMP architecture I have
is x86, and as x86 unaffected I have not done any further measurements.

Fixes: 15115830c887 ("preempt: Cleanup the macro maze a bit")
Signed-off-by: Finn Thain <fthain@linux-m68k.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/0a403120a682a525e6db2d81d1a3ffcc137c3742.1694756831.git.fthain@linux-m68k.org
---
 include/linux/preempt.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 1424670df161..9aa6358a1a16 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -99,14 +99,21 @@ static __always_inline unsigned char interrupt_context_level(void)
 	return level;
 }
 
+/*
+ * These macro definitions avoid redundant invocations of preempt_count()
+ * because such invocations would result in redundant loads given that
+ * preempt_count() is commonly implemented with READ_ONCE().
+ */
+
 #define nmi_count()	(preempt_count() & NMI_MASK)
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #ifdef CONFIG_PREEMPT_RT
 # define softirq_count()	(current->softirq_disable_cnt & SOFTIRQ_MASK)
+# define irq_count()		((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
 #else
 # define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
+# define irq_count()		(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
 #endif
-#define irq_count()	(nmi_count() | hardirq_count() | softirq_count())
 
 /*
  * Macros to retrieve the current execution context:
@@ -119,7 +126,11 @@ static __always_inline unsigned char interrupt_context_level(void)
 #define in_nmi()		(nmi_count())
 #define in_hardirq()		(hardirq_count())
 #define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
-#define in_task()		(!(in_nmi() | in_hardirq() | in_serving_softirq()))
+#ifdef CONFIG_PREEMPT_RT
+# define in_task()		(!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
+#else
+# define in_task()		(!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+#endif
 
 /*
  * The following macros are deprecated and should not be used in new code:
-- 
cgit v1.2.3


From 3ba78da711940ce07c39c4cdd1f4ad284067a42d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 6 Jun 2021 13:27:15 +0200
Subject: sched/headers: Add header guard to <linux/sched/deadline.h>

It's the only non-trivial header in include/linux/sched/ missing a header guard.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/deadline.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 7c83d4d5a971..df3aca89d4f5 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_DEADLINE_H
+#define _LINUX_SCHED_DEADLINE_H
 
 /*
  * SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -34,3 +36,5 @@ extern void dl_add_task_root_domain(struct task_struct *p);
 extern void dl_clear_root_domain(struct root_domain *rd);
 
 #endif /* CONFIG_SMP */
+
+#endif /* _LINUX_SCHED_DEADLINE_H */
-- 
cgit v1.2.3


From 6eddb116dd830436afbd922568292867de6c8b9e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 21 Sep 2023 11:24:17 +0200
Subject: sched/headers: Standardize the <linux/sched/vhost_task.h> header
 guard name

Use the same _LINUX_SCHED_ prefix nomenclature as the other 29 header guards
in include/linux/sched/ do.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/vhost_task.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h
index 837a23624a66..bc60243d43b3 100644
--- a/include/linux/sched/vhost_task.h
+++ b/include/linux/sched/vhost_task.h
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_VHOST_TASK_H
-#define _LINUX_VHOST_TASK_H
-
+#ifndef _LINUX_SCHED_VHOST_TASK_H
+#define _LINUX_SCHED_VHOST_TASK_H
 
 struct vhost_task;
 
@@ -11,4 +10,4 @@ void vhost_task_start(struct vhost_task *vtsk);
 void vhost_task_stop(struct vhost_task *vtsk);
 void vhost_task_wake(struct vhost_task *vtsk);
 
-#endif
+#endif /* _LINUX_SCHED_VHOST_TASK_H */
-- 
cgit v1.2.3


From 0f9a1a4d234c064d8dff69cf3f3755554dd479ec Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 21 Sep 2023 11:27:37 +0200
Subject: sched/headers: Standardize the <linux/sched/type.h> header guard
 #endif

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h
index 3c3e049224ae..969aaf5ef9d6 100644
--- a/include/linux/sched/types.h
+++ b/include/linux/sched/types.h
@@ -20,4 +20,4 @@ struct task_cputime {
 	unsigned long long		sum_exec_runtime;
 };
 
-#endif
+#endif /* _LINUX_SCHED_TYPES_H */
-- 
cgit v1.2.3


From 1632d47fae2f2d229dd432854c4443ebb0bb27a4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 21 Sep 2023 11:28:48 +0200
Subject: sched/headers: Standardize the <linux/sched/smt.h> header guard
 #endif

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/smt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h
index 59d3736c454c..fb1e295e7e63 100644
--- a/include/linux/sched/smt.h
+++ b/include/linux/sched/smt.h
@@ -17,4 +17,4 @@ static inline bool sched_smt_active(void) { return false; }
 
 void arch_smt_update(void);
 
-#endif
+#endif /* _LINUX_SCHED_SMT_H */
-- 
cgit v1.2.3


From 0df7cd3c13e44d01f9f28e29cbce74e2931b00fe Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Sat, 16 Sep 2023 16:09:15 +0300
Subject: vsock/virtio/vhost: read data from non-linear skb

This is preparation patch for MSG_ZEROCOPY support. It adds handling of
non-linear skbs by replacing direct calls of 'memcpy_to_msg()' with
'skb_copy_datagram_iter()'. Main advantage of the second one is that it
can handle paged part of the skb by using 'kmap()' on each page, but if
there are no pages in the skb, it behaves like simple copying to iov
iterator. This patch also adds new field to the control block of skb -
this value shows current offset in the skb to read next portion of data
(it doesn't matter linear it or not). Idea behind this field is that
'skb_copy_datagram_iter()' handles both types of skb internally - it
just needs an offset from which to copy data from the given skb. This
offset is incremented on each read from skb. This approach allows to
simplify handling of both linear and non-linear skbs, because for
linear skb we need to call 'skb_pull()' after reading data from it,
while in non-linear case we need to update 'data_len'.

Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/virtio_vsock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c58453699ee9..a91fbdf233e4 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -12,6 +12,7 @@
 struct virtio_vsock_skb_cb {
 	bool reply;
 	bool tap_delivered;
+	u32 offset;
 };
 
 #define VIRTIO_VSOCK_SKB_CB(skb) ((struct virtio_vsock_skb_cb *)((skb)->cb))
-- 
cgit v1.2.3


From 581512a6dc939ef122e49336626ae159f3b8a345 Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Sat, 16 Sep 2023 16:09:18 +0300
Subject: vsock/virtio: MSG_ZEROCOPY flag support

This adds handling of MSG_ZEROCOPY flag on transmission path:

1) If this flag is set and zerocopy transmission is possible (enabled
   in socket options and transport allows zerocopy), then non-linear
   skb will be created and filled with the pages of user's buffer.
   Pages of user's buffer are locked in memory by 'get_user_pages()'.
2) Replaces way of skb owning: instead of 'skb_set_owner_sk_safe()' it
   calls 'skb_set_owner_w()'. Reason of this change is that
   '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc' of socket, so
   to decrease this field correctly, proper skb destructor is needed:
   'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'.
3) Adds new callback to 'struct virtio_transport': 'can_msgzerocopy'.
   If this callback is set, then transport needs extra check to be able
   to send provided number of buffers in zerocopy mode. Currently, the
   only transport that needs this callback set is virtio, because this
   transport adds new buffers to the virtio queue and we need to check,
   that number of these buffers is less than size of the queue (it is
   required by virtio spec). vhost and loopback transports don't need
   this check.

Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/virtio_vsock.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index a91fbdf233e4..ebb3ce63d64d 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -160,6 +160,15 @@ struct virtio_transport {
 
 	/* Takes ownership of the packet */
 	int (*send_pkt)(struct sk_buff *skb);
+
+	/* Used in MSG_ZEROCOPY mode. Checks, that provided data
+	 * (number of buffers) could be transmitted with zerocopy
+	 * mode. If this callback is not implemented for the current
+	 * transport - this means that this transport doesn't need
+	 * extra checks and can perform zerocopy transmission by
+	 * default.
+	 */
+	bool (*can_msgzerocopy)(int bufs_num);
 };
 
 ssize_t
-- 
cgit v1.2.3


From e9090e70e618cd62ab7bf2914511e5eea31a2535 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 25 Aug 2023 13:26:21 +0200
Subject: firmware: arm_scmi: Extend perf protocol ops to get number of domains

Similar to other protocol ops, it's useful for an scmi module driver to get
the number of supported performance domains, hence let's make this
available by adding a new perf protocol callback. Note that, a user is
being added from subsequent changes.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20230825112633.236607-2-ulf.hansson@linaro.org
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_protocol.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index e09ac428fa1b..14791dd54706 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -110,6 +110,7 @@ struct scmi_clk_proto_ops {
  * struct scmi_perf_proto_ops - represents the various operations provided
  *	by SCMI Performance Protocol
  *
+ * @num_domains_get: gets the number of supported performance domains
  * @limits_set: sets limits on the performance level of a domain
  * @limits_get: gets limits on the performance level of a domain
  * @level_set: sets the performance level of a domain
@@ -129,6 +130,7 @@ struct scmi_clk_proto_ops {
  *	or in some other (abstract) scale
  */
 struct scmi_perf_proto_ops {
+	int (*num_domains_get)(const struct scmi_protocol_handle *ph);
 	int (*limits_set)(const struct scmi_protocol_handle *ph, u32 domain,
 			  u32 max_perf, u32 min_perf);
 	int (*limits_get)(const struct scmi_protocol_handle *ph, u32 domain,
-- 
cgit v1.2.3


From 3d99ed60721bf2e108c8fc660775766057689a92 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 25 Aug 2023 13:26:22 +0200
Subject: firmware: arm_scmi: Extend perf protocol ops to get information of a
 domain

Similar to other protocol ops, it's useful for an scmi module driver to get
some generic information of a performance domain. Therefore, let's add a
new callback to provide this information. The information is currently
limited to the name of the performance domain and whether the set-level
operation is supported, although this can easily be extended if we find the
need for it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20230825112633.236607-3-ulf.hansson@linaro.org
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_protocol.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 14791dd54706..9108fe8ed3e3 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -106,11 +106,17 @@ struct scmi_clk_proto_ops {
 			      u8 oem_type, u32 oem_val, bool atomic);
 };
 
+struct scmi_perf_domain_info {
+	char name[SCMI_MAX_STR_SIZE];
+	bool set_perf;
+};
+
 /**
  * struct scmi_perf_proto_ops - represents the various operations provided
  *	by SCMI Performance Protocol
  *
  * @num_domains_get: gets the number of supported performance domains
+ * @info_get: get the information of a performance domain
  * @limits_set: sets limits on the performance level of a domain
  * @limits_get: gets limits on the performance level of a domain
  * @level_set: sets the performance level of a domain
@@ -131,6 +137,8 @@ struct scmi_clk_proto_ops {
  */
 struct scmi_perf_proto_ops {
 	int (*num_domains_get)(const struct scmi_protocol_handle *ph);
+	const struct scmi_perf_domain_info __must_check *(*info_get)
+		(const struct scmi_protocol_handle *ph, u32 domain);
 	int (*limits_set)(const struct scmi_protocol_handle *ph, u32 domain,
 			  u32 max_perf, u32 min_perf);
 	int (*limits_get)(const struct scmi_protocol_handle *ph, u32 domain,
-- 
cgit v1.2.3


From 39dfa5b9e1f0fa63b811a0a87f1c2fb9c76a0456 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 25 Aug 2023 13:26:24 +0200
Subject: firmware: arm_scmi: Align perf ops to use domain-id as in-parameter

Most scmi_perf_proto_ops are already using an "u32 domain" as an
in-parameter to indicate what performance domain we shall operate upon.
However, some of the ops are using a "struct device *dev", which means that
an additional OF parsing is needed each time the perf ops gets called, to
find the corresponding domain-id.

To avoid the above, but also to make the code more consistent, let's
replace the in-parameter "struct device *dev" with an "u32 domain". Note
that, this requires us to make some corresponding changes to the scmi
cpufreq driver, so let's do that too.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lore.kernel.org/r/20230825112633.236607-5-ulf.hansson@linaro.org
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_protocol.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 9108fe8ed3e3..4e63766adc6f 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -149,9 +149,9 @@ struct scmi_perf_proto_ops {
 			 u32 *level, bool poll);
 	int (*device_domain_id)(struct device *dev);
 	int (*transition_latency_get)(const struct scmi_protocol_handle *ph,
-				      struct device *dev);
+				      u32 domain);
 	int (*device_opps_add)(const struct scmi_protocol_handle *ph,
-			       struct device *dev);
+			       struct device *dev, u32 domain);
 	int (*freq_set)(const struct scmi_protocol_handle *ph, u32 domain,
 			unsigned long rate, bool poll);
 	int (*freq_get)(const struct scmi_protocol_handle *ph, u32 domain,
@@ -159,7 +159,7 @@ struct scmi_perf_proto_ops {
 	int (*est_power_get)(const struct scmi_protocol_handle *ph, u32 domain,
 			     unsigned long *rate, unsigned long *power);
 	bool (*fast_switch_possible)(const struct scmi_protocol_handle *ph,
-				     struct device *dev);
+				     u32 domain);
 	enum scmi_power_scale (*power_scale_get)(const struct scmi_protocol_handle *ph);
 };
 
-- 
cgit v1.2.3


From 9b578d83629e13f81a53d1695a4f700cdb10f772 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 25 Aug 2023 13:26:25 +0200
Subject: firmware: arm_scmi: Drop redundant ->device_domain_id() from perf ops

There are no longer any users of the ->device_domain_id() ops in the
scmi_perf_proto_ops, therefore let's remove it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20230825112633.236607-6-ulf.hansson@linaro.org
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_protocol.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 4e63766adc6f..27bfa5a65b45 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -121,7 +121,6 @@ struct scmi_perf_domain_info {
  * @limits_get: gets limits on the performance level of a domain
  * @level_set: sets the performance level of a domain
  * @level_get: gets the performance level of a domain
- * @device_domain_id: gets the scmi domain id for a given device
  * @transition_latency_get: gets the DVFS transition latency for a given device
  * @device_opps_add: adds all the OPPs for a given device
  * @freq_set: sets the frequency for a given device using sustained frequency
@@ -147,7 +146,6 @@ struct scmi_perf_proto_ops {
 			 u32 level, bool poll);
 	int (*level_get)(const struct scmi_protocol_handle *ph, u32 domain,
 			 u32 *level, bool poll);
-	int (*device_domain_id)(struct device *dev);
 	int (*transition_latency_get)(const struct scmi_protocol_handle *ph,
 				      u32 domain);
 	int (*device_opps_add)(const struct scmi_protocol_handle *ph,
-- 
cgit v1.2.3


From 3dd91515ef43dd43e32e2a84e4bd881b64fb33ae Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 25 Aug 2023 13:26:32 +0200
Subject: PM: domains: Allow genpd providers to manage OPP tables directly by
 its FW

In some cases the OPP tables aren't specified in device tree, but rather
encoded in the FW. To allow a genpd provider to specify them dynamically
instead, let's add a new genpd flag, GENPD_FLAG_OPP_TABLE_FW.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20230825112633.236607-13-ulf.hansson@linaro.org
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/pm_domain.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index f776fb93eaa0..05ad8cefdff1 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -61,6 +61,10 @@
  * GENPD_FLAG_MIN_RESIDENCY:	Enable the genpd governor to consider its
  *				components' next wakeup when determining the
  *				optimal idle state.
+ *
+ * GENPD_FLAG_OPP_TABLE_FW:	The genpd provider supports performance states,
+ *				but its corresponding OPP tables are not
+ *				described in DT, but are given directly by FW.
  */
 #define GENPD_FLAG_PM_CLK	 (1U << 0)
 #define GENPD_FLAG_IRQ_SAFE	 (1U << 1)
@@ -69,6 +73,7 @@
 #define GENPD_FLAG_CPU_DOMAIN	 (1U << 4)
 #define GENPD_FLAG_RPM_ALWAYS_ON (1U << 5)
 #define GENPD_FLAG_MIN_RESIDENCY (1U << 6)
+#define GENPD_FLAG_OPP_TABLE_FW	 (1U << 7)
 
 enum gpd_status {
 	GENPD_STATE_ON = 0,	/* PM domain is on */
-- 
cgit v1.2.3


From 9f6c532f59b20580acf8ede9409c9b8dce6e74e1 Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:10 +0200
Subject: futex: Add sys_futex_wake()

To complement sys_futex_waitv() add sys_futex_wake(). This syscall
implements what was previously known as FUTEX_WAKE_BITSET except it
uses 'unsigned long' for the bitmask and takes FUTEX2 flags.

The 'unsigned long' allows FUTEX2_SIZE_U64 on 64bit platforms.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20230921105247.936205525@noisy.programming.kicks-ass.net
---
 include/linux/syscalls.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 22bc6bc147f8..e174ed86da1d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -549,6 +549,9 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
 				unsigned int nr_futexes, unsigned int flags,
 				struct __kernel_timespec __user *timeout, clockid_t clockid);
+
+asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long mask, int nr, unsigned int flags);
+
 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
 			      struct __kernel_timespec __user *rmtp);
 asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp,
-- 
cgit v1.2.3


From cb8c4312afca1b2dc64107e7e7cea81911055612 Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:12 +0200
Subject: futex: Add sys_futex_wait()

To complement sys_futex_waitv()/wake(), add sys_futex_wait(). This
syscall implements what was previously known as FUTEX_WAIT_BITSET
except it uses 'unsigned long' for the value and bitmask arguments,
takes timespec and clockid_t arguments for the absolute timeout and
uses FUTEX2 flags.

The 'unsigned long' allows FUTEX2_SIZE_U64 on 64bit platforms.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20230921105248.164324363@noisy.programming.kicks-ass.net
---
 include/linux/syscalls.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e174ed86da1d..11f3fdd1ee03 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -552,6 +552,10 @@ asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
 
 asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long mask, int nr, unsigned int flags);
 
+asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, unsigned long mask,
+			       unsigned int flags, struct __kernel_timespec __user *timespec,
+			       clockid_t clockid);
+
 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
 			      struct __kernel_timespec __user *rmtp);
 asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp,
-- 
cgit v1.2.3


From 0f4b5f972216782a4acb1ae00dcb55173847c2ff Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:15 +0200
Subject: futex: Add sys_futex_requeue()

Finish off the 'simple' futex2 syscall group by adding
sys_futex_requeue(). Unlike sys_futex_{wait,wake}() its arguments are
too numerous to fit into a regular syscall. As such, use struct
futex_waitv to pass the 'source' and 'destination' futexes to the
syscall.

This syscall implements what was previously known as FUTEX_CMP_REQUEUE
and uses {val, uaddr, flags} for source and {uaddr, flags} for
destination.

This design explicitly allows requeueing between different types of
futex by having a different flags word per uaddr.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20230921105248.511860556@noisy.programming.kicks-ass.net
---
 include/linux/syscalls.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 11f3fdd1ee03..0901af60d971 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -556,6 +556,9 @@ asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, unsigned l
 			       unsigned int flags, struct __kernel_timespec __user *timespec,
 			       clockid_t clockid);
 
+asmlinkage long sys_futex_requeue(struct futex_waitv __user *waiters,
+				  unsigned int flags, int nr_wake, int nr_requeue);
+
 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
 			      struct __kernel_timespec __user *rmtp);
 asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp,
-- 
cgit v1.2.3


From f31ecf671ddc498f20219453395794ff2383e06b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 10 Jul 2023 16:14:37 -0600
Subject: io_uring: add IORING_OP_WAITID support

This adds support for an async version of waitid(2), in a fully async
version. If an event isn't immediately available, wait for a callback
to trigger a retry.

The format of the sqe is as follows:

sqe->len		The 'which', the idtype being queried/waited for.
sqe->fd			The 'pid' (or id) being waited for.
sqe->file_index		The 'options' being set.
sqe->addr2		A pointer to siginfo_t, if any, being filled in.

buf_index, add3, and waitid_flags are reserved/unused for now.
waitid_flags will be used for options for this request type. One
interesting use case may be to add multi-shot support, so that the
request stays armed and posts a notification every time a monitored
process state change occurs.

Note that this does not support rusage, on Arnd's recommendation.

See the waitid(2) man page for details on the arguments.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 13d19b9be9f4..fe1c5d4ec56c 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -313,6 +313,8 @@ struct io_ring_ctx {
 	struct list_head	cq_overflow_list;
 	struct io_hash_table	cancel_table;
 
+	struct hlist_head	waitid_list;
+
 	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
 	struct io_sq_data	*sq_data;	/* if using sq thread polling */
 
-- 
cgit v1.2.3


From 41e845628511878d6e89e2a9249c095e72aab7eb Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Sat, 9 Sep 2023 21:19:32 +0200
Subject: cred: add get_cred_many and put_cred_many

Some of the frequent consumers of get_cred and put_cred operate on 2
references on the same creds back-to-back.

Switch them to doing the work in one go instead.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
[PM: removed changelog from commit description]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/cred.h | 59 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index f923528d5cc4..56bc432fe49b 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -218,6 +218,20 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
 					  cred->cap_inheritable));
 }
 
+/**
+ * get_new_cred_many - Get references on a new set of credentials
+ * @cred: The new credentials to reference
+ * @nr: Number of references to acquire
+ *
+ * Get references on the specified set of new credentials.  The caller must
+ * release all acquired references.
+ */
+static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
+{
+	atomic_add(nr, &cred->usage);
+	return cred;
+}
+
 /**
  * get_new_cred - Get a reference on a new set of credentials
  * @cred: The new credentials to reference
@@ -227,16 +241,16 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
  */
 static inline struct cred *get_new_cred(struct cred *cred)
 {
-	atomic_inc(&cred->usage);
-	return cred;
+	return get_new_cred_many(cred, 1);
 }
 
 /**
- * get_cred - Get a reference on a set of credentials
+ * get_cred_many - Get references on a set of credentials
  * @cred: The credentials to reference
+ * @nr: Number of references to acquire
  *
- * Get a reference on the specified set of credentials.  The caller must
- * release the reference.  If %NULL is passed, it is returned with no action.
+ * Get references on the specified set of credentials.  The caller must release
+ * all acquired reference.  If %NULL is passed, it is returned with no action.
  *
  * This is used to deal with a committed set of credentials.  Although the
  * pointer is const, this will temporarily discard the const and increment the
@@ -244,14 +258,28 @@ static inline struct cred *get_new_cred(struct cred *cred)
  * accidental alteration of a set of credentials that should be considered
  * immutable.
  */
-static inline const struct cred *get_cred(const struct cred *cred)
+static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
 {
 	struct cred *nonconst_cred = (struct cred *) cred;
 	if (!cred)
 		return cred;
 	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
-	return get_new_cred(nonconst_cred);
+	return get_new_cred_many(nonconst_cred, nr);
+}
+
+/*
+ * get_cred - Get a reference on a set of credentials
+ * @cred: The credentials to reference
+ *
+ * Get a reference on the specified set of credentials.  The caller must
+ * release the reference.  If %NULL is passed, it is returned with no action.
+ *
+ * This is used to deal with a committed set of credentials.
+ */
+static inline const struct cred *get_cred(const struct cred *cred)
+{
+	return get_cred_many(cred, 1);
 }
 
 static inline const struct cred *get_cred_rcu(const struct cred *cred)
@@ -269,6 +297,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
 /**
  * put_cred - Release a reference to a set of credentials
  * @cred: The credentials to release
+ * @nr: Number of references to release
  *
  * Release a reference to a set of credentials, deleting them when the last ref
  * is released.  If %NULL is passed, nothing is done.
@@ -277,17 +306,29 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
  * on task_struct are attached by const pointers to prevent accidental
  * alteration of otherwise immutable credential sets.
  */
-static inline void put_cred(const struct cred *_cred)
+static inline void put_cred_many(const struct cred *_cred, int nr)
 {
 	struct cred *cred = (struct cred *) _cred;
 
 	if (cred) {
 		validate_creds(cred);
-		if (atomic_dec_and_test(&(cred)->usage))
+		if (atomic_sub_and_test(nr, &cred->usage))
 			__put_cred(cred);
 	}
 }
 
+/*
+ * put_cred - Release a reference to a set of credentials
+ * @cred: The credentials to release
+ *
+ * Release a reference to a set of credentials, deleting them when the last ref
+ * is released.  If %NULL is passed, nothing is done.
+ */
+static inline void put_cred(const struct cred *cred)
+{
+	put_cred_many(cred, 1);
+}
+
 /**
  * current_cred - Access the current task's subjective credentials
  *
-- 
cgit v1.2.3


From 8bf0cdfac7f8aa3fa6151b5c5f5eebdb44a64e89 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 21 Sep 2023 11:32:58 +0200
Subject: <linux/list.h>: Introduce the list_for_each_reverse() method

The list_head counterpart of list_for_each_entry_reverse() was missing,
add it to complete the list handling APIs in <linux/list.h>.

[ This new API is also relied on by a WIP scheduler patch, so this
  variant is not a theoretical possibility only. ]

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
---
 include/linux/list.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 164b4d0e9d2a..1837caedf723 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -686,6 +686,14 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_for_each(pos, head) \
 	for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)
 
+/**
+ * list_for_each_reverse - iterate backwards over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each_reverse(pos, head) \
+	for (pos = (head)->prev; pos != (head); pos = pos->prev)
+
 /**
  * list_for_each_rcu - Iterate over a list in an RCU-safe fashion
  * @pos:	the &struct list_head to use as a loop cursor.
-- 
cgit v1.2.3


From b8ec60e1186cdcfce41e7db4c827cb107e459002 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Tue, 19 Sep 2023 17:17:28 -0700
Subject: x86/speculation, objtool: Use absolute relocations for annotations

.discard.retpoline_safe sections do not have the SHF_ALLOC flag.  These
sections referencing text sections' STT_SECTION symbols with PC-relative
relocations like R_386_PC32 [0] is conceptually not suitable.  Newer
LLD will report warnings for REL relocations even for relocatable links [1]:

    ld.lld: warning: vmlinux.a(drivers/i2c/busses/i2c-i801.o):(.discard.retpoline_safe+0x120): has non-ABS relocation R_386_PC32 against symbol ''

Switch to absolute relocations instead, which indicate link-time
addresses.  In a relocatable link, these addresses are also output
section offsets, used by checks in tools/objtool/check.c.  When linking
vmlinux, these .discard.* sections will be discarded, therefore it is
not a problem that R_X86_64_32 cannot represent a kernel address.

Alternatively, we could set the SHF_ALLOC flag for .discard.* sections,
but I think non-SHF_ALLOC for sections to be discarded makes more sense.

Note: if we decide to never support REL architectures (e.g. arm, i386),
we can utilize R_*_NONE relocations (.reloc ., BFD_RELOC_NONE, sym),
making .discard.* sections zero-sized.  That said, the section content
waste is 4 bytes per entry, much smaller than sizeof(Elf{32,64}_Rel).

  [0] commit 1c0c1faf5692 ("objtool: Use relative pointers for annotations")
  [1] https://github.com/ClangBuiltLinux/linux/issues/1937

Signed-off-by: Fangrui Song <maskray@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20230920001728.1439947-1-maskray@google.com
---
 include/linux/objtool.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 03f82c2c2ebf..6f6da95fe7f9 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -48,13 +48,13 @@
 #define ANNOTATE_NOENDBR					\
 	"986: \n\t"						\
 	".pushsection .discard.noendbr\n\t"			\
-	".long 986b - .\n\t"					\
+	".long 986b\n\t"					\
 	".popsection\n\t"
 
 #define ASM_REACHABLE							\
 	"998:\n\t"							\
 	".pushsection .discard.reachable\n\t"				\
-	".long 998b - .\n\t"						\
+	".long 998b\n\t"						\
 	".popsection\n\t"
 
 #else /* __ASSEMBLY__ */
@@ -66,7 +66,7 @@
 #define ANNOTATE_INTRA_FUNCTION_CALL				\
 	999:							\
 	.pushsection .discard.intra_function_calls;		\
-	.long 999b - .;						\
+	.long 999b;						\
 	.popsection;
 
 /*
@@ -118,7 +118,7 @@
 .macro ANNOTATE_NOENDBR
 .Lhere_\@:
 	.pushsection .discard.noendbr
-	.long	.Lhere_\@ - .
+	.long	.Lhere_\@
 	.popsection
 .endm
 
@@ -141,7 +141,7 @@
 .macro REACHABLE
 .Lhere_\@:
 	.pushsection .discard.reachable
-	.long	.Lhere_\@ - .
+	.long	.Lhere_\@
 	.popsection
 .endm
 
-- 
cgit v1.2.3


From f1a9be986cedbef839062428e8e9b2bcc9c58af5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 15 Sep 2023 13:12:00 -0700
Subject: mtd: Annotate struct lpddr_private with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct lpddr_private.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Vignesh Raghavendra <vigneshr@ti.com>
Cc: linux-mtd@lists.infradead.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20230915201159.never.112-kees@kernel.org
---
 include/linux/mtd/qinfo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/qinfo.h b/include/linux/mtd/qinfo.h
index 2e3f43788d48..0421f12156b5 100644
--- a/include/linux/mtd/qinfo.h
+++ b/include/linux/mtd/qinfo.h
@@ -24,7 +24,7 @@ struct lpddr_private {
 	struct qinfo_chip *qinfo;
 	int numchips;
 	unsigned long chipshift;
-	struct flchip chips[];
+	struct flchip chips[] __counted_by(numchips);
 };
 
 /* qinfo_query_info structure contains request information for
-- 
cgit v1.2.3


From 1442d628d05c7aad86e5754fa554f32edc3e77a8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 15 Sep 2023 13:12:06 -0700
Subject: mtd: cfi: Annotate struct cfi_private with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct cfi_private.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Vignesh Raghavendra <vigneshr@ti.com>
Cc: linux-mtd@lists.infradead.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20230915201206.never.107-kees@kernel.org
---
 include/linux/mtd/cfi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index d88bb56c18e2..947410faf9e2 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -287,7 +287,7 @@ struct cfi_private {
 	unsigned long chipshift; /* Because they're of the same type */
 	const char *im_name;	 /* inter_module name for cmdset_setup */
 	unsigned long quirks;
-	struct flchip chips[];  /* per-chip data structure for each chip */
+	struct flchip chips[] __counted_by(numchips);  /* per-chip data structure for each chip */
 };
 
 uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs,
-- 
cgit v1.2.3


From 48554df6bf2b1e83f70749bf4b4d7914f8b3c01d Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Wed, 13 Sep 2023 15:16:13 +0000
Subject: blk-mq: remove RQF_MQ_INFLIGHT

Since the previous patch change to only account active requests when
we really allocate the driver tag, the RQF_MQ_INFLIGHT can be removed
and no double account problem.

1. none elevator: flush request will use the first pending request's
   driver tag, won't double account.

2. other elevator: flush request will be accounted when allocate driver
   tag when issue, and will be unaccounted when it put the driver tag.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230913151616.3164338-3-chengming.zhou@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 958ed7e89b30..1ab3081c82ed 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -32,8 +32,6 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_FLUSH_SEQ		((__force req_flags_t)(1 << 4))
 /* merge of different types, fail separately */
 #define RQF_MIXED_MERGE		((__force req_flags_t)(1 << 5))
-/* track inflight for MQ */
-#define RQF_MQ_INFLIGHT		((__force req_flags_t)(1 << 6))
 /* don't call prep for this one */
 #define RQF_DONTPREP		((__force req_flags_t)(1 << 7))
 /* use hctx->sched_tags */
-- 
cgit v1.2.3


From 5f05285df691b1e82108eead7165feae238c95ef Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Date: Tue, 19 Sep 2023 13:40:48 +0530
Subject: iio: hid-sensor-als: Add light color temperature support

In most cases, ambient color sensors also support light color temperature.
As a result, add support of light color temperature.

Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Acked-by: Srinivas Pandruvada<srinivas.pandruvada@linux.intel.com>
Link: https://lore.kernel.org/r/20230919081054.2050714-4-Basavaraj.Natikar@amd.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/hid-sensor-ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 13b1e65fbdcc..8af4fb3e0254 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -21,6 +21,7 @@
 #define HID_USAGE_SENSOR_ALS					0x200041
 #define HID_USAGE_SENSOR_DATA_LIGHT				0x2004d0
 #define HID_USAGE_SENSOR_LIGHT_ILLUM				0x2004d1
+#define HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE		0x2004d2
 
 /* PROX (200011) */
 #define HID_USAGE_SENSOR_PROX                                   0x200011
-- 
cgit v1.2.3


From ee3710f39f9d0ae5137a866138d005fe1ad18132 Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Date: Tue, 19 Sep 2023 13:40:52 +0530
Subject: iio: hid-sensor-als: Add light chromaticity support

In most cases, ambient color sensors also support the x and y light
colors, which represent the coordinates on the CIE 1931 chromaticity
diagram. Thus, add light chromaticity x and y.

Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Acked-by: Srinivas Pandruvada<srinivas.pandruvada@linux.intel.com>
Link: https://lore.kernel.org/r/20230919081054.2050714-8-Basavaraj.Natikar@amd.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/hid-sensor-ids.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 8af4fb3e0254..6730ee900ee1 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -22,6 +22,9 @@
 #define HID_USAGE_SENSOR_DATA_LIGHT				0x2004d0
 #define HID_USAGE_SENSOR_LIGHT_ILLUM				0x2004d1
 #define HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE		0x2004d2
+#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY			0x2004d3
+#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X			0x2004d4
+#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y			0x2004d5
 
 /* PROX (200011) */
 #define HID_USAGE_SENSOR_PROX                                   0x200011
-- 
cgit v1.2.3


From a741deac787f0d2d7068638c067db20af9e63752 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 26 Jul 2023 13:57:03 -0700
Subject: torture: Make torture_hrtimeout_ns() take an hrtimer mode parameter

The current torture-test sleeps are waiting for a duration, but there
are situations where it is better to wait for an absolute time, for
example, when ending a stutter interval.  This commit therefore adds
an hrtimer mode parameter to torture_hrtimeout_ns().  Why not also the
other torture_hrtimeout_*() functions?  The theory is that most absolute
times will be in nanoseconds, especially not (say) jiffies.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/torture.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index bb466eec01e4..017f0f710815 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -81,7 +81,8 @@ static inline void torture_random_init(struct torture_random_state *trsp)
 }
 
 /* Definitions for high-resolution-timer sleeps. */
-int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp);
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode,
+			 struct torture_random_state *trsp);
 int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp);
 int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp);
 int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp);
-- 
cgit v1.2.3


From 0cfecd7d754f2ef5d7e6b56ee656a8544ade920a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 31 Jul 2023 13:10:34 -0700
Subject: torture: Move rcutorture_sched_setaffinity() out of rcutorture

The rcutorture_sched_setaffinity() function is needed by locktorture,
so move its declaration from rcu.h to torture.h and rename it to the
more generic torture_sched_setaffinity() name.

Please note that use of this function is still restricted to torture
tests, and of those, currently only rcutorture and locktorture.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/torture.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 017f0f710815..c98d0c83d117 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -121,10 +121,15 @@ void _torture_stop_kthread(char *m, struct task_struct **tp);
 #define torture_stop_kthread(n, tp) \
 	_torture_stop_kthread("Stopping " #n " task", &(tp))
 
+/* Scheduler-related definitions. */
 #ifdef CONFIG_PREEMPTION
 #define torture_preempt_schedule() __preempt_schedule()
 #else
 #define torture_preempt_schedule()	do { } while (0)
 #endif
 
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+#endif
+
 #endif /* __LINUX_TORTURE_H */
-- 
cgit v1.2.3


From 5970fbad1036d1015abe45651628b39b5bcb8a22 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2023 22:54:47 -0700
Subject: fscrypt: make it clearer that key_prefix is deprecated

fscrypt_operations::key_prefix should not be set by any filesystems that
aren't setting it already.  This is already documented, but apparently
it's not sufficiently clear, as both ceph and btrfs have tried to set
it.  Rename the field to legacy_key_prefix and improve the documentation
to hopefully make it clearer.

Link: https://lore.kernel.org/r/20230925055451.59499-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 include/linux/fscrypt.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index c895b12737a1..b0037566ce30 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -73,12 +73,16 @@ struct fscrypt_operations {
 	unsigned int flags;
 
 	/*
-	 * If set, this is a filesystem-specific key description prefix that
-	 * will be accepted for "logon" keys for v1 fscrypt policies, in
-	 * addition to the generic prefix "fscrypt:".  This functionality is
-	 * deprecated, so new filesystems shouldn't set this field.
+	 * This field exists only for backwards compatibility reasons and should
+	 * only be set by the filesystems that are setting it already.  It
+	 * contains the filesystem-specific key description prefix that is
+	 * accepted for "logon" keys for v1 fscrypt policies.  This
+	 * functionality is deprecated in favor of the generic prefix
+	 * "fscrypt:", which itself is deprecated in favor of the filesystem
+	 * keyring ioctls such as FS_IOC_ADD_ENCRYPTION_KEY.  Filesystems that
+	 * are newly adding fscrypt support should not set this field.
 	 */
-	const char *key_prefix;
+	const char *legacy_key_prefix;
 
 	/*
 	 * Get the fscrypt context of the given inode.
-- 
cgit v1.2.3


From 40e13e18168fd0f1a6ad10166f5042a21c47ab99 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2023 22:54:48 -0700
Subject: fscrypt: make the bounce page pool opt-in instead of opt-out

Replace FS_CFLG_OWN_PAGES with a bit flag 'needs_bounce_pages' which has
the opposite meaning.  I.e., filesystems now opt into the bounce page
pool instead of opt out.  Make fscrypt_alloc_bounce_page() check that
the bounce page pool has been initialized.

I believe the opt-in makes more sense, since nothing else in
fscrypt_operations is opt-out, and these days filesystems can choose to
use blk-crypto which doesn't need the fscrypt bounce page pool.  Also, I
happen to be planning to add two more flags, and I wanted to fix the
"FS_CFLG_" name anyway as it wasn't prefixed with "FSCRYPT_".

Link: https://lore.kernel.org/r/20230925055451.59499-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 include/linux/fscrypt.h | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index b0037566ce30..4505078e89b7 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -59,18 +59,20 @@ struct fscrypt_name {
 
 #ifdef CONFIG_FS_ENCRYPTION
 
-/*
- * If set, the fscrypt bounce page pool won't be allocated (unless another
- * filesystem needs it).  Set this if the filesystem always uses its own bounce
- * pages for writes and therefore won't need the fscrypt bounce page pool.
- */
-#define FS_CFLG_OWN_PAGES (1U << 1)
-
 /* Crypto operations for filesystems */
 struct fscrypt_operations {
 
-	/* Set of optional flags; see above for allowed flags */
-	unsigned int flags;
+	/*
+	 * If set, then fs/crypto/ will allocate a global bounce page pool the
+	 * first time an encryption key is set up for a file.  The bounce page
+	 * pool is required by the following functions:
+	 *
+	 * - fscrypt_encrypt_pagecache_blocks()
+	 * - fscrypt_zeroout_range() for files not using inline crypto
+	 *
+	 * If the filesystem doesn't use those, it doesn't need to set this.
+	 */
+	unsigned int needs_bounce_pages : 1;
 
 	/*
 	 * This field exists only for backwards compatibility reasons and should
-- 
cgit v1.2.3


From 753a4d531bc518633ea88ac0ed02b25a16823d51 Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Fri, 22 Sep 2023 22:55:16 +0200
Subject: ata: libata-sata: increase PMP SRST timeout to 10s

On certain SATA controllers, softreset fails after wakeup from S2RAM with
the message "softreset failed (1st FIS failed)", sometimes resulting in
drives not being detected again. With the increased timeout, this issue
is avoided. Instead, "softreset failed (device not ready)" is now
logged 1-2 times; this later failure seems to cause fewer problems
however, and the drives are detected reliably once they've spun up and
the probe is retried.

The issue was observed with the primary SATA controller of the QNAP
TS-453B, which is an "Intel Corporation Celeron/Pentium Silver Processor
SATA Controller [8086:31e3] (rev 06)" integrated in the Celeron J4125 CPU,
and the following drives:

- Seagate IronWolf ST12000VN0008
- Seagate IronWolf ST8000NE0004

The SATA controller seems to be more relevant to this issue than the
drives, as the same drives are always detected reliably on the secondary
SATA controller on the same board (an ASMedia 106x) without any "softreset
failed" errors even without the increased timeout.

Fixes: e7d3ef13d52a ("libata: change drive ready wait after hard reset to 5s")
Cc: stable@vger.kernel.org
Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index bf4913f4d7ac..84aca8c44fa3 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -259,7 +259,7 @@ enum {
 	 * advised to wait only for the following duration before
 	 * doing SRST.
 	 */
-	ATA_TMOUT_PMP_SRST_WAIT	= 5000,
+	ATA_TMOUT_PMP_SRST_WAIT	= 10000,
 
 	/* When the LPM policy is set to ATA_LPM_MAX_POWER, there might
 	 * be a spurious PHY event, so ignore the first PHY event that
-- 
cgit v1.2.3


From b3239498353484fd6ddeb513df89c4628cd623d0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 20 Sep 2023 21:25:12 +0300
Subject: wifi: mac80211: use bandwidth indication element for CSA

In CSA, parse the (EHT) bandwidth indication element and
use it (in fact prefer it if present).

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.43ef01920556.If4f24a61cd634ab1e50eba43899b9e992bf25602@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 340d7e0f6bf7..f11b7022d9eb 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -3139,6 +3139,28 @@ ieee80211_eht_oper_size_ok(const u8 *data, u8 len)
 	return len >= needed;
 }
 
+#define IEEE80211_BW_IND_DIS_SUBCH_PRESENT	BIT(1)
+
+struct ieee80211_bandwidth_indication {
+	u8 params;
+	struct ieee80211_eht_operation_info info;
+} __packed;
+
+static inline bool
+ieee80211_bandwidth_indication_size_ok(const u8 *data, u8 len)
+{
+	const struct ieee80211_bandwidth_indication *bwi = (const void *)data;
+
+	if (len < sizeof(*bwi))
+		return false;
+
+	if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT &&
+	    len < sizeof(*bwi) + 2)
+		return false;
+
+	return true;
+}
+
 #define LISTEN_INT_USF	GENMASK(15, 14)
 #define LISTEN_INT_UI	GENMASK(13, 0)
 
@@ -3596,6 +3618,7 @@ enum ieee80211_eid_ext {
 	WLAN_EID_EXT_EHT_OPERATION = 106,
 	WLAN_EID_EXT_EHT_MULTI_LINK = 107,
 	WLAN_EID_EXT_EHT_CAPABILITY = 108,
+	WLAN_EID_EXT_BANDWIDTH_INDICATION = 135,
 };
 
 /* Action category code */
-- 
cgit v1.2.3


From 62e9c64eedfeb697ba28081ccaac59a45f9a96e1 Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Wed, 20 Sep 2023 21:25:24 +0300
Subject: wifi: mac80211: add support for parsing TID to Link mapping element

Add the relevant definitions for TID to Link mapping element
according to the P802.11be_D4.0.

Signed-off-by: Ayala Beker <ayala.beker@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.9ea9b0b4412a.I2281ab2c70e8b43a39032dc115db6a80f1f0b3f4@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index f11b7022d9eb..f2965ff3d7c1 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1246,6 +1246,30 @@ struct ieee80211_twt_setup {
 	u8 params[];
 } __packed;
 
+#define IEEE80211_TTLM_MAX_CNT				2
+#define IEEE80211_TTLM_CONTROL_DIRECTION		0x03
+#define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP		0x04
+#define IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT	0x08
+#define IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT	0x10
+#define IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE		0x20
+
+#define IEEE80211_TTLM_DIRECTION_DOWN		0
+#define IEEE80211_TTLM_DIRECTION_UP		1
+#define IEEE80211_TTLM_DIRECTION_BOTH		2
+
+/**
+ * struct ieee80211_ttlm_elem - TID-To-Link Mapping element
+ *
+ * Defined in section 9.4.2.314 in P802.11be_D4
+ *
+ * @control: the first part of control field
+ * @optional: the second part of control field
+ */
+struct ieee80211_ttlm_elem {
+	u8 control;
+	u8 optional[];
+} __packed;
+
 struct ieee80211_mgmt {
 	__le16 frame_control;
 	__le16 duration;
@@ -3618,6 +3642,7 @@ enum ieee80211_eid_ext {
 	WLAN_EID_EXT_EHT_OPERATION = 106,
 	WLAN_EID_EXT_EHT_MULTI_LINK = 107,
 	WLAN_EID_EXT_EHT_CAPABILITY = 108,
+	WLAN_EID_EXT_TID_TO_LINK_MAPPING = 109,
 	WLAN_EID_EXT_BANDWIDTH_INDICATION = 135,
 };
 
@@ -5155,6 +5180,39 @@ static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data,
 	       fixed + prof->sta_info_len - 1 <= len;
 }
 
+static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len)
+{
+	const struct ieee80211_ttlm_elem *t2l = (const void *)data;
+	u8 control, fixed = sizeof(*t2l), elem_len = 0;
+
+	if (len < fixed)
+		return false;
+
+	control = t2l->control;
+
+	if (control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT)
+		elem_len += 2;
+	if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT)
+		elem_len += 3;
+
+	if (!(control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP)) {
+		u8 bm_size;
+
+		elem_len += 1;
+		if (len < fixed + elem_len)
+			return false;
+
+		if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE)
+			bm_size = 1;
+		else
+			bm_size = 2;
+
+		elem_len += hweight8(t2l->optional[0]) * bm_size;
+	}
+
+	return len >= fixed + elem_len;
+}
+
 #define for_each_mle_subelement(_elem, _data, _len)			\
 	if (ieee80211_mle_size_ok(_data, _len))				\
 		for_each_element(_elem,					\
-- 
cgit v1.2.3


From ef246a1480cc484cd2aeda75737cb0848616ddf3 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 20 Sep 2023 21:25:26 +0300
Subject: wifi: mac80211: support antenna control in injection

Support antenna control for injection by parsing the antenna
radiotap field (which may be presented multiple times) and
telling the driver about the resulting antenna bitmap. Of
course there's no guarantee the driver will actually honour
this, just like any other injection control.

If misconfigured, i.e. the injected HT/VHT MCS needs more
chains than antennas are configured, the bitmap is reset to
zero, indicating no selection.

For now this is only set up for two anntenas so we keep more
free bits, but that can be trivially extended if any driver
implements support for it that can deal with hardware with
more antennas.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.f71001aa4da9.I00ccb762a806ea62bc3d728fa3a0d29f4f285eeb@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index f2965ff3d7c1..3b02f038d509 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1705,6 +1705,8 @@ struct ieee80211_mcs_info {
 #define		IEEE80211_HT_MCS_TX_MAX_STREAMS	4
 #define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION	0x10
 
+#define IEEE80211_HT_MCS_CHAINS(mcs) ((mcs) == 32 ? 1 : (1 + ((mcs) >> 3)))
+
 /*
  * 802.11n D5.0 20.3.5 / 20.6 says:
  * - indices 0 to 7 and 32 are single spatial stream
-- 
cgit v1.2.3


From 823a0258912b1a4f627e455bee7e78d349c29c5b Mon Sep 17 00:00:00 2001
From: Po-Hao Huang <phhuang@realtek.com>
Date: Mon, 25 Sep 2023 16:08:59 +0800
Subject: wifi: ieee80211: add UL-bandwidth definition of trigger frame

Define UL-bandwidth values of trigger frame according to 802.11 std.

Signed-off-by: Po-Hao Huang <phhuang@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://lore.kernel.org/r/20230925080902.51449-2-pkshih@realtek.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 3b02f038d509..62b4469c6866 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -307,6 +307,13 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 #define IEEE80211_TRIGGER_TYPE_BQRP		0x6
 #define IEEE80211_TRIGGER_TYPE_NFRP		0x7
 
+/* UL-bandwidth within common_info of trigger frame */
+#define IEEE80211_TRIGGER_ULBW_MASK		0xc0000
+#define IEEE80211_TRIGGER_ULBW_20MHZ		0x0
+#define IEEE80211_TRIGGER_ULBW_40MHZ		0x1
+#define IEEE80211_TRIGGER_ULBW_80MHZ		0x2
+#define IEEE80211_TRIGGER_ULBW_160_80P80MHZ	0x3
+
 struct ieee80211_hdr {
 	__le16 frame_control;
 	__le16 duration_id;
-- 
cgit v1.2.3


From df31b298477e65a01deff0af352be3a61524d930 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 13 Sep 2023 10:43:34 -0300
Subject: iommu: Add iommu_ops->identity_domain

This allows a driver to set a global static to an IDENTITY domain and
the core code will automatically use it whenever an IDENTITY domain
is requested.

By making it always available it means the IDENTITY can be used in error
handling paths to force the iommu driver into a known state. Devices
implementing global static identity domains should avoid failing their
attach_dev ops.

To make global static domains simpler allow drivers to omit their free
function and update the iommufd selftest.

Convert rockchip to use the new mechanism.

Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c50a769d569a..d0920b2a9f1c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -260,6 +260,8 @@ struct iommu_iotlb_gather {
  *                    will be blocked by the hardware.
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @owner: Driver module providing these ops
+ * @identity_domain: An always available, always attachable identity
+ *                   translation.
  */
 struct iommu_ops {
 	bool (*capable)(struct device *dev, enum iommu_cap);
@@ -294,6 +296,7 @@ struct iommu_ops {
 	const struct iommu_domain_ops *default_domain_ops;
 	unsigned long pgsize_bitmap;
 	struct module *owner;
+	struct iommu_domain *identity_domain;
 };
 
 /**
-- 
cgit v1.2.3


From 1c68cbc64fe6ac01dc242ba562344303031a76fb Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 13 Sep 2023 10:43:35 -0300
Subject: iommu: Add IOMMU_DOMAIN_PLATFORM

This is used when the iommu driver is taking control of the dma_ops,
currently only on S390 and power spapr. It is designed to preserve the
original ops->detach_dev() semantic that these S390 was built around.

Provide an opaque domain type and a 'default_domain' ops value that allows
the driver to trivially force any single domain as the default domain.

Update iommufd selftest to use this instead of set_platform_dma_ops

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d0920b2a9f1c..a05480be05fd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -64,6 +64,7 @@ struct iommu_domain_geometry {
 #define __IOMMU_DOMAIN_DMA_FQ	(1U << 3)  /* DMA-API uses flush queue    */
 
 #define __IOMMU_DOMAIN_SVA	(1U << 4)  /* Shared process address space */
+#define __IOMMU_DOMAIN_PLATFORM	(1U << 5)
 
 #define IOMMU_DOMAIN_ALLOC_FLAGS ~__IOMMU_DOMAIN_DMA_FQ
 /*
@@ -81,6 +82,8 @@ struct iommu_domain_geometry {
  *				  invalidation.
  *	IOMMU_DOMAIN_SVA	- DMA addresses are shared process addresses
  *				  represented by mm_struct's.
+ *	IOMMU_DOMAIN_PLATFORM	- Legacy domain for drivers that do their own
+ *				  dma_api stuff. Do not use in new drivers.
  */
 #define IOMMU_DOMAIN_BLOCKED	(0U)
 #define IOMMU_DOMAIN_IDENTITY	(__IOMMU_DOMAIN_PT)
@@ -91,6 +94,7 @@ struct iommu_domain_geometry {
 				 __IOMMU_DOMAIN_DMA_API |	\
 				 __IOMMU_DOMAIN_DMA_FQ)
 #define IOMMU_DOMAIN_SVA	(__IOMMU_DOMAIN_SVA)
+#define IOMMU_DOMAIN_PLATFORM	(__IOMMU_DOMAIN_PLATFORM)
 
 struct iommu_domain {
 	unsigned type;
@@ -262,6 +266,9 @@ struct iommu_iotlb_gather {
  * @owner: Driver module providing these ops
  * @identity_domain: An always available, always attachable identity
  *                   translation.
+ * @default_domain: If not NULL this will always be set as the default domain.
+ *                  This should be an IDENTITY/BLOCKED/PLATFORM domain.
+ *                  Do not use in new drivers.
  */
 struct iommu_ops {
 	bool (*capable)(struct device *dev, enum iommu_cap);
@@ -297,6 +304,7 @@ struct iommu_ops {
 	unsigned long pgsize_bitmap;
 	struct module *owner;
 	struct iommu_domain *identity_domain;
+	struct iommu_domain *default_domain;
 };
 
 /**
-- 
cgit v1.2.3


From 24b1d476167df3e30c7a53b67765bf3c787c5160 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 13 Sep 2023 10:43:48 -0300
Subject: iommu: Remove ops->set_platform_dma_ops()

All drivers are now using IDENTITY or PLATFORM domains for what this did,
we can remove it now. It is no longer possible to attach to a NULL domain.

Tested-by: Heiko Stuebner <heiko@sntech.de>
Tested-by: Niklas Schnelle <schnelle@linux.ibm.com>
Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/15-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a05480be05fd..511dfeea5272 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -243,9 +243,6 @@ struct iommu_iotlb_gather {
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
  *                  group and attached to the groups domain
- * @set_platform_dma_ops: Returning control back to the platform DMA ops. This op
- *                        is to support old IOMMU drivers, new drivers should use
- *                        default domains, and the common IOMMU DMA ops.
  * @device_group: find iommu group for a particular device
  * @get_resv_regions: Request list of reserved regions for a device
  * @of_xlate: add OF master IDs to iommu grouping
@@ -280,7 +277,6 @@ struct iommu_ops {
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
 	void (*probe_finalize)(struct device *dev);
-	void (*set_platform_dma_ops)(struct device *dev);
 	struct iommu_group *(*device_group)(struct device *dev);
 
 	/* Request/Free a list of reserved regions for a device */
-- 
cgit v1.2.3


From 4601cd2d7c4c82c4bafc822e1ff630a709eff206 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 13 Sep 2023 10:43:55 -0300
Subject: iommu: Add ops->domain_alloc_paging()

This callback requests the driver to create only a __IOMMU_DOMAIN_PAGING
domain, so it saves a few lines in a lot of drivers needlessly checking
the type.

More critically, this allows us to sweep out all the
IOMMU_DOMAIN_UNMANAGED and IOMMU_DOMAIN_DMA checks from a lot of the
drivers, simplifying what is going on in the code and ultimately removing
the now-unused special cases in drivers where they did not support
IOMMU_DOMAIN_DMA.

domain_alloc_paging() should return a struct iommu_domain that is
functionally compatible with ARM_DMA_USE_IOMMU, dma-iommu.c and iommufd.

Be forwards looking and pass in a 'struct device *' argument. We can
provide this when allocating the default_domain. No drivers will look at
this.

Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/22-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 511dfeea5272..3f173307434d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -239,6 +239,8 @@ struct iommu_iotlb_gather {
  *           use. The information type is one of enum iommu_hw_info_type defined
  *           in include/uapi/linux/iommufd.h.
  * @domain_alloc: allocate iommu domain
+ * @domain_alloc_paging: Allocate an iommu_domain that can be used for
+ *                       UNMANAGED, DMA, and DMA_FQ domain types.
  * @probe_device: Add device to iommu driver handling
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -273,6 +275,7 @@ struct iommu_ops {
 
 	/* Domain allocation and freeing by the iommu driver */
 	struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
+	struct iommu_domain *(*domain_alloc_paging)(struct device *dev);
 
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
-- 
cgit v1.2.3


From e8f52d84cf0b6d1862ab62f7ed705f78690d11b2 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 22 Aug 2023 13:15:57 -0300
Subject: iommu: Add generic_single_device_group()

This implements the common pattern seen in drivers of a single iommu_group
for the entire iommu driver instance. Implement this in core code so the
drivers that want this can select it from their ops.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v1-c869a95191f2+5e8-iommu_single_grp_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3f173307434d..5b693be3d35f 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -378,6 +378,7 @@ struct iommu_domain_ops {
  * @list: Used by the iommu-core to keep a list of registered iommus
  * @ops: iommu-ops for talking to this iommu
  * @dev: struct device for sysfs handling
+ * @singleton_group: Used internally for drivers that have only one group
  * @max_pasids: number of supported PASIDs
  */
 struct iommu_device {
@@ -385,6 +386,7 @@ struct iommu_device {
 	const struct iommu_ops *ops;
 	struct fwnode_handle *fwnode;
 	struct device *dev;
+	struct iommu_group *singleton_group;
 	u32 max_pasids;
 };
 
@@ -648,6 +650,7 @@ extern struct iommu_group *pci_device_group(struct device *dev);
 extern struct iommu_group *generic_device_group(struct device *dev);
 /* FSL-MC device grouping function */
 struct iommu_group *fsl_mc_device_group(struct device *dev);
+extern struct iommu_group *generic_single_device_group(struct device *dev);
 
 /**
  * struct iommu_fwspec - per-device IOMMU instance data
-- 
cgit v1.2.3


From 950210887670cbb7d2eb9af6fb743b70f1a1ebdc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 19 Sep 2023 20:54:37 +0200
Subject: thermal: core: Drop trips_disabled bitmask

After recent changes, thermal_zone_get_trip() cannot fail, as invoked
from thermal_zone_device_register_with_trips(), so the only role of
the trips_disabled bitmask is struct thermal_zone_device is to make
handle_thermal_trip() skip trip points whose temperature was initially
zero.  However, since the unit of temperature in the thermal core is
millicelsius, zero may very well be a valid temperature value at least
in some usage scenarios and the trip temperature may as well change
later.  Thus there is no reason to permanently disable trip points
with initial temperature equal to zero.

Accordingly, drop the trips_disabled bitmask along with the code
related to it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/thermal.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index a5ae4af955ff..6cfcae22ba12 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -122,7 +122,6 @@ struct thermal_cooling_device {
  * @devdata:	private pointer for device private data
  * @trips:	an array of struct thermal_trip
  * @num_trips:	number of trip points the thermal zone supports
- * @trips_disabled;	bitmap for disabled trips
  * @passive_delay_jiffies: number of jiffies to wait between polls when
  *			performing passive cooling.
  * @polling_delay_jiffies: number of jiffies to wait between polls when
@@ -163,7 +162,6 @@ struct thermal_zone_device {
 	void *devdata;
 	struct thermal_trip *trips;
 	int num_trips;
-	unsigned long trips_disabled;	/* bitmap for disabled trips */
 	unsigned long passive_delay_jiffies;
 	unsigned long polling_delay_jiffies;
 	int temperature;
-- 
cgit v1.2.3


From bd111e987e762d82dc738232c6ed4b3c9bcc5c91 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 12 Sep 2023 17:18:43 +0100
Subject: iommu: Retire map/unmap ops

With everyone now implementing the new interfaces, clean up the last
remnants of the old map/unmap ops and simplify the calling logic again.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/d2afdf13b2fbf537713c3ec642dfd49d16dd9e6a.1694525662.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5b693be3d35f..64bd20142cbe 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -322,10 +322,8 @@ struct iommu_ops {
  * * ENODEV	- device specific errors, not able to be attached
  * * <others>	- treated as ENODEV by the caller. Use is discouraged
  * @set_dev_pasid: set an iommu domain to a pasid of device
- * @map: map a physically contiguous memory region to an iommu domain
  * @map_pages: map a physically contiguous set of pages of the same size to
  *             an iommu domain.
- * @unmap: unmap a physically contiguous memory region from an iommu domain
  * @unmap_pages: unmap a number of pages of the same size from an iommu domain
  * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
  * @iotlb_sync_map: Sync mappings created recently using @map to the hardware
@@ -344,13 +342,9 @@ struct iommu_domain_ops {
 	int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev,
 			     ioasid_t pasid);
 
-	int (*map)(struct iommu_domain *domain, unsigned long iova,
-		   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
 	int (*map_pages)(struct iommu_domain *domain, unsigned long iova,
 			 phys_addr_t paddr, size_t pgsize, size_t pgcount,
 			 int prot, gfp_t gfp, size_t *mapped);
-	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
-			size_t size, struct iommu_iotlb_gather *iotlb_gather);
 	size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova,
 			      size_t pgsize, size_t pgcount,
 			      struct iommu_iotlb_gather *iotlb_gather);
-- 
cgit v1.2.3


From cf08fa74c716cf20e5038d1e7dbbd7dba1b76062 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Wed, 13 Sep 2023 16:29:17 +0800
Subject: regulator: mt6358: Add output voltage fine tuning to fixed regulators

The "fixed" LDO regulators found on the MT6358 and MT6366 PMICs have
either no voltage selection register, or only one valid setting.
However these do have a fine voltage calibration setting that can
slightly boost the output voltage from 0 mV to 100 mV, in 10 mV
increments.

Add support for this by changing these into linear range regulators.
Some register definitions that are missing are also added.

Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Acked-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20230913082919.1631287-3-wenst@chromium.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/mt6358/registers.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/mt6358/registers.h b/include/linux/mfd/mt6358/registers.h
index 3d33517f178c..5ea2590be710 100644
--- a/include/linux/mfd/mt6358/registers.h
+++ b/include/linux/mfd/mt6358/registers.h
@@ -262,6 +262,12 @@
 #define MT6358_LDO_VBIF28_CON3                0x1db0
 #define MT6358_VCAMA1_ANA_CON0                0x1e08
 #define MT6358_VCAMA2_ANA_CON0                0x1e0c
+#define MT6358_VFE28_ANA_CON0                 0x1e10
+#define MT6358_VCN28_ANA_CON0                 0x1e14
+#define MT6358_VBIF28_ANA_CON0                0x1e18
+#define MT6358_VAUD28_ANA_CON0                0x1e1c
+#define MT6358_VAUX18_ANA_CON0                0x1e20
+#define MT6358_VXO22_ANA_CON0                 0x1e24
 #define MT6358_VCN33_ANA_CON0                 0x1e28
 #define MT6358_VSIM1_ANA_CON0                 0x1e2c
 #define MT6358_VSIM2_ANA_CON0                 0x1e30
-- 
cgit v1.2.3


From 581beb4fe37d17571c87058d13d298d5458e25e9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:02:58 +0100
Subject: iov_iter: Remove last_offset from iov_iter as it was for ITER_PIPE

Now that ITER_PIPE has been removed, iov_iter::last_offset is no longer
used, so remove it.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-2-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uio.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 42bce38a8e87..2000e42a6586 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -44,10 +44,7 @@ struct iov_iter {
 	bool nofault;
 	bool data_source;
 	bool user_backed;
-	union {
-		size_t iov_offset;
-		int last_offset;
-	};
+	size_t iov_offset;
 	/*
 	 * Hack alert: overlay ubuf_iovec with iovec + count, so
 	 * that the members resolve correctly regardless of the type
-- 
cgit v1.2.3


From 7d9e44a6ad8a22135e6308d330da29271f41a98a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:02 +0100
Subject: iov_iter: Renumber ITER_* constants

Renumber the ITER_* iterator-type constants to put things in the same order
as in the iteration functions and to group user-backed iterators at the
bottom.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-6-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 2000e42a6586..bef8e56aa45c 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -21,12 +21,12 @@ struct kvec {
 
 enum iter_type {
 	/* iter types */
+	ITER_UBUF,
 	ITER_IOVEC,
-	ITER_KVEC,
 	ITER_BVEC,
+	ITER_KVEC,
 	ITER_XARRAY,
 	ITER_DISCARD,
-	ITER_UBUF,
 };
 
 #define ITER_SOURCE	1	// == WRITE
-- 
cgit v1.2.3


From f1b4cb650b9a0eeba206d8f069fcdc532bfbcd74 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:03 +0100
Subject: iov_iter: Derive user-backedness from the iterator type

Use the iterator type to determine whether an iterator is user-backed or
not rather than using a special flag for it.  Now that ITER_UBUF and
ITER_IOVEC are 0 and 1, they can be checked with a single comparison.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-7-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uio.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index bef8e56aa45c..65d9143f83c8 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -43,7 +43,6 @@ struct iov_iter {
 	bool copy_mc;
 	bool nofault;
 	bool data_source;
-	bool user_backed;
 	size_t iov_offset;
 	/*
 	 * Hack alert: overlay ubuf_iovec with iovec + count, so
@@ -140,7 +139,7 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 
 static inline bool user_backed_iter(const struct iov_iter *i)
 {
-	return i->user_backed;
+	return iter_is_ubuf(i) || iter_is_iovec(i);
 }
 
 /*
@@ -380,7 +379,6 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
 	*i = (struct iov_iter) {
 		.iter_type = ITER_UBUF,
 		.copy_mc = false,
-		.user_backed = true,
 		.data_source = direction,
 		.ubuf = buf,
 		.count = count,
-- 
cgit v1.2.3


From f1982740f5e77090bde41a9b84e257d69ec46598 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:04 +0100
Subject: iov_iter: Convert iterate*() to inline funcs

Convert the iov_iter iteration macros to inline functions to make the code
easier to follow.

The functions are marked __always_inline as we don't want to end up with
indirect calls in the code.  This, however, leaves dealing with ->copy_mc
in an awkard situation since the step function (memcpy_from_iter_mc())
needs to test the flag in the iterator, but isn't passed the iterator.
This will be dealt with in a follow-up patch.

The variable names in the per-type iterator functions have been harmonised
as much as possible and made clearer as to the variable purpose.

The iterator functions are also moved to a header file so that other
operations that need to scan over an iterator can be added.  For instance,
the rbd driver could use this to scan a buffer to see if it is all zeros
and libceph could use this to generate a crc.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/3710261.1691764329@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/855.1692047347@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/20230816120741.534415-1-dhowells@redhat.com/ # v3
Link: https://lore.kernel.org/r/20230925120309.1731676-8-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/iov_iter.h | 274 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 274 insertions(+)
 create mode 100644 include/linux/iov_iter.h

(limited to 'include/linux')

diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h
new file mode 100644
index 000000000000..270454a6703d
--- /dev/null
+++ b/include/linux/iov_iter.h
@@ -0,0 +1,274 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* I/O iterator iteration building functions.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _LINUX_IOV_ITER_H
+#define _LINUX_IOV_ITER_H
+
+#include <linux/uio.h>
+#include <linux/bvec.h>
+
+typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
+			     void *priv, void *priv2);
+typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len,
+			      void *priv, void *priv2);
+
+/*
+ * Handle ITER_UBUF.
+ */
+static __always_inline
+size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		    iov_ustep_f step)
+{
+	void __user *base = iter->ubuf;
+	size_t progress = 0, remain;
+
+	remain = step(base + iter->iov_offset, 0, len, priv, priv2);
+	progress = len - remain;
+	iter->iov_offset += progress;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_IOVEC.
+ */
+static __always_inline
+size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		     iov_ustep_f step)
+{
+	const struct iovec *p = iter->__iov;
+	size_t progress = 0, skip = iter->iov_offset;
+
+	do {
+		size_t remain, consumed;
+		size_t part = min(len, p->iov_len - skip);
+
+		if (likely(part)) {
+			remain = step(p->iov_base + skip, progress, part, priv, priv2);
+			consumed = part - remain;
+			progress += consumed;
+			skip += consumed;
+			len -= consumed;
+			if (skip < p->iov_len)
+				break;
+		}
+		p++;
+		skip = 0;
+	} while (len);
+
+	iter->nr_segs -= p - iter->__iov;
+	iter->__iov = p;
+	iter->iov_offset = skip;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_KVEC.
+ */
+static __always_inline
+size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		    iov_step_f step)
+{
+	const struct kvec *p = iter->kvec;
+	size_t progress = 0, skip = iter->iov_offset;
+
+	do {
+		size_t remain, consumed;
+		size_t part = min(len, p->iov_len - skip);
+
+		if (likely(part)) {
+			remain = step(p->iov_base + skip, progress, part, priv, priv2);
+			consumed = part - remain;
+			progress += consumed;
+			skip += consumed;
+			len -= consumed;
+			if (skip < p->iov_len)
+				break;
+		}
+		p++;
+		skip = 0;
+	} while (len);
+
+	iter->nr_segs -= p - iter->kvec;
+	iter->kvec = p;
+	iter->iov_offset = skip;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_BVEC.
+ */
+static __always_inline
+size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		    iov_step_f step)
+{
+	const struct bio_vec *p = iter->bvec;
+	size_t progress = 0, skip = iter->iov_offset;
+
+	do {
+		size_t remain, consumed;
+		size_t offset = p->bv_offset + skip, part;
+		void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE);
+
+		part = min3(len,
+			   (size_t)(p->bv_len - skip),
+			   (size_t)(PAGE_SIZE - offset % PAGE_SIZE));
+		remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2);
+		kunmap_local(kaddr);
+		consumed = part - remain;
+		len -= consumed;
+		progress += consumed;
+		skip += consumed;
+		if (skip >= p->bv_len) {
+			skip = 0;
+			p++;
+		}
+		if (remain)
+			break;
+	} while (len);
+
+	iter->nr_segs -= p - iter->bvec;
+	iter->bvec = p;
+	iter->iov_offset = skip;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_XARRAY.
+ */
+static __always_inline
+size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		      iov_step_f step)
+{
+	struct folio *folio;
+	size_t progress = 0;
+	loff_t start = iter->xarray_start + iter->iov_offset;
+	pgoff_t index = start / PAGE_SIZE;
+	XA_STATE(xas, iter->xarray, index);
+
+	rcu_read_lock();
+	xas_for_each(&xas, folio, ULONG_MAX) {
+		size_t remain, consumed, offset, part, flen;
+
+		if (xas_retry(&xas, folio))
+			continue;
+		if (WARN_ON(xa_is_value(folio)))
+			break;
+		if (WARN_ON(folio_test_hugetlb(folio)))
+			break;
+
+		offset = offset_in_folio(folio, start + progress);
+		flen = min(folio_size(folio) - offset, len);
+
+		while (flen) {
+			void *base = kmap_local_folio(folio, offset);
+
+			part = min_t(size_t, flen,
+				     PAGE_SIZE - offset_in_page(offset));
+			remain = step(base, progress, part, priv, priv2);
+			kunmap_local(base);
+
+			consumed = part - remain;
+			progress += consumed;
+			len -= consumed;
+
+			if (remain || len == 0)
+				goto out;
+			flen -= consumed;
+			offset += consumed;
+		}
+	}
+
+out:
+	rcu_read_unlock();
+	iter->iov_offset += progress;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_DISCARD.
+ */
+static __always_inline
+size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		      iov_step_f step)
+{
+	size_t progress = len;
+
+	iter->count -= progress;
+	return progress;
+}
+
+/**
+ * iterate_and_advance2 - Iterate over an iterator
+ * @iter: The iterator to iterate over.
+ * @len: The amount to iterate over.
+ * @priv: Data for the step functions.
+ * @priv2: More data for the step functions.
+ * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
+ * @step: Function for other iterators; given kernel addresses.
+ *
+ * Iterate over the next part of an iterator, up to the specified length.  The
+ * buffer is presented in segments, which for kernel iteration are broken up by
+ * physical pages and mapped, with the mapped address being presented.
+ *
+ * Two step functions, @step and @ustep, must be provided, one for handling
+ * mapped kernel addresses and the other is given user addresses which have the
+ * potential to fault since no pinning is performed.
+ *
+ * The step functions are passed the address and length of the segment, @priv,
+ * @priv2 and the amount of data so far iterated over (which can, for example,
+ * be added to @priv to point to the right part of a second buffer).  The step
+ * functions should return the amount of the segment they didn't process (ie. 0
+ * indicates complete processsing).
+ *
+ * This function returns the amount of data processed (ie. 0 means nothing was
+ * processed and the value of @len means processes to completion).
+ */
+static __always_inline
+size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
+			    void *priv2, iov_ustep_f ustep, iov_step_f step)
+{
+	if (unlikely(iter->count < len))
+		len = iter->count;
+	if (unlikely(!len))
+		return 0;
+
+	if (likely(iter_is_ubuf(iter)))
+		return iterate_ubuf(iter, len, priv, priv2, ustep);
+	if (likely(iter_is_iovec(iter)))
+		return iterate_iovec(iter, len, priv, priv2, ustep);
+	if (iov_iter_is_bvec(iter))
+		return iterate_bvec(iter, len, priv, priv2, step);
+	if (iov_iter_is_kvec(iter))
+		return iterate_kvec(iter, len, priv, priv2, step);
+	if (iov_iter_is_xarray(iter))
+		return iterate_xarray(iter, len, priv, priv2, step);
+	return iterate_discard(iter, len, priv, priv2, step);
+}
+
+/**
+ * iterate_and_advance - Iterate over an iterator
+ * @iter: The iterator to iterate over.
+ * @len: The amount to iterate over.
+ * @priv: Data for the step functions.
+ * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
+ * @step: Function for other iterators; given kernel addresses.
+ *
+ * As iterate_and_advance2(), but priv2 is always NULL.
+ */
+static __always_inline
+size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv,
+			   iov_ustep_f ustep, iov_step_f step)
+{
+	return iterate_and_advance2(iter, len, priv, NULL, ustep, step);
+}
+
+#endif /* _LINUX_IOV_ITER_H */
-- 
cgit v1.2.3


From 3acf8ace68230e9558cf916847f1cc9f208abdf1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Sep 2023 23:31:39 +0200
Subject: bpf: Add missed value to kprobe perf link info

Add missed value to kprobe attached through perf link info to
hold the stats of missed kprobe handler execution.

The kprobe's missed counter gets incremented when kprobe handler
is not executed due to another kprobe running on the same cpu.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230920213145.1941596-4-jolsa@kernel.org
---
 include/linux/trace_events.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 21ae37e49319..5eb88a66eb68 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -761,7 +761,8 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name);
 void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp);
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 			    u32 *fd_type, const char **buf,
-			    u64 *probe_offset, u64 *probe_addr);
+			    u64 *probe_offset, u64 *probe_addr,
+			    unsigned long *missed);
 int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 #else
@@ -801,7 +802,7 @@ static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
 static inline int bpf_get_perf_event_info(const struct perf_event *event,
 					  u32 *prog_id, u32 *fd_type,
 					  const char **buf, u64 *probe_offset,
-					  u64 *probe_addr)
+					  u64 *probe_addr, unsigned long *missed)
 {
 	return -EOPNOTSUPP;
 }
@@ -877,6 +878,7 @@ extern void perf_kprobe_destroy(struct perf_event *event);
 extern int bpf_get_kprobe_info(const struct perf_event *event,
 			       u32 *fd_type, const char **symbol,
 			       u64 *probe_offset, u64 *probe_addr,
+			       unsigned long *missed,
 			       bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
-- 
cgit v1.2.3


From dd8657894c11b03c6eb0fd53fe9d7fec2072d18b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Sep 2023 23:31:40 +0200
Subject: bpf: Count missed stats in trace_call_bpf

Increase misses stats in case bpf array execution is skipped
because of recursion check in trace_call_bpf.

Adding bpf_prog_inc_misses_counters that increase misses
counts for all bpf programs in bpf_prog_array.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Song Liu <song@kernel.org>
Reviewed-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20230920213145.1941596-5-jolsa@kernel.org
---
 include/linux/bpf.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 30063a760b5a..a82efd34b741 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2922,6 +2922,22 @@ static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
 #endif /* CONFIG_BPF_SYSCALL */
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
+static __always_inline void
+bpf_prog_inc_misses_counters(const struct bpf_prog_array *array)
+{
+	const struct bpf_prog_array_item *item;
+	struct bpf_prog *prog;
+
+	if (unlikely(!array))
+		return;
+
+	item = &array->items[0];
+	while ((prog = READ_ONCE(item->prog))) {
+		bpf_prog_inc_misses_counter(prog);
+		item++;
+	}
+}
+
 #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
 void bpf_sk_reuseport_detach(struct sock *sk);
 int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
-- 
cgit v1.2.3


From 7a0263dc904f3467f474e4088ae092eda9a5a99b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2023 22:54:50 -0700
Subject: fscrypt: replace get_ino_and_lblk_bits with just has_32bit_inodes

Now that fs/crypto/ computes the filesystem's lblk_bits from its maximum
file size, it is no longer necessary for filesystems to provide
lblk_bits via fscrypt_operations::get_ino_and_lblk_bits.

It is still necessary for fs/crypto/ to retrieve ino_bits from the
filesystem.  However, this is used only to decide whether inode numbers
fit in 32 bits.  Also, ino_bits is static for all relevant filesystems,
i.e. it doesn't depend on the filesystem instance.

Therefore, in the interest of keeping things as simple as possible,
replace 'get_ino_and_lblk_bits' with a flag 'has_32bit_inodes'.  This
can always be changed back to a function if a filesystem needs it to be
dynamic, but for now a static flag is all that's needed.

Link: https://lore.kernel.org/r/20230925055451.59499-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 include/linux/fscrypt.h | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 4505078e89b7..09a3cacbf62a 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -74,6 +74,17 @@ struct fscrypt_operations {
 	 */
 	unsigned int needs_bounce_pages : 1;
 
+	/*
+	 * If set, then fs/crypto/ will allow the use of encryption settings
+	 * that assume inode numbers fit in 32 bits (i.e.
+	 * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64}), provided that the other
+	 * prerequisites for these settings are also met.  This is only useful
+	 * if the filesystem wants to support inline encryption hardware that is
+	 * limited to 32-bit or 64-bit data unit numbers and where programming
+	 * keyslots is very slow.
+	 */
+	unsigned int has_32bit_inodes : 1;
+
 	/*
 	 * This field exists only for backwards compatibility reasons and should
 	 * only be set by the filesystems that are setting it already.  It
@@ -151,21 +162,6 @@ struct fscrypt_operations {
 	 */
 	bool (*has_stable_inodes)(struct super_block *sb);
 
-	/*
-	 * Get the number of bits that the filesystem uses to represent inode
-	 * numbers and file logical block numbers.
-	 *
-	 * By default, both of these are assumed to be 64-bit.  This function
-	 * can be implemented to declare that either or both of these numbers is
-	 * shorter, which may allow the use of the
-	 * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags and/or the use of
-	 * inline crypto hardware whose maximum DUN length is less than 64 bits
-	 * (e.g., eMMC v5.2 spec compliant hardware).  This function only needs
-	 * to be implemented if support for one of these features is needed.
-	 */
-	void (*get_ino_and_lblk_bits)(struct super_block *sb,
-				      int *ino_bits_ret, int *lblk_bits_ret);
-
 	/*
 	 * Return an array of pointers to the block devices to which the
 	 * filesystem may write encrypted file contents, NULL if the filesystem
-- 
cgit v1.2.3


From 5b11888471806edf699316d4dcb9b426caebbef2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2023 22:54:51 -0700
Subject: fscrypt: support crypto data unit size less than filesystem block
 size

Until now, fscrypt has always used the filesystem block size as the
granularity of file contents encryption.  Two scenarios have come up
where a sub-block granularity of contents encryption would be useful:

1. Inline crypto hardware that only supports a crypto data unit size
   that is less than the filesystem block size.

2. Support for direct I/O at a granularity less than the filesystem
   block size, for example at the block device's logical block size in
   order to match the traditional direct I/O alignment requirement.

(1) first came up with older eMMC inline crypto hardware that only
supports a crypto data unit size of 512 bytes.  That specific case
ultimately went away because all systems with that hardware continued
using out of tree code and never actually upgraded to the upstream
inline crypto framework.  But, now it's coming back in a new way: some
current UFS controllers only support a data unit size of 4096 bytes, and
there is a proposal to increase the filesystem block size to 16K.

(2) was discussed as a "nice to have" feature, though not essential,
when support for direct I/O on encrypted files was being upstreamed.

Still, the fact that this feature has come up several times does suggest
it would be wise to have available.  Therefore, this patch implements it
by using one of the reserved bytes in fscrypt_policy_v2 to allow users
to select a sub-block data unit size.  Supported data unit sizes are
powers of 2 between 512 and the filesystem block size, inclusively.
Support is implemented for both the FS-layer and inline crypto cases.

This patch focuses on the basic support for sub-block data units.  Some
things are out of scope for this patch but may be addressed later:

- Supporting sub-block data units in combination with
  FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64, in most cases.  Unfortunately this
  combination usually causes data unit indices to exceed 32 bits, and
  thus fscrypt_supported_policy() correctly disallows it.  The users who
  potentially need this combination are using f2fs.  To support it, f2fs
  would need to provide an option to slightly reduce its max file size.

- Supporting sub-block data units in combination with
  FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32.  This has the same problem
  described above, but also it will need special code to make DUN
  wraparound still happen on a FS block boundary.

- Supporting use case (2) mentioned above.  The encrypted direct I/O
  code will need to stop requiring and assuming FS block alignment.
  This won't be hard, but it belongs in a separate patch.

- Supporting this feature on filesystems other than ext4 and f2fs.
  (Filesystems declare support for it via their fscrypt_operations.)
  On UBIFS, sub-block data units don't make sense because UBIFS encrypts
  variable-length blocks as a result of compression.  CephFS could
  support it, but a bit more work would be needed to make the
  fscrypt_*_block_inplace functions play nicely with sub-block data
  units.  I don't think there's a use case for this on CephFS anyway.

Link: https://lore.kernel.org/r/20230925055451.59499-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 include/linux/fscrypt.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 09a3cacbf62a..b559e6f77707 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -85,6 +85,18 @@ struct fscrypt_operations {
 	 */
 	unsigned int has_32bit_inodes : 1;
 
+	/*
+	 * If set, then fs/crypto/ will allow users to select a crypto data unit
+	 * size that is less than the filesystem block size.  This is done via
+	 * the log2_data_unit_size field of the fscrypt policy.  This flag is
+	 * not compatible with filesystems that encrypt variable-length blocks
+	 * (i.e. blocks that aren't all equal to filesystem's block size), for
+	 * example as a result of compression.  It's also not compatible with
+	 * the fscrypt_encrypt_block_inplace() and
+	 * fscrypt_decrypt_block_inplace() functions.
+	 */
+	unsigned int supports_subblock_data_units : 1;
+
 	/*
 	 * This field exists only for backwards compatibility reasons and should
 	 * only be set by the filesystems that are setting it already.  It
-- 
cgit v1.2.3


From e850d9a52f4cd31521c80a7ea9718b69129af4d5 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Sat, 12 Aug 2023 01:05:07 +0800
Subject: badblocks: add more helper structure and routines in badblocks.h

This patch adds the following helper structure and routines into
badblocks.h,
- struct badblocks_context
  This structure is used in improved badblocks code for bad table
  iteration.
- BB_END()
  The macro to calculate end LBA of a bad range record from bad
  table.
- badblocks_full() and badblocks_empty()
  The inline routines to check whether bad table is full or empty.
- set_changed() and clear_changed()
  The inline routines to set and clear 'changed' tag from struct
  badblocks.

These new helper structure and routines can help to make the code more
clear, they will be used in the improved badblocks code in following
patches.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Xiao Ni <xni@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Geliang Tang <geliang.tang@suse.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: NeilBrown <neilb@suse.de>
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Acked-by: Geliang Tang <geliang.tang@suse.com>
Link: https://lore.kernel.org/r/20230811170513.2300-2-colyli@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/badblocks.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h
index 2426276b9bd3..670f2dae692f 100644
--- a/include/linux/badblocks.h
+++ b/include/linux/badblocks.h
@@ -15,6 +15,7 @@
 #define BB_OFFSET(x)	(((x) & BB_OFFSET_MASK) >> 9)
 #define BB_LEN(x)	(((x) & BB_LEN_MASK) + 1)
 #define BB_ACK(x)	(!!((x) & BB_ACK_MASK))
+#define BB_END(x)	(BB_OFFSET(x) + BB_LEN(x))
 #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
 
 /* Bad block numbers are stored sorted in a single page.
@@ -41,6 +42,12 @@ struct badblocks {
 	sector_t size;		/* in sectors */
 };
 
+struct badblocks_context {
+	sector_t	start;
+	sector_t	len;
+	int		ack;
+};
+
 int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
 		   sector_t *first_bad, int *bad_sectors);
 int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
@@ -63,4 +70,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
 	}
 	badblocks_exit(bb);
 }
+
+static inline int badblocks_full(struct badblocks *bb)
+{
+	return (bb->count >= MAX_BADBLOCKS);
+}
+
+static inline int badblocks_empty(struct badblocks *bb)
+{
+	return (bb->count == 0);
+}
+
+static inline void set_changed(struct badblocks *bb)
+{
+	if (bb->changed != 1)
+		bb->changed = 1;
+}
+
+static inline void clear_changed(struct badblocks *bb)
+{
+	if (bb->changed != 0)
+		bb->changed = 0;
+}
+
 #endif
-- 
cgit v1.2.3


From 948f0bf5ad6ac1e5f19f6aa8e7da4a950d66b661 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Wed, 20 Sep 2023 13:07:42 +0300
Subject: IB/mlx5: Add support for 800G_8X lane speed

Add a check for 800G_8X speed when querying PTYS and report it back
correctly when needed.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Link: https://lore.kernel.org/r/26fd0b6e1fac071c3eb779657bb3d8ba47f47c4f.1695204156.git.leon@kernel.org
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/port.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 98b2e1e149f9..794001ebd003 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -117,6 +117,7 @@ enum mlx5e_ext_link_mode {
 	MLX5E_200GAUI_2_200GBASE_CR2_KR2	= 13,
 	MLX5E_400GAUI_8				= 15,
 	MLX5E_400GAUI_4_400GBASE_CR4_KR4	= 16,
+	MLX5E_800GAUI_8_800GBASE_CR8_KR8	= 19,
 	MLX5E_EXT_LINK_MODES_NUMBER,
 };
 
-- 
cgit v1.2.3


From b28ad32442bec2f0d9cb660d7d698a1a53c13d08 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Wed, 20 Sep 2023 13:07:43 +0300
Subject: IB/mlx5: Rename 400G_8X speed to comply to naming convention

Rename 400G_8X speed to comply to naming convention.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Link: https://lore.kernel.org/r/ac98447cac8379a43fbdb36d56e5fb2b741a97ff.1695204156.git.leon@kernel.org
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/port.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 794001ebd003..26092c78a985 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -115,7 +115,7 @@ enum mlx5e_ext_link_mode {
 	MLX5E_100GAUI_1_100GBASE_CR_KR		= 11,
 	MLX5E_200GAUI_4_200GBASE_CR4_KR4	= 12,
 	MLX5E_200GAUI_2_200GBASE_CR2_KR2	= 13,
-	MLX5E_400GAUI_8				= 15,
+	MLX5E_400GAUI_8_400GBASE_CR8		= 15,
 	MLX5E_400GAUI_4_400GBASE_CR4_KR4	= 16,
 	MLX5E_800GAUI_8_800GBASE_CR8_KR8	= 19,
 	MLX5E_EXT_LINK_MODES_NUMBER,
-- 
cgit v1.2.3


From 9cf63f3a33e929f7eca36409914b8c12102b9984 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:54:37 -0700
Subject: platform/surface: aggregator: Annotate struct ssam_event with
 __counted_by
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ssam_event.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Maximilian Luz <luzmaximilian@gmail.com>
Cc: platform-driver-x86@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Reviewed-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20230922175436.work.031-kees@kernel.org
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/surface_aggregator/controller.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h
index cb7980805920..5b67f0f47d80 100644
--- a/include/linux/surface_aggregator/controller.h
+++ b/include/linux/surface_aggregator/controller.h
@@ -44,7 +44,7 @@ struct ssam_event {
 	u8 command_id;
 	u8 instance_id;
 	u16 length;
-	u8 data[];
+	u8 data[] __counted_by(length);
 };
 
 /**
-- 
cgit v1.2.3


From 8d74f1da776da9b0306630b13a3025214fa44618 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Fri, 22 Sep 2023 11:15:52 +0530
Subject: PM: sleep: Fix symbol export for _SIMPLE_ variants of _PM_OPS()

Currently EXPORT_*_SIMPLE_DEV_PM_OPS() use EXPORT_*_DEV_PM_OPS() set
of macros to export dev_pm_ops symbol, which export the symbol in case
CONFIG_PM=y but don't take CONFIG_PM_SLEEP into consideration.

Since _SIMPLE_ variants of _PM_OPS() do not include runtime PM handles
and are only used in case CONFIG_PM_SLEEP=y, we should not be exporting
dev_pm_ops symbol for them in case CONFIG_PM_SLEEP=n.

This can be fixed by having two distinct set of export macros for both
_RUNTIME_ and _SIMPLE_ variants of _PM_OPS(), such that the export of
dev_pm_ops symbol used in each variant depends on CONFIG_PM and
CONFIG_PM_SLEEP respectively.

Introduce _DEV_SLEEP_PM_OPS() set of export macros for _SIMPLE_ variants
of _PM_OPS(), which export dev_pm_ops symbol only in case CONFIG_PM_SLEEP=y
and discard it otherwise.

Fixes: 34e1ed189fab ("PM: Improve EXPORT_*_DEV_PM_OPS macros")
Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Reviewed-by: Paul Cercueil <paul@crapouillou.net>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h | 43 +++++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 1400c37b29c7..629c1633bbd0 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -374,24 +374,39 @@ const struct dev_pm_ops name = { \
 	RUNTIME_PM_OPS(runtime_suspend_fn, runtime_resume_fn, idle_fn) \
 }
 
-#ifdef CONFIG_PM
-#define _EXPORT_DEV_PM_OPS(name, license, ns)				\
+#define _EXPORT_PM_OPS(name, license, ns)				\
 	const struct dev_pm_ops name;					\
 	__EXPORT_SYMBOL(name, license, ns);				\
 	const struct dev_pm_ops name
-#define EXPORT_PM_FN_GPL(name)		EXPORT_SYMBOL_GPL(name)
-#define EXPORT_PM_FN_NS_GPL(name, ns)	EXPORT_SYMBOL_NS_GPL(name, ns)
-#else
-#define _EXPORT_DEV_PM_OPS(name, license, ns)				\
+
+#define _DISCARD_PM_OPS(name, license, ns)				\
 	static __maybe_unused const struct dev_pm_ops __static_##name
+
+#ifdef CONFIG_PM
+#define _EXPORT_DEV_PM_OPS(name, license, ns)		_EXPORT_PM_OPS(name, license, ns)
+#define EXPORT_PM_FN_GPL(name)				EXPORT_SYMBOL_GPL(name)
+#define EXPORT_PM_FN_NS_GPL(name, ns)			EXPORT_SYMBOL_NS_GPL(name, ns)
+#else
+#define _EXPORT_DEV_PM_OPS(name, license, ns)		_DISCARD_PM_OPS(name, license, ns)
 #define EXPORT_PM_FN_GPL(name)
 #define EXPORT_PM_FN_NS_GPL(name, ns)
 #endif
 
-#define EXPORT_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "", "")
-#define EXPORT_GPL_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "GPL", "")
-#define EXPORT_NS_DEV_PM_OPS(name, ns) _EXPORT_DEV_PM_OPS(name, "", #ns)
-#define EXPORT_NS_GPL_DEV_PM_OPS(name, ns) _EXPORT_DEV_PM_OPS(name, "GPL", #ns)
+#ifdef CONFIG_PM_SLEEP
+#define _EXPORT_DEV_SLEEP_PM_OPS(name, license, ns)	_EXPORT_PM_OPS(name, license, ns)
+#else
+#define _EXPORT_DEV_SLEEP_PM_OPS(name, license, ns)	_DISCARD_PM_OPS(name, license, ns)
+#endif
+
+#define EXPORT_DEV_PM_OPS(name)				_EXPORT_DEV_PM_OPS(name, "", "")
+#define EXPORT_GPL_DEV_PM_OPS(name)			_EXPORT_DEV_PM_OPS(name, "GPL", "")
+#define EXPORT_NS_DEV_PM_OPS(name, ns)			_EXPORT_DEV_PM_OPS(name, "", #ns)
+#define EXPORT_NS_GPL_DEV_PM_OPS(name, ns)		_EXPORT_DEV_PM_OPS(name, "GPL", #ns)
+
+#define EXPORT_DEV_SLEEP_PM_OPS(name)			_EXPORT_DEV_SLEEP_PM_OPS(name, "", "")
+#define EXPORT_GPL_DEV_SLEEP_PM_OPS(name)		_EXPORT_DEV_SLEEP_PM_OPS(name, "GPL", "")
+#define EXPORT_NS_DEV_SLEEP_PM_OPS(name, ns)		_EXPORT_DEV_SLEEP_PM_OPS(name, "", #ns)
+#define EXPORT_NS_GPL_DEV_SLEEP_PM_OPS(name, ns)	_EXPORT_DEV_SLEEP_PM_OPS(name, "GPL", #ns)
 
 /*
  * Use this if you want to use the same suspend and resume callbacks for suspend
@@ -404,19 +419,19 @@ const struct dev_pm_ops name = { \
 	_DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL)
 
 #define EXPORT_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
-	EXPORT_DEV_PM_OPS(name) = { \
+	EXPORT_DEV_SLEEP_PM_OPS(name) = { \
 		SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
 	}
 #define EXPORT_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
-	EXPORT_GPL_DEV_PM_OPS(name) = { \
+	EXPORT_GPL_DEV_SLEEP_PM_OPS(name) = { \
 		SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
 	}
 #define EXPORT_NS_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns)	\
-	EXPORT_NS_DEV_PM_OPS(name, ns) = { \
+	EXPORT_NS_DEV_SLEEP_PM_OPS(name, ns) = { \
 		SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
 	}
 #define EXPORT_NS_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns)	\
-	EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \
+	EXPORT_NS_GPL_DEV_SLEEP_PM_OPS(name, ns) = { \
 		SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
 	}
 
-- 
cgit v1.2.3


From 4d0e179a42879f7d76a5b95a2e7e7a5afa33954a Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Fri, 25 Aug 2023 12:43:55 +1000
Subject: media: cros-ec-cec: Manage an array of ports

To support multiple CEC ports, change cros_ec_cec to contain an array of
ports, each with their own CEC adapter, etc.

For now, only create a single port and use that port everywhere, so
there is no functional change. Support for multiple ports will be added
in the following patches.

Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/linux/platform_data/cros_ec_commands.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index ab721cf13a98..cb2ddd10a613 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4436,6 +4436,8 @@ struct ec_response_i2c_passthru_protect {
  * These commands are for sending and receiving message via HDMI CEC
  */
 
+#define EC_CEC_MAX_PORTS 16
+
 #define MAX_CEC_MSG_LEN 16
 
 /* CEC message from the AP to be written on the CEC bus */
-- 
cgit v1.2.3


From e90bd1fe7cda1aa267fe683e392b4433ec2dc0d3 Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Fri, 25 Aug 2023 12:43:56 +1000
Subject: media: cros-ec-cec: Support multiple ports in set/get host commands

Reuse the top four bits of the cmd field to specify the port number.
The reason for doing this as opposed to adding a separate uint8_t field
is it avoids the need to add new versions of these commands. The change
is backwards compatible since these bits were previously always zero, so
the default behaviour is to always operate on port 0.

Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/linux/platform_data/cros_ec_commands.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index cb2ddd10a613..e8bb05db360f 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4457,13 +4457,15 @@ struct ec_params_cec_write {
 /**
  * struct ec_params_cec_set - CEC parameters set
  * @cmd: parameter type, can be CEC_CMD_ENABLE or CEC_CMD_LOGICAL_ADDRESS
+ * @port: CEC port to set the parameter on
  * @val: in case cmd is CEC_CMD_ENABLE, this field can be 0 to disable CEC
  *	or 1 to enable CEC functionality, in case cmd is
  *	CEC_CMD_LOGICAL_ADDRESS, this field encodes the requested logical
  *	address between 0 and 15 or 0xff to unregister
  */
 struct ec_params_cec_set {
-	uint8_t cmd; /* enum cec_command */
+	uint8_t cmd : 4; /* enum cec_command */
+	uint8_t port : 4;
 	uint8_t val;
 } __ec_align1;
 
@@ -4473,9 +4475,11 @@ struct ec_params_cec_set {
 /**
  * struct ec_params_cec_get - CEC parameters get
  * @cmd: parameter type, can be CEC_CMD_ENABLE or CEC_CMD_LOGICAL_ADDRESS
+ * @port: CEC port to get the parameter on
  */
 struct ec_params_cec_get {
-	uint8_t cmd; /* enum cec_command */
+	uint8_t cmd : 4; /* enum cec_command */
+	uint8_t port : 4;
 } __ec_align1;
 
 /**
-- 
cgit v1.2.3


From adbfc747ddfb48c06d238640e16939916b7a4494 Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Fri, 25 Aug 2023 12:43:57 +1000
Subject: media: cros-ec-cec: Support multiple ports in write command

Add a v1 of the CEC write command which contains a port parameter. Check
which versions of the write command the EC supports and use the highest
supported version. If it only supports v0, check that there is only one
port. With v0, the EC will assume all write commands are for port 0.

Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/linux/platform_data/cros_ec_commands.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index e8bb05db360f..9a0c6e28f370 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4451,6 +4451,18 @@ struct ec_params_cec_write {
 	uint8_t msg[MAX_CEC_MSG_LEN];
 } __ec_align1;
 
+/**
+ * struct ec_params_cec_write_v1 - Message to write to the CEC bus
+ * @port: CEC port to write the message on
+ * @msg_len: length of msg in bytes
+ * @msg: message content to write to the CEC bus
+ */
+struct ec_params_cec_write_v1 {
+	uint8_t port;
+	uint8_t msg_len;
+	uint8_t msg[MAX_CEC_MSG_LEN];
+} __ec_align1;
+
 /* Set various CEC parameters */
 #define EC_CMD_CEC_SET 0x00BA
 
-- 
cgit v1.2.3


From 1cabf52639d16428bc0d61028dcaf38e29c5f3b5 Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Fri, 25 Aug 2023 12:43:58 +1000
Subject: media: cros-ec-cec: Support multiple ports in MKBP cec_events

Use the top four bits of the cec_events MKBP event to store the port
number.

Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/linux/platform_data/cros_ec_commands.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 9a0c6e28f370..b7e8573a8a49 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4440,6 +4440,16 @@ struct ec_response_i2c_passthru_protect {
 
 #define MAX_CEC_MSG_LEN 16
 
+/*
+ * Helper macros for packing/unpacking cec_events.
+ * bits[27:0] : bitmask of events from enum mkbp_cec_event
+ * bits[31:28]: port number
+ */
+#define EC_MKBP_EVENT_CEC_PACK(events, port) \
+		(((events) & GENMASK(27, 0)) | (((port) & 0xf) << 28))
+#define EC_MKBP_EVENT_CEC_GET_EVENTS(event) ((event) & GENMASK(27, 0))
+#define EC_MKBP_EVENT_CEC_GET_PORT(event) (((event) >> 28) & 0xf)
+
 /* CEC message from the AP to be written on the CEC bus */
 #define EC_CMD_CEC_WRITE_MSG 0x00B8
 
-- 
cgit v1.2.3


From 425d20518c54bc6d66d733fb117a9a4046932d50 Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Fri, 25 Aug 2023 12:43:59 +1000
Subject: media: cros-ec-cec: Support receiving messages from multiple ports

Currently, received messages are sent from the EC in the cec_message
MKBP event. Since the size of ec_response_get_next_data_v1 is 16 bytes,
which is also the maximum size of a CEC message, there is no space to
add a port parameter. Increasing the size of
ec_response_get_next_data_v1 is an option, but this would increase
EC-kernel traffic for all MKBP event types.

Instead, use an event to notify that data is ready, and add a new read
command to read the data.

For backwards compatibility with old EC firmware, continue to handle
cec_message events as well.

Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/linux/platform_data/cros_ec_commands.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index b7e8573a8a49..ad61c7ff0b28 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4473,6 +4473,27 @@ struct ec_params_cec_write_v1 {
 	uint8_t msg[MAX_CEC_MSG_LEN];
 } __ec_align1;
 
+/* CEC message read from a CEC bus reported back to the AP */
+#define EC_CMD_CEC_READ_MSG 0x00B9
+
+/**
+ * struct ec_params_cec_read - Read a message from the CEC bus
+ * @port: CEC port to read a message on
+ */
+struct ec_params_cec_read {
+	uint8_t port;
+} __ec_align1;
+
+/**
+ * struct ec_response_cec_read - Message read from the CEC bus
+ * @msg_len: length of msg in bytes
+ * @msg: message content read from the CEC bus
+ */
+struct ec_response_cec_read {
+	uint8_t msg_len;
+	uint8_t msg[MAX_CEC_MSG_LEN];
+} __ec_align1;
+
 /* Set various CEC parameters */
 #define EC_CMD_CEC_SET 0x00BA
 
@@ -4529,6 +4550,8 @@ enum mkbp_cec_event {
 	EC_MKBP_CEC_SEND_OK			= BIT(0),
 	/* Outgoing message was not acknowledged */
 	EC_MKBP_CEC_SEND_FAILED			= BIT(1),
+	/* Incoming message can be read out by AP */
+	EC_MKBP_CEC_HAVE_DATA			= BIT(2),
 };
 
 /*****************************************************************************/
-- 
cgit v1.2.3


From 5d227f02ceb9cc120cf04efbd77e12da182a5f62 Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Fri, 25 Aug 2023 12:44:01 +1000
Subject: media: cros-ec-cec: Get number of CEC ports from EC

Add a new CEC port count host command and use it to query the number of
CEC ports from the EC. If the host command is not supported then it must
be old EC firmware which only supports one port, so fall back to
assuming one port.

This patch completes support for multiple ports in cros-ec-cec.

Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/linux/platform_data/cros_ec_commands.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index ad61c7ff0b28..7dae17b62a4d 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4536,6 +4536,17 @@ struct ec_response_cec_get {
 	uint8_t val;
 } __ec_align1;
 
+/* Get the number of CEC ports */
+#define EC_CMD_CEC_PORT_COUNT 0x00C1
+
+/**
+ * struct ec_response_cec_port_count - CEC port count response
+ * @port_count: number of CEC ports
+ */
+struct ec_response_cec_port_count {
+	uint8_t port_count;
+} __ec_align1;
+
 /* CEC parameters command */
 enum cec_command {
 	/* CEC reading, writing and events enable */
-- 
cgit v1.2.3


From 24775700eaa93ff83b2a0f1e005879cdf186cdd9 Mon Sep 17 00:00:00 2001
From: Muralidhara M K <muralidhara.mk@amd.com>
Date: Tue, 26 Sep 2023 05:19:32 +0000
Subject: x86/amd_nb: Add AMD Family MI300 PCI IDs

Add new Root, Device 18h Function 3, and Function 4 PCI IDS
for AMD F19h Model 90h-9fh (MI300A).

Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Suma Hegde <suma.hegde@amd.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20230926051932.193239-1-suma.hegde@amd.com
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 5fb3d4c393a9..91b457de262e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -579,6 +579,7 @@
 #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3 0x12c3
 #define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3 0x16fb
 #define PCI_DEVICE_ID_AMD_MI200_DF_F3	0x14d3
+#define PCI_DEVICE_ID_AMD_MI300_DF_F3	0x152b
 #define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
-- 
cgit v1.2.3


From 2d5780bbef8dbe6375d481cbea212606a80e4453 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <petr@tesarici.cz>
Date: Tue, 26 Sep 2023 20:55:56 +0200
Subject: swiotlb: fix the check whether a device has used software IO TLB

When CONFIG_SWIOTLB_DYNAMIC=y, devices which do not use the software IO TLB
can avoid swiotlb lookup. A flag is added by commit 1395706a1490 ("swiotlb:
search the software IO TLB only if the device makes use of it"), the flag
is correctly set, but it is then never checked. Add the actual check here.

Note that this code is an alternative to the default pool check, not an
additional check, because:

1. swiotlb_find_pool() also searches the default pool;
2. if dma_uses_io_tlb is false, the default swiotlb pool is not used.

Tested in a KVM guest against a QEMU RAM-backed SATA disk over virtio and
*not* using software IO TLB, this patch increases IOPS by approx 2% for
4-way parallel I/O.

The write memory barrier in swiotlb_dyn_alloc() is not needed, because a
newly allocated pool must always be observed by swiotlb_find_slots() before
an address from that pool is passed to is_swiotlb_buffer().

Correctness was verified using the following litmus test:

C swiotlb-new-pool

(*
 * Result: Never
 *
 * Check that a newly allocated pool is always visible when the
 *  corresponding swiotlb buffer is visible.
 *)

{
	mem_pools = default;
}

P0(int **mem_pools, int *pool)
{
	/* add_mem_pool() */
	WRITE_ONCE(*pool, 999);
	rcu_assign_pointer(*mem_pools, pool);
}

P1(int **mem_pools, int *flag, int *buf)
{
	/* swiotlb_find_slots() */
	int *r0;
	int r1;

	rcu_read_lock();
	r0 = READ_ONCE(*mem_pools);
	r1 = READ_ONCE(*r0);
	rcu_read_unlock();

	if (r1) {
		WRITE_ONCE(*flag, 1);
		smp_mb();
	}

	/* device driver (presumed) */
	WRITE_ONCE(*buf, r1);
}

P2(int **mem_pools, int *flag, int *buf)
{
	/* device driver (presumed) */
	int r0 = READ_ONCE(*buf);

	/* is_swiotlb_buffer() */
	int r1;
	int *r2;
	int r3;

	smp_rmb();
	r1 = READ_ONCE(*flag);
	if (r1) {
		/* swiotlb_find_pool() */
		rcu_read_lock();
		r2 = READ_ONCE(*mem_pools);
		r3 = READ_ONCE(*r2);
		rcu_read_unlock();
	}
}

exists (2:r0<>0 /\ 2:r3=0) (* Not found. *)

Fixes: 1395706a1490 ("swiotlb: search the software IO TLB only if the device makes use of it")
Reported-by: Jonathan Corbet <corbet@lwn.net>
Closes: https://lore.kernel.org/linux-iommu/87a5uz3ob8.fsf@meer.lwn.net/
Signed-off-by: Petr Tesarik <petr@tesarici.cz>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b4536626f8ff..ecde0312dd52 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -172,14 +172,23 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 	if (!mem)
 		return false;
 
-	if (IS_ENABLED(CONFIG_SWIOTLB_DYNAMIC)) {
-		/* Pairs with smp_wmb() in swiotlb_find_slots() and
-		 * swiotlb_dyn_alloc(), which modify the RCU lists.
-		 */
-		smp_rmb();
-		return swiotlb_find_pool(dev, paddr);
-	}
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+	/*
+	 * All SWIOTLB buffer addresses must have been returned by
+	 * swiotlb_tbl_map_single() and passed to a device driver.
+	 * If a SWIOTLB address is checked on another CPU, then it was
+	 * presumably loaded by the device driver from an unspecified private
+	 * data structure. Make sure that this load is ordered before reading
+	 * dev->dma_uses_io_tlb here and mem->pools in swiotlb_find_pool().
+	 *
+	 * This barrier pairs with smp_mb() in swiotlb_find_slots().
+	 */
+	smp_rmb();
+	return READ_ONCE(dev->dma_uses_io_tlb) &&
+		swiotlb_find_pool(dev, paddr);
+#else
 	return paddr >= mem->defpool.start && paddr < mem->defpool.end;
+#endif
 }
 
 static inline bool is_swiotlb_force_bounce(struct device *dev)
-- 
cgit v1.2.3


From 2e2b547950bc09e75afe912f9683be39c2195d9d Mon Sep 17 00:00:00 2001
From: Wenchao Chen <wenchao.chen@unisoc.com>
Date: Tue, 19 Sep 2023 15:47:06 +0800
Subject: mmc: core: Allow dynamical updates of the number of requests for hsq

To allow dynamical updates of the current number of used in-flight
requests, let's move away from using a hard-coded value to a use a
corresponding variable in the struct mmc_host.

This can be valuable when optimizing for certain I/O request sequences, as
shown by subsequent changes.

Signed-off-by: Wenchao Chen <wenchao.chen@unisoc.com>
Link: https://lore.kernel.org/r/20230919074707.25517-2-wenchao.chen@unisoc.com
[Ulf: Re-wrote the commitmsg to clarify the change]
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 62a6847a3b6f..2f445c651742 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -526,6 +526,7 @@ struct mmc_host {
 
 	/* Host Software Queue support */
 	bool			hsq_enabled;
+	int			hsq_depth;
 
 	u32			err_stats[MMC_ERR_MAX];
 	unsigned long		private[] ____cacheline_aligned;
-- 
cgit v1.2.3


From 1a6a464774947920dcedcf7409be62495c7cedd0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 12 Sep 2023 12:44:06 +0200
Subject: timers: Tag (hr)timer softirq as hotplug safe

Specific stress involving frequent CPU-hotplug operations, such as
running rcutorture for example, may trigger the following message:

  NOHZ tick-stop error: local softirq work is pending, handler #02!!!"

This happens in the CPU-down hotplug process, after
CPUHP_AP_SMPBOOT_THREADS whose teardown callback parks ksoftirqd, and
before the target CPU shuts down through CPUHP_AP_IDLE_DEAD. In this
fragile intermediate state, softirqs waiting for threaded handling may be
forever ignored and eventually reported by the idle task as in the above
example.

However some vectors are known to be safe as long as the corresponding
subsystems have teardown callbacks handling the migration of their
events. The above error message reports pending timers softirq although
this vector can be considered as hotplug safe because the
CPUHP_TIMERS_PREPARE teardown callback performs the necessary migration
of timers after the death of the CPU. Hrtimers also have a similar
hotplug handling.

Therefore this error message, as far as (hr-)timers are concerned, can
be considered spurious and the relevant softirq vectors can be marked as
hotplug safe.

Fixes: 0345691b24c0 ("tick/rcu: Stop allowing RCU_SOFTIRQ in idle")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230912104406.312185-6-frederic@kernel.org
---
 include/linux/interrupt.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a92bce40b04b..4a1dc88ddbff 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -569,8 +569,12 @@ enum
  * 	2) rcu_report_dead() reports the final quiescent states.
  *
  * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue
+ *
+ * _ (HR)TIMER_SOFTIRQ: (hr)timers_dead_cpu() migrates the queue
  */
-#define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(RCU_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ))
+#define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(TIMER_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ) |\
+				   BIT(HRTIMER_SOFTIRQ) | BIT(RCU_SOFTIRQ))
+
 
 /* map softirq index to softirq name. update 'softirq_to_name' in
  * kernel/softirq.c when adding a new softirq.
-- 
cgit v1.2.3


From c02a427f7b64ed5b840a0720a6cee5a17a1e7e07 Mon Sep 17 00:00:00 2001
From: Xueshi Hu <xueshi.hu@smartx.com>
Date: Tue, 12 Sep 2023 12:44:05 +0200
Subject: tick/nohz: Remove unused tick_nohz_idle_stop_tick_protected()

All the caller has been removed since commit 336f560a8917 ("x86/xen: don't
let xen_pv_play_dead() return")

Signed-off-by: Xueshi Hu <xueshi.hu@smartx.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230912104406.312185-5-frederic@kernel.org
---
 include/linux/tick.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 9459fef5b857..716d17f31c45 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -140,14 +140,6 @@ extern unsigned long tick_nohz_get_idle_calls(void);
 extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
-
-static inline void tick_nohz_idle_stop_tick_protected(void)
-{
-	local_irq_disable();
-	tick_nohz_idle_stop_tick();
-	local_irq_enable();
-}
-
 #else /* !CONFIG_NO_HZ_COMMON */
 #define tick_nohz_enabled (0)
 static inline int tick_nohz_tick_stopped(void) { return 0; }
@@ -170,8 +162,6 @@ static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
-
-static inline void tick_nohz_idle_stop_tick_protected(void) { }
 #endif /* !CONFIG_NO_HZ_COMMON */
 
 #ifdef CONFIG_NO_HZ_FULL
-- 
cgit v1.2.3


From 6260ecd04594360ae2af104fb2641317728a66e4 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:51:27 -0700
Subject: irqdomain: Annotate struct irq_domain with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct irq_domain.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922175127.work.214-kees@kernel.org
---
 include/linux/irqdomain.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 51c254b7fec2..ee0a82c60508 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -174,7 +174,7 @@ struct irq_domain {
 	irq_hw_number_t			hwirq_max;
 	unsigned int			revmap_size;
 	struct radix_tree_root		revmap_tree;
-	struct irq_data __rcu		*revmap[];
+	struct irq_data __rcu		*revmap[] __counted_by(revmap_size);
 };
 
 /* Irq domain flags */
-- 
cgit v1.2.3


From d069ed6b752f91cea6341a9c60be42837678a7f5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 21 Sep 2023 20:01:43 +0200
Subject: thermal: core: Allow trip pointers to be used for cooling device
 binding

Add new helper functions, thermal_bind_cdev_to_trip() and
thermal_unbind_cdev_from_trip(), to allow a trip pointer to be used for
binding a cooling device to a trip point and unbinding it, respectively,
and redefine the existing helpers, thermal_zone_bind_cooling_device()
and thermal_zone_unbind_cooling_device(), as wrappers around the new
ones, respectively.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/thermal.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 6cfcae22ba12..6710a4ace992 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -320,10 +320,18 @@ const char *thermal_zone_device_type(struct thermal_zone_device *tzd);
 int thermal_zone_device_id(struct thermal_zone_device *tzd);
 struct device *thermal_zone_device(struct thermal_zone_device *tzd);
 
+int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz,
+			      const struct thermal_trip *trip,
+			      struct thermal_cooling_device *cdev,
+			      unsigned long upper, unsigned long lower,
+			      unsigned int weight);
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     struct thermal_cooling_device *,
 				     unsigned long, unsigned long,
 				     unsigned int);
+int thermal_unbind_cdev_from_trip(struct thermal_zone_device *tz,
+				  const struct thermal_trip *trip,
+				  struct thermal_cooling_device *cdev);
 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
 				       struct thermal_cooling_device *);
 void thermal_zone_device_update(struct thermal_zone_device *,
-- 
cgit v1.2.3


From 5aa4c9608d2d5fea29e211a80c29696f7d94e9f7 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 11 Sep 2023 12:38:48 +0300
Subject: net/mlx5: Introduce ifc bits for migration in a chunk mode

Introduce ifc related stuff to enable migration in a chunk mode.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20230911093856.81910-2-yishaih@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index fc3db401f8a2..3265bfcb3156 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1948,7 +1948,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_c0[0x8];
 	u8	   migration_multi_load[0x1];
 	u8	   migration_tracking_state[0x1];
-	u8	   reserved_at_ca[0x16];
+	u8	   reserved_at_ca[0x6];
+	u8	   migration_in_chunks[0x1];
+	u8	   reserved_at_d1[0xf];
 
 	u8	   reserved_at_e0[0xc0];
 
@@ -12392,7 +12394,8 @@ struct mlx5_ifc_query_vhca_migration_state_in_bits {
 	u8         op_mod[0x10];
 
 	u8         incremental[0x1];
-	u8         reserved_at_41[0xf];
+	u8         chunk[0x1];
+	u8         reserved_at_42[0xe];
 	u8         vhca_id[0x10];
 
 	u8         reserved_at_60[0x20];
@@ -12408,7 +12411,11 @@ struct mlx5_ifc_query_vhca_migration_state_out_bits {
 
 	u8         required_umem_size[0x20];
 
-	u8         reserved_at_a0[0x160];
+	u8         reserved_at_a0[0x20];
+
+	u8         remaining_total_size[0x40];
+
+	u8         reserved_at_100[0x100];
 };
 
 struct mlx5_ifc_save_vhca_state_in_bits {
@@ -12440,7 +12447,7 @@ struct mlx5_ifc_save_vhca_state_out_bits {
 
 	u8         actual_image_size[0x20];
 
-	u8         reserved_at_60[0x20];
+	u8         next_required_umem_size[0x20];
 };
 
 struct mlx5_ifc_load_vhca_state_in_bits {
-- 
cgit v1.2.3


From fb99ef17865035a6657786d4b2af11a27ba23f9b Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Fri, 25 Aug 2023 15:41:14 +0900
Subject: ata: libata-scsi: link ata port and scsi device

There is no direct device ancestry defined between an ata_device and
its scsi device which prevents the power management code from correctly
ordering suspend and resume operations. Create such ancestry with the
ata device as the parent to ensure that the scsi device (child) is
suspended before the ata device and that resume handles the ata device
before the scsi device.

The parent-child (supplier-consumer) relationship is established between
the ata_port (parent) and the scsi device (child) with the function
device_add_link(). The parent used is not the ata_device as the PM
operations are defined per port and the status of all devices connected
through that port is controlled from the port operations.

The device link is established with the new function
ata_scsi_slave_alloc(), and this function is used to define the
->slave_alloc callback of the scsi host template of all ata drivers.

Fixes: a19a93e4c6a9 ("scsi: core: pm: Rely on the device driver core for async power management")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Niklas Cassel <niklas.cassel@wdc.com>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
---
 include/linux/libata.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 84aca8c44fa3..3ce1ab408114 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1148,6 +1148,7 @@ extern int ata_std_bios_param(struct scsi_device *sdev,
 			      struct block_device *bdev,
 			      sector_t capacity, int geom[]);
 extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev);
+extern int ata_scsi_slave_alloc(struct scsi_device *sdev);
 extern int ata_scsi_slave_config(struct scsi_device *sdev);
 extern void ata_scsi_slave_destroy(struct scsi_device *sdev);
 extern int ata_scsi_change_queue_depth(struct scsi_device *sdev,
@@ -1396,6 +1397,7 @@ extern const struct attribute_group *ata_common_sdev_groups[];
 	.this_id		= ATA_SHT_THIS_ID,		\
 	.emulated		= ATA_SHT_EMULATED,		\
 	.proc_name		= drv_name,			\
+	.slave_alloc		= ata_scsi_slave_alloc,		\
 	.slave_destroy		= ata_scsi_slave_destroy,	\
 	.bios_param		= ata_std_bios_param,		\
 	.unlock_native_capacity	= ata_scsi_unlock_native_capacity,\
-- 
cgit v1.2.3


From aa3998dbeb3abce63653b7f6d4542e7dcd022590 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Sat, 26 Aug 2023 09:43:39 +0900
Subject: ata: libata-scsi: Disable scsi device manage_system_start_stop

The introduction of a device link to create a consumer/supplier
relationship between the scsi device of an ATA device and the ATA port
of that ATA device fixes the ordering of system suspend and resume
operations. For suspend, the scsi device is suspended first and the ata
port after it. This is fine as this allows the synchronize cache and
START STOP UNIT commands issued by the scsi disk driver to be executed
before the ata port is disabled.

For resume operations, the ata port is resumed first, followed
by the scsi device. This allows having the request queue of the scsi
device to be unfrozen after the ata port resume is scheduled in EH,
thus avoiding to see new requests prematurely issued to the ATA device.
Since libata sets manage_system_start_stop to 1, the scsi disk resume
operation also results in issuing a START STOP UNIT command to the
device being resumed so that the device exits standby power mode.

However, restoring the ATA device to the active power mode must be
synchronized with libata EH processing of the port resume operation to
avoid either 1) seeing the start stop unit command being received too
early when the port is not yet resumed and ready to accept commands, or
after the port resume process issues commands such as IDENTIFY to
revalidate the device. In this last case, the risk is that the device
revalidation fails with timeout errors as the drive is still spun down.

Commit 0a8589055936 ("ata,scsi: do not issue START STOP UNIT on resume")
disabled issuing the START STOP UNIT command to avoid issues with it.
But this is incorrect as transitioning a device to the active power
mode from the standby power mode set on suspend requires a media access
command. The IDENTIFY, READ LOG and SET FEATURES commands executed in
libata EH context triggered by the ata port resume operation may thus
fail.

Fix these synchronization issues is by handling a device power mode
transitions for system suspend and resume directly in libata EH context,
without relying on the scsi disk driver management triggered with the
manage_system_start_stop flag.

To do this, the following libata helper functions are introduced:

1) ata_dev_power_set_standby():

This function issues a STANDBY IMMEDIATE command to transitiom a device
to the standby power mode. For HDDs, this spins down the disks. This
function applies only to ATA and ZAC devices and does nothing otherwise.
This function also does nothing for devices that have the
ATA_FLAG_NO_POWEROFF_SPINDOWN or ATA_FLAG_NO_HIBERNATE_SPINDOWN flag
set.

For suspend, call ata_dev_power_set_standby() in
ata_eh_handle_port_suspend() before the port is disabled and frozen.
ata_eh_unload() is also modified to transition all enabled devices to
the standby power mode when the system is shutdown or devices removed.

2) ata_dev_power_set_active() and

This function applies to ATA or ZAC devices and issues a VERIFY command
for 1 sector at LBA 0 to transition the device to the active power mode.
For HDDs, since this function will complete only once the disk spin up.
Its execution uses the same timeouts as for reset, to give the drive
enough time to complete spinup without triggering a command timeout.

For resume, call ata_dev_power_set_active() in
ata_eh_revalidate_and_attach() after the port has been enabled and
before any other command is issued to the device.

With these changes, the manage_system_start_stop and no_start_on_resume
scsi device flags do not need to be set in ata_scsi_dev_config(). The
flag manage_runtime_start_stop is still set to allow the sd driver to
spinup/spindown a disk through the sd runtime operations.

Fixes: 0a8589055936 ("ata,scsi: do not issue START STOP UNIT on resume")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/libata.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 3ce1ab408114..2a7d2af0ed80 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -192,6 +192,7 @@ enum {
 	ATA_PFLAG_UNLOADING	= (1 << 9), /* driver is being unloaded */
 	ATA_PFLAG_UNLOADED	= (1 << 10), /* driver is unloaded */
 
+	ATA_PFLAG_RESUMING	= (1 << 16),  /* port is being resumed */
 	ATA_PFLAG_SUSPENDED	= (1 << 17), /* port is suspended (power) */
 	ATA_PFLAG_PM_PENDING	= (1 << 18), /* PM operation pending */
 	ATA_PFLAG_INIT_GTM_VALID = (1 << 19), /* initial gtm data valid */
@@ -318,9 +319,10 @@ enum {
 	ATA_EH_ENABLE_LINK	= (1 << 3),
 	ATA_EH_PARK		= (1 << 5), /* unload heads and stop I/O */
 	ATA_EH_GET_SUCCESS_SENSE = (1 << 6), /* Get sense data for successful cmd */
+	ATA_EH_SET_ACTIVE	= (1 << 7), /* Set a device to active power mode */
 
 	ATA_EH_PERDEV_MASK	= ATA_EH_REVALIDATE | ATA_EH_PARK |
-				  ATA_EH_GET_SUCCESS_SENSE,
+				  ATA_EH_GET_SUCCESS_SENSE | ATA_EH_SET_ACTIVE,
 	ATA_EH_ALL_ACTIONS	= ATA_EH_REVALIDATE | ATA_EH_RESET |
 				  ATA_EH_ENABLE_LINK,
 
@@ -357,7 +359,7 @@ enum {
 	/* This should match the actual table size of
 	 * ata_eh_cmd_timeout_table in libata-eh.c.
 	 */
-	ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 7,
+	ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 8,
 
 	/* Horkage types. May be set by libata or controller on drives
 	   (some horkage may be drive/controller pair dependent */
-- 
cgit v1.2.3


From 528ce6781726e022bc5dc84034360e6e8f1b89bd Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 28 Sep 2023 20:43:24 +0800
Subject: io_uring: retain top 8bits of uring_cmd flags for kernel internal use

Retain top 8bits of uring_cmd flags for kernel internal use, so that we
can move IORING_URING_CMD_POLLED out of uapi header.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 106cdc55ff3b..ae08d6f66e62 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -22,6 +22,9 @@ enum io_uring_cmd_flags {
 	IO_URING_F_IOPOLL		= (1 << 10),
 };
 
+/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
+#define IORING_URING_CMD_POLLED		(1U << 31)
+
 struct io_uring_cmd {
 	struct file	*file;
 	const struct io_uring_sqe *sqe;
-- 
cgit v1.2.3


From 93b8cc60c37b9d17732b7a297e5dca29b50a990d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 28 Sep 2023 20:43:25 +0800
Subject: io_uring: cancelable uring_cmd

uring_cmd may never complete, such as ublk, in which uring cmd isn't
completed until one new block request is coming from ublk block device.

Add cancelable uring_cmd to provide mechanism to driver for cancelling
pending commands in its own way.

Add API of io_uring_cmd_mark_cancelable() for driver to mark one command as
cancelable, then io_uring will cancel this command in
io_uring_cancel_generic(). ->uring_cmd() callback is reused for canceling
command in driver's way, then driver gets notified with the cancelling
from io_uring.

Add API of io_uring_cmd_get_task() to help driver cancel handler
deal with the canceling.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h       | 15 +++++++++++++++
 include/linux/io_uring_types.h |  6 ++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index ae08d6f66e62..b4391e0a9bc8 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -20,9 +20,13 @@ enum io_uring_cmd_flags {
 	IO_URING_F_SQE128		= (1 << 8),
 	IO_URING_F_CQE32		= (1 << 9),
 	IO_URING_F_IOPOLL		= (1 << 10),
+
+	/* set when uring wants to cancel a previously issued command */
+	IO_URING_F_CANCEL		= (1 << 11),
 };
 
 /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
+#define IORING_URING_CMD_CANCELABLE	(1U << 30)
 #define IORING_URING_CMD_POLLED		(1U << 31)
 
 struct io_uring_cmd {
@@ -85,6 +89,9 @@ static inline void io_uring_free(struct task_struct *tsk)
 		__io_uring_free(tsk);
 }
 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
+void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags);
+struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
 #else
 static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
@@ -125,6 +132,14 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
 {
 	return -EOPNOTSUPP;
 }
+static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags)
+{
+}
+static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+	return NULL;
+}
 #endif
 
 #endif
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index fe1c5d4ec56c..e178461fa513 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -265,6 +265,12 @@ struct io_ring_ctx {
 		 */
 		struct io_wq_work_list	iopoll_list;
 		bool			poll_multi_queue;
+
+		/*
+		 * Any cancelable uring_cmd is added to this list in
+		 * ->uring_cmd() by io_uring_cmd_insert_cancelable()
+		 */
+		struct hlist_head	cancelable_uring_cmd;
 	} ____cacheline_aligned_in_smp;
 
 	struct {
-- 
cgit v1.2.3


From a941b784b15ff65e1a3b6a259c6d6cf7fa0bb3c3 Mon Sep 17 00:00:00 2001
From: Nipun Gupta <nipun.gupta@amd.com>
Date: Fri, 15 Sep 2023 10:24:21 +0530
Subject: cdx: add support for bus mastering

Introduce cdx_set_master() and cdx_clear_master() APIs to support
enable and disable of bus mastering. Drivers need to use these APIs to
enable/disable DMAs from the CDX devices.

Signed-off-by: Nipun Gupta <nipun.gupta@amd.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@amd.com>
Link: https://lore.kernel.org/r/20230915045423.31630-1-nipun.gupta@amd.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/cdx/cdx_bus.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index bead71b7bc73..8320ec3b9e37 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -21,11 +21,13 @@
 struct cdx_controller;
 
 enum {
+	CDX_DEV_BUS_MASTER_CONF,
 	CDX_DEV_RESET_CONF,
 };
 
 struct cdx_device_config {
 	u8 type;
+	bool bus_master_enable;
 };
 
 typedef int (*cdx_scan_cb)(struct cdx_controller *cdx);
@@ -170,4 +172,20 @@ extern struct bus_type cdx_bus_type;
  */
 int cdx_dev_reset(struct device *dev);
 
+/**
+ * cdx_set_master - enables bus-mastering for CDX device
+ * @cdx_dev: the CDX device to enable
+ *
+ * Return: 0 for success, -errno on failure
+ */
+int cdx_set_master(struct cdx_device *cdx_dev);
+
+/**
+ * cdx_clear_master - disables bus-mastering for CDX device
+ * @cdx_dev: the CDX device to disable
+ *
+ * Return: 0 for success, -errno on failure
+ */
+int cdx_clear_master(struct cdx_device *cdx_dev);
+
 #endif /* _CDX_BUS_H_ */
-- 
cgit v1.2.3


From d427da2323b093a65d8317783e76ab8fad2e2ef0 Mon Sep 17 00:00:00 2001
From: Sui Jingfeng <suijingfeng@loongson.cn>
Date: Fri, 25 Aug 2023 14:27:10 +0800
Subject: PCI: Add pci_get_base_class() helper

There is no function to get all PCI devices in a system by matching
against the base class code only, ignoring the sub-class code and
the programming interface.  Add pci_get_base_class() to suit the
need.

For example, if a driver wants to process all PCI display devices in
a system, it can do so like this:

  pdev = NULL;
  while ((pdev = pci_get_base_class(PCI_BASE_CLASS_DISPLAY, pdev))) {
    do_something_for_pci_display_device(pdev);
  }

Link: https://lore.kernel.org/r/20230825062714.6325-2-sui.jingfeng@linux.dev
Signed-off-by: Sui Jingfeng <suijingfeng@loongson.cn>
[bhelgaas: reword commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/linux/pci.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8c7c2c3c6c65..40ac1288a2cc 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1181,6 +1181,8 @@ struct pci_dev *pci_get_slot(struct pci_bus *bus, unsigned int devfn);
 struct pci_dev *pci_get_domain_bus_and_slot(int domain, unsigned int bus,
 					    unsigned int devfn);
 struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from);
+struct pci_dev *pci_get_base_class(unsigned int class, struct pci_dev *from);
+
 int pci_dev_present(const struct pci_device_id *ids);
 
 int pci_bus_read_config_byte(struct pci_bus *bus, unsigned int devfn,
@@ -1924,6 +1926,9 @@ static inline struct pci_dev *pci_get_class(unsigned int class,
 					    struct pci_dev *from)
 { return NULL; }
 
+static inline struct pci_dev *pci_get_base_class(unsigned int class,
+						 struct pci_dev *from)
+{ return NULL; }
 
 static inline int pci_dev_present(const struct pci_device_id *ids)
 { return 0; }
-- 
cgit v1.2.3


From 194bb58c6090e39bd7d9b9c888a079213628e1f6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 8 Jun 2023 11:57:40 -0600
Subject: io_uring: add support for futex wake and wait

Add support for FUTEX_WAKE/WAIT primitives.

IORING_OP_FUTEX_WAKE is mix of FUTEX_WAKE and FUTEX_WAKE_BITSET, as
it does support passing in a bitset.

Similary, IORING_OP_FUTEX_WAIT is a mix of FUTEX_WAIT and
FUTEX_WAIT_BITSET.

For both of them, they are using the futex2 interface.

FUTEX_WAKE is straight forward, as those can always be done directly from
the io_uring submission without needing async handling. For FUTEX_WAIT,
things are a bit more complicated. If the futex isn't ready, then we
rely on a callback via futex_queue->wake() when someone wakes up the
futex. From that calback, we queue up task_work with the original task,
which will post a CQE and wake it, if necessary.

Cancelations are supported, both from the application point-of-view,
but also to be able to cancel pending waits if the ring exits before
all events have occurred. The return value of futex_unqueue() is used
to gate who wins the potential race between cancelation and futex
wakeups. Whomever gets a 'ret == 1' return from that claims ownership
of the io_uring futex request.

This is just the barebones wait/wake support. PI or REQUEUE support is
not added at this point, unclear if we might look into that later.

Likewise, explicit timeouts are not supported either. It is expected
that users that need timeouts would do so via the usual io_uring
mechanism to do that using linked timeouts.

The SQE format is as follows:

`addr`		Address of futex
`fd`		futex2(2) FUTEX2_* flags
`futex_flags`	io_uring specific command flags. None valid now.
`addr2`		Value of futex
`addr3`		Mask to wake/wait

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e178461fa513..990984614fca 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -321,6 +321,11 @@ struct io_ring_ctx {
 
 	struct hlist_head	waitid_list;
 
+#ifdef CONFIG_FUTEX
+	struct hlist_head	futex_list;
+	struct io_alloc_cache	futex_cache;
+#endif
+
 	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
 	struct io_sq_data	*sq_data;	/* if using sq thread polling */
 
-- 
cgit v1.2.3


From d77008421afda6208b1256c9b218457acd174ca6 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Thu, 17 Aug 2023 21:14:57 -0700
Subject: groups: Convert group_info.usage to refcount_t

atomic_t variables are currently used to implement reference counters
with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows and
underflows. This is important since overflows and underflows can lead
to use-after-free situation and be exploitable.

The variable group_info.usage is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

**Important note for maintainers:

Some functions from refcount_t API defined in refcount.h have different
memory ordering guarantees than their atomic counterparts. Please check
Documentation/core-api/refcount-vs-atomic.rst for more information.

Normally the differences should not matter since refcount_t provides
enough guarantees to satisfy the refcounting use cases, but in some
rare cases it might matter. Please double check that you don't have
some undocumented memory guarantees for this variable usage.

For the group_info.usage it might make a difference in following places:
 - put_group_info(): decrement in refcount_dec_and_test() only
   provides RELEASE ordering and ACQUIRE ordering on success vs. fully
   ordered atomic counterpart

Suggested-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Link: https://lore.kernel.org/r/20230818041456.gonna.009-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/cred.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index f923528d5cc4..92f8d772da6f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/key.h>
 #include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <linux/uidgid.h>
 #include <linux/sched.h>
 #include <linux/sched/user.h>
@@ -23,7 +24,7 @@ struct inode;
  * COW Supplementary groups list
  */
 struct group_info {
-	atomic_t	usage;
+	refcount_t	usage;
 	int		ngroups;
 	kgid_t		gid[];
 } __randomize_layout;
@@ -39,7 +40,7 @@ struct group_info {
  */
 static inline struct group_info *get_group_info(struct group_info *gi)
 {
-	atomic_inc(&gi->usage);
+	refcount_inc(&gi->usage);
 	return gi;
 }
 
@@ -49,7 +50,7 @@ static inline struct group_info *get_group_info(struct group_info *gi)
  */
 #define put_group_info(group_info)			\
 do {							\
-	if (atomic_dec_and_test(&(group_info)->usage))	\
+	if (refcount_dec_and_test(&(group_info)->usage))	\
 		groups_free(group_info);		\
 } while (0)
 
-- 
cgit v1.2.3


From ce60f27bb62dfeb1bf827350520f34abc84e0933 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 20 Sep 2023 05:09:58 +0100
Subject: mm: abstract moving to the next PFN

In order to fix the L1TF vulnerability, x86 can invert the PTE bits for
PROT_NONE VMAs, which means we cannot move from one PTE to the next by
adding 1 to the PFN field of the PTE.  This results in the BUG reported at
[1].

Abstract advancing the PTE to the next PFN through a pte_next_pfn()
function/macro.

Link: https://lkml.kernel.org/r/20230920040958.866520-1-willy@infradead.org
Fixes: bcc6cc832573 ("mm: add default definition of set_ptes()")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reported-by: syzbot+55cc72f8cc3a549119df@syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/000000000000d099fa0604f03351@google.com [1]
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 1fba072b3dac..af7639c3b0a3 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -206,6 +206,14 @@ static inline int pmd_young(pmd_t pmd)
 #endif
 
 #ifndef set_ptes
+
+#ifndef pte_next_pfn
+static inline pte_t pte_next_pfn(pte_t pte)
+{
+	return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+}
+#endif
+
 /**
  * set_ptes - Map consecutive pages to a contiguous range of addresses.
  * @mm: Address space to map the pages into.
@@ -231,7 +239,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 		if (--nr == 0)
 			break;
 		ptep++;
-		pte = __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+		pte = pte_next_pfn(pte);
 	}
 	arch_leave_lazy_mmu_mode();
 }
-- 
cgit v1.2.3


From 5c590804b6b0ff933ed4e5cee5d76de3a5048d9f Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Thu, 21 Sep 2023 14:12:35 -0400
Subject: maple_tree: add mas_is_active() to detect in-tree walks

Patch series "maple_tree: Fix mas_prev() state regression".

Pedro Falcato retported an mprotect regression [1] which was bisected back
to the iterator changes for maple tree.  Root cause analysis showed the
mas_prev() running off the end of the VMA space (previous from 0) followed
by mas_find(), would skip the first value.

This patchset introduces maple state underflow/overflow so the sequence of
calls on the maple state will return what the user expects.

Users who encounter this bug may see mprotect(), userfaultfd_register(),
and mlock() fail on VMAs mapped with address 0.


This patch (of 2):

Instead of constantly checking each possibility of the maple state,
create a fast path that will skip over checking unlikely states.

Link: https://lkml.kernel.org/r/20230921181236.509072-1-Liam.Howlett@oracle.com
Link: https://lkml.kernel.org/r/20230921181236.509072-2-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index e41c70ac7744..f66f5f78f8cf 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -511,6 +511,15 @@ static inline bool mas_is_paused(const struct ma_state *mas)
 	return mas->node == MAS_PAUSE;
 }
 
+/* Check if the mas is pointing to a node or not */
+static inline bool mas_is_active(struct ma_state *mas)
+{
+	if ((unsigned long)mas->node >= MAPLE_RESERVED_RANGE)
+		return true;
+
+	return false;
+}
+
 /**
  * mas_reset() - Reset a Maple Tree operation state.
  * @mas: Maple Tree operation state.
-- 
cgit v1.2.3


From a8091f039c1ebf5cb0d5261e3613f18eb2a5d8b7 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Thu, 21 Sep 2023 14:12:36 -0400
Subject: maple_tree: add MAS_UNDERFLOW and MAS_OVERFLOW states

When updating the maple tree iterator to avoid rewalks, an issue was
introduced when shifting beyond the limits.  This can be seen by trying to
go to the previous address of 0, which would set the maple node to
MAS_NONE and keep the range as the last entry.

Subsequent calls to mas_find() would then search upwards from mas->last
and skip the value at mas->index/mas->last.  This showed up as a bug in
mprotect which skips the actual VMA at the current range after attempting
to go to the previous VMA from 0.

Since MAS_NONE may already be set when searching for a value that isn't
contained within a node, changing the handling of MAS_NONE in mas_find()
would make the code more complicated and error prone.  Furthermore, there
was no way to tell which limit was hit, and thus which action to take
(next or the entry at the current range).

This solution is to add two states to track what happened with the
previous iterator action.  This allows for the expected behaviour of the
next command to return the correct item (either the item at the range
requested, or the next/previous).

Tests are also added and updated accordingly.

Link: https://lkml.kernel.org/r/20230921181236.509072-3-Liam.Howlett@oracle.com
Link: https://gist.github.com/heatd/85d2971fae1501b55b6ea401fbbe485b
Link: https://lore.kernel.org/linux-mm/20230921181236.509072-1-Liam.Howlett@oracle.com/
Fixes: 39193685d585 ("maple_tree: try harder to keep active node with mas_prev()")
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reported-by: Pedro Falcato <pedro.falcato@gmail.com>
Closes: https://gist.github.com/heatd/85d2971fae1501b55b6ea401fbbe485b
Closes: https://bugs.archlinux.org/task/79656
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index f66f5f78f8cf..d01e850b570f 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -428,6 +428,8 @@ struct ma_wr_state {
 #define MAS_ROOT	((struct maple_enode *)5UL)
 #define MAS_NONE	((struct maple_enode *)9UL)
 #define MAS_PAUSE	((struct maple_enode *)17UL)
+#define MAS_OVERFLOW	((struct maple_enode *)33UL)
+#define MAS_UNDERFLOW	((struct maple_enode *)65UL)
 #define MA_ERROR(err) \
 		((struct maple_enode *)(((unsigned long)err << 2) | 2UL))
 
-- 
cgit v1.2.3


From 935d4f0c6dc8b3533e6e39346de7389a84490178 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Fri, 22 Sep 2023 12:58:03 +0100
Subject: mm: hugetlb: add huge page size param to set_huge_pte_at()

Patch series "Fix set_huge_pte_at() panic on arm64", v2.

This series fixes a bug in arm64's implementation of set_huge_pte_at(),
which can result in an unprivileged user causing a kernel panic.  The
problem was triggered when running the new uffd poison mm selftest for
HUGETLB memory.  This test (and the uffd poison feature) was merged for
v6.5-rc7.

Ideally, I'd like to get this fix in for v6.6 and I've cc'ed stable
(correctly this time) to get it backported to v6.5, where the issue first
showed up.


Description of Bug
==================

arm64's huge pte implementation supports multiple huge page sizes, some of
which are implemented in the page table with multiple contiguous entries.
So set_huge_pte_at() needs to work out how big the logical pte is, so that
it can also work out how many physical ptes (or pmds) need to be written.
It previously did this by grabbing the folio out of the pte and querying
its size.

However, there are cases when the pte being set is actually a swap entry.
But this also used to work fine, because for huge ptes, we only ever saw
migration entries and hwpoison entries.  And both of these types of swap
entries have a PFN embedded, so the code would grab that and everything
still worked out.

But over time, more calls to set_huge_pte_at() have been added that set
swap entry types that do not embed a PFN.  And this causes the code to go
bang.  The triggering case is for the uffd poison test, commit
99aa77215ad0 ("selftests/mm: add uffd unit test for UFFDIO_POISON"), which
causes a PTE_MARKER_POISONED swap entry to be set, coutesey of commit
8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs") -
added in v6.5-rc7.  Although review shows that there are other call sites
that set PTE_MARKER_UFFD_WP (which also has no PFN), these don't trigger
on arm64 because arm64 doesn't support UFFD WP.

If CONFIG_DEBUG_VM is enabled, we do at least get a BUG(), but otherwise,
it will dereference a bad pointer in page_folio():

    static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry)
    {
        VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry));

        return page_folio(pfn_to_page(swp_offset_pfn(entry)));
    }


Fix
===

The simplest fix would have been to revert the dodgy cleanup commit
18f3962953e4 ("mm: hugetlb: kill set_huge_swap_pte_at()"), but since
things have moved on, this would have required an audit of all the new
set_huge_pte_at() call sites to see if they should be converted to
set_huge_swap_pte_at().  As per the original intent of the change, it
would also leave us open to future bugs when people invariably get it
wrong and call the wrong helper.

So instead, I've added a huge page size parameter to set_huge_pte_at().
This means that the arm64 code has the size in all cases.  It's a bigger
change, due to needing to touch the arches that implement the function,
but it is entirely mechanical, so in my view, low risk.

I've compile-tested all touched arches; arm64, parisc, powerpc, riscv,
s390, sparc (and additionally x86_64).  I've additionally booted and run
mm selftests against arm64, where I observe the uffd poison test is fixed,
and there are no other regressions.


This patch (of 2):

In order to fix a bug, arm64 needs to be told the size of the huge page
for which the pte is being set in set_huge_pte_at().  Provide for this by
adding an `unsigned long sz` parameter to the function.  This follows the
same pattern as huge_pte_clear().

This commit makes the required interface modifications to the core mm as
well as all arches that implement this function (arm64, parisc, powerpc,
riscv, s390, sparc).  The actual arm64 bug will be fixed in a separate
commit.

No behavioral changes intended.

Link: https://lkml.kernel.org/r/20230922115804.2043771-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20230922115804.2043771-2-ryan.roberts@arm.com
Fixes: 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs")
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>	[powerpc 8xx]
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>	[vmalloc change]
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: <stable@vger.kernel.org>	[6.5+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5b2626063f4f..a30686e649f7 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -984,7 +984,9 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 						unsigned long addr, pte_t *ptep,
 						pte_t old_pte, pte_t pte)
 {
-	set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+	unsigned long psize = huge_page_size(hstate_vma(vma));
+
+	set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
 }
 #endif
 
@@ -1173,7 +1175,7 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 }
 
 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-				   pte_t *ptep, pte_t pte)
+				   pte_t *ptep, pte_t pte, unsigned long sz)
 {
 }
 
-- 
cgit v1.2.3


From 348cbf987ed328682af7d74ce98d9f0e6857f42f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sat, 30 Sep 2023 09:20:20 -0700
Subject: Input: mt - annotate struct input_mt with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct input_mt.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922175036.work.762-kees@kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/input/mt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/input/mt.h b/include/linux/input/mt.h
index 3b8580bd33c1..2cf89a538b18 100644
--- a/include/linux/input/mt.h
+++ b/include/linux/input/mt.h
@@ -47,7 +47,7 @@ struct input_mt {
 	unsigned int flags;
 	unsigned int frame;
 	int *red;
-	struct input_mt_slot slots[];
+	struct input_mt_slot slots[] __counted_by(num_slots);
 };
 
 static inline void input_mt_set_value(struct input_mt_slot *slot,
-- 
cgit v1.2.3


From fce9c967820a72f600abbf061d7077861685a14d Mon Sep 17 00:00:00 2001
From: Ingo Rohloff <lundril@gmx.de>
Date: Sat, 26 Aug 2023 22:02:41 +0200
Subject: wifi: mt76: mt7921e: Support MT7992 IP in Xiaomi Redmibook 15 Pro
 (2023)

In the Xiaomi Redmibook 15 Pro (2023) laptop I have got, a wifi chip is
used, which according to its PCI Vendor ID is from "ITTIM Technology".

This chip works flawlessly with the mt7921e module.  The driver doesn't
bind to this PCI device, because the Vendor ID from "ITTIM Technology" is
not recognized.

This patch adds the PCI Vendor ID from "ITTIM Technology" to the list of
PCI Vendor IDs and lets the mt7921e driver bind to the mentioned wifi
chip.

Signed-off-by: Ingo Rohloff <lundril@gmx.de>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 2dc75df1437f..6ae1803bcd2f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -180,6 +180,8 @@
 #define PCI_DEVICE_ID_BERKOM_A4T		0xffa4
 #define PCI_DEVICE_ID_BERKOM_SCITEL_QUADRO	0xffa8
 
+#define PCI_VENDOR_ID_ITTIM		0x0b48
+
 #define PCI_VENDOR_ID_COMPAQ		0x0e11
 #define PCI_DEVICE_ID_COMPAQ_TOKENRING	0x0508
 #define PCI_DEVICE_ID_COMPAQ_TACHYON	0xa0fc
-- 
cgit v1.2.3


From 0b068c714ca9479d2783cc333fff5bc2d4a6d45c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 Sep 2023 08:52:16 +0000
Subject: net: add DEV_STATS_READ() helper

Companion of DEV_STATS_INC() & DEV_STATS_ADD().

This is going to be used in the series.

Use it in macsec_get_stats64().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7e520c14eb8c..e070a4540fba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -5236,5 +5236,6 @@ extern struct net_device *blackhole_netdev;
 #define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD)
 #define DEV_STATS_ADD(DEV, FIELD, VAL) 	\
 		atomic_long_add((VAL), &(DEV)->stats.__##FIELD)
+#define DEV_STATS_READ(DEV, FIELD) atomic_long_read(&(DEV)->stats.__##FIELD)
 
 #endif	/* _LINUX_NETDEVICE_H */
-- 
cgit v1.2.3


From fa4c4507099f781ca89a748c480af9cf97629726 Mon Sep 17 00:00:00 2001
From: Niklas Schnelle <schnelle@linux.ibm.com>
Date: Thu, 28 Sep 2023 16:31:35 +0200
Subject: iommu: Allow .iotlb_sync_map to fail and handle s390's -ENOMEM return

On s390 when using a paging hypervisor, .iotlb_sync_map is used to sync
mappings by letting the hypervisor inspect the synced IOVA range and
updating a shadow table. This however means that .iotlb_sync_map can
fail as the hypervisor may run out of resources while doing the sync.
This can be due to the hypervisor being unable to pin guest pages, due
to a limit on mapped addresses such as vfio_iommu_type1.dma_entry_limit
or lack of other resources. Either way such a failure to sync a mapping
should result in a DMA_MAPPING_ERROR.

Now especially when running with batched IOTLB flushes for unmap it may
be that some IOVAs have already been invalidated but not yet synced via
.iotlb_sync_map. Thus if the hypervisor indicates running out of
resources, first do a global flush allowing the hypervisor to free
resources associated with these mappings as well a retry creating the
new mappings and only if that also fails report this error to callers.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com> # sun50i
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Link: https://lore.kernel.org/r/20230928-dma_iommu-v13-1-9e5fc4dacc36@linux.ibm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 64bd20142cbe..1eb638752781 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -350,8 +350,8 @@ struct iommu_domain_ops {
 			      struct iommu_iotlb_gather *iotlb_gather);
 
 	void (*flush_iotlb_all)(struct iommu_domain *domain);
-	void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova,
-			       size_t size);
+	int (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova,
+			      size_t size);
 	void (*iotlb_sync)(struct iommu_domain *domain,
 			   struct iommu_iotlb_gather *iotlb_gather);
 
-- 
cgit v1.2.3


From 32d5bc8b09c7cc48c511809e7c3b1755c7ecc5fa Mon Sep 17 00:00:00 2001
From: Niklas Schnelle <schnelle@linux.ibm.com>
Date: Thu, 28 Sep 2023 16:31:39 +0200
Subject: iommu/dma: Allow a single FQ in addition to per-CPU FQs

In some virtualized environments, including s390 paged memory guests,
IOTLB flushes are used to update IOMMU shadow tables. Due to this, they
are much more expensive than in typical bare metal environments or
non-paged s390 guests. In addition they may parallelize poorly in
virtualized environments. This changes the trade off for flushing IOVAs
such that minimizing the number of IOTLB flushes trumps any benefit of
cheaper queuing operations or increased paralellism.

In this scenario per-CPU flush queues pose several problems. Firstly
per-CPU memory is often quite limited prohibiting larger queues.
Secondly collecting IOVAs per-CPU but flushing via a global timeout
reduces the number of IOVAs flushed for each timeout especially on s390
where PCI interrupts may not be bound to a specific CPU.

Let's introduce a single flush queue mode that reuses the same queue
logic but only allocates a single global queue. This mode is selected by
dma-iommu if a newly introduced .shadow_on_flush flag is set in struct
dev_iommu. As a first user the s390 IOMMU driver sets this flag during
probe_device. With the unchanged small FQ size and timeouts this setting
is worse than per-CPU queues but a follow up patch will make the FQ size
and timeout variable. Together this allows the common IOVA flushing code
to more closely resemble the global flush behavior used on s390's
previous internal DMA API implementation.

Link: https://lore.kernel.org/all/9a466109-01c5-96b0-bf03-304123f435ee@arm.com/
Acked-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com> #s390
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Link: https://lore.kernel.org/r/20230928-dma_iommu-v13-5-9e5fc4dacc36@linux.ibm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1eb638752781..0c4d8ae985ac 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -424,6 +424,7 @@ struct iommu_fault_param {
  * @attach_deferred: the dma domain attachment is deferred
  * @pci_32bit_workaround: Limit DMA allocations to 32-bit IOVAs
  * @require_direct: device requires IOMMU_RESV_DIRECT regions
+ * @shadow_on_flush: IOTLB flushes are used to sync shadow tables
  *
  * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
  *	struct iommu_group	*iommu_group;
@@ -439,6 +440,7 @@ struct dev_iommu {
 	u32				attach_deferred:1;
 	u32				pci_32bit_workaround:1;
 	u32				require_direct:1;
+	u32				shadow_on_flush:1;
 };
 
 int iommu_device_register(struct iommu_device *iommu,
-- 
cgit v1.2.3


From 0d293714ac32650bfb669ceadf7cc2fad8161401 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Thu, 21 Sep 2023 15:10:27 +0300
Subject: RDMA/mlx5: Send events from IB driver about device affiliation state

Send blocking events from IB driver whenever the device is done being
affiliated or if it is removed from an affiliation.

This is useful since now the EN driver can register to those event and
know when a device is affiliated or not.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Link: https://lore.kernel.org/r/a7491c3e483cfd8d962f5f75b9a25f253043384a.1695296682.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/device.h | 2 ++
 include/linux/mlx5/driver.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 4d5be378fa8c..26333d602a50 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -366,6 +366,8 @@ enum mlx5_driver_event {
 	MLX5_DRIVER_EVENT_UPLINK_NETDEV,
 	MLX5_DRIVER_EVENT_MACSEC_SA_ADDED,
 	MLX5_DRIVER_EVENT_MACSEC_SA_DELETED,
+	MLX5_DRIVER_EVENT_AFFILIATION_DONE,
+	MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
 };
 
 enum {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3033bbaeac81..5ca4e085d813 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1027,6 +1027,8 @@ bool mlx5_cmd_is_down(struct mlx5_core_dev *dev);
 void mlx5_core_uplink_netdev_set(struct mlx5_core_dev *mdev, struct net_device *netdev);
 void mlx5_core_uplink_netdev_event_replay(struct mlx5_core_dev *mdev);
 
+void mlx5_core_mp_event_replay(struct mlx5_core_dev *dev, u32 event, void *data);
+
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From ef36ffcb381096ed32c309c342a02bf62939c503 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Thu, 21 Sep 2023 15:10:30 +0300
Subject: net/mlx5: Add alias flow table bits

Add all the capabilities needed to check for alias object support.
As well as all the fields or commands needed for its creation and
the creation of flow table that is able to jump to an alias object.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Link: https://lore.kernel.org/r/544c030f2a78c4adf3fe6b64f97a39cc1bbdabb9.1695296682.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 56 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3265bfcb3156..23f9780adb83 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -312,6 +312,7 @@ enum {
 	MLX5_CMD_OP_QUERY_VHCA_STATE              = 0xb0d,
 	MLX5_CMD_OP_MODIFY_VHCA_STATE             = 0xb0e,
 	MLX5_CMD_OP_SYNC_CRYPTO                   = 0xb12,
+	MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS       = 0xb16,
 	MLX5_CMD_OP_MAX
 };
 
@@ -1934,6 +1935,14 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         match_definer_format_supported[0x40];
 };
 
+enum {
+	MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_TO_REMOTE_FLOW_TABLE_MISS  = 0x80000,
+};
+
+enum {
+	MLX5_ALLOWED_OBJ_FOR_OTHER_VHCA_ACCESS_FLOW_TABLE       = 0x200,
+};
+
 struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_0[0x80];
 
@@ -1952,7 +1961,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   migration_in_chunks[0x1];
 	u8	   reserved_at_d1[0xf];
 
-	u8	   reserved_at_e0[0xc0];
+	u8	   cross_vhca_object_to_object_supported[0x20];
+
+	u8	   allowed_object_for_other_vhca_access[0x40];
+
+	u8	   reserved_at_140[0x60];
 
 	u8	   flow_table_type_2_type[0x8];
 	u8	   reserved_at_1a8[0x3];
@@ -6371,6 +6384,28 @@ struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_allow_other_vhca_access_in_bits {
+	u8 opcode[0x10];
+	u8 uid[0x10];
+	u8 reserved_at_20[0x10];
+	u8 op_mod[0x10];
+	u8 reserved_at_40[0x50];
+	u8 object_type_to_be_accessed[0x10];
+	u8 object_id_to_be_accessed[0x20];
+	u8 reserved_at_c0[0x40];
+	union {
+		u8 access_key_raw[0x100];
+		u8 access_key[8][0x20];
+	};
+};
+
+struct mlx5_ifc_allow_other_vhca_access_out_bits {
+	u8 status[0x8];
+	u8 reserved_at_8[0x18];
+	u8 syndrome[0x20];
+	u8 reserved_at_40[0x40];
+};
+
 struct mlx5_ifc_modify_header_arg_bits {
 	u8         reserved_at_0[0x80];
 
@@ -6393,6 +6428,24 @@ struct mlx5_ifc_create_match_definer_out_bits {
 	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
 };
 
+struct mlx5_ifc_alias_context_bits {
+	u8 vhca_id_to_be_accessed[0x10];
+	u8 reserved_at_10[0xd];
+	u8 status[0x3];
+	u8 object_id_to_be_accessed[0x20];
+	u8 reserved_at_40[0x40];
+	union {
+		u8 access_key_raw[0x100];
+		u8 access_key[8][0x20];
+	};
+	u8 metadata[0x80];
+};
+
+struct mlx5_ifc_create_alias_obj_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+	struct mlx5_ifc_alias_context_bits alias_ctx;
+};
+
 enum {
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_OUTER_HEADERS    = 0x0,
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS  = 0x1,
@@ -11919,6 +11972,7 @@ enum {
 	MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = 0x24,
 	MLX5_GENERAL_OBJECT_TYPES_MACSEC = 0x27,
 	MLX5_GENERAL_OBJECT_TYPES_INT_KEK = 0x47,
+	MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS = 0xff15,
 };
 
 enum {
-- 
cgit v1.2.3


From d424348b060d87f92cc59d8e6ea9c612c5b708f5 Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Thu, 28 Sep 2023 19:45:12 +0300
Subject: vdpa/mlx5: Expose descriptor group mkey hw capability

Necessary for improved live migration flow. Actual support will be added
in a downstream patch.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Link: https://lore.kernel.org/r/20230928164550.980832-3-dtatulea@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index fc3db401f8a2..3388007c645f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1231,7 +1231,13 @@ struct mlx5_ifc_virtio_emulation_cap_bits {
 	u8         max_emulated_devices[0x8];
 	u8         max_num_virtio_queues[0x18];
 
-	u8         reserved_at_a0[0x60];
+	u8         reserved_at_a0[0x20];
+
+	u8	   reserved_at_c0[0x13];
+	u8         desc_group_mkey_supported[0x1];
+	u8         reserved_at_d4[0xc];
+
+	u8         reserved_at_e0[0x20];
 
 	u8         umem_1_buffer_param_a[0x20];
 
-- 
cgit v1.2.3


From a60359ea32251399c00d934981f5a6fa25ec917f Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Fri, 22 Sep 2023 08:45:51 +0200
Subject: usb: renesas_usbhs: remove boilerplate from header file

There is a SPDX entry, so we can remove the boilerplate.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20230922064551.4663-1-wsa+renesas@sang-engineering.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/renesas_usbhs.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index d418c55523a7..372898d9eeb0 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -5,16 +5,6 @@
  * Copyright (C) 2011 Renesas Solutions Corp.
  * Copyright (C) 2019 Renesas Electronics Corporation
  * Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  */
 #ifndef RENESAS_USB_H
 #define RENESAS_USB_H
-- 
cgit v1.2.3


From a17fae8fc38e91026f116a85c5068668fbf9848a Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <utkarsh.h.patel@intel.com>
Date: Tue, 19 Sep 2023 19:32:39 -0700
Subject: usb: typec: Add Displayport Alternate Mode 2.1 Support

Displayport Alternate mode 2.1 requires configuration for additional
cable details such as signalling for cable, UHBR13.5 Support, Cable type
and DPAM version.
These details can be used with mux drivers to configure SOP DP
configuration for Displayport Alternate mode 2.1.
This change also includes pertinent cable signalling support in displayport
alternate mode.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
Link: https://lore.kernel.org/r/20230920023243.2494410-2-utkarsh.h.patel@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/typec_dp.h | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/typec_dp.h b/include/linux/usb/typec_dp.h
index 8d09c2f0a9b8..1f358098522d 100644
--- a/include/linux/usb/typec_dp.h
+++ b/include/linux/usb/typec_dp.h
@@ -67,8 +67,10 @@ enum {
 #define   DP_CAP_UFP_D			1
 #define   DP_CAP_DFP_D			2
 #define   DP_CAP_DFP_D_AND_UFP_D	3
-#define DP_CAP_DP_SIGNALING		BIT(2) /* Always set */
-#define DP_CAP_GEN2			BIT(3) /* Reserved after v1.0b */
+#define DP_CAP_DP_SIGNALLING(_cap_)	(((_cap_) & GENMASK(5, 2)) >> 2)
+#define   DP_CAP_SIGNALLING_HBR3	1
+#define   DP_CAP_SIGNALLING_UHBR10	2
+#define   DP_CAP_SIGNALLING_UHBR20	3
 #define DP_CAP_RECEPTACLE		BIT(6)
 #define DP_CAP_USB			BIT(7)
 #define DP_CAP_DFP_D_PIN_ASSIGN(_cap_)	(((_cap_) & GENMASK(15, 8)) >> 8)
@@ -78,6 +80,13 @@ enum {
 			DP_CAP_UFP_D_PIN_ASSIGN(_cap_) : DP_CAP_DFP_D_PIN_ASSIGN(_cap_))
 #define DP_CAP_PIN_ASSIGN_DFP_D(_cap_) ((_cap_ & DP_CAP_RECEPTACLE) ? \
 			DP_CAP_DFP_D_PIN_ASSIGN(_cap_) : DP_CAP_UFP_D_PIN_ASSIGN(_cap_))
+#define DP_CAP_UHBR_13_5_SUPPORT	BIT(26)
+#define DP_CAP_CABLE_TYPE(_cap_)	(((_cap_) & GENMASK(29, 28)) >> 28)
+#define   DP_CAP_CABLE_TYPE_PASSIVE	0
+#define   DP_CAP_CABLE_TYPE_RE_TIMER	1
+#define   DP_CAP_CABLE_TYPE_RE_DRIVER	2
+#define   DP_CAP_CABLE_TYPE_OPTICAL	3
+#define DP_CAP_DPAM_VERSION		BIT(30)
 
 /* DisplayPort Status Update VDO bits */
 #define DP_STATUS_CONNECTION(_status_)	((_status_) & 3)
@@ -97,13 +106,24 @@ enum {
 #define DP_CONF_CURRENTLY(_conf_)	((_conf_) & 3)
 #define DP_CONF_UFP_U_AS_DFP_D		BIT(0)
 #define DP_CONF_UFP_U_AS_UFP_D		BIT(1)
-#define DP_CONF_SIGNALING_DP		BIT(2)
-#define DP_CONF_SIGNALING_GEN_2		BIT(3) /* Reserved after v1.0b */
+#define DP_CONF_SIGNALLING_MASK		GENMASK(5, 2)
+#define DP_CONF_SIGNALLING_SHIFT	2
+#define   DP_CONF_SIGNALLING_HBR3	1
+#define   DP_CONF_SIGNALLING_UHBR10	2
+#define   DP_CONF_SIGNALLING_UHBR20	3
 #define DP_CONF_PIN_ASSIGNEMENT_SHIFT	8
 #define DP_CONF_PIN_ASSIGNEMENT_MASK	GENMASK(15, 8)
 
 /* Helper for setting/getting the pin assignment value to the configuration */
 #define DP_CONF_SET_PIN_ASSIGN(_a_)	((_a_) << 8)
 #define DP_CONF_GET_PIN_ASSIGN(_conf_)	(((_conf_) & GENMASK(15, 8)) >> 8)
+#define DP_CONF_UHBR13_5_SUPPORT	BIT(26)
+#define DP_CONF_CABLE_TYPE_MASK		GENMASK(29, 28)
+#define DP_CONF_CABLE_TYPE_SHIFT	28
+#define   DP_CONF_CABLE_TYPE_PASSIVE	0
+#define   DP_CONF_CABLE_TYPE_RE_TIMER	1
+#define   DP_CONF_CABLE_TYPE_RE_DRIVER	2
+#define   DP_CONF_CABLE_TYPE_OPTICAL	3
+#define DP_CONF_DPAM_VERSION		BIT(30)
 
 #endif /* __USB_TYPEC_DP_H */
-- 
cgit v1.2.3


From c365b1e1f40499472433cc8fca3d0ea280ead52a Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <utkarsh.h.patel@intel.com>
Date: Tue, 19 Sep 2023 19:32:40 -0700
Subject: usb: typec: Add Active or Passive cable defination to cable discover
 mode VDO

As per USB Type-C Connector specification v2.2 section F.2.6, BIT25
represents Active or Passive cable. Added BIT25 defination to the
Thunderbolt 3 cable discover mode VDO.

Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
Link: https://lore.kernel.org/r/20230920023243.2494410-3-utkarsh.h.patel@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/typec_tbt.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/typec_tbt.h b/include/linux/usb/typec_tbt.h
index 63dd44b72e0c..c7a2153bd6f5 100644
--- a/include/linux/usb/typec_tbt.h
+++ b/include/linux/usb/typec_tbt.h
@@ -46,6 +46,7 @@ struct typec_thunderbolt_data {
 #define TBT_CABLE_OPTICAL		BIT(21)
 #define TBT_CABLE_RETIMER		BIT(22)
 #define TBT_CABLE_LINK_TRAINING		BIT(23)
+#define TBT_CABLE_ACTIVE_PASSIVE	BIT(25)
 
 #define TBT_SET_CABLE_SPEED(_s_)	(((_s_) & GENMASK(2, 0)) << 16)
 #define TBT_SET_CABLE_ROUNDED(_g_)	(((_g_) & GENMASK(1, 0)) << 19)
-- 
cgit v1.2.3


From f9ee6043283a78c84adf6ef3cef7a085eee4130f Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <utkarsh.h.patel@intel.com>
Date: Tue, 19 Sep 2023 19:32:41 -0700
Subject: usb: pd: Add helper macro to get Type C cable speed

Added a helper macro to get the Type C cable speed when provided the
cable VDO.

Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
Link: https://lore.kernel.org/r/20230920023243.2494410-4-utkarsh.h.patel@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/pd_vdo.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/pd_vdo.h b/include/linux/usb/pd_vdo.h
index b057250704e8..3a747938cdab 100644
--- a/include/linux/usb/pd_vdo.h
+++ b/include/linux/usb/pd_vdo.h
@@ -376,6 +376,7 @@
 	 | ((vbm) & 0x3) << 9 | (sbu) << 8 | (sbut) << 7 | ((cur) & 0x3) << 5	\
 	 | (vbt) << 4 | (sopp) << 3 | ((spd) & 0x7))
 
+#define VDO_TYPEC_CABLE_SPEED(vdo)	((vdo) & 0x7)
 #define VDO_TYPEC_CABLE_TYPE(vdo)	(((vdo) >> 18) & 0x3)
 
 /*
-- 
cgit v1.2.3


From 1b8a62937e0b23c41956feec778ca7776a01df48 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 26 Sep 2023 15:25:31 +0200
Subject: ASoC: ti: Convert TWL4030 to use GPIO descriptors

The TWL4030 is actually only ever populated from the device tree,
so we can just pass the right device and headphone jack GPIO name
to snd_soc_jack_add_gpios() and it will pick the right GPIO right
from the device tree.

The platform data patch is unused (no in-tree users of the pdata
method) but these can use GPIO descriptor tables rather than global
GPIO numbers if they need this.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Jarkko Nikula <jarkko.nikula@bitmer.com>
Link: https://lore.kernel.org/r/20230926-descriptors-asoc-ti-v1-3-60cf4f8adbc5@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/platform_data/omap-twl4030.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/omap-twl4030.h b/include/linux/platform_data/omap-twl4030.h
index 0dd851ea1c72..7fcb55fe21c9 100644
--- a/include/linux/platform_data/omap-twl4030.h
+++ b/include/linux/platform_data/omap-twl4030.h
@@ -37,9 +37,6 @@ struct omap_tw4030_pdata {
 	bool	has_digimic0;
 	bool	has_digimic1;
 	u8	has_linein;
-
-	/* Jack detect GPIO or  <= 0 if it is not implemented */
-	int jack_detect;
 };
 
 #endif /* _OMAP_TWL4030_H_ */
-- 
cgit v1.2.3


From 52e24f8c0a102ac76649c6b71224fadcc82bd5da Mon Sep 17 00:00:00 2001
From: Niklas Schnelle <schnelle@linux.ibm.com>
Date: Mon, 11 Sep 2023 14:56:52 +0200
Subject: usb: pci-quirks: handle HAS_IOPORT dependency for AMD quirk

In a future patch HAS_IOPORT=n will result in inb()/outb() and friends
not being declared. In the pci-quirks case the I/O port acceses are
used in the quirks for several AMD south bridges, Add a config option
for the AMD quirks to depend on HAS_IOPORT and #ifdef the quirk code.

Co-developed-by: Arnd Bergmann <arnd@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@kernel.org>
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Link: https://lore.kernel.org/r/20230911125653.1393895-3-schnelle@linux.ibm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/hcd.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 61d4f0b793dc..00724b4f6e12 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -484,8 +484,25 @@ extern int usb_hcd_pci_probe(struct pci_dev *dev,
 extern void usb_hcd_pci_remove(struct pci_dev *dev);
 extern void usb_hcd_pci_shutdown(struct pci_dev *dev);
 
+#ifdef CONFIG_USB_PCI_AMD
 extern int usb_hcd_amd_remote_wakeup_quirk(struct pci_dev *dev);
 
+static inline bool usb_hcd_amd_resume_bug(struct pci_dev *dev,
+					  const struct hc_driver *driver)
+{
+	if (!usb_hcd_amd_remote_wakeup_quirk(dev))
+		return false;
+	if (driver->flags & (HCD_USB11 | HCD_USB3))
+		return true;
+	return false;
+}
+#else /* CONFIG_USB_PCI_AMD */
+static inline bool usb_hcd_amd_resume_bug(struct pci_dev *dev,
+					  const struct hc_driver *driver)
+{
+	return false;
+}
+#endif
 extern const struct dev_pm_ops usb_hcd_pci_pm_ops;
 #endif /* CONFIG_USB_PCI */
 
-- 
cgit v1.2.3


From 568441b7d45fc7198a89b522d721428f2005c356 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 2 Oct 2023 17:22:40 +0300
Subject: usb: pd: Exposing the Peak Current value of Fixed Supplies to user
 space

Exposing the value of the field as is.

The Peak Current value has to be interpreted as described
in Table 6-10 (Fixed Power Source Peak Current Capability)
of the USB Power Delivery Specification, but that
interpretation will be done in user space, not in kernel.

Suggested-by: Douglas Gilbert <dgilbert@interlog.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20231002142240.2641962-1-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/pd.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index c59fb79a42e8..eb626af0e4e7 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -228,6 +228,7 @@ enum pd_pdo_type {
 #define PDO_FIXED_UNCHUNK_EXT		BIT(24) /* Unchunked Extended Message supported (Source) */
 #define PDO_FIXED_FRS_CURR_MASK		(BIT(24) | BIT(23)) /* FR_Swap Current (Sink) */
 #define PDO_FIXED_FRS_CURR_SHIFT	23
+#define PDO_FIXED_PEAK_CURR_SHIFT	20
 #define PDO_FIXED_VOLT_SHIFT		10	/* 50mV units */
 #define PDO_FIXED_CURR_SHIFT		0	/* 10mA units */
 
-- 
cgit v1.2.3


From 3551ff7c5cfff4dc27fdcd14fa286edc08d78088 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@gmail.com>
Date: Mon, 28 Aug 2023 17:43:02 +0200
Subject: usb: gadget: clarify usage of USB_GADGET_DELAYED_STATUS

USB_GADGET_DELAYED_STATUS was introduced in commit 1b9ba000177e ("usb:
gadget: composite: Allow function drivers to pause control transfers").
It was initially intended for the composite framework to allow delaying
completing the status stage of a SET_CONFIGURATION request until all
functions are ready.

Unfortunately, that commit had an unintended side-effect of returning
USB_GADGET_DELAYED_STATUS from the ->setup() call of the composite
framework gadget driver.

As a result of this and the incomplete documentation, some UDC drivers
started relying on USB_GADGET_DELAYED_STATUS to decide when to avoid
autocompleting the status stage for 0-length control transfers. dwc3 was
the first in commit 5bdb1dcc6330 ("usb: dwc3: ep0: handle delayed_status
again"). And a number of other UDC drivers followed later, probably
relying on the dwc3 behavior as a reference.

Unfortunately, this violated the interface between the UDC and the
gadget driver for 0-length control transfers: the UDC driver must only
proceed with the status stage for a 0-length control transfer once the
gadget driver queued a response to EP0.

As a result, a few gadget drivers are partially broken when used with
a UDC that only delays the status stage for 0-length transfers when
USB_GADGET_DELAYED_STATUS is returned from the setup() callback.

This includes Raw Gadget and GadgetFS. For FunctionFS, a workaround was
added in commit 946ef68ad4e4 ("usb: gadget: ffs: Let setup() return
USB_GADGET_DELAYED_STATUS") and commit 4d644abf2569 ("usb: gadget: f_fs:
Only return delayed status when len is 0").

The proper solution to this issue would be to contain
USB_GADGET_DELAYED_STATUS within the composite framework and make all
UDC drivers to not complete the status stage for 0-length requests on
their own.

Unfortunately, there is quite a few UDC drivers that need to get fixed
and the required changes for some of them are not trivial.

For now, update the comments to clarify that USB_GADGET_DELAYED_STATUS
must not be used by the UDC drivers.

The following two commits also add workarounds to Raw Gadget and GadgetFS
to make them compatible with the broken UDC drivers until they are fixed.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://lore.kernel.org/r/7f0ee06c68c7241c844cd50f8565fdd5ead79b1b.1693237258.git.andreyknvl@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/composite.h | 8 ++++++++
 include/linux/usb/gadget.h    | 9 +++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 6014340ba980..af3cd2aae4bc 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -35,6 +35,14 @@
  * are ready. The control transfer will then be kept from completing till
  * all the function drivers that requested for USB_GADGET_DELAYED_STAUS
  * invoke usb_composite_setup_continue().
+ *
+ * NOTE: USB_GADGET_DELAYED_STATUS must not be used in UDC drivers: they
+ * must delay completing the status stage for 0-length control transfers
+ * regardless of the whether USB_GADGET_DELAYED_STATUS is returned from
+ * the gadget driver's setup() callback.
+ * Currently, a number of UDC drivers rely on USB_GADGET_DELAYED_STATUS,
+ * which is a bug. These drivers must be fixed and USB_GADGET_DELAYED_STATUS
+ * must be contained within the composite framework.
  */
 #define USB_GADGET_DELAYED_STATUS       0x7fff	/* Impossibly large value */
 
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 75bda0783395..6532beb587b1 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -711,6 +711,15 @@ static inline int usb_gadget_check_config(struct usb_gadget *gadget)
  * get_interface.  Setting a configuration (or interface) is where
  * endpoints should be activated or (config 0) shut down.
  *
+ * The gadget driver's setup() callback does not have to queue a response to
+ * ep0 within the setup() call, the driver can do it after setup() returns.
+ * The UDC driver must wait until such a response is queued before proceeding
+ * with the data/status stages of the control transfer.
+ *
+ * NOTE: Currently, a number of UDC drivers rely on USB_GADGET_DELAYED_STATUS
+ * being returned from the setup() callback, which is a bug. See the comment
+ * next to USB_GADGET_DELAYED_STATUS for details.
+ *
  * (Note that only the default control endpoint is supported.  Neither
  * hosts nor devices generally support control traffic except to ep0.)
  *
-- 
cgit v1.2.3


From 5234193ee2b997e59326b047610a8f3f64a0ce02 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 15 Sep 2023 13:15:17 -0700
Subject: ceph: Annotate struct ceph_osd_request with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ceph_osd_request.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Xiubo Li <xiubli@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: ceph-devel@vger.kernel.org
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Link: https://lore.kernel.org/r/20230915201517.never.373-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/ceph/osd_client.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index bf9823956758..b8610e9d2471 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -278,7 +278,7 @@ struct ceph_osd_request {
 	int r_attempts;
 	u32 r_map_dne_bound;
 
-	struct ceph_osd_req_op r_ops[];
+	struct ceph_osd_req_op r_ops[] __counted_by(r_num_ops);
 };
 
 struct ceph_request_redirect {
-- 
cgit v1.2.3


From 210d4e9c732fa87b584af72faae96c037d9d7957 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:28:44 -0700
Subject: ipv4/igmp: Annotate struct ip_sf_socklist with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ip_sf_socklist.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Martin KaFai Lau <martin.lau@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922172858.3822653-2-keescook@chromium.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/igmp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index ebf4349a53af..5171231f70a8 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -39,7 +39,7 @@ struct ip_sf_socklist {
 	unsigned int		sl_max;
 	unsigned int		sl_count;
 	struct rcu_head		rcu;
-	__be32			sl_addr[];
+	__be32			sl_addr[] __counted_by(sl_max);
 };
 
 #define IP_SFBLOCK	10	/* allocate this many at once */
-- 
cgit v1.2.3


From b7768e67af9a5b6d6101cbfc146969fedf8df4be Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Thu, 28 Sep 2023 16:55:33 +0800
Subject: regulator: mt6358: Add missing regulators for MT6366

When support for the MT6366 PMIC regulators was added, it was assumed
that it had the same functionality as MT6358. In reality there are
differences. A few regulators have different ranges, or were renamed
and repurposed, or removed altogether.

Add the 3 regulators that were missing from the original submission.
These are added for completeness. VSRAM_CORE is not used in existing
projects. VM18 and VMDDR feed DRAM related consumers, and are not used
in-kernel.

Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Acked-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20230928085537.3246669-11-wenst@chromium.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/mt6358/registers.h       | 17 +++++++++++++++++
 include/linux/regulator/mt6358-regulator.h |  3 +++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/mt6358/registers.h b/include/linux/mfd/mt6358/registers.h
index 5ea2590be710..d83e87298ac4 100644
--- a/include/linux/mfd/mt6358/registers.h
+++ b/include/linux/mfd/mt6358/registers.h
@@ -294,4 +294,21 @@
 #define MT6358_AUD_TOP_INT_CON0               0x2228
 #define MT6358_AUD_TOP_INT_STATUS0            0x2234
 
+/*
+ * MT6366 has no VCAM*, but has other regulators in its place. The names
+ * keep the MT6358 prefix for ease of use in the regulator driver.
+ */
+#define MT6358_LDO_VSRAM_CON5                 0x1bf8
+#define MT6358_LDO_VM18_CON0                  MT6358_LDO_VCAMA1_CON0
+#define MT6358_LDO_VM18_CON1                  MT6358_LDO_VCAMA1_CON1
+#define MT6358_LDO_VM18_CON2                  MT6358_LDO_VCAMA1_CON2
+#define MT6358_LDO_VMDDR_CON0                 MT6358_LDO_VCAMA2_CON0
+#define MT6358_LDO_VMDDR_CON1                 MT6358_LDO_VCAMA2_CON1
+#define MT6358_LDO_VMDDR_CON2                 MT6358_LDO_VCAMA2_CON2
+#define MT6358_LDO_VSRAM_CORE_CON0            MT6358_LDO_VCAMD_CON0
+#define MT6358_LDO_VSRAM_CORE_DBG0            0x1cb6
+#define MT6358_LDO_VSRAM_CORE_DBG1            0x1cb8
+#define MT6358_VM18_ANA_CON0                  MT6358_VCAMA1_ANA_CON0
+#define MT6358_VMDDR_ANA_CON0                 MT6358_VCAMD_ANA_CON0
+
 #endif /* __MFD_MT6358_REGISTERS_H__ */
diff --git a/include/linux/regulator/mt6358-regulator.h b/include/linux/regulator/mt6358-regulator.h
index c71a6a9fce7a..562386f9b80e 100644
--- a/include/linux/regulator/mt6358-regulator.h
+++ b/include/linux/regulator/mt6358-regulator.h
@@ -86,6 +86,9 @@ enum {
 	MT6366_ID_VMC,
 	MT6366_ID_VAUD28,
 	MT6366_ID_VSIM2,
+	MT6366_ID_VM18,
+	MT6366_ID_VMDDR,
+	MT6366_ID_VSRAM_CORE,
 	MT6366_ID_RG_MAX,
 };
 
-- 
cgit v1.2.3


From d844fe65f0957024c3e1b0bf2a0615246184d9bc Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Mon, 7 Aug 2023 20:03:57 -0700
Subject: sched/headers: Move 'struct sched_param' out of uapi, to work around
 glibc/musl breakage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both glibc and musl define 'struct sched_param' in sched.h, while kernel
has it in uapi/linux/sched/types.h, making it cumbersome to use
sched_getattr(2) or sched_setattr(2) from userspace.

For example, something like this:

	#include <sched.h>
	#include <linux/sched/types.h>

	struct sched_attr sa;

will result in "error: redefinition of ‘struct sched_param’" (note the
code doesn't need sched_param at all -- it needs struct sched_attr
plus some stuff from sched.h).

The situation is, glibc is not going to provide a wrapper for
sched_{get,set}attr, thus the need to include linux/sched_types.h
directly, which leads to the above problem.

Thus, the userspace is left with a few sub-par choices when it wants to
use e.g. sched_setattr(2), such as maintaining a copy of struct
sched_attr definition, or using some other ugly tricks.

OTOH, 'struct sched_param' is well known, defined in POSIX, and it won't
be ever changed (as that would break backward compatibility).

So, while 'struct sched_param' is indeed part of the kernel uapi,
exposing it the way it's done now creates an issue, and hiding it
(like this patch does) fixes that issue, hopefully without creating
another one: common userspace software rely on libc headers, and as
for "special" software (like libc), it looks like glibc and musl
do not rely on kernel headers for 'struct sched_param' definition
(but let's Cc their mailing lists in case it's otherwise).

The alternative to this patch would be to move struct sched_attr to,
say, linux/sched.h, or linux/sched/attr.h (the new file).

Oh, and here is the previous attempt to fix the issue:

  https://lore.kernel.org/all/20200528135552.GA87103@google.com/

While I support Linus arguments, the issue is still here
and needs to be fixed.

[ mingo: Linus is right, this shouldn't be needed - but on the other
         hand I agree that this header is not really helpful to
	 user-space as-is. So let's pretend that
	 <uapi/linux/sched/types.h> is only about sched_attr, and
	 call this commit a workaround for user-space breakage
	 that it in reality is ... Also, remove the Fixes tag. ]

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230808030357.1213829-1-kolyshkin@gmail.com
---
 include/linux/sched.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dc37ae787e33..e4235bbfad77 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,7 +63,6 @@ struct robust_list_head;
 struct root_domain;
 struct rq;
 struct sched_attr;
-struct sched_param;
 struct seq_file;
 struct sighand_struct;
 struct signal_struct;
@@ -370,6 +369,10 @@ extern struct root_domain def_root_domain;
 extern struct mutex sched_domains_mutex;
 #endif
 
+struct sched_param {
+	int sched_priority;
+};
+
 struct sched_info {
 #ifdef CONFIG_SCHED_INFO
 	/* Cumulative counters: */
-- 
cgit v1.2.3


From affccb16c117d188eb09495cbdea149cecbf00b9 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Wed, 6 Sep 2023 11:22:32 +0200
Subject: ata: ahci: print the lpm policy on boot

The target LPM policy can be set using either a Kconfig or a kernel module
parameter.

However, if the board type is set to anything but board_ahci_low_power,
then the LPM policy will overridden and set to ATA_LPM_UNKNOWN.

Additionally, if the default suspend is suspend to idle, depending on the
hardware capabilities of the HBA, ahci_update_initial_lpm_policy() might
override the LPM policy to either ATA_LPM_MIN_POWER_WITH_PARTIAL or
ATA_LPM_MIN_POWER.

All this means that it is very hard to know which LPM policy a user will
actually be using on a given system.

In order to make it easier to debug LPM related issues, print the LPM
policy on boot.

One common LPM related issue is that the device fails to link up.
Because of that, we cannot add this print to ata_dev_configure(), as that
function is only called after a successful link up. Instead, add the info
using ata_port_desc(), with the help of a new ata_port_desc_misc() helper.
The port description is printed once per port during boot.

Before changes:
ata1: SATA max UDMA/133 abar m524288@0xa5780000 port 0xa5780100 irq 170
ata2: SATA max UDMA/133 abar m524288@0xa5780000 port 0xa5780180 irq 170

After changes:
ata1: SATA max UDMA/133 abar m524288@0xa5780000 port 0xa5780100 irq 170 lpm-pol 4
ata2: SATA max UDMA/133 abar m524288@0xa5780000 port 0xa5780180 irq 170 lpm-pol 4

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2a7d2af0ed80..54a217868ad0 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1561,6 +1561,11 @@ void ata_port_desc(struct ata_port *ap, const char *fmt, ...);
 extern void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset,
 			       const char *name);
 #endif
+static inline void ata_port_desc_misc(struct ata_port *ap, int irq)
+{
+	ata_port_desc(ap, "irq %d", irq);
+	ata_port_desc(ap, "lpm-pol %d", ap->target_lpm_policy);
+}
 
 static inline bool ata_tag_internal(unsigned int tag)
 {
-- 
cgit v1.2.3


From 0e19548145d863c48c6562add768a05231d768c9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:52:11 -0700
Subject: ata: libata: Annotate struct ata_cpr_log with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ata_cpr_log.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: linux-ide@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 54a217868ad0..b39891320271 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -656,7 +656,7 @@ struct ata_cpr {
 
 struct ata_cpr_log {
 	u8			nr_cpr;
-	struct ata_cpr		cpr[];
+	struct ata_cpr		cpr[] __counted_by(nr_cpr);
 };
 
 struct ata_device {
-- 
cgit v1.2.3


From 1b947279798fd51bef46e8241102f5b896add021 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 28 Aug 2023 13:10:28 +0900
Subject: ata: libata: Cleanup inline DMA helper functions

Simplify the inline DMA helper functions ata_using_mwdma(),
ata_using_udma() and ata_dma_enabled() to directly return as a boolean
the result of their test condition.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Tested-by: Chia-Lin Kao (AceLan) <acelan.kao@canonical.com>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/libata.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index b39891320271..1dbb14daccfa 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1886,23 +1886,21 @@ static inline unsigned long ata_deadline(unsigned long from_jiffies,
    change in future hardware and specs, secondly 0xFF means 'no DMA' but is
    > UDMA_0. Dyma ddreigiau */
 
-static inline int ata_using_mwdma(struct ata_device *adev)
+static inline bool ata_using_mwdma(struct ata_device *adev)
 {
-	if (adev->dma_mode >= XFER_MW_DMA_0 && adev->dma_mode <= XFER_MW_DMA_4)
-		return 1;
-	return 0;
+	return adev->dma_mode >= XFER_MW_DMA_0 &&
+		adev->dma_mode <= XFER_MW_DMA_4;
 }
 
-static inline int ata_using_udma(struct ata_device *adev)
+static inline bool ata_using_udma(struct ata_device *adev)
 {
-	if (adev->dma_mode >= XFER_UDMA_0 && adev->dma_mode <= XFER_UDMA_7)
-		return 1;
-	return 0;
+	return adev->dma_mode >= XFER_UDMA_0 &&
+		adev->dma_mode <= XFER_UDMA_7;
 }
 
-static inline int ata_dma_enabled(struct ata_device *adev)
+static inline bool ata_dma_enabled(struct ata_device *adev)
 {
-	return (adev->dma_mode == 0xFF ? 0 : 1);
+	return adev->dma_mode != 0xFF;
 }
 
 /**************************************************************************
-- 
cgit v1.2.3


From c2a36609dab3b7c937ab95bfb8b98e72391f772e Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Tue, 19 Sep 2023 10:51:47 +0200
Subject: tty: switch tty_{,un}throttle_safe() to return a bool

They return 0 or 1 -- a boolean value, so make it clear than noone
should expect negative or other values.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20230919085156.1578-7-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index f002d0f25db7..59d675f345e9 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -416,8 +416,8 @@ unsigned int tty_chars_in_buffer(struct tty_struct *tty);
 unsigned int tty_write_room(struct tty_struct *tty);
 void tty_driver_flush_buffer(struct tty_struct *tty);
 void tty_unthrottle(struct tty_struct *tty);
-int tty_throttle_safe(struct tty_struct *tty);
-int tty_unthrottle_safe(struct tty_struct *tty);
+bool tty_throttle_safe(struct tty_struct *tty);
+bool tty_unthrottle_safe(struct tty_struct *tty);
 int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
 int tty_get_icount(struct tty_struct *tty,
 		struct serial_icounter_struct *icount);
-- 
cgit v1.2.3


From 71067fb797e074c468221793fe93ba1b58350f1e Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Tue, 19 Sep 2023 10:51:50 +0200
Subject: tty: fix kernel-doc for functions in tty.h

tty_kref_get() is already included in Documentation, but is not properly
formatted. Fix this.

tty_get_baud_rate() is neither properly formatted, nor is included. Fix
both.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20230919085156.1578-10-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 59d675f345e9..4b6340ac2af2 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -390,14 +390,12 @@ int vcs_init(void);
 extern const struct class tty_class;
 
 /**
- *	tty_kref_get		-	get a tty reference
- *	@tty: tty device
+ * tty_kref_get - get a tty reference
+ * @tty: tty device
  *
- *	Return a new reference to a tty object. The caller must hold
- *	sufficient locks/counts to ensure that their existing reference cannot
- *	go away
+ * Returns: a new reference to a tty object. The caller must hold sufficient
+ * locks/counts to ensure that their existing reference cannot go away
  */
-
 static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 {
 	if (tty)
@@ -435,14 +433,13 @@ void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud,
 		speed_t obaud);
 
 /**
- *	tty_get_baud_rate	-	get tty bit rates
- *	@tty: tty to query
+ * tty_get_baud_rate - get tty bit rates
+ * @tty: tty to query
  *
- *	Returns the baud rate as an integer for this terminal. The
- *	termios lock must be held by the caller and the terminal bit
- *	flags may be updated.
+ * Returns: the baud rate as an integer for this terminal. The termios lock
+ * must be held by the caller and the terminal bit flags may be updated.
  *
- *	Locking: none
+ * Locking: none
  */
 static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
 {
-- 
cgit v1.2.3


From 29bff582b74ed0bdb7e6986482ad9e6799ea4d2f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 26 Sep 2023 21:41:28 -0700
Subject: serial: core: fix kernel-doc for uart_port_unlock_irqrestore()

Fix the function name to avoid a kernel-doc warning:

include/linux/serial_core.h:666: warning: expecting prototype for uart_port_lock_irqrestore(). Prototype was for uart_port_unlock_irqrestore() instead

Fixes: b0af4bcb4946 ("serial: core: Provide port lock wrappers")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: linux-serial@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Slaby <jirislaby@kernel.org>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Link: https://lore.kernel.org/r/20230927044128.4748-1-rdunlap@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 3091c62ec37b..89f7b6c63598 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -658,7 +658,7 @@ static inline void uart_port_unlock_irq(struct uart_port *up)
 }
 
 /**
- * uart_port_lock_irqrestore - Unlock the UART port, restore interrupts
+ * uart_port_unlock_irqrestore - Unlock the UART port, restore interrupts
  * @up:		Pointer to UART port structure
  * @flags:	The saved interrupt flags for restore
  */
-- 
cgit v1.2.3


From 3cd39bc3b11b8d34b7d7c961a35fdfd18b0ebf75 Mon Sep 17 00:00:00 2001
From: Alejandro Colomar <alx@kernel.org>
Date: Tue, 3 Oct 2023 14:59:53 +0300
Subject: kernel.h: Move ARRAY_SIZE() to a separate header

Touching files so used for the kernel,
forces 'make' to recompile most of the kernel.

Having those definitions in more granular files
helps avoid recompiling so much of the kernel.

Signed-off-by: Alejandro Colomar <alx@kernel.org>
Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20230817143352.132583-2-lucas.segarra.fernandez@intel.com
[andy: reduced to cover only string.h for now]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/array_size.h | 13 +++++++++++++
 include/linux/kernel.h     |  7 +------
 include/linux/string.h     |  1 +
 3 files changed, 15 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/array_size.h

(limited to 'include/linux')

diff --git a/include/linux/array_size.h b/include/linux/array_size.h
new file mode 100644
index 000000000000..06d7d83196ca
--- /dev/null
+++ b/include/linux/array_size.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ARRAY_SIZE_H
+#define _LINUX_ARRAY_SIZE_H
+
+#include <linux/compiler.h>
+
+/**
+ * ARRAY_SIZE - get the number of elements in array @arr
+ * @arr: array to be sized
+ */
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
+
+#endif  /* _LINUX_ARRAY_SIZE_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index cee8fe87e9f4..d9ad21058eed 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -13,6 +13,7 @@
 
 #include <linux/stdarg.h>
 #include <linux/align.h>
+#include <linux/array_size.h>
 #include <linux/limits.h>
 #include <linux/linkage.h>
 #include <linux/stddef.h>
@@ -50,12 +51,6 @@
 #define READ			0
 #define WRITE			1
 
-/**
- * ARRAY_SIZE - get the number of elements in array @arr
- * @arr: array to be sized
- */
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
-
 #define PTR_IF(cond, ptr)	((cond) ? (ptr) : NULL)
 
 #define u64_to_user_ptr(x) (		\
diff --git a/include/linux/string.h b/include/linux/string.h
index dbfc66400050..3c920b6d609b 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_STRING_H_
 #define _LINUX_STRING_H_
 
+#include <linux/array_size.h>
 #include <linux/compiler.h>	/* for inline */
 #include <linux/types.h>	/* for size_t */
 #include <linux/stddef.h>	/* for NULL */
-- 
cgit v1.2.3


From 82cc14c9930c7613da2fcb41a8d4f90c8b4cb048 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 3 Oct 2023 15:10:11 +0300
Subject: pinctrl: Replace kernel.h by what is actually being used

The kernel.h is a mess of unrelated things and we only used it
as a proxy to array_size.h, hence switch from former to the latter.

While at it, group and sort the headers where it makes sense.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/pinctrl/machine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/machine.h b/include/linux/pinctrl/machine.h
index 0639b36f43c5..ee8803f6ad07 100644
--- a/include/linux/pinctrl/machine.h
+++ b/include/linux/pinctrl/machine.h
@@ -11,7 +11,7 @@
 #ifndef __LINUX_PINCTRL_MACHINE_H
 #define __LINUX_PINCTRL_MACHINE_H
 
-#include <linux/kernel.h>	/* ARRAY_SIZE() */
+#include <linux/array_size.h>
 
 #include <linux/pinctrl/pinctrl-state.h>
 
-- 
cgit v1.2.3


From 75cec20345fa8e05a2b5f861fada95ad8e165257 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 8 Sep 2023 16:32:15 +0200
Subject: bpf: Remove xdp_do_flush_map().
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xdp_do_flush_map() can be removed because there is no more user in tree.

Remove xdp_do_flush_map().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Link: https://lore.kernel.org/r/20230908143215.869913-3-bigeasy@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/filter.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 27406aee2d40..e8822bd595f9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1025,12 +1025,6 @@ int xdp_do_redirect_frame(struct net_device *dev,
 			  struct bpf_prog *prog);
 void xdp_do_flush(void);
 
-/* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as
- * it is no longer only flushing maps. Keep this define for compatibility
- * until all drivers are updated - do not use xdp_do_flush_map() in new code!
- */
-#define xdp_do_flush_map xdp_do_flush
-
 void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act);
 
 #ifdef CONFIG_INET
-- 
cgit v1.2.3


From ccab211af3c2b90ed792eb5f33707d2f0d59fe50 Mon Sep 17 00:00:00 2001
From: Sohil Mehta <sohil.mehta@intel.com>
Date: Mon, 10 Jul 2023 18:51:24 +0000
Subject: syscalls: Cleanup references to sys_lookup_dcookie()

commit 'be65de6b03aa ("fs: Remove dcookies support")' removed the
syscall definition for lookup_dcookie.  However, syscall tables still
point to the old sys_lookup_dcookie() definition. Update syscall tables
of all architectures to directly point to sys_ni_syscall() instead.

Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Namhyung Kim <namhyung@kernel.org> # for perf
Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/compat.h   | 1 -
 include/linux/syscalls.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 1cfa4f0f490a..233f61ec8afc 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -581,7 +581,6 @@ asmlinkage long compat_sys_io_pgetevents_time64(compat_aio_context_t ctx_id,
 					struct io_event __user *events,
 					struct __kernel_timespec __user *timeout,
 					const struct __compat_aio_sigset __user *usig);
-asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, compat_size_t);
 asmlinkage long compat_sys_epoll_pwait(int epfd,
 			struct epoll_event __user *events,
 			int maxevents, int timeout,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 22bc6bc147f8..a031613bf966 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -355,7 +355,6 @@ asmlinkage long sys_lremovexattr(const char __user *path,
 				 const char __user *name);
 asmlinkage long sys_fremovexattr(int fd, const char __user *name);
 asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
-asmlinkage long sys_lookup_dcookie(u64 cookie64, char __user *buf, size_t len);
 asmlinkage long sys_eventfd2(unsigned int count, int flags);
 asmlinkage long sys_epoll_create1(int flags);
 asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
-- 
cgit v1.2.3


From 26dd68d293fd1c5ac966fb5dd5f6d89de322a541 Mon Sep 17 00:00:00 2001
From: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Date: Tue, 12 Sep 2023 07:59:31 -0400
Subject: overflow: add DEFINE_FLEX() for on-stack allocs

Add DEFINE_FLEX() macro for on-stack allocations of structs with
flexible array member.

Expose __struct_size() macro outside of fortify-string.h, as it could be
used to read size of structs allocated by DEFINE_FLEX().
Move __member_size() alongside it.
-Kees

Using underlying array for on-stack storage lets us to declare
known-at-compile-time structures without kzalloc().

Actual usage for ice driver is in following patches of the series.

Missing __has_builtin() workaround is moved up to serve also assembly
compilation with m68k-linux-gcc, see [1].
Error was (note the .S file extension):
In file included from ../include/linux/linkage.h:5,
                 from ../arch/m68k/fpsp040/skeleton.S:40:
../include/linux/compiler_types.h:331:5: warning: "__has_builtin" is not defined, evaluates to 0 [-Wundef]
  331 | #if __has_builtin(__builtin_dynamic_object_size)
      |     ^~~~~~~~~~~~~
../include/linux/compiler_types.h:331:18: error: missing binary operator before token "("
  331 | #if __has_builtin(__builtin_dynamic_object_size)
      |                  ^

[1] https://lore.kernel.org/netdev/202308112122.OuF0YZqL-lkp@intel.com/
Co-developed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Link: https://lore.kernel.org/r/20230912115937.1645707-2-przemyslaw.kitszel@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/compiler_types.h | 32 +++++++++++++++++++++-----------
 include/linux/fortify-string.h |  4 ----
 include/linux/overflow.h       | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index c523c6683789..6f1ca49306d2 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -2,6 +2,15 @@
 #ifndef __LINUX_COMPILER_TYPES_H
 #define __LINUX_COMPILER_TYPES_H
 
+/*
+ * __has_builtin is supported on gcc >= 10, clang >= 3 and icc >= 21.
+ * In the meantime, to support gcc < 10, we implement __has_builtin
+ * by hand.
+ */
+#ifndef __has_builtin
+#define __has_builtin(x) (0)
+#endif
+
 #ifndef __ASSEMBLY__
 
 /*
@@ -134,17 +143,6 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
 # define __preserve_most
 #endif
 
-/* Builtins */
-
-/*
- * __has_builtin is supported on gcc >= 10, clang >= 3 and icc >= 21.
- * In the meantime, to support gcc < 10, we implement __has_builtin
- * by hand.
- */
-#ifndef __has_builtin
-#define __has_builtin(x) (0)
-#endif
-
 /* Compiler specific macros. */
 #ifdef __clang__
 #include <linux/compiler-clang.h>
@@ -352,6 +350,18 @@ struct ftrace_likely_data {
 # define __realloc_size(x, ...)
 #endif
 
+/*
+ * When the size of an allocated object is needed, use the best available
+ * mechanism to find it. (For cases where sizeof() cannot be used.)
+ */
+#if __has_builtin(__builtin_dynamic_object_size)
+#define __struct_size(p)	__builtin_dynamic_object_size(p, 0)
+#define __member_size(p)	__builtin_dynamic_object_size(p, 1)
+#else
+#define __struct_size(p)	__builtin_object_size(p, 0)
+#define __member_size(p)	__builtin_object_size(p, 1)
+#endif
+
 #ifndef asm_volatile_goto
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index da51a83b2829..1e7711185ec6 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -93,13 +93,9 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 #if __has_builtin(__builtin_dynamic_object_size)
 #define POS			__pass_dynamic_object_size(1)
 #define POS0			__pass_dynamic_object_size(0)
-#define __struct_size(p)	__builtin_dynamic_object_size(p, 0)
-#define __member_size(p)	__builtin_dynamic_object_size(p, 1)
 #else
 #define POS			__pass_object_size(1)
 #define POS0			__pass_object_size(0)
-#define __struct_size(p)	__builtin_object_size(p, 0)
-#define __member_size(p)	__builtin_object_size(p, 1)
 #endif
 
 #define __compiletime_lessthan(bounds, length)	(	\
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index f9b60313eaea..7b5cf4a5cd19 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -309,4 +309,39 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
 #define struct_size_t(type, member, count)					\
 	struct_size((type *)NULL, member, count)
 
+/**
+ * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family.
+ * Enables caller macro to pass (different) initializer.
+ *
+ * @type: structure type name, including "struct" keyword.
+ * @name: Name for a variable to define.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array; must be compile-time const.
+ * @initializer: initializer expression (could be empty for no init).
+ */
+#define _DEFINE_FLEX(type, name, member, count, initializer)			\
+	_Static_assert(__builtin_constant_p(count),				\
+		       "onstack flex array members require compile-time const count"); \
+	union {									\
+		u8 bytes[struct_size_t(type, member, count)];			\
+		type obj;							\
+	} name##_u initializer;							\
+	type *name = (type *)&name##_u
+
+/**
+ * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing
+ * flexible array member.
+ *
+ * @type: structure type name, including "struct" keyword.
+ * @name: Name for a variable to define.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array; must be compile-time const.
+ *
+ * Define a zeroed, on-stack, instance of @type structure with a trailing
+ * flexible array member.
+ * Use __struct_size(@name) to get compile-time size of it afterwards.
+ */
+#define DEFINE_FLEX(type, name, member, count)			\
+	_DEFINE_FLEX(type, name, member, count, = {})
+
 #endif /* __LINUX_OVERFLOW_H */
-- 
cgit v1.2.3


From eeb6d1d6f4ec0e304608f72c3ead584bbb155714 Mon Sep 17 00:00:00 2001
From: GuoHua Cheng <chenguohua@jari.cn>
Date: Wed, 27 Sep 2023 13:56:08 +0800
Subject: PNP: Clean up coding style in pnp.h

Address the following checkpatch complaints:

ERROR: "foo * bar" should be "foo *bar"
ERROR: space required after that ';' (ctx:VxV)

Signed-off-by: GuoHua Cheng <chenguohua@jari.cn>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pnp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index c2a7cfbca713..267fb8a4fb6e 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -291,7 +291,7 @@ static inline void pnp_set_drvdata(struct pnp_dev *pdev, void *data)
 
 struct pnp_fixup {
 	char id[7];
-	void (*quirk_function) (struct pnp_dev * dev);	/* fixup function */
+	void (*quirk_function) (struct pnp_dev *dev);	/* fixup function */
 };
 
 /* config parameters */
@@ -419,8 +419,8 @@ struct pnp_protocol {
 
 	/* protocol specific suspend/resume */
 	bool (*can_wakeup) (struct pnp_dev *dev);
-	int (*suspend) (struct pnp_dev * dev, pm_message_t state);
-	int (*resume) (struct pnp_dev * dev);
+	int (*suspend) (struct pnp_dev *dev, pm_message_t state);
+	int (*resume) (struct pnp_dev *dev);
 
 	/* used by pnp layer only (look but don't touch) */
 	unsigned char number;	/* protocol number */
@@ -492,7 +492,7 @@ static inline int pnp_start_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_stop_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_activate_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_disable_dev(struct pnp_dev *dev) { return -ENODEV; }
-static inline int pnp_range_reserved(resource_size_t start, resource_size_t end) { return 0;}
+static inline int pnp_range_reserved(resource_size_t start, resource_size_t end) { return 0; }
 
 /* protocol helpers */
 static inline int pnp_is_active(struct pnp_dev *dev) { return 0; }
-- 
cgit v1.2.3


From 473267a4911f2469722c74ca58087d951072f72a Mon Sep 17 00:00:00 2001
From: Patrick Rohr <prohr@google.com>
Date: Mon, 25 Sep 2023 14:47:11 -0700
Subject: net: add sysctl to disable rfc4862 5.5.3e lifetime handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change adds a sysctl to opt-out of RFC4862 section 5.5.3e's valid
lifetime derivation mechanism.

RFC4862 section 5.5.3e prescribes that the valid lifetime in a Router
Advertisement PIO shall be ignored if it less than 2 hours and to reset
the lifetime of the corresponding address to 2 hours. An in-progress
6man draft (see draft-ietf-6man-slaac-renum-07 section 4.2) is currently
looking to remove this mechanism. While this draft has not been moving
particularly quickly for other reasons, there is widespread consensus on
section 4.2 which updates RFC4862 section 5.5.3e.

Cc: Maciej Żenczykowski <maze@google.com>
Cc: Lorenzo Colitti <lorenzo@google.com>
Cc: Jen Linkova <furry@google.com>
Signed-off-by: Patrick Rohr <prohr@google.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20230925214711.959704-1-prohr@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e400ff757f13..5e605e384aac 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -82,6 +82,7 @@ struct ipv6_devconf {
 	__u32		ioam6_id_wide;
 	__u8		ioam6_enabled;
 	__u8		ndisc_evict_nocarrier;
+	__u8		ra_honor_pio_life;
 
 	struct ctl_table_header *sysctl_header;
 };
-- 
cgit v1.2.3


From 2632bb84d1d53cfd6cf65261064273ded4f759d5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 28 Sep 2023 20:24:34 -0700
Subject: mm: Remove unused vm_brk()

With fs/binfmt_elf.c fully refactored to use the new elf_load() helper,
there are no more users of vm_brk(), so remove it.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Suggested-by: Eric Biederman <ebiederm@xmission.com>
Tested-by: Pedro Falcato <pedro.falcato@gmail.com>
Signed-off-by: Sebastian Ott <sebott@redhat.com>
Link: https://lore.kernel.org/r/20230929032435.2391507-6-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/mm.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..216dd0c6dcf8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3308,8 +3308,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
 static inline void mm_populate(unsigned long addr, unsigned long len) {}
 #endif
 
-/* These take the mm semaphore themselves */
-extern int __must_check vm_brk(unsigned long, unsigned long);
+/* This takes the mm semaphore itself */
 extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
-- 
cgit v1.2.3


From 2f3dd39e2b492bec366487a2c9bcbdbd7792f77c Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Mon, 2 Oct 2023 17:34:28 -0700
Subject: platform/chrome: cros_ec_proto: Mark outdata as const

The 'outdata' is copied to the data buffer in cros_ec_cmd() before being
sent over to the EC. Mark the argument as const so that callers can pass
const pointers to this function and so that callers know the data won't
be modified.

Cc: Prashant Malani <pmalani@chromium.org>
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Acked-by: Prashant Malani <pmalani@chromium.org>
Link: https://lore.kernel.org/r/20231003003429.1378109-5-swboyd@chromium.org
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
---
 include/linux/platform_data/cros_ec_proto.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 4f9f756bc17c..8865e350c12a 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -258,7 +258,7 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature);
 
 int cros_ec_get_sensor_count(struct cros_ec_dev *ec);
 
-int cros_ec_cmd(struct cros_ec_device *ec_dev, unsigned int version, int command, void *outdata,
+int cros_ec_cmd(struct cros_ec_device *ec_dev, unsigned int version, int command, const void *outdata,
 		    size_t outsize, void *indata, size_t insize);
 
 /**
-- 
cgit v1.2.3


From 8874e414fe78718d0f2861fe511cecbd1cd73f4d Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 3 Oct 2023 11:49:14 -0700
Subject: platform/x86/intel/tpmi: Add defines to get version information
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add defines to get major and minor version from a TPMI version field
value. This will avoid code duplication to convert in every feature
driver. Also add define for invalid version field.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Link: https://lore.kernel.org/r/20231003184916.1860084-2-srinivas.pandruvada@linux.intel.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/intel_tpmi.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h
index 04d937ad4dc4..ee07393445f9 100644
--- a/include/linux/intel_tpmi.h
+++ b/include/linux/intel_tpmi.h
@@ -6,6 +6,12 @@
 #ifndef _INTEL_TPMI_H_
 #define _INTEL_TPMI_H_
 
+#include <linux/bitfield.h>
+
+#define TPMI_VERSION_INVALID	0xff
+#define TPMI_MINOR_VERSION(val)	FIELD_GET(GENMASK(4, 0), val)
+#define TPMI_MAJOR_VERSION(val)	FIELD_GET(GENMASK(7, 5), val)
+
 /**
  * struct intel_tpmi_plat_info - Platform information for a TPMI device instance
  * @package_id:	CPU Package id
-- 
cgit v1.2.3


From 968118fcf0546ef74cf306bf8f8c1e06efff10e3 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 12 Sep 2023 10:44:52 +0200
Subject: OMAP/gpio: drop MPUIO static base

The OMAP GPIO driver hardcodes the MPIO chip base, but there
is no point: we have already moved all consumers over to using
descriptor look-ups.

Drop the MPUIO GPIO base and use dynamic assignment.

Root out the unused instances of the OMAP_MPUIO() macro and
delete the unused OMAP_GPIO_IS_MPUIO() macro.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Tony Lindgren <tony@atomide.com>
Tested-by: Janusz Krzysztofik <jmkrzyszt@gmail.com>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/platform_data/gpio-omap.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/gpio-omap.h b/include/linux/platform_data/gpio-omap.h
index f377817ce75c..cdd8cfb424f5 100644
--- a/include/linux/platform_data/gpio-omap.h
+++ b/include/linux/platform_data/gpio-omap.h
@@ -144,9 +144,6 @@
 
 #define OMAP_MAX_GPIO_LINES		192
 
-#define OMAP_MPUIO(nr)		(OMAP_MAX_GPIO_LINES + (nr))
-#define OMAP_GPIO_IS_MPUIO(nr)	((nr) >= OMAP_MAX_GPIO_LINES)
-
 #ifndef __ASSEMBLER__
 struct omap_gpio_reg_offs {
 	u16 revision;
-- 
cgit v1.2.3


From 36aa129f221c9070afd8dff03154ab49702a5b1b Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 27 Sep 2023 16:29:21 +0200
Subject: gpiolib: make gpio_device_get() and gpio_device_put() public

In order to start migrating away from accessing struct gpio_chip by
users other than their owners, let's first make the reference management
functions for the opaque struct gpio_device public in the driver.h
header.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/gpio/driver.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 8f0859ba7065..a2060dc3344b 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -606,6 +606,9 @@ int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc,
 struct gpio_chip *gpiochip_find(void *data,
 				int (*match)(struct gpio_chip *gc, void *data));
 
+struct gpio_device *gpio_device_get(struct gpio_device *gdev);
+void gpio_device_put(struct gpio_device *gdev);
+
 bool gpiochip_line_is_irq(struct gpio_chip *gc, unsigned int offset);
 int gpiochip_reqres_irq(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_relres_irq(struct gpio_chip *gc, unsigned int offset);
-- 
cgit v1.2.3


From 9e4555d1e54a18946d7ca363b9fc8ed1fe7dfde4 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 27 Sep 2023 16:29:22 +0200
Subject: gpiolib: add support for scope-based management to gpio_device

As the few users that need to get the reference to the GPIO device often
release it right after inspecting its properties, let's add support for
the automatic reference release to struct gpio_device.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/gpio/driver.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index a2060dc3344b..1cedbc3d3200 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -3,6 +3,8 @@
 #define __LINUX_GPIO_DRIVER_H
 
 #include <linux/bits.h>
+#include <linux/cleanup.h>
+#include <linux/err.h>
 #include <linux/irqchip/chained_irq.h>
 #include <linux/irqdomain.h>
 #include <linux/irqhandler.h>
@@ -609,6 +611,9 @@ struct gpio_chip *gpiochip_find(void *data,
 struct gpio_device *gpio_device_get(struct gpio_device *gdev);
 void gpio_device_put(struct gpio_device *gdev);
 
+DEFINE_FREE(gpio_device_put, struct gpio_device *,
+	    if (IS_ERR_OR_NULL(_T)) gpio_device_put(_T));
+
 bool gpiochip_line_is_irq(struct gpio_chip *gc, unsigned int offset);
 int gpiochip_reqres_irq(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_relres_irq(struct gpio_chip *gc, unsigned int offset);
-- 
cgit v1.2.3


From cfe102f63308c8c8e01199a682868a64b83f653e Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 27 Sep 2023 16:29:23 +0200
Subject: gpiolib: provide gpio_device_find()

gpiochip_find() is wrong and its kernel doc is misleading as the
function doesn't return a reference to the gpio_chip but just a raw
pointer. The chip itself is not guaranteed to stay alive, in fact it can
be deleted at any point. Also: other than GPIO drivers themselves,
nobody else has any business accessing gpio_chip structs.

Provide a new gpio_device_find() function that returns a real reference
to the opaque gpio_device structure that is guaranteed to stay alive for
as long as there are active users of it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 1cedbc3d3200..6ad1f1a8ef2e 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -608,6 +608,9 @@ int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc,
 struct gpio_chip *gpiochip_find(void *data,
 				int (*match)(struct gpio_chip *gc, void *data));
 
+struct gpio_device *gpio_device_find(void *data,
+				int (*match)(struct gpio_chip *gc, void *data));
+
 struct gpio_device *gpio_device_get(struct gpio_device *gdev);
 void gpio_device_put(struct gpio_device *gdev);
 
-- 
cgit v1.2.3


From d62fcd9f1897d587f8f9db3f77ccae8797a44b5d Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 27 Sep 2023 16:29:24 +0200
Subject: gpiolib: provide gpio_device_find_by_label()

By far the most common way of looking up GPIO devices is using their
label. Provide a helpers for that to avoid every user implementing their
own matching function.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 6ad1f1a8ef2e..24996cba6465 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -610,6 +610,7 @@ struct gpio_chip *gpiochip_find(void *data,
 
 struct gpio_device *gpio_device_find(void *data,
 				int (*match)(struct gpio_chip *gc, void *data));
+struct gpio_device *gpio_device_find_by_label(const char *label);
 
 struct gpio_device *gpio_device_get(struct gpio_device *gdev);
 void gpio_device_put(struct gpio_device *gdev);
-- 
cgit v1.2.3


From 93548f8bbbbf5b62dbd37c8f61a037a06666787b Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 27 Sep 2023 16:29:25 +0200
Subject: gpiolib: provide gpio_device_get_desc()

Getting the GPIO descriptor directly from the gpio_chip struct is
dangerous as we don't take the reference to the underlying GPIO device.
In order to start working towards removing gpiochip_get_desc(), let's
provide a safer variant that works with an existing reference to struct
gpio_device.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 24996cba6465..3fdf3f14bb13 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -770,6 +770,8 @@ struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *gc,
 void gpiochip_free_own_desc(struct gpio_desc *desc);
 
 struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum);
+struct gpio_desc *
+gpio_device_get_desc(struct gpio_device *gdev, unsigned int hwnum);
 
 #ifdef CONFIG_GPIOLIB
 
-- 
cgit v1.2.3


From 9b418780844c677669ab474f6f29940ef545c954 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 27 Sep 2023 16:29:26 +0200
Subject: gpiolib: reluctantly provide gpio_device_get_chip()

The process of converting all unauthorized users of struct gpio_chip to
using dedicated struct gpio_device function will be long so in the
meantime we must provide a way of retrieving the pointer to struct
gpio_chip from a GPIO device.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 3fdf3f14bb13..f8ad7f40100c 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -773,6 +773,8 @@ struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum);
 struct gpio_desc *
 gpio_device_get_desc(struct gpio_device *gdev, unsigned int hwnum);
 
+struct gpio_chip *gpio_device_get_chip(struct gpio_device *gdev);
+
 #ifdef CONFIG_GPIOLIB
 
 /* lock/unlock as IRQ */
-- 
cgit v1.2.3


From 8e56b063c86569e51eed1c5681ce6361fa97fc7a Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 3 Oct 2023 13:17:53 -0400
Subject: netfilter: handle the connecting collision properly in
 nf_conntrack_proto_sctp

In Scenario A and B below, as the delayed INIT_ACK always changes the peer
vtag, SCTP ct with the incorrect vtag may cause packet loss.

Scenario A: INIT_ACK is delayed until the peer receives its own INIT_ACK

  192.168.1.2 > 192.168.1.1: [INIT] [init tag: 1328086772]
    192.168.1.1 > 192.168.1.2: [INIT] [init tag: 1414468151]
    192.168.1.2 > 192.168.1.1: [INIT ACK] [init tag: 1328086772]
  192.168.1.1 > 192.168.1.2: [INIT ACK] [init tag: 1650211246] *
  192.168.1.2 > 192.168.1.1: [COOKIE ECHO]
    192.168.1.1 > 192.168.1.2: [COOKIE ECHO]
    192.168.1.2 > 192.168.1.1: [COOKIE ACK]

Scenario B: INIT_ACK is delayed until the peer completes its own handshake

  192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 3922216408]
    192.168.1.1 > 192.168.1.2: sctp (1) [INIT] [init tag: 144230885]
    192.168.1.2 > 192.168.1.1: sctp (1) [INIT ACK] [init tag: 3922216408]
    192.168.1.1 > 192.168.1.2: sctp (1) [COOKIE ECHO]
    192.168.1.2 > 192.168.1.1: sctp (1) [COOKIE ACK]
  192.168.1.1 > 192.168.1.2: sctp (1) [INIT ACK] [init tag: 3914796021] *

This patch fixes it as below:

In SCTP_CID_INIT processing:
- clear ct->proto.sctp.init[!dir] if ct->proto.sctp.init[dir] &&
  ct->proto.sctp.init[!dir]. (Scenario E)
- set ct->proto.sctp.init[dir].

In SCTP_CID_INIT_ACK processing:
- drop it if !ct->proto.sctp.init[!dir] && ct->proto.sctp.vtag[!dir] &&
  ct->proto.sctp.vtag[!dir] != ih->init_tag. (Scenario B, Scenario C)
- drop it if ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] &&
  ct->proto.sctp.vtag[!dir] != ih->init_tag. (Scenario A)

In SCTP_CID_COOKIE_ACK processing:
- clear ct->proto.sctp.init[dir] and ct->proto.sctp.init[!dir].
  (Scenario D)

Also, it's important to allow the ct state to move forward with cookie_echo
and cookie_ack from the opposite dir for the collision scenarios.

There are also other Scenarios where it should allow the packet through,
addressed by the processing above:

Scenario C: new CT is created by INIT_ACK.

Scenario D: start INIT on the existing ESTABLISHED ct.

Scenario E: start INIT after the old collision on the existing ESTABLISHED
ct.

  192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 3922216408]
  192.168.1.1 > 192.168.1.2: sctp (1) [INIT] [init tag: 144230885]
  (both side are stopped, then start new connection again in hours)
  192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 242308742]

Fixes: 9fb9cbb1082d ("[NETFILTER]: Add nf_conntrack subsystem.")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter/nf_conntrack_sctp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_sctp.h b/include/linux/netfilter/nf_conntrack_sctp.h
index 625f491b95de..fb31312825ae 100644
--- a/include/linux/netfilter/nf_conntrack_sctp.h
+++ b/include/linux/netfilter/nf_conntrack_sctp.h
@@ -9,6 +9,7 @@ struct ip_ct_sctp {
 	enum sctp_conntrack state;
 
 	__be32 vtag[IP_CT_DIR_MAX];
+	u8 init[IP_CT_DIR_MAX];
 	u8 last_dir;
 	u8 flags;
 };
-- 
cgit v1.2.3


From 09361abc346102c505657db4f3ae19ed70e1b703 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Wed, 9 Aug 2023 16:00:12 +0800
Subject: dmaengine: Remove unused declaration dma_chan_cleanup()

Commit f27c580c3628 ("dmaengine: remove 'bigref' infrastructure")
removed the implementation but left declaration in place. Remove it.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230809080012.22000-1-yuehaibing@huawei.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index c3656e590213..3df70d6131c8 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -517,8 +517,6 @@ static inline const char *dma_chan_name(struct dma_chan *chan)
 	return dev_name(&chan->dev->device);
 }
 
-void dma_chan_cleanup(struct kref *kref);
-
 /**
  * typedef dma_filter_fn - callback filter for dma_request_channel
  * @chan: channel to be reviewed
-- 
cgit v1.2.3


From 4785aa8028536c2be656d22c74ec1995b97056f3 Mon Sep 17 00:00:00 2001
From: Oza Pawandeep <quic_poza@quicinc.com>
Date: Tue, 3 Oct 2023 10:33:33 -0700
Subject: cpuidle, ACPI: Evaluate LPI arch_flags for broadcast timer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Arm® Functional Fixed Hardware Specification defines LPI states,
which provide an architectural context loss flags field that can
be used to describe the context that might be lost when an LPI
state is entered.

- Core context Lost
        - General purpose registers.
        - Floating point and SIMD registers.
        - System registers, include the System register based
        - generic timer for the core.
        - Debug register in the core power domain.
        - PMU registers in the core power domain.
        - Trace register in the core power domain.
- Trace context loss
- GICR
- GICD

Qualcomm's custom CPUs preserves the architectural state,
including keeping the power domain for local timers active.
when core is power gated, the local timers are sufficient to
wake the core up without needing broadcast timer.

The patch fixes the evaluation of cpuidle arch_flags, and moves only to
broadcast timer if core context lost is defined in ACPI LPI.

Fixes: a36a7fecfe60 ("ACPI / processor_idle: Add support for Low Power Idle(LPI) states")
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Oza Pawandeep <quic_poza@quicinc.com>
Link: https://lore.kernel.org/r/20231003173333.2865323-1-quic_poza@quicinc.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/acpi.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a73246c3c35e..afd94c9b8b8a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1480,6 +1480,15 @@ static inline int lpit_read_residency_count_address(u64 *address)
 }
 #endif
 
+#ifdef CONFIG_ACPI_PROCESSOR_IDLE
+#ifndef arch_get_idle_state_flags
+static inline unsigned int arch_get_idle_state_flags(u32 arch_flags)
+{
+	return 0;
+}
+#endif
+#endif /* CONFIG_ACPI_PROCESSOR_IDLE */
+
 #ifdef CONFIG_ACPI_PPTT
 int acpi_pptt_cpu_is_thread(unsigned int cpu);
 int find_acpi_cpu_topology(unsigned int cpu, int level);
-- 
cgit v1.2.3


From c964c1f5ee96e1460606d44f80a47bdacd8fe568 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Sep 2023 22:35:59 +0200
Subject: rcu: Assume rcu_report_dead() is always called locally

rcu_report_dead() has to be called locally by the CPU that is going to
exit the RCU state machine. Passing a cpu argument here is error-prone
and leaves the possibility for a racy remote call.

Use local access instead.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/rcupdate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5e5f920ade90..aa351ddcbe8d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -122,7 +122,7 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
 void rcu_init(void);
 extern int rcu_scheduler_active;
 void rcu_sched_clock_irq(int user);
-void rcu_report_dead(unsigned int cpu);
+void rcu_report_dead(void);
 void rcutree_migrate_callbacks(int cpu);
 
 #ifdef CONFIG_TASKS_RCU_GENERIC
-- 
cgit v1.2.3


From bc0c3357601e3ff1b006600530079bd246ef0d82 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 23 Aug 2023 19:05:56 +0200
Subject: mm: remove remnants of SPLIT_RSS_COUNTING

The feature got retired in f1a7941243c1 ("mm: convert mm's rss stats into
percpu_counter"), but the patch failed to fully clean it up.

Link: https://lkml.kernel.org/r/20230823170556.2281747-1-mjguzik@gmail.com
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..7613150acab9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2628,14 +2628,6 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 		*maxrss = hiwater_rss;
 }
 
-#if defined(SPLIT_RSS_COUNTING)
-void sync_mm_rss(struct mm_struct *mm);
-#else
-static inline void sync_mm_rss(struct mm_struct *mm)
-{
-}
-#endif
-
 #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
 static inline int pte_special(pte_t pte)
 {
-- 
cgit v1.2.3


From 91e79d22be75fec88ae58d274a7c9e49d6215099 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 23 Aug 2023 00:13:14 +0100
Subject: mm: convert DAX lock/unlock page to lock/unlock folio

The one caller of DAX lock/unlock page already calls compound_head(), so
use page_folio() instead, then use a folio throughout the DAX code to
remove uses of page->mapping and page->index.

[jane.chu@oracle.com: add comment to mf_generic_kill_procss(), simplify mf_generic_kill_procs:folio initialization]
  Link: https://lkml.kernel.org/r/20230908222336.186313-1-jane.chu@oracle.com
Link: https://lkml.kernel.org/r/20230822231314.349200-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jane Chu <jane.chu@oracle.com>
Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jane Chu <jane.chu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/dax.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 22cd9902345d..b463502b16e1 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -159,8 +159,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 
 struct page *dax_layout_busy_page(struct address_space *mapping);
 struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
-dax_entry_t dax_lock_page(struct page *page);
-void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock_folio(struct folio *folio);
+void dax_unlock_folio(struct folio *folio, dax_entry_t cookie);
 dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
 		unsigned long index, struct page **page);
 void dax_unlock_mapping_entry(struct address_space *mapping,
@@ -182,14 +182,14 @@ static inline int dax_writeback_mapping_range(struct address_space *mapping,
 	return -EOPNOTSUPP;
 }
 
-static inline dax_entry_t dax_lock_page(struct page *page)
+static inline dax_entry_t dax_lock_folio(struct folio *folio)
 {
-	if (IS_DAX(page->mapping->host))
+	if (IS_DAX(folio->mapping->host))
 		return ~0UL;
 	return 0;
 }
 
-static inline void dax_unlock_page(struct page *page, dax_entry_t cookie)
+static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
 {
 }
 
-- 
cgit v1.2.3


From b1e5a3dee255a11cbdd5a0e814829276bd33a793 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sun, 3 Sep 2023 15:13:23 +0000
Subject: mm/mremap: allow moves within the same VMA for stack moves

For the stack move happening in shift_arg_pages(), the move is happening
within the same VMA which spans the old and new ranges.

In case the aligned address happens to fall within that VMA, allow such
moves and don't abort the mremap alignment optimization.

In the regular non-stack mremap case, we cannot allow any such moves as
will end up destroying some part of the mapping (either the source of the
move, or part of the existing mapping).  So just avoid it for stack moves.

Link: https://lkml.kernel.org/r/20230903151328.2981432-3-joel@joelfernandes.org
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7613150acab9..21c3d4e8a282 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2480,7 +2480,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len,
-		bool need_rmap_locks);
+		bool need_rmap_locks, bool for_stack);
 
 /*
  * Flags used by change_protection().  For now we make it a bitmap so
-- 
cgit v1.2.3


From d896073fc767ebb40c11a6a9de71c390757ac64b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 7 Sep 2023 02:29:27 +0000
Subject: mm/damon/core: add more comments for nr_accesses

The comment on struct damon_region about nr_accesses field looks not
sufficient.  Many people actually used to ask what nr_accesses mean.
There is more detailed explanation of the mechanism on the comment for
struct damon_attrs, but it is also ambiguous, as it doesn't specify the
name of the counter for aggregating the access check results.  Make
those more detailed.

Link: https://lkml.kernel.org/r/20230907022929.91361-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ae2664d1d5f1..266f92b34dd2 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -43,6 +43,10 @@ struct damon_addr_range {
  * @list:		List head for siblings.
  * @age:		Age of this region.
  *
+ * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be
+ * increased for every &damon_attrs->sample_interval if an access to the region
+ * during the last sampling interval is found.
+ *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
  * regions are merged into a new region, both @nr_accesses and @age of the new
@@ -472,13 +476,14 @@ struct damon_callback {
  *				regions.
  *
  * For each @sample_interval, DAMON checks whether each region is accessed or
- * not.  It aggregates and keeps the access information (number of accesses to
- * each region) for @aggr_interval time.  DAMON also checks whether the target
- * memory regions need update (e.g., by ``mmap()`` calls from the application,
- * in case of virtual memory monitoring) and applies the changes for each
- * @ops_update_interval.  All time intervals are in micro-seconds.
- * Please refer to &struct damon_operations and &struct damon_callback for more
- * detail.
+ * not during the last @sample_interval.  If such access is found, DAMON
+ * aggregates the information by increasing &damon_region->nr_accesses for
+ * @aggr_interval time.  For each @aggr_interval, the count is reset.  DAMON
+ * also checks whether the target memory regions need update (e.g., by
+ * ``mmap()`` calls from the application, in case of virtual memory monitoring)
+ * and applies the changes for each @ops_update_interval.  All time intervals
+ * are in micro-seconds.  Please refer to &struct damon_operations and &struct
+ * damon_callback for more detail.
  */
 struct damon_attrs {
 	unsigned long sample_interval;
-- 
cgit v1.2.3


From cf0a96bd3ab4d9d8a1c92baf1a822f2ddbca3a34 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 7 Sep 2023 02:29:28 +0000
Subject: mm/damon/core: remove duplicated comment for watermarks-based
 deactivation

The comment for explaining about watermarks-based monitoring part
deactivation is duplicated in two paragraphs.  Remove one.

Link: https://lkml.kernel.org/r/20230907022929.91361-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 266f92b34dd2..ab3089de1478 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -317,9 +317,6 @@ struct damos_access_pattern {
  * monitoring context are inactive, DAMON stops monitoring either, and just
  * repeatedly checks the watermarks.
  *
- * If all schemes that registered to a &struct damon_ctx are inactive, DAMON
- * stops monitoring and just repeatedly checks the watermarks.
- *
  * Before applying the &action to a memory region, &struct damon_operations
  * implementation could check pages of the region and skip &action to respect
  * &filters
-- 
cgit v1.2.3


From 7cd34dd3c9bf1d67ce7d1ab3fe8886c583ae0d9a Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 11 Sep 2023 14:21:13 +0300
Subject: efi/unaccepted: do not let /proc/vmcore try to access unaccepted
 memory

Patch series "Do not try to access unaccepted memory", v2.

Support for unaccepted memory was added recently, refer commit
dcdfdd40fa82 ("mm: Add support for unaccepted memory"), whereby
a virtual machine may need to accept memory before it can be used.

Plug a few gaps where RAM is exposed without checking if it is
unaccepted memory.


This patch (of 2):

Support for unaccepted memory was added recently, refer commit
dcdfdd40fa82 ("mm: Add support for unaccepted memory"), whereby a virtual
machine may need to accept memory before it can be used.

Do not let /proc/vmcore try to access unaccepted memory because it can
cause the guest to fail.

For /proc/vmcore, which is read-only, this means a read or mmap of
unaccepted memory will return zeros.

Link: https://lkml.kernel.org/r/20230911112114.91323-1-adrian.hunter@intel.com
Link: https://lkml.kernel.org/r/20230911112114.91323-2-adrian.hunter@intel.com
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21c3d4e8a282..31dc25d3f6b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4054,4 +4054,11 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end)
 
 #endif
 
+static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
+{
+	phys_addr_t paddr = pfn << PAGE_SHIFT;
+
+	return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
+}
+
 #endif /* _LINUX_MM_H */
-- 
cgit v1.2.3


From 3ee0aa9f06756e959633ffda37856c6741d948ed Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:25:14 +0800
Subject: mm: move some shrinker-related function declarations to mm/internal.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "cleanups for lockless slab shrink", v4.

This series is some cleanups for lockless slab shrink.


This patch (of 4):

The following functions are only used inside the mm subsystem, so it's
better to move their declarations to the mm/internal.h file.

1. shrinker_debugfs_add()
2. shrinker_debugfs_detach()
3. shrinker_debugfs_remove()

Link: https://lkml.kernel.org/r/20230911092517.64141-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911092517.64141-2-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian König <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sean Paul <sean@poorly.run>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 224293b2dd06..8dc15aa37410 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -106,28 +106,9 @@ extern void free_prealloced_shrinker(struct shrinker *shrinker);
 extern void synchronize_shrinkers(void);
 
 #ifdef CONFIG_SHRINKER_DEBUG
-extern int shrinker_debugfs_add(struct shrinker *shrinker);
-extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
-					      int *debugfs_id);
-extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
-				    int debugfs_id);
 extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
 						  const char *fmt, ...);
 #else /* CONFIG_SHRINKER_DEBUG */
-static inline int shrinker_debugfs_add(struct shrinker *shrinker)
-{
-	return 0;
-}
-static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
-						     int *debugfs_id)
-{
-	*debugfs_id = -1;
-	return NULL;
-}
-static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
-					   int debugfs_id)
-{
-}
 static inline __printf(2, 3)
 int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
 {
-- 
cgit v1.2.3


From 0b2f5ea1aa39c0ed34bdadb53faf519e3d84ac4a Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:25:17 +0800
Subject: drm/ttm: introduce pool_shrink_rwsem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, synchronize_shrinkers() is only used by TTM pool.  It only
requires that no shrinkers run in parallel.

After we use RCU+refcount method to implement the lockless slab shrink, we
can not use shrinker_rwsem or synchronize_rcu() to guarantee that all
shrinker invocations have seen an update before freeing memory.

So we introduce a new pool_shrink_rwsem to implement a private
ttm_pool_synchronize_shrinkers(), so as to achieve the same purpose.

Link: https://lkml.kernel.org/r/20230911092517.64141-5-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Chuck Lever <cel@kernel.org>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sean Paul <sean@poorly.run>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 8dc15aa37410..6b5843c3b827 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -103,7 +103,6 @@ extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker,
 					    const char *fmt, ...);
 extern void unregister_shrinker(struct shrinker *shrinker);
 extern void free_prealloced_shrinker(struct shrinker *shrinker);
-extern void synchronize_shrinkers(void);
 
 #ifdef CONFIG_SHRINKER_DEBUG
 extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
-- 
cgit v1.2.3


From c42d50aefd17a6bad3ed617769edbbb579137545 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:00 +0800
Subject: mm: shrinker: add infrastructure for dynamically allocating shrinker

Patch series "use refcount+RCU method to implement lockless slab shrink",
v6.

1. Background
=============

We used to implement the lockless slab shrink with SRCU [1], but then kernel
test robot reported -88.8% regression in stress-ng.ramfs.ops_per_sec test
case [2], so we reverted it [3].

This patch series aims to re-implement the lockless slab shrink using the
refcount+RCU method proposed by Dave Chinner [4].

[1]. https://lore.kernel.org/lkml/20230313112819.38938-1-zhengqi.arch@bytedance.com/
[2]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/
[3]. https://lore.kernel.org/all/20230609081518.3039120-1-qi.zheng@linux.dev/
[4]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/

2. Implementation
=================

Currently, the shrinker instances can be divided into the following three types:

a) global shrinker instance statically defined in the kernel, such as
   workingset_shadow_shrinker.

b) global shrinker instance statically defined in the kernel modules, such as
   mmu_shrinker in x86.

c) shrinker instance embedded in other structures.

For case a, the memory of shrinker instance is never freed. For case b, the
memory of shrinker instance will be freed after synchronize_rcu() when the
module is unloaded. For case c, the memory of shrinker instance will be freed
along with the structure it is embedded in.

In preparation for implementing lockless slab shrink, we need to dynamically
allocate those shrinker instances in case c, then the memory can be dynamically
freed alone by calling kfree_rcu().

This patchset adds the following new APIs for dynamically allocating shrinker,
and add a private_data field to struct shrinker to record and get the original
embedded structure.

1. shrinker_alloc()
2. shrinker_register()
3. shrinker_free()

In order to simplify shrinker-related APIs and make shrinker more independent of
other kernel mechanisms, this patchset uses the above APIs to convert all
shrinkers (including case a and b) to dynamically allocated, and then remove all
existing APIs. This will also have another advantage mentioned by Dave Chinner:

```
The other advantage of this is that it will break all the existing out of tree
code and third party modules using the old API and will no longer work with a
kernel using lockless slab shrinkers. They need to break (both at the source and
binary levels) to stop bad things from happening due to using uncoverted
shrinkers in the new setup.
```

Then we free the shrinker by calling call_rcu(), and use rcu_read_{lock,unlock}()
to ensure that the shrinker instance is valid. And the shrinker::refcount
mechanism ensures that the shrinker instance will not be run again after
unregistration. So the structure that records the pointer of shrinker instance
can be safely freed without waiting for the RCU read-side critical section.

In this way, while we implement the lockless slab shrink, we don't need to be
blocked in unregister_shrinker() to wait RCU read-side critical section.

PATCH 1: introduce new APIs
PATCH 2~38: convert all shrinnkers to use new APIs
PATCH 39: remove old APIs
PATCH 40~41: some cleanups and preparations
PATCH 42-43: implement the lockless slab shrink
PATCH 44~45: convert shrinker_rwsem to mutex

3. Testing
==========

3.1 slab shrink stress test
---------------------------

We can reproduce the down_read_trylock() hotspot through the following script:

```

DIR="/root/shrinker/memcg/mnt"

do_create()
{
    mkdir -p /sys/fs/cgroup/memory/test
    echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
    for i in `seq 0 $1`;
    do
        mkdir -p /sys/fs/cgroup/memory/test/$i;
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        mkdir -p $DIR/$i;
    done
}

do_mount()
{
    for i in `seq $1 $2`;
    do
        mount -t tmpfs $i $DIR/$i;
    done
}

do_touch()
{
    for i in `seq $1 $2`;
    do
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
    done
}

case "$1" in
  touch)
    do_touch $2 $3
    ;;
  test)
    do_create 4000
    do_mount 0 4000
    do_touch 0 3000
    ;;
  *)
    exit 1
    ;;
esac
```

Save the above script, then run test and touch commands. Then we can use the
following perf command to view hotspots:

perf top -U -F 999

1) Before applying this patchset:

  33.15%  [kernel]          [k] down_read_trylock
  25.38%  [kernel]          [k] shrink_slab
  21.75%  [kernel]          [k] up_read
   4.45%  [kernel]          [k] _find_next_bit
   2.27%  [kernel]          [k] do_shrink_slab
   1.80%  [kernel]          [k] intel_idle_irq
   1.79%  [kernel]          [k] shrink_lruvec
   0.67%  [kernel]          [k] xas_descend
   0.41%  [kernel]          [k] mem_cgroup_iter
   0.40%  [kernel]          [k] shrink_node
   0.38%  [kernel]          [k] list_lru_count_one

2) After applying this patchset:

  64.56%  [kernel]          [k] shrink_slab
  12.18%  [kernel]          [k] do_shrink_slab
   3.30%  [kernel]          [k] __rcu_read_unlock
   2.61%  [kernel]          [k] shrink_lruvec
   2.49%  [kernel]          [k] __rcu_read_lock
   1.93%  [kernel]          [k] intel_idle_irq
   0.89%  [kernel]          [k] shrink_node
   0.81%  [kernel]          [k] mem_cgroup_iter
   0.77%  [kernel]          [k] mem_cgroup_calculate_protection
   0.66%  [kernel]          [k] list_lru_count_one

We can see that the first perf hotspot becomes shrink_slab, which is what we
expect.

3.2 registration and unregistration stress test
-----------------------------------------------

Run the command below to test:

stress-ng --timeout 60 --times --verify --metrics-brief --ramfs 9 &

1) Before applying this patchset:

setting to a 60 second run per stressor
dispatching hogs: 9 ramfs
stressor       bogo ops real time  usr time  sys time   bogo ops/s     bogo ops/s
                          (secs)    (secs)    (secs)   (real time) (usr+sys time)
ramfs            473062     60.00      8.00    279.13      7884.12        1647.59
for a 60.01s run time:
   1440.34s available CPU time
      7.99s user time   (  0.55%)
    279.13s system time ( 19.38%)
    287.12s total time  ( 19.93%)
load average: 7.12 2.99 1.15
successful run completed in 60.01s (1 min, 0.01 secs)

2) After applying this patchset:

setting to a 60 second run per stressor
dispatching hogs: 9 ramfs
stressor       bogo ops real time  usr time  sys time   bogo ops/s     bogo ops/s
                          (secs)    (secs)    (secs)   (real time) (usr+sys time)
ramfs            477165     60.00      8.13    281.34      7952.55        1648.40
for a 60.01s run time:
   1440.33s available CPU time
      8.12s user time   (  0.56%)
    281.34s system time ( 19.53%)
    289.46s total time  ( 20.10%)
load average: 6.98 3.03 1.19
successful run completed in 60.01s (1 min, 0.01 secs)

We can see that the ops/s has hardly changed.


This patch (of 45):

Currently, the shrinker instances can be divided into the following three
types:

a) global shrinker instance statically defined in the kernel, such as
   workingset_shadow_shrinker.

b) global shrinker instance statically defined in the kernel modules, such
   as mmu_shrinker in x86.

c) shrinker instance embedded in other structures.

For case a, the memory of shrinker instance is never freed. For case b,
the memory of shrinker instance will be freed after synchronize_rcu() when
the module is unloaded. For case c, the memory of shrinker instance will
be freed along with the structure it is embedded in.

In preparation for implementing lockless slab shrink, we need to
dynamically allocate those shrinker instances in case c, then the memory
can be dynamically freed alone by calling kfree_rcu().

So this commit adds the following new APIs for dynamically allocating
shrinker, and add a private_data field to struct shrinker to record and
get the original embedded structure.

1. shrinker_alloc()

Used to allocate shrinker instance itself and related memory, it will
return a pointer to the shrinker instance on success and NULL on failure.

2. shrinker_register()

Used to register the shrinker instance, which is same as the current
register_shrinker_prepared().

3. shrinker_free()

Used to unregister (if needed) and free the shrinker instance.

In order to simplify shrinker-related APIs and make shrinker more
independent of other kernel mechanisms, subsequent submissions will use
the above API to convert all shrinkers (including case a and b) to
dynamically allocated, and then remove all existing APIs.

This will also have another advantage mentioned by Dave Chinner:

```
The other advantage of this is that it will break all the existing
out of tree code and third party modules using the old API and will
no longer work with a kernel using lockless slab shrinkers. They
need to break (both at the source and binary levels) to stop bad
things from happening due to using unconverted shrinkers in the new
setup.
```

[zhengqi.arch@bytedance.com: mm: shrinker: some cleanup]
  Link: https://lkml.kernel.org/r/20230919024607.65463-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911094444.68966-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911094444.68966-2-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Chuck Lever <cel@kernel.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sean Paul <sean@poorly.run>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 6b5843c3b827..f4a5249f00b2 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -70,6 +70,8 @@ struct shrinker {
 	int seeks;	/* seeks to recreate an obj */
 	unsigned flags;
 
+	void *private_data;
+
 	/* These are for internal use */
 	struct list_head list;
 #ifdef CONFIG_MEMCG
@@ -86,15 +88,22 @@ struct shrinker {
 };
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
 
-/* Flags */
-#define SHRINKER_REGISTERED	(1 << 0)
-#define SHRINKER_NUMA_AWARE	(1 << 1)
-#define SHRINKER_MEMCG_AWARE	(1 << 2)
+/* Internal flags */
+#define SHRINKER_REGISTERED	BIT(0)
+#define SHRINKER_ALLOCATED	BIT(1)
+
+/* Flags for users to use */
+#define SHRINKER_NUMA_AWARE	BIT(2)
+#define SHRINKER_MEMCG_AWARE	BIT(3)
 /*
  * It just makes sense when the shrinker is also MEMCG_AWARE for now,
  * non-MEMCG_AWARE shrinker should not have this flag set.
  */
-#define SHRINKER_NONSLAB	(1 << 3)
+#define SHRINKER_NONSLAB	BIT(4)
+
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
+void shrinker_register(struct shrinker *shrinker);
+void shrinker_free(struct shrinker *shrinker);
 
 extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker,
 					    const char *fmt, ...);
-- 
cgit v1.2.3


From 4b403dfa8ea8dfbc9d317b25a0433c1ac6765f7f Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:30 +0800
Subject: jbd2,ext4: dynamically allocate the jbd2-journal shrinker

In preparation for implementing lockless slab shrink, use new APIs to
dynamically allocate the jbd2-journal shrinker, so that it can be freed
asynchronously via RCU. Then it doesn't need to wait for RCU read-side
critical section when releasing the struct journal_s.

Link: https://lkml.kernel.org/r/20230911094444.68966-32-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/jbd2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 52772c826c86..6dcbb4eb80fb 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -886,7 +886,7 @@ struct journal_s
 	 * Journal head shrinker, reclaim buffer's journal head which
 	 * has been written back.
 	 */
-	struct shrinker		j_shrinker;
+	struct shrinker		*j_shrinker;
 
 	/**
 	 * @j_checkpoint_jh_count:
-- 
cgit v1.2.3


From 1720f5dd8d3af34d6023fb9e8c35e5e60e8b6643 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:37 +0800
Subject: fs: super: dynamically allocate the s_shrink

In preparation for implementing lockless slab shrink, use new APIs to
dynamically allocate the s_shrink, so that it can be freed asynchronously
via RCU. Then it doesn't need to wait for RCU read-side critical section
when releasing the struct super_block.

Link: https://lkml.kernel.org/r/20230911094444.68966-39-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b528f063e8ff..fd539c9fef8e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1265,7 +1265,7 @@ struct super_block {
 
 	const struct dentry_operations *s_d_op; /* default d_op for dentries */
 
-	struct shrinker s_shrink;	/* per-sb shrinker handle */
+	struct shrinker *s_shrink;	/* per-sb shrinker handle */
 
 	/* Number of inodes with nlink == 0 but still referenced */
 	atomic_long_t s_remove_count;
-- 
cgit v1.2.3


From f2383e01507eeee8a1c1283d61a117a97d6c4ebe Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:38 +0800
Subject: mm: shrinker: remove old APIs

Now no users are using the old APIs, just remove them.

Link: https://lkml.kernel.org/r/20230911094444.68966-40-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index f4a5249f00b2..1d3899f37229 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -105,14 +105,6 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
 void shrinker_register(struct shrinker *shrinker);
 void shrinker_free(struct shrinker *shrinker);
 
-extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker,
-					    const char *fmt, ...);
-extern void register_shrinker_prepared(struct shrinker *shrinker);
-extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker,
-					    const char *fmt, ...);
-extern void unregister_shrinker(struct shrinker *shrinker);
-extern void free_prealloced_shrinker(struct shrinker *shrinker);
-
 #ifdef CONFIG_SHRINKER_DEBUG
 extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
 						  const char *fmt, ...);
-- 
cgit v1.2.3


From 307bececcd1205bcb67a3c0d53a69db237ccc9d4 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:39 +0800
Subject: mm: shrinker: add a secondary array for shrinker_info::{map,
 nr_deferred}

Currently, we maintain two linear arrays per node per memcg, which are
shrinker_info::map and shrinker_info::nr_deferred. And we need to resize
them when the shrinker_nr_max is exceeded, that is, allocate a new array,
and then copy the old array to the new array, and finally free the old
array by RCU.

For shrinker_info::map, we do set_bit() under the RCU lock, so we may set
the value into the old map which is about to be freed. This may cause the
value set to be lost. The current solution is not to copy the old map when
resizing, but to set all the corresponding bits in the new map to 1. This
solves the data loss problem, but bring the overhead of more pointless
loops while doing memcg slab shrink.

For shrinker_info::nr_deferred, we will only modify it under the read lock
of shrinker_rwsem, so it will not run concurrently with the resizing. But
after we make memcg slab shrink lockless, there will be the same data loss
problem as shrinker_info::map, and we can't work around it like the map.

For such resizable arrays, the most straightforward idea is to change it
to xarray, like we did for list_lru [1]. We need to do xa_store() in the
list_lru_add()-->set_shrinker_bit(), but this will cause memory
allocation, and the list_lru_add() doesn't accept failure. A possible
solution is to pre-allocate, but the location of pre-allocation is not
well determined (such as deferred_split_shrinker case).

Therefore, this commit chooses to introduce the following secondary array
for shrinker_info::{map, nr_deferred}:

+---------------+--------+--------+-----+
| shrinker_info | unit 0 | unit 1 | ... | (secondary array)
+---------------+--------+--------+-----+
                     |
                     v
                +---------------+-----+
                | nr_deferred[] | map | (leaf array)
                +---------------+-----+
                (shrinker_info_unit)

The leaf array is never freed unless the memcg is destroyed. The secondary
array will be resized every time the shrinker id exceeds shrinker_nr_max.
So the shrinker_info_unit can be indexed from both the old and the new
shrinker_info->unit[x]. Then even if we get the old secondary array under
the RCU lock, the found map and nr_deferred are also true, so the updated
nr_deferred and map will not be lost.

[1]. https://lore.kernel.org/all/20220228122126.37293-13-songmuchun@bytedance.com/

[zhengqi.arch@bytedance.com: unlock the &shrinker_rwsem before the call to free_shrinker_info()]
  Link: https://lkml.kernel.org/r/20230928141517.12164-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911094444.68966-41-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 12 +-----------
 include/linux/shrinker.h   | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e4e24da16d2c..031102ac9311 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,6 +21,7 @@
 #include <linux/vmstat.h>
 #include <linux/writeback.h>
 #include <linux/page-flags.h>
+#include <linux/shrinker.h>
 
 struct mem_cgroup;
 struct obj_cgroup;
@@ -88,17 +89,6 @@ struct mem_cgroup_reclaim_iter {
 	unsigned int generation;
 };
 
-/*
- * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
- * shrinkers, which have elements charged to this memcg.
- */
-struct shrinker_info {
-	struct rcu_head rcu;
-	atomic_long_t *nr_deferred;
-	unsigned long *map;
-	int map_nr_max;
-};
-
 struct lruvec_stats_percpu {
 	/* Local (CPU and cgroup) state */
 	long state[NR_VM_NODE_STAT_ITEMS];
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 1d3899f37229..ba5ac82b5dbd 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -5,6 +5,23 @@
 #include <linux/atomic.h>
 #include <linux/types.h>
 
+#define SHRINKER_UNIT_BITS	BITS_PER_LONG
+
+/*
+ * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
+ * shrinkers, which have elements charged to the memcg.
+ */
+struct shrinker_info_unit {
+	atomic_long_t nr_deferred[SHRINKER_UNIT_BITS];
+	DECLARE_BITMAP(map, SHRINKER_UNIT_BITS);
+};
+
+struct shrinker_info {
+	struct rcu_head rcu;
+	int map_nr_max;
+	struct shrinker_info_unit *unit[];
+};
+
 /*
  * This struct is used to pass information from page reclaim to the shrinkers.
  * We consolidate the values for easier extension later.
-- 
cgit v1.2.3


From ca1d36b823944f24b5755311e95883fb5fdb807b Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:41 +0800
Subject: mm: shrinker: make global slab shrink lockless

The shrinker_rwsem is a global read-write lock in shrinkers subsystem,
which protects most operations such as slab shrink, registration and
unregistration of shrinkers, etc. This can easily cause problems in the
following cases.

1) When the memory pressure is high and there are many filesystems
   mounted or unmounted at the same time, slab shrink will be affected
   (down_read_trylock() failed).

   Such as the real workload mentioned by Kirill Tkhai:

   ```
   One of the real workloads from my experience is start
   of an overcommitted node containing many starting
   containers after node crash (or many resuming containers
   after reboot for kernel update). In these cases memory
   pressure is huge, and the node goes round in long reclaim.
   ```

2) If a shrinker is blocked (such as the case mentioned
   in [1]) and a writer comes in (such as mount a fs),
   then this writer will be blocked and cause all
   subsequent shrinker-related operations to be blocked.

Even if there is no competitor when shrinking slab, there may still be a
problem. The down_read_trylock() may become a perf hotspot with frequent
calls to shrink_slab(). Because of the poor multicore scalability of
atomic operations, this can lead to a significant drop in IPC
(instructions per cycle).

We used to implement the lockless slab shrink with SRCU [2], but then
kernel test robot reported -88.8% regression in
stress-ng.ramfs.ops_per_sec test case [3], so we reverted it [4].

This commit uses the refcount+RCU method [5] proposed by Dave Chinner
to re-implement the lockless global slab shrink. The memcg slab shrink is
handled in the subsequent patch.

For now, all shrinker instances are converted to dynamically allocated and
will be freed by call_rcu(). So we can use rcu_read_{lock,unlock}() to
ensure that the shrinker instance is valid.

And the shrinker instance will not be run again after unregistration. So
the structure that records the pointer of shrinker instance can be safely
freed without waiting for the RCU read-side critical section.

In this way, while we implement the lockless slab shrink, we don't need to
be blocked in unregister_shrinker().

The following are the test results:

stress-ng --timeout 60 --times --verify --metrics-brief --ramfs 9 &

1) Before applying this patchset:

setting to a 60 second run per stressor
dispatching hogs: 9 ramfs
stressor       bogo ops real time  usr time  sys time   bogo ops/s     bogo ops/s
                          (secs)    (secs)    (secs)   (real time) (usr+sys time)
ramfs            473062     60.00      8.00    279.13      7884.12        1647.59
for a 60.01s run time:
   1440.34s available CPU time
      7.99s user time   (  0.55%)
    279.13s system time ( 19.38%)
    287.12s total time  ( 19.93%)
load average: 7.12 2.99 1.15
successful run completed in 60.01s (1 min, 0.01 secs)

2) After applying this patchset:

setting to a 60 second run per stressor
dispatching hogs: 9 ramfs
stressor       bogo ops real time  usr time  sys time   bogo ops/s     bogo ops/s
                          (secs)    (secs)    (secs)   (real time) (usr+sys time)
ramfs            477165     60.00      8.13    281.34      7952.55        1648.40
for a 60.01s run time:
   1440.33s available CPU time
      8.12s user time   (  0.56%)
    281.34s system time ( 19.53%)
    289.46s total time  ( 20.10%)
load average: 6.98 3.03 1.19
successful run completed in 60.01s (1 min, 0.01 secs)

We can see that the ops/s has hardly changed.

[1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
[2]. https://lore.kernel.org/lkml/20230313112819.38938-1-zhengqi.arch@bytedance.com/
[3]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/
[4]. https://lore.kernel.org/all/20230609081518.3039120-1-qi.zheng@linux.dev/
[5]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/

Link: https://lkml.kernel.org/r/20230911094444.68966-43-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index ba5ac82b5dbd..e4f93120e0ab 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -4,6 +4,8 @@
 
 #include <linux/atomic.h>
 #include <linux/types.h>
+#include <linux/refcount.h>
+#include <linux/completion.h>
 
 #define SHRINKER_UNIT_BITS	BITS_PER_LONG
 
@@ -87,6 +89,17 @@ struct shrinker {
 	int seeks;	/* seeks to recreate an obj */
 	unsigned flags;
 
+	/*
+	 * The reference count of this shrinker. Registered shrinker have an
+	 * initial refcount of 1, then the lookup operations are now allowed
+	 * to use it via shrinker_try_get(). Later in the unregistration step,
+	 * the initial refcount will be discarded, and will free the shrinker
+	 * asynchronously via RCU after its refcount reaches 0.
+	 */
+	refcount_t refcount;
+	struct completion done;	/* use to wait for refcount to reach 0 */
+	struct rcu_head rcu;
+
 	void *private_data;
 
 	/* These are for internal use */
@@ -122,6 +135,17 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
 void shrinker_register(struct shrinker *shrinker);
 void shrinker_free(struct shrinker *shrinker);
 
+static inline bool shrinker_try_get(struct shrinker *shrinker)
+{
+	return refcount_inc_not_zero(&shrinker->refcount);
+}
+
+static inline void shrinker_put(struct shrinker *shrinker)
+{
+	if (refcount_dec_and_test(&shrinker->refcount))
+		complete(&shrinker->done);
+}
+
 #ifdef CONFIG_SHRINKER_DEBUG
 extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
 						  const char *fmt, ...);
-- 
cgit v1.2.3


From 09c550508a4b8f7844b197cc16877dd0f7c42d8f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 13 Sep 2023 14:51:13 +0200
Subject: mm/rmap: pass folio to hugepage_add_anon_rmap()

Let's pass a folio; we are always mapping the entire thing.

Link: https://lkml.kernel.org/r/20230913125113.313322-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 51cc21ebb568..d22f4d21a11c 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -203,7 +203,7 @@ void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
 void page_remove_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
 
-void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
+void hugepage_add_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
 void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
-- 
cgit v1.2.3


From 73eab3ca481e5be0f1fd8140365d604482f84ee1 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 13 Sep 2023 17:51:27 +0800
Subject: mm: migrate: convert migrate_misplaced_page() to
 migrate_misplaced_folio()

At present, numa balance only support base page and PMD-mapped THP, but we
will expand to support to migrate large folio/pte-mapped THP in the
future, it is better to make migrate_misplaced_page() to take a folio
instead of a page, and rename it to migrate_misplaced_folio(), it is a
preparation, also this remove several compound_head() calls.

Link: https://lkml.kernel.org/r/20230913095131.2426871-5-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 711dd9412561..2ce13e8a309b 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -142,10 +142,10 @@ const struct movable_operations *page_movable_ops(struct page *page)
 }
 
 #ifdef CONFIG_NUMA_BALANCING
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
 			   int node);
 #else
-static inline int migrate_misplaced_page(struct page *page,
+static inline int migrate_misplaced_folio(struct folio *folio,
 					 struct vm_area_struct *vma, int node)
 {
 	return -EAGAIN; /* can't migrate now */
-- 
cgit v1.2.3


From 2a41815784e029cbef571511384f00fa40f2a82e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:04 +0100
Subject: buffer: pass GFP flags to folio_alloc_buffers()

Patch series "Add and use bdev_getblk()", v2.

This patch series fixes a bug reported by Hui Zhu; see proposed
patches v1 and v2:
https://lore.kernel.org/linux-fsdevel/20230811035705.3296-1-teawaterz@linux.alibaba.com/
https://lore.kernel.org/linux-fsdevel/20230811071519.1094-1-teawaterz@linux.alibaba.com/

I decided to go in a rather different direction for this fix, and fix a
related problem at the same time.  I don't think there's any urgency to
rush this into Linus' tree, nor have I marked it for stable.  Reasonable
people may disagree.


This patch (of 8):

Instead of creating entirely new flags, inherit them from grow_dev_page().
The other callers create the same flags that this function used to
create.

Link: https://lkml.kernel.org/r/20230914150011.843330-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20230914150011.843330-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 44e9de51eedf..67d94d2be475 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -198,7 +198,7 @@ void touch_buffer(struct buffer_head *bh);
 void folio_set_bh(struct buffer_head *bh, struct folio *folio,
 		  unsigned long offset);
 struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
-					bool retry);
+					gfp_t gfp);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
 void create_empty_buffers(struct page *, unsigned long,
-- 
cgit v1.2.3


From 3ed65f04aac4d1cd025f30ee3fac174bcbf2b018 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:05 +0100
Subject: buffer: hoist GFP flags from grow_dev_page() to __getblk_gfp()

grow_dev_page() is only called by grow_buffers().  grow_buffers() is only
called by __getblk_slow() and __getblk_slow() is only called from
__getblk_gfp(), so it is safe to move the GFP flags setting all the way
up.  With that done, add a new bdev_getblk() entry point that leaves the
GFP flags the way the caller specified them.

[willy@infradead.org: fix grow_dev_page() error handling]
  Link: https://lkml.kernel.org/r/ZRREEIwqiy5DijKB@casper.infradead.org
Link: https://lkml.kernel.org/r/20230914150011.843330-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 67d94d2be475..7825bb3d63a7 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -227,6 +227,8 @@ void __wait_on_buffer(struct buffer_head *);
 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
 struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
 			unsigned size);
+struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
+		unsigned size, gfp_t gfp);
 struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
 				  unsigned size, gfp_t gfp);
 void __brelse(struct buffer_head *);
-- 
cgit v1.2.3


From c645e65c0675dd4df7ee68b995154dc1c1e7ce3b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:08 +0100
Subject: buffer: convert getblk_unmovable() and __getblk() to use
 bdev_getblk()

Move these two functions up in the file for the benefit of the next patch,
and pass in all of the GFP flags to use instead of the partial GFP flags
used by __getblk_gfp().

Link: https://lkml.kernel.org/r/20230914150011.843330-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7825bb3d63a7..9a3ca5f6d63d 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -340,6 +340,28 @@ sb_breadahead(struct super_block *sb, sector_t block)
 	__breadahead(sb->s_bdev, block, sb->s_blocksize);
 }
 
+static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
+		sector_t block, unsigned size)
+{
+	gfp_t gfp;
+
+	gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+	gfp |= __GFP_NOFAIL;
+
+	return bdev_getblk(bdev, block, size, gfp);
+}
+
+static inline struct buffer_head *__getblk(struct block_device *bdev,
+		sector_t block, unsigned size)
+{
+	gfp_t gfp;
+
+	gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+	gfp |= __GFP_MOVABLE | __GFP_NOFAIL;
+
+	return bdev_getblk(bdev, block, size, gfp);
+}
+
 static inline struct buffer_head *
 sb_getblk(struct super_block *sb, sector_t block)
 {
@@ -387,20 +409,6 @@ static inline void lock_buffer(struct buffer_head *bh)
 		__lock_buffer(bh);
 }
 
-static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
-						   sector_t block,
-						   unsigned size)
-{
-	return __getblk_gfp(bdev, block, size, 0);
-}
-
-static inline struct buffer_head *__getblk(struct block_device *bdev,
-					   sector_t block,
-					   unsigned size)
-{
-	return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
-}
-
 static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags)
 {
 	if (!buffer_uptodate(bh) && trylock_buffer(bh)) {
-- 
cgit v1.2.3


From 4b9c8b1919323f7f359376ca31a4c721cb2b3acf Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:09 +0100
Subject: buffer: convert sb_getblk() to call __getblk()

Now that __getblk() is in the right place in the file, it is trivial to
call it from sb_getblk().

Link: https://lkml.kernel.org/r/20230914150011.843330-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 9a3ca5f6d63d..b294e2cccbae 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -362,13 +362,12 @@ static inline struct buffer_head *__getblk(struct block_device *bdev,
 	return bdev_getblk(bdev, block, size, gfp);
 }
 
-static inline struct buffer_head *
-sb_getblk(struct super_block *sb, sector_t block)
+static inline struct buffer_head *sb_getblk(struct super_block *sb,
+		sector_t block)
 {
-	return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
+	return __getblk(sb->s_bdev, block, sb->s_blocksize);
 }
 
-
 static inline struct buffer_head *
 sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
 {
-- 
cgit v1.2.3


From 8a83ac54940d27b8f56d766e1cb270d150fedd50 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:10 +0100
Subject: ext4: call bdev_getblk() from sb_getblk_gfp()

Most of the callers of sb_getblk_gfp() already assumed that they were
passing the entire GFP flags to use.  Fix up the two callers that didn't,
and remove the __GFP_NOFAIL from them since they both appear to correctly
handle failure.

Link: https://lkml.kernel.org/r/20230914150011.843330-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index b294e2cccbae..22f13eece719 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -368,10 +368,10 @@ static inline struct buffer_head *sb_getblk(struct super_block *sb,
 	return __getblk(sb->s_bdev, block, sb->s_blocksize);
 }
 
-static inline struct buffer_head *
-sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
+static inline struct buffer_head *sb_getblk_gfp(struct super_block *sb,
+		sector_t block, gfp_t gfp)
 {
-	return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
+	return bdev_getblk(sb->s_bdev, block, sb->s_blocksize, gfp);
 }
 
 static inline struct buffer_head *
-- 
cgit v1.2.3


From 93b13ecaa713e1fcbf23b7483eec065d300c5ad8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:11 +0100
Subject: buffer: remove __getblk_gfp()

Inline it into __bread_gfp().

Link: https://lkml.kernel.org/r/20230914150011.843330-9-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 22f13eece719..3dc4720e4773 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -229,8 +229,6 @@ struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
 			unsigned size);
 struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
 		unsigned size, gfp_t gfp);
-struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
-				  unsigned size, gfp_t gfp);
 void __brelse(struct buffer_head *);
 void __bforget(struct buffer_head *);
 void __breadahead(struct block_device *, sector_t block, unsigned int size);
-- 
cgit v1.2.3


From 3dfbb555c98ac55b9d911f9af0e35014b445fb41 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 14 Sep 2023 15:16:39 +0200
Subject: mm, vmscan: remove ISOLATE_UNMAPPED

This isolate_mode_t flag is effectively unused since 89f6c88a6ab4 ("mm:
__isolate_lru_page_prepare() in isolate_migratepages_block()") as
sc->may_unmap is now checked directly (and only node_reclaim has a mode
that sets it to 0).  The last remaining place is mm_vmscan_lru_isolate
tracepoint for the isolate_mode parameter.  That one was mainly used to
indicate the active/inactive mode, which the trace-vmscan-postprocess.pl
script consumed, but that got silently broken.  After fixing the script by
the previous patch, it does not need the isolate_mode anymore.  So just
remove the parameter and with that the whole ISOLATE_UNMAPPED flag.

Link: https://lkml.kernel.org/r/20230914131637.12204-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4106fbc5b4b3..486587fcd27f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -639,8 +639,6 @@ struct lruvec {
 #endif
 };
 
-/* Isolate unmapped pages */
-#define ISOLATE_UNMAPPED	((__force isolate_mode_t)0x2)
 /* Isolate for asynchronous migration */
 #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
 /* Isolate unevictable pages */
-- 
cgit v1.2.3


From 4472edf63d6630e6cf65e205b4fc8c3c94d0afe5 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 14 Sep 2023 02:15:23 +0000
Subject: mm/damon/core: use number of passed access sampling as a timer

DAMON sleeps for sampling interval after each sampling, and check if the
aggregation interval and the ops update interval have passed using
ktime_get_coarse_ts64() and baseline timestamps for the intervals.  That
design is for making the operations occur at deterministic timing
regardless of the time that spend for each work.  However, it turned out
it is not that useful, and incur not-that-intuitive results.

After all, timer functions, and especially sleep functions that DAMON uses
to wait for specific timing, are not necessarily strictly accurate.  It is
legal design, so no problem.  However, depending on such inaccuracies, the
nr_accesses can be larger than aggregation interval divided by sampling
interval.  For example, with the default setting (5 ms sampling interval
and 100 ms aggregation interval) we frequently show regions having
nr_accesses larger than 20.  Also, if the execution of a DAMOS scheme
takes a long time, next aggregation could happen before enough number of
samples are collected.  This is not what usual users would intuitively
expect.

Since access check sampling is the smallest unit work of DAMON, using the
number of passed sampling intervals as the DAMON-internal timer can easily
avoid these problems.  That is, convert aggregation and ops update
intervals to numbers of sampling intervals that need to be passed before
those operations be executed, count the number of passed sampling
intervals, and invoke the operations as soon as the specific amount of
sampling intervals passed.  Make the change.

Note that this could make a behavioral change to settings that using
intervals that not aligned by the sampling interval.  For example, if the
sampling interval is 5 ms and the aggregation interval is 12 ms, DAMON
effectively uses 15 ms as its aggregation interval, because it checks
whether the aggregation interval after sleeping the sampling interval.
This change will make DAMON to effectively use 10 ms as aggregation
interval, since it uses 'aggregation interval / sampling interval *
sampling interval' as the effective aggregation interval, and we don't use
floating point types.  Usual users would have used aligned intervals, so
this behavioral change is not expected to make any meaningful impact, so
just make this change.

Link: https://lkml.kernel.org/r/20230914021523.60649-1-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ab3089de1478..9a32b8fd0bd3 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -524,8 +524,18 @@ struct damon_ctx {
 	struct damon_attrs attrs;
 
 /* private: internal use only */
-	struct timespec64 last_aggregation;
-	struct timespec64 last_ops_update;
+	/* number of sample intervals that passed since this context started */
+	unsigned long passed_sample_intervals;
+	/*
+	 * number of sample intervals that should be passed before next
+	 * aggregation
+	 */
+	unsigned long next_aggregation_sis;
+	/*
+	 * number of sample intervals that should be passed before next ops
+	 * update
+	 */
+	unsigned long next_ops_update_sis;
 
 /* public: */
 	struct task_struct *kdamond;
-- 
cgit v1.2.3


From 78fbfb155d204428119310d1b9df665ab88da6e8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:44 +0000
Subject: mm/damon/core: define and use a dedicated function for region access
 rate update

Patch series "mm/damon: provide pseudo-moving sum based access rate".

DAMON checks the access to each region for every sampling interval,
increase the access rate counter of the region, namely nr_accesses, if the
access was made.  For every aggregation interval, the counter is reset.
The counter is exposed to users to be used as a metric showing the
relative access rate (frequency) of each region.  In other words, DAMON
provides access rate of each region in every aggregation interval.  The
aggregation avoids temporal access pattern changes making things
confusing.  However, this also makes a few DAMON-related operations to
unnecessarily need to be aligned to the aggregation interval.  This can
restrict the flexibility of DAMON applications, especially when the
aggregation interval is huge.

To provide the monitoring results in finer-grained timing while keeping
handling of temporal access pattern change, this patchset implements a
pseudo-moving sum based access rate metric.  It is pseudo-moving sum
because strict moving sum implementation would need to keep all values for
last time window, and that could incur high overhead of there could be
arbitrary number of values in a time window.  Especially in case of the
nr_accesses, since the sampling interval and aggregation interval can
arbitrarily set and the past values should be maintained for every region,
it could be risky.  The pseudo-moving sum assumes there were no temporal
access pattern change in last discrete time window to remove the needs for
keeping the list of the last time window values.  As a result, it beocmes
not strict moving sum implementation, but provides a reasonable accuracy.

Also, it keeps an important property of the moving sum.  That is, the
moving sum becomes same to discrete-window based sum at the time that
aligns to the time window.  This means using the pseudo moving sum based
nr_accesses makes no change to users who shows the value for every
aggregation interval.

Patches Sequence
----------------

The sequence of the patches is as follows.  The first four patches are for
preparation of the change.  The first two (patches 1 and 2) implements a
helper function for nr_accesses update and eliminate corner case that
skips use of the function, respectively.  Following two (patches 3 and 4)
respectively implement the pseudo-moving sum function and its simple unit
test case.

Two patches for making DAMON to use the pseudo-moving sum follow.  The
fifthe one (patch 5) introduces a new field for representing the
pseudo-moving sum-based access rate of each region, and the sixth one
makes the new representation to actually updated with the pseudo-moving
sum function.

Last two patches (patches 7 and 8) makes followup fixes for skipping
unnecessary updates and marking the moving sum function as static,
respectively.


This patch (of 8):

Each DAMON operarions set is updating nr_accesses field of each
damon_region for each of their access check results, from the
check_accesses() callback.  Directly accessing the field could make things
complex to manage and change in future.  Define and use a dedicated
function for the purpose.

Link: https://lkml.kernel.org/r/20230915025251.72816-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20230915025251.72816-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 9a32b8fd0bd3..17c504d236b9 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -45,7 +45,9 @@ struct damon_addr_range {
  *
  * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be
  * increased for every &damon_attrs->sample_interval if an access to the region
- * during the last sampling interval is found.
+ * during the last sampling interval is found.  The update of this field should
+ * not be done with direct access but with the helper function,
+ * damon_update_region_access_rate().
  *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
@@ -620,6 +622,7 @@ void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
+void damon_update_region_access_rate(struct damon_region *r, bool accessed);
 
 struct damos_filter *damos_new_filter(enum damos_filter_type type,
 		bool matching);
-- 
cgit v1.2.3


From d2c062ade07ffd206dd16bf085f02abc59651309 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:46 +0000
Subject: mm/damon/core: implement a pseudo-moving sum function

For values that continuously change, moving average or sum are good ways
to provide fast updates while handling temporal and errorneous variability
of the value.  For example, the access rate counter (nr_accesses) is
calculated as a sum of the number of positive sampled access check results
that collected during a discrete time window (aggregation interval), and
hence it handles temporal and errorneous access check results, but
provides the update only for every aggregation interval.  Using a moving
sum method for that could allow providing the value for every sampling
interval.  That could be useful for getting monitoring results snapshot or
running DAMOS in fine-grained timing.

However, supporting the moving sum for cases that number of samples in the
time window is arbirary could impose high overhead, since the number of
past values that it needs to keep could be too high.  The nr_accesses
would also be one of the cases.  To mitigate the overhead, implement a
pseudo-moving sum function that only provides an estimated pseudo-moving
sum.  It assumes there was no error in last discrete time window and
subtract constant portion of last discrete time window sum.

Note that the function is not strictly implementing the moving sum, but it
keeps a property of moving sum, which makes the value same to the
dsicrete-window based sum for each time window-aligned timing.  Hence,
people collecting the value in the old timings would show no difference.

Link: https://lkml.kernel.org/r/20230915025251.72816-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 17c504d236b9..487a545a11b4 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -622,6 +622,8 @@ void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
+unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
+		unsigned int len_window, unsigned int new_value);
 void damon_update_region_access_rate(struct damon_region *r, bool accessed);
 
 struct damos_filter *damos_new_filter(enum damos_filter_type type,
-- 
cgit v1.2.3


From 80333828ea7728ebe85d079bb5c1467eb9fc6c8c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:48 +0000
Subject: mm/damon/core: introduce nr_accesses_bp

Add yet another representation of the access rate of each region, namely
nr_accesses_bp.  It is just same to the nr_accesses but represents the
value in basis point (1 in 10,000), and updated at once in every
aggregation interval.  That is, moving_accesses_bp is just nr_accesses *
10000.  This may seems useless at the moment.  However, it will be useful
for representing less than one nr_accesses value that will be needed to
make moving sum-based nr_accesses.

Link: https://lkml.kernel.org/r/20230915025251.72816-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 487a545a11b4..15f24b23c9a0 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -40,6 +40,7 @@ struct damon_addr_range {
  * @ar:			The address range of the region.
  * @sampling_addr:	Address of the sample for the next access check.
  * @nr_accesses:	Access frequency of this region.
+ * @nr_accesses_bp:	@nr_accesses in basis point (0.01%).
  * @list:		List head for siblings.
  * @age:		Age of this region.
  *
@@ -49,6 +50,9 @@ struct damon_addr_range {
  * not be done with direct access but with the helper function,
  * damon_update_region_access_rate().
  *
+ * @nr_accesses_bp is another representation of @nr_accesses in basis point
+ * (1 in 10,000) that updated every aggregation interval.
+ *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
  * regions are merged into a new region, both @nr_accesses and @age of the new
@@ -58,6 +62,7 @@ struct damon_region {
 	struct damon_addr_range ar;
 	unsigned long sampling_addr;
 	unsigned int nr_accesses;
+	unsigned int nr_accesses_bp;
 	struct list_head list;
 
 	unsigned int age;
-- 
cgit v1.2.3


From ace30fb21af5f1be1605db72c16040b95b1557ef Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:49 +0000
Subject: mm/damon/core: use pseudo-moving sum for nr_accesses_bp

Let nr_accesses_bp be calculated as a pseudo-moving sum that updated for
every sampling interval, using damon_moving_sum().  This is assumed to be
useful for cases that the aggregation interval is set quite huge, but the
monivoting results need to be collected earlier than next aggregation
interval is passed.

Link: https://lkml.kernel.org/r/20230915025251.72816-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 15f24b23c9a0..0fe13482df63 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -40,7 +40,8 @@ struct damon_addr_range {
  * @ar:			The address range of the region.
  * @sampling_addr:	Address of the sample for the next access check.
  * @nr_accesses:	Access frequency of this region.
- * @nr_accesses_bp:	@nr_accesses in basis point (0.01%).
+ * @nr_accesses_bp:	@nr_accesses in basis point (0.01%) that updated for
+ *			each sampling interval.
  * @list:		List head for siblings.
  * @age:		Age of this region.
  *
@@ -51,7 +52,11 @@ struct damon_addr_range {
  * damon_update_region_access_rate().
  *
  * @nr_accesses_bp is another representation of @nr_accesses in basis point
- * (1 in 10,000) that updated every aggregation interval.
+ * (1 in 10,000) that updated for every &damon_attrs->sample_interval in a
+ * manner similar to moving sum.  By the algorithm, this value becomes
+ * @nr_accesses * 10000 for every &struct damon_attrs->aggr_interval.  This can
+ * be used when the aggregation interval is too huge and therefore cannot wait
+ * for it before getting the access monitoring results.
  *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
@@ -629,7 +634,8 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
 unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
 		unsigned int len_window, unsigned int new_value);
-void damon_update_region_access_rate(struct damon_region *r, bool accessed);
+void damon_update_region_access_rate(struct damon_region *r, bool accessed,
+		struct damon_attrs *attrs);
 
 struct damos_filter *damos_new_filter(enum damos_filter_type type,
 		bool matching);
-- 
cgit v1.2.3


From 863803a7948c8e33e6a7b002017747ca83ecfd63 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:51 +0000
Subject: mm/damon/core: mark damon_moving_sum() as a static function

The function is used by only mm/damon/core.c.  Mark it as a static
function.

Link: https://lkml.kernel.org/r/20230915025251.72816-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 0fe13482df63..491fdd3e4c76 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -632,8 +632,6 @@ void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
-unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
-		unsigned int len_window, unsigned int new_value);
 void damon_update_region_access_rate(struct damon_region *r, bool accessed,
 		struct damon_attrs *attrs);
 
-- 
cgit v1.2.3


From 77e6c43e137c130138c3fbadc847351a83c4befe Mon Sep 17 00:00:00 2001
From: Usama Arif <usama.arif@bytedance.com>
Date: Wed, 13 Sep 2023 11:54:00 +0100
Subject: memblock: introduce MEMBLOCK_RSRV_NOINIT flag

For reserved memory regions marked with this flag, reserve_bootmem_region
is not called during memmap_init_reserved_pages.  This can be used to
avoid struct page initialization for regions which won't need them, for
e.g.  hugepages with Hugepage Vmemmap Optimization enabled.

Link: https://lkml.kernel.org/r/20230913105401.519709-4-usama.arif@bytedance.com
Signed-off-by: Usama Arif <usama.arif@bytedance.com>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Punit Agrawal <punit.agrawal@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memblock.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 1c1072e3ca06..ae3bde302f70 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -40,6 +40,8 @@ extern unsigned long long max_possible_pfn;
  * via a driver, and never indicated in the firmware-provided memory map as
  * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the
  * kernel resource tree.
+ * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are
+ * not initialized (only for reserved regions).
  */
 enum memblock_flags {
 	MEMBLOCK_NONE		= 0x0,	/* No special request */
@@ -47,6 +49,7 @@ enum memblock_flags {
 	MEMBLOCK_MIRROR		= 0x2,	/* mirrored region */
 	MEMBLOCK_NOMAP		= 0x4,	/* don't add to kernel direct mapping */
 	MEMBLOCK_DRIVER_MANAGED = 0x8,	/* always detected via a driver */
+	MEMBLOCK_RSRV_NOINIT	= 0x10,	/* don't initialize struct pages */
 };
 
 /**
@@ -125,6 +128,7 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
 int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
+int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size);
 
 void memblock_free_all(void);
 void memblock_free(void *ptr, size_t size);
@@ -259,6 +263,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m)
 	return m->flags & MEMBLOCK_NOMAP;
 }
 
+static inline bool memblock_is_reserved_noinit(struct memblock_region *m)
+{
+	return m->flags & MEMBLOCK_RSRV_NOINIT;
+}
+
 static inline bool memblock_is_driver_managed(struct memblock_region *m)
 {
 	return m->flags & MEMBLOCK_DRIVER_MANAGED;
-- 
cgit v1.2.3


From 42f994b71404b17abcd6b170de7a6aa95ffe5d4a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sat, 16 Sep 2023 02:09:40 +0000
Subject: mm/damon/core: implement scheme-specific apply interval

DAMON-based operation schemes are applied for every aggregation interval.
That was mainly because schemes were using nr_accesses, which be complete
to be used for every aggregation interval.  However, the schemes are now
using nr_accesses_bp, which is updated for each sampling interval in a way
that reasonable to be used.  Therefore, there is no reason to apply
schemes for each aggregation interval.

The unnecessary alignment with aggregation interval was also making some
use cases of DAMOS tricky.  Quotas setting under long aggregation interval
is one such example.  Suppose the aggregation interval is ten seconds, and
there is a scheme having CPU quota 100ms per 1s.  The scheme will actually
uses 100ms per ten seconds, since it cannobe be applied before next
aggregation interval.  The feature is working as intended, but the results
might not that intuitive for some users.  This could be fixed by updating
the quota to 1s per 10s.  But, in the case, the CPU usage of DAMOS could
look like spikes, and would actually make a bad effect to other
CPU-sensitive workloads.

Implement a dedicated timing interval for each DAMON-based operation
scheme, namely apply_interval.  The interval will be sampling interval
aligned, and each scheme will be applied for its apply_interval.  The
interval is set to 0 by default, and it means the scheme should use the
aggregation interval instead.  This avoids old users getting any
behavioral difference.

Link: https://lkml.kernel.org/r/20230916020945.47296-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 491fdd3e4c76..27b995c22497 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -314,16 +314,19 @@ struct damos_access_pattern {
  * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
  * @pattern:		Access pattern of target regions.
  * @action:		&damo_action to be applied to the target regions.
+ * @apply_interval_us:	The time between applying the @action.
  * @quota:		Control the aggressiveness of this scheme.
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
  * @filters:		Additional set of &struct damos_filter for &action.
  * @stat:		Statistics of this scheme.
  * @list:		List head for siblings.
  *
- * For each aggregation interval, DAMON finds regions which fit in the
+ * For each @apply_interval_us, DAMON finds regions which fit in the
  * &pattern and applies &action to those. To avoid consuming too much
  * CPU time or IO resources for the &action, &quota is used.
  *
+ * If @apply_interval_us is zero, &damon_attrs->aggr_interval is used instead.
+ *
  * To do the work only when needed, schemes can be activated for specific
  * system situations using &wmarks.  If all schemes that registered to the
  * monitoring context are inactive, DAMON stops monitoring either, and just
@@ -340,6 +343,14 @@ struct damos_access_pattern {
 struct damos {
 	struct damos_access_pattern pattern;
 	enum damos_action action;
+	unsigned long apply_interval_us;
+/* private: internal use only */
+	/*
+	 * number of sample intervals that should be passed before applying
+	 * @action
+	 */
+	unsigned long next_apply_sis;
+/* public: */
 	struct damos_quota quota;
 	struct damos_watermarks wmarks;
 	struct list_head filters;
@@ -641,7 +652,9 @@ void damos_add_filter(struct damos *s, struct damos_filter *f);
 void damos_destroy_filter(struct damos_filter *f);
 
 struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
-			enum damos_action action, struct damos_quota *quota,
+			enum damos_action action,
+			unsigned long apply_interval_us,
+			struct damos_quota *quota,
 			struct damos_watermarks *wmarks);
 void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
 void damon_destroy_scheme(struct damos *s);
-- 
cgit v1.2.3


From a8306f2d4dcea03538c70c26d2948483f70254ff Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Thu, 31 Aug 2023 09:33:40 -0700
Subject: compiler.h: unify __UNIQUE_ID

commit 6f33d58794ef ("__UNIQUE_ID()")
added a fallback definition of __UNIQUE_ID because gcc 4.2 and older did
not support __COUNTER__.

Also, this commit is effectively a revert of
commit b41c29b0527c ("Kbuild: provide a __UNIQUE_ID for clang")
which mentions clang 2.6+ supporting __COUNTER__.

Documentation/process/changes.rst currently lists the minimum supported
version of these compilers as:
- gcc: 5.1
- clang: 11.0.0
It should be safe to say that __COUNTER__ is well supported by this
point.

Link: https://lkml.kernel.org/r/20230831-unique_id-v1-1-28bacd18eb1d@google.com
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Michal rarek <mmarek@suse.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Paul Russel <rusty@rustcorp.com.au>
Cc: Tom Rix <trix@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler-clang.h | 5 -----
 include/linux/compiler-gcc.h   | 2 --
 include/linux/compiler.h       | 5 +----
 3 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 9b673fefcef8..ddab1ef22bee 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -14,11 +14,6 @@
 #undef __cleanup
 #define __cleanup(func) __maybe_unused __attribute__((__cleanup__(func)))
 
-/* same as gcc, this was present in clang-2.6 so we can assume it works
- * with any version that can compile the kernel
- */
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
 /* all clang versions usable with the kernel support KASAN ABI version 5 */
 #define KASAN_ABI_VERSION 5
 
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 7af9e34ec261..2ceba3fe4ec1 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -39,8 +39,6 @@
 #define __noretpoline __attribute__((__indirect_branch__("keep")))
 #endif
 
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
 #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__)
 #define __latent_entropy __attribute__((latent_entropy))
 #endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d7779a18b24f..174099fdc485 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -177,10 +177,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	__asm__ ("" : "=r" (var) : "0" (var))
 #endif
 
-/* Not-quite-unique ID. */
-#ifndef __UNIQUE_ID
-# define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__)
-#endif
+#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
 
 /**
  * data_race - mark an expression as containing intentional data races
-- 
cgit v1.2.3


From 33a9813825710fdc2b980d566ee391fd093a36c6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 24 Aug 2023 16:31:42 +0200
Subject: introduce __next_thread(), fix next_tid() vs exec() race

Patch series "introduce __next_thread(), change next_thread()".

After commit dce8f8ed1de1 ("document while_each_thread(), change
first_tid() to use for_each_thread()") + this series

1. We have only one lockless user of next_thread(), task_group_seq_get_next().
   I think it should be changed too.

2. We have only one user of task_struct->thread_group, thread_group_empty().
   The next patches will change thread_group_empty() and kill ->thread_group.


This patch (of 2):

next_tid(start) does:

	rcu_read_lock();
	if (pid_alive(start)) {
		pos = next_thread(start);
		if (thread_group_leader(pos))
			pos = NULL;
		else
			get_task_struct(pos);

it should return pos = NULL when next_thread() wraps to the 1st thread
in the thread group, group leader, and the thread_group_leader() check
tries to detect this case.

But this can race with exec. To simplify, suppose we have a main thread
M and a single sub-thread T, next_tid(T) should return NULL.

Now suppose that T execs. If next_tid(T) is called after T changes the
leadership and before it does release_task() which removes the old leader
from list, then next_thread() returns M and thread_group_leader(M) = F.

Lockless use of next_thread() should be avoided. After this change only
task_group_seq_get_next() does this, and I believe it should be changed
as well.

Link: https://lkml.kernel.org/r/20230824143112.GA31208@redhat.com
Link: https://lkml.kernel.org/r/20230824143142.GA31222@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/signal.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..7fb34b8cda54 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -715,6 +715,17 @@ bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
 	return p1->signal == p2->signal;
 }
 
+/*
+ * returns NULL if p is the last thread in the thread group
+ */
+static inline struct task_struct *__next_thread(struct task_struct *p)
+{
+	return list_next_or_null_rcu(&p->signal->thread_head,
+					&p->thread_node,
+					struct task_struct,
+					thread_node);
+}
+
 static inline struct task_struct *next_thread(const struct task_struct *p)
 {
 	return list_entry_rcu(p->thread_group.next,
-- 
cgit v1.2.3


From d639cf4abb4d171ab2456904da5668c42b5c1937 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 24 Aug 2023 16:32:01 +0200
Subject: change next_thread() to use __next_thread() ?: group_leader

This relies on fact that group leader is always the 1st entry in the
signal->thread_head list.

With or without this change, if the lockless next_thread(last_thread)
races with exec it can return the old or the new leader.

We are almost ready to kill task->thread_group, after this change its
only user is thread_group_empty().

Link: https://lkml.kernel.org/r/20230824143201.GB31222@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/signal.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 7fb34b8cda54..cffc882d367f 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -726,10 +726,9 @@ static inline struct task_struct *__next_thread(struct task_struct *p)
 					thread_node);
 }
 
-static inline struct task_struct *next_thread(const struct task_struct *p)
+static inline struct task_struct *next_thread(struct task_struct *p)
 {
-	return list_entry_rcu(p->thread_group.next,
-			      struct task_struct, thread_group);
+	return __next_thread(p) ?: p->group_leader;
 }
 
 static inline int thread_group_empty(struct task_struct *p)
-- 
cgit v1.2.3


From e34a35ee1f52312af130b5ebd42fa28313fc6149 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 26 Aug 2023 13:14:06 +0200
Subject: change thread_group_empty() to use task_struct->thread_node

Patch series "kill task_struct->thread_group".


This patch (of 2):

It could use list_is_singular() but this way it is cheaper.  Plus the
thread_group_leader() check makes it clear that thread_group_empty() can
only return true if p is a group leader.  This was not immediately obvious
before this patch.

task_struct->thread_group no longer has users, it can die.

Link: https://lkml.kernel.org/r/20230826111200.GA22982@redhat.com
Link: https://lkml.kernel.org/r/20230826111406.GA23238@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/signal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index cffc882d367f..d7fa3ca2fa53 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -733,7 +733,8 @@ static inline struct task_struct *next_thread(struct task_struct *p)
 
 static inline int thread_group_empty(struct task_struct *p)
 {
-	return list_empty(&p->thread_group);
+	return thread_group_leader(p) &&
+	       list_is_last(&p->thread_node, &p->signal->thread_head);
 }
 
 #define delay_group_leader(p) \
-- 
cgit v1.2.3


From 8e1f385104ac044f1552686ad6e1cbc71cc05a30 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 26 Aug 2023 13:14:09 +0200
Subject: kill task_struct->thread_group

The last user was removed by the previous patch.

Link: https://lkml.kernel.org/r/20230826111409.GA23243@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..6d1341b1673f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1002,7 +1002,6 @@ struct task_struct {
 	/* PID/PID hash table linkage. */
 	struct pid			*thread_pid;
 	struct hlist_node		pid_links[PIDTYPE_MAX];
-	struct list_head		thread_group;
 	struct list_head		thread_node;
 
 	struct completion		*vfork_done;
-- 
cgit v1.2.3


From 9cba82bba500e3ce875381350f289cfb3aa633ba Mon Sep 17 00:00:00 2001
From: Xingui Yang <yangxingui@huawei.com>
Date: Tue, 5 Sep 2023 02:48:33 +0000
Subject: seq_file: add helper macro to define attribute for rw file

Patch series "Add helper macro DEFINE_SHOW_STORE_ATTRIBUTE() at
seq_file.c", v6.

We already own DEFINE_SHOW_ATTRIBUTE() helper macro for defining attribute
for read-only file, but we found many of drivers also want a helper macro
for read-write file too.

So we add this helper macro to reduce duplicated code.


This patch (of 3):

We already own DEFINE_SHOW_ATTRIBUTE() helper macro for defining attribute
for read-only file, but many of drivers want a helper macro for read-write
file too.

So we add DEFINE_SHOW_STORE_ATTRIBUTE() helper to reduce duplicated code.

Link: https://lkml.kernel.org/r/20230905024835.43219-1-yangxingui@huawei.com
Link: https://lkml.kernel.org/r/20230905024835.43219-2-yangxingui@huawei.com
Signed-off-by: Luo Jiaxing <luojiaxing@huawei.com>
Co-developed-by: Xingui Yang <yangxingui@huawei.com>
Signed-off-by: Xingui Yang <yangxingui@huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Animesh Manna <animesh.manna@intel.com>
Cc: Anshuman Gupta <anshuman.gupta@intel.com>
Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Cc: Felipe Balbi <felipe.balbi@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Himanshu Madhani <himanshu.madhani@cavium.com>
Cc: James Bottomley <jejb@linux.ibm.com>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Uma Shankar <uma.shankar@intel.com>
Cc: Xiang Chen <chenxiang66@hisilicon.com>
Cc: Zeng Tao <prime.zeng@hisilicon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/seq_file.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 386ab580b839..234bcdb1fba4 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -207,6 +207,21 @@ static const struct file_operations __name ## _fops = {			\
 	.release	= single_release,				\
 }
 
+#define DEFINE_SHOW_STORE_ATTRIBUTE(__name)				\
+static int __name ## _open(struct inode *inode, struct file *file)	\
+{									\
+	return single_open(file, __name ## _show, inode->i_private);	\
+}									\
+									\
+static const struct file_operations __name ## _fops = {			\
+	.owner		= THIS_MODULE,					\
+	.open		= __name ## _open,				\
+	.read		= seq_read,					\
+	.write		= __name ## _write,				\
+	.llseek		= seq_lseek,					\
+	.release	= single_release,				\
+}
+
 #define DEFINE_PROC_SHOW_ATTRIBUTE(__name)				\
 static int __name ## _open(struct inode *inode, struct file *file)	\
 {									\
-- 
cgit v1.2.3


From 6309727ef27162deabd5c095c11af24970fba5a2 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Fri, 8 Sep 2023 01:40:48 +0200
Subject: kthread: add kthread_stop_put

Add a kthread_stop_put() helper that stops a thread and puts its task
struct.  Use it to replace the various instances of kthread_stop()
followed by put_task_struct().

Remove the kthread_stop_put() macro in usbip that is similar but doesn't
return the result of kthread_stop().

[agruenba@redhat.com: fix kerneldoc comment]
  Link: https://lkml.kernel.org/r/20230911111730.2565537-1-agruenba@redhat.com
[akpm@linux-foundation.org: document kthread_stop_put()'s argument]
Link: https://lkml.kernel.org/r/20230907234048.2499820-1-agruenba@redhat.com
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kthread.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 2c30ade43bc8..b11f53c1ba2e 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -86,6 +86,7 @@ void free_kthread_struct(struct task_struct *k);
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
+int kthread_stop_put(struct task_struct *k);
 bool kthread_should_stop(void);
 bool kthread_should_park(void);
 bool kthread_should_stop_or_park(void);
-- 
cgit v1.2.3


From 5e57418a2031cd5e1863efdf3d7447a16a368172 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 11 Sep 2023 18:49:13 +0300
Subject: minmax: deduplicate __unconst_integer_typeof()

It appears that compiler_types.h already have an implementation of the
__unconst_integer_typeof() called __unqual_scalar_typeof().  Use it
instead of the copy.

Link: https://lkml.kernel.org/r/20230911154913.4176033-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Herve Codina <herve.codina@bootlin.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 83aebc244cba..69bbe987fa87 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_MINMAX_H
 #define _LINUX_MINMAX_H
 
+#include <linux/compiler_types.h>
 #include <linux/const.h>
 #include <linux/types.h>
 
@@ -134,27 +135,6 @@
  */
 #define max_t(type, x, y)	__careful_cmp((type)(x), (type)(y), >)
 
-/*
- * Remove a const qualifier from integer types
- * _Generic(foo, type-name: association, ..., default: association) performs a
- * comparison against the foo type (not the qualified type).
- * Do not use the const keyword in the type-name as it will not match the
- * unqualified type of foo.
- */
-#define __unconst_integer_type_cases(type)	\
-	unsigned type:  (unsigned type)0,	\
-	signed type:    (signed type)0
-
-#define __unconst_integer_typeof(x) typeof(			\
-	_Generic((x),						\
-		char: (char)0,					\
-		__unconst_integer_type_cases(char),		\
-		__unconst_integer_type_cases(short),		\
-		__unconst_integer_type_cases(int),		\
-		__unconst_integer_type_cases(long),		\
-		__unconst_integer_type_cases(long long),	\
-		default: (x)))
-
 /*
  * Do not check the array parameter using __must_be_array().
  * In the following legit use-case where the "array" passed is a simple pointer,
@@ -169,13 +149,13 @@
  * 'int *buff' and 'int buff[N]' types.
  *
  * The array can be an array of const items.
- * typeof() keeps the const qualifier. Use __unconst_integer_typeof() in order
+ * typeof() keeps the const qualifier. Use __unqual_scalar_typeof() in order
  * to discard the const qualifier for the __element variable.
  */
 #define __minmax_array(op, array, len) ({				\
 	typeof(&(array)[0]) __array = (array);				\
 	typeof(len) __len = (len);					\
-	__unconst_integer_typeof(__array[0]) __element = __array[--__len]; \
+	__unqual_scalar_typeof(__array[0]) __element = __array[--__len];\
 	while (__len--)							\
 		__element = op(__element, __array[__len]);		\
 	__element; })
-- 
cgit v1.2.3


From f6e9d38f8eb00ac8b52e6d15f6aa9bcecacb081b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 12 Sep 2023 12:23:55 +0300
Subject: minmax: fix header inclusions

BUILD_BUG_ON*() macros are defined in build_bug.h.  Include it.  Replace
compiler_types.h by compiler.h, which provides the former, to have a
definition of the __UNIQUE_ID().

Link: https://lkml.kernel.org/r/20230912092355.79280-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Herve Codina <herve.codina@bootlin.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 69bbe987fa87..ca69abd6151e 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -2,7 +2,8 @@
 #ifndef _LINUX_MINMAX_H
 #define _LINUX_MINMAX_H
 
-#include <linux/compiler_types.h>
+#include <linux/build_bug.h>
+#include <linux/compiler.h>
 #include <linux/const.h>
 #include <linux/types.h>
 
-- 
cgit v1.2.3


From a9e1a3d84e4a0ea560ed4d84c28d06dbfdffed22 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:35 +0800
Subject: crash_core: change the prototype of function parse_crashkernel()

Add two parameters 'low_size' and 'high' to function parse_crashkernel(),
later crashkernel=,high|low parsing will be added.  Make adjustments in
all call sites of parse_crashkernel() in arch.

Link: https://lkml.kernel.org/r/20230914033142.676708-3-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 0c06561bf5ff..6156355ef831 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -80,7 +80,8 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 void final_note(Elf_Word *buf);
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
+		unsigned long long *crash_size, unsigned long long *crash_base,
+		unsigned long long *low_size, bool *high);
 int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
 int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
-- 
cgit v1.2.3


From 70916e9c8d9f1a286c99727072b22e395097909f Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:36 +0800
Subject: crash_core: change parse_crashkernel() to support
 crashkernel=,high|low parsing

Now parse_crashkernel() is a real entry point for all kinds of crahskernel
parsing on any architecture.

And wrap the crahskernel=,high|low handling inside
CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION ifdeffery scope.

Link: https://lkml.kernel.org/r/20230914033142.676708-4-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 6156355ef831..d8050a7eab01 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -79,6 +79,12 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
 
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+#define DEFAULT_CRASH_KERNEL_LOW_SIZE  (128UL << 20)
+#endif
+#endif
+
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base,
 		unsigned long long *low_size, bool *high);
-- 
cgit v1.2.3


From 0ab97169aa0517079b22c2e64192906caa5dc6d5 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:37 +0800
Subject: crash_core: add generic function to do reservation

In architecture like x86_64, arm64 and riscv, they have vast virtual
address space and usually have huge physical memory RAM.  Their
crashkernel reservation doesn't have to be limited under 4G RAM, but can
be extended to the whole physical memory via crashkernel=,high support.

Now add function reserve_crashkernel_generic() to reserve crashkernel
memory if users specify any case of kernel pamameters, like
crashkernel=xM[@offset] or crashkernel=,high|low.

This is preparation to simplify code of crashkernel=,high support in
architecutures.

Link: https://lkml.kernel.org/r/20230914033142.676708-5-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index d8050a7eab01..4dbd6565e0ff 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -93,6 +93,34 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
 int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
 
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+#define DEFAULT_CRASH_KERNEL_LOW_SIZE	(128UL << 20)
+#endif
+#ifndef CRASH_ALIGN
+#define CRASH_ALIGN			SZ_2M
+#endif
+#ifndef CRASH_ADDR_LOW_MAX
+#define CRASH_ADDR_LOW_MAX		SZ_4G
+#endif
+#ifndef CRASH_ADDR_HIGH_MAX
+#define CRASH_ADDR_HIGH_MAX		memblock_end_of_DRAM()
+#endif
+
+void __init reserve_crashkernel_generic(char *cmdline,
+		unsigned long long crash_size,
+		unsigned long long crash_base,
+		unsigned long long crash_low_size,
+		bool high);
+#else
+static inline void __init reserve_crashkernel_generic(char *cmdline,
+		unsigned long long crash_size,
+		unsigned long long crash_base,
+		unsigned long long crash_low_size,
+		bool high)
+{}
+#endif
+
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
 
-- 
cgit v1.2.3


From b631b95dded5e7f007a3a79cbaf82ef50c1e2cf7 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:38 +0800
Subject: crash_core: move crashk_*res definition into crash_core.c

Both crashk_res and crashk_low_res are used to mark the reserved
crashkernel regions in iomem_resource tree.  And later the generic
crashkernel resrvation will be added into crash_core.c.  So move
crashk_res and crashk_low_res definition into crash_core.c to avoid
compiling error if CONFIG_CRASH_CORE=on while CONFIG_KEXEC_CORE is unset.

Meanwhile include <asm/crash_core.h> in <linux/crash_core.h> if generic
reservation is needed.  In that case, <asm/crash_core.h> need be added by
ARCH.  In asm/crash_core.h, ARCH can provide its own macro definitions to
override macros in <linux/crash_core.h> if needed.  Wrap the including
into CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION ifdeffery scope to
avoid compiling error in other ARCH-es which don't take the generic
reservation way yet.

Link: https://lkml.kernel.org/r/20230914033142.676708-6-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h | 8 ++++++++
 include/linux/kexec.h      | 4 ----
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 4dbd6565e0ff..3c735a7e33fb 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -5,6 +5,14 @@
 #include <linux/linkage.h>
 #include <linux/elfcore.h>
 #include <linux/elf.h>
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#include <asm/crash_core.h>
+#endif
+
+/* Location of a reserved region to hold the crash kernel.
+ */
+extern struct resource crashk_res;
+extern struct resource crashk_low_res;
 
 #define CRASH_CORE_NOTE_NAME	   "CORE"
 #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 32c78078552c..8227455192b7 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -22,10 +22,6 @@
 #include <uapi/linux/kexec.h>
 #include <linux/verification.h>
 
-/* Location of a reserved region to hold the crash kernel.
- */
-extern struct resource crashk_res;
-extern struct resource crashk_low_res;
 extern note_buf_t __percpu *crash_notes;
 
 #ifdef CONFIG_KEXEC_CORE
-- 
cgit v1.2.3


From c37e56cac3d62c69f093904afbc58fc428484d14 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:42 +0800
Subject: crash_core.c: remove unneeded functions

So far, nobody calls functions parse_crashkernel_high() and
parse_crashkernel_low(), remove both of them.

Link: https://lkml.kernel.org/r/20230914033142.676708-10-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 3c735a7e33fb..3426f6eef60b 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -96,10 +96,6 @@ void final_note(Elf_Word *buf);
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base,
 		unsigned long long *low_size, bool *high);
-int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
-int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
 
 #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
 #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
-- 
cgit v1.2.3


From 2cb1f6e9a743af58a23cf14563b5eada1e0d3fde Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Sep 2023 22:36:00 +0200
Subject: rcu: Conditionally build CPU-hotplug teardown callbacks

Among the three CPU-hotplug teardown RCU callbacks, two of them early
exit if CONFIG_HOTPLUG_CPU=n, and one is left unchanged. In any case
all of them have an implementation when CONFIG_HOTPLUG_CPU=n.

Align instead with the common way to deal with CPU-hotplug teardown
callbacks and provide a proper stub when they are not supported.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/rcutree.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 153cfc7bbffd..46875c4e9f56 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -110,9 +110,16 @@ void rcu_all_qs(void);
 /* RCUtree hotplug events */
 int rcutree_prepare_cpu(unsigned int cpu);
 int rcutree_online_cpu(unsigned int cpu);
-int rcutree_offline_cpu(unsigned int cpu);
+void rcu_cpu_starting(unsigned int cpu);
+
+#ifdef CONFIG_HOTPLUG_CPU
 int rcutree_dead_cpu(unsigned int cpu);
 int rcutree_dying_cpu(unsigned int cpu);
-void rcu_cpu_starting(unsigned int cpu);
+int rcutree_offline_cpu(unsigned int cpu);
+#else
+#define rcutree_dead_cpu NULL
+#define rcutree_dying_cpu NULL
+#define rcutree_offline_cpu NULL
+#endif
 
 #endif /* __LINUX_RCUTREE_H */
-- 
cgit v1.2.3


From 448e9f34d91d1a4799fdb06a93c2c24b34b6fd9d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Sep 2023 22:36:01 +0200
Subject: rcu: Standardize explicit CPU-hotplug calls

rcu_report_dead() and rcutree_migrate_callbacks() have their headers in
rcupdate.h while those are pure rcutree calls, like the other CPU-hotplug
functions.

Also rcu_cpu_starting() and rcu_report_dead() have different naming
conventions while they mirror each other's effects.

Fix the headers and propose a naming that relates both functions and
aligns with the prefix of other rcutree CPU-hotplug functions.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/interrupt.h | 2 +-
 include/linux/rcupdate.h  | 2 --
 include/linux/rcutiny.h   | 2 +-
 include/linux/rcutree.h   | 7 ++++++-
 4 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a92bce40b04b..d05e1e9a553c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -566,7 +566,7 @@ enum
  *
  * _ RCU:
  * 	1) rcutree_migrate_callbacks() migrates the queue.
- * 	2) rcu_report_dead() reports the final quiescent states.
+ * 	2) rcutree_report_cpu_dead() reports the final quiescent states.
  *
  * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue
  */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index aa351ddcbe8d..f7206b2623c9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -122,8 +122,6 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
 void rcu_init(void);
 extern int rcu_scheduler_active;
 void rcu_sched_clock_irq(int user);
-void rcu_report_dead(void);
-void rcutree_migrate_callbacks(int cpu);
 
 #ifdef CONFIG_TASKS_RCU_GENERIC
 void rcu_init_tasks_generic(void);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 7b949292908a..d9ac7b136aea 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -171,6 +171,6 @@ static inline void rcu_all_qs(void) { barrier(); }
 #define rcutree_offline_cpu      NULL
 #define rcutree_dead_cpu         NULL
 #define rcutree_dying_cpu        NULL
-static inline void rcu_cpu_starting(unsigned int cpu) { }
+static inline void rcutree_report_cpu_starting(unsigned int cpu) { }
 
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 46875c4e9f56..254244202ea9 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -110,7 +110,7 @@ void rcu_all_qs(void);
 /* RCUtree hotplug events */
 int rcutree_prepare_cpu(unsigned int cpu);
 int rcutree_online_cpu(unsigned int cpu);
-void rcu_cpu_starting(unsigned int cpu);
+void rcutree_report_cpu_starting(unsigned int cpu);
 
 #ifdef CONFIG_HOTPLUG_CPU
 int rcutree_dead_cpu(unsigned int cpu);
@@ -122,4 +122,9 @@ int rcutree_offline_cpu(unsigned int cpu);
 #define rcutree_offline_cpu NULL
 #endif
 
+void rcutree_migrate_callbacks(int cpu);
+
+/* Called from hotplug and also arm64 early secondary boot failure */
+void rcutree_report_cpu_dead(void);
+
 #endif /* __LINUX_RCUTREE_H */
-- 
cgit v1.2.3


From 5790b1fb3d672d9a1fe3881a7181dfdbe741568f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 4 Oct 2023 16:50:07 -0400
Subject: eventfs: Remove eventfs_file and just use eventfs_inode

Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.

struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
						const struct eventfs_entry *entries,
						int size, void *data);

is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:

struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
					 const struct eventfs_entry *entries,
					 int size, void *data);

where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.

The entries are defined by:

typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
				const struct file_operations **fops);

struct eventfs_entry {
	const char			*name;
	eventfs_callback		callback;
};

Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.

If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.

This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.

The "show_events_dentry" file has been updated to show the directories,
and any files they have.

With just the eventfs_file allocations:

 Before after deltas for meminfo (in kB):

   MemFree:		-14360
   MemAvailable:	-14260
   Buffers:		40
   Cached:		24
   Active:		44
   Inactive:		48
   Inactive(anon):	28
   Active(file):	44
   Inactive(file):	20
   Dirty:		-4
   AnonPages:		28
   Mapped:		4
   KReclaimable:	132
   Slab:		1604
   SReclaimable:	132
   SUnreclaim:		1472
   Committed_AS:	12

 Before after deltas for slabinfo:

   <slab>:		<objects>	[ * <size> = <total>]

   ext4_inode_cache	27		[* 1184 = 31968 ]
   extent_status	102		[*   40 = 4080 ]
   tracefs_inode_cache	144		[*  656 = 94464 ]
   buffer_head		39		[*  104 = 4056 ]
   shmem_inode_cache	49		[*  800 = 39200 ]
   filp			-53		[*  256 = -13568 ]
   dentry		251		[*  192 = 48192 ]
   lsm_file_cache	277		[*   32 = 8864 ]
   vm_area_struct	-14		[*  184 = -2576 ]
   trace_event_file	1748		[*   88 = 153824 ]
   kmalloc-1k		35		[* 1024 = 35840 ]
   kmalloc-256		49		[*  256 = 12544 ]
   kmalloc-192		-28		[*  192 = -5376 ]
   kmalloc-128		-30		[*  128 = -3840 ]
   kmalloc-96		10581		[*   96 = 1015776 ]
   kmalloc-64		3056		[*   64 = 195584 ]
   kmalloc-32		1291		[*   32 = 41312 ]
   kmalloc-16		2310		[*   16 = 36960 ]
   kmalloc-8		9216		[*    8 = 73728 ]

 Free memory dropped by 14,360 kB
 Available memory dropped by 14,260 kB
 Total slab additions in size: 1,771,032 bytes

With this change:

 Before after deltas for meminfo (in kB):

   MemFree:		-12084
   MemAvailable:	-11976
   Buffers:		32
   Cached:		32
   Active:		72
   Inactive:		168
   Inactive(anon):	176
   Active(file):	72
   Inactive(file):	-8
   Dirty:		24
   AnonPages:		196
   Mapped:		8
   KReclaimable:	148
   Slab:		836
   SReclaimable:	148
   SUnreclaim:		688
   Committed_AS:	324

 Before after deltas for slabinfo:

   <slab>:		<objects>	[ * <size> = <total>]

   tracefs_inode_cache	144		[* 656 = 94464 ]
   shmem_inode_cache	-23		[* 800 = -18400 ]
   filp			-92		[* 256 = -23552 ]
   dentry		179		[* 192 = 34368 ]
   lsm_file_cache	-3		[* 32 = -96 ]
   vm_area_struct	-13		[* 184 = -2392 ]
   trace_event_file	1748		[* 88 = 153824 ]
   kmalloc-1k		-49		[* 1024 = -50176 ]
   kmalloc-256		-27		[* 256 = -6912 ]
   kmalloc-128		1864		[* 128 = 238592 ]
   kmalloc-64		4685		[* 64 = 299840 ]
   kmalloc-32		-72		[* 32 = -2304 ]
   kmalloc-16		256		[* 16 = 4096 ]
   total = 721352

 Free memory dropped by 12,084 kB
 Available memory dropped by 11,976 kB
 Total slab additions in size:  721,352 bytes

That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.

Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  2 +-
 include/linux/tracefs.h      | 29 ++++++++++++++---------------
 2 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 21ae37e49319..12207dc6722d 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -649,7 +649,7 @@ struct trace_event_file {
 	struct list_head		list;
 	struct trace_event_call		*event_call;
 	struct event_filter __rcu	*filter;
-	struct eventfs_file             *ef;
+	struct eventfs_inode		*ei;
 	struct trace_array		*tr;
 	struct trace_subsystem_dir	*system;
 	struct list_head		triggers;
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 009072792fa3..0c39704455d9 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -23,26 +23,25 @@ struct file_operations;
 
 struct eventfs_file;
 
-struct dentry *eventfs_create_events_dir(const char *name,
-					 struct dentry *parent);
+typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
+				const struct file_operations **fops);
 
-struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
-					       struct dentry *parent);
+struct eventfs_entry {
+	const char			*name;
+	eventfs_callback		callback;
+};
 
-struct eventfs_file *eventfs_add_dir(const char *name,
-				     struct eventfs_file *ef_parent);
+struct eventfs_inode;
 
-int eventfs_add_file(const char *name, umode_t mode,
-		     struct eventfs_file *ef_parent, void *data,
-		     const struct file_operations *fops);
+struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
+						const struct eventfs_entry *entries,
+						int size, void *data);
 
-int eventfs_add_events_file(const char *name, umode_t mode,
-			 struct dentry *parent, void *data,
-			 const struct file_operations *fops);
+struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
+					 const struct eventfs_entry *entries,
+					 int size, void *data);
 
-void eventfs_remove(struct eventfs_file *ef);
-
-void eventfs_remove_events_dir(struct dentry *dentry);
+void eventfs_remove_dir(struct eventfs_inode *ei);
 
 struct dentry *tracefs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
-- 
cgit v1.2.3


From d7e9a9037de27b642d5a3edef7c69e2a2b460287 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 2 Oct 2023 16:09:35 -0700
Subject: f2fs: Support Block Size == Page Size

This allows f2fs to support cases where the block size = page size for
both 4K and 16K block sizes. Other sizes should work as well, should the
need arise. This does not currently support 4K Block size filesystems if
the page size is 16K.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 69 +++++++++++++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index a82a4bb6ce68..07ed69c2840d 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -13,10 +13,10 @@
 
 #define F2FS_SUPER_OFFSET		1024	/* byte-size offset */
 #define F2FS_MIN_LOG_SECTOR_SIZE	9	/* 9 bits for 512 bytes */
-#define F2FS_MAX_LOG_SECTOR_SIZE	12	/* 12 bits for 4096 bytes */
-#define F2FS_LOG_SECTORS_PER_BLOCK	3	/* log number for sector/blk */
-#define F2FS_BLKSIZE			4096	/* support only 4KB block */
-#define F2FS_BLKSIZE_BITS		12	/* bits for F2FS_BLKSIZE */
+#define F2FS_MAX_LOG_SECTOR_SIZE	PAGE_SHIFT	/* Max is Block Size */
+#define F2FS_LOG_SECTORS_PER_BLOCK	(PAGE_SHIFT - 9) /* log number for sector/blk */
+#define F2FS_BLKSIZE			PAGE_SIZE /* support only block == page */
+#define F2FS_BLKSIZE_BITS		PAGE_SHIFT /* bits for F2FS_BLKSIZE */
 #define F2FS_MAX_EXTENSION		64	/* # of extension entries */
 #define F2FS_EXTENSION_LEN		8	/* max size of extension */
 #define F2FS_BLK_ALIGN(x)	(((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS)
@@ -210,14 +210,14 @@ struct f2fs_checkpoint {
 	unsigned char sit_nat_version_bitmap[];
 } __packed;
 
-#define CP_CHKSUM_OFFSET	4092	/* default chksum offset in checkpoint */
+#define CP_CHKSUM_OFFSET	(F2FS_BLKSIZE - sizeof(__le32))	/* default chksum offset in checkpoint */
 #define CP_MIN_CHKSUM_OFFSET						\
 	(offsetof(struct f2fs_checkpoint, sit_nat_version_bitmap))
 
 /*
  * For orphan inode management
  */
-#define F2FS_ORPHANS_PER_BLOCK	1020
+#define F2FS_ORPHANS_PER_BLOCK	((F2FS_BLKSIZE - 4 * sizeof(__le32)) / sizeof(__le32))
 
 #define GET_ORPHAN_BLOCKS(n)	(((n) + F2FS_ORPHANS_PER_BLOCK - 1) / \
 					F2FS_ORPHANS_PER_BLOCK)
@@ -243,14 +243,31 @@ struct f2fs_extent {
 #define F2FS_NAME_LEN		255
 /* 200 bytes for inline xattrs by default */
 #define DEFAULT_INLINE_XATTR_ADDRS	50
-#define DEF_ADDRS_PER_INODE	923	/* Address Pointers in an Inode */
+
+#define OFFSET_OF_END_OF_I_EXT		360
+#define SIZE_OF_I_NID			20
+
+struct node_footer {
+	__le32 nid;		/* node id */
+	__le32 ino;		/* inode number */
+	__le32 flag;		/* include cold/fsync/dentry marks and offset */
+	__le64 cp_ver;		/* checkpoint version */
+	__le32 next_blkaddr;	/* next node page block address */
+} __packed;
+
+/* Address Pointers in an Inode */
+#define DEF_ADDRS_PER_INODE	((F2FS_BLKSIZE - OFFSET_OF_END_OF_I_EXT	\
+					- SIZE_OF_I_NID	\
+					- sizeof(struct node_footer)) / sizeof(__le32))
 #define CUR_ADDRS_PER_INODE(inode)	(DEF_ADDRS_PER_INODE - \
 					get_extra_isize(inode))
 #define DEF_NIDS_PER_INODE	5	/* Node IDs in an Inode */
 #define ADDRS_PER_INODE(inode)	addrs_per_inode(inode)
-#define DEF_ADDRS_PER_BLOCK	1018	/* Address Pointers in a Direct Block */
+/* Address Pointers in a Direct Block */
+#define DEF_ADDRS_PER_BLOCK	((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32))
 #define ADDRS_PER_BLOCK(inode)	addrs_per_block(inode)
-#define NIDS_PER_BLOCK		1018	/* Node IDs in an Indirect Block */
+/* Node IDs in an Indirect Block */
+#define NIDS_PER_BLOCK		((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32))
 
 #define ADDRS_PER_PAGE(page, inode)	\
 	(IS_INODE(page) ? ADDRS_PER_INODE(inode) : ADDRS_PER_BLOCK(inode))
@@ -342,14 +359,6 @@ enum {
 
 #define OFFSET_BIT_MASK		GENMASK(OFFSET_BIT_SHIFT - 1, 0)
 
-struct node_footer {
-	__le32 nid;		/* node id */
-	__le32 ino;		/* inode number */
-	__le32 flag;		/* include cold/fsync/dentry marks and offset */
-	__le64 cp_ver;		/* checkpoint version */
-	__le32 next_blkaddr;	/* next node page block address */
-} __packed;
-
 struct f2fs_node {
 	/* can be one of three types: inode, direct, and indirect types */
 	union {
@@ -363,7 +372,7 @@ struct f2fs_node {
 /*
  * For NAT entries
  */
-#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))
+#define NAT_ENTRY_PER_BLOCK (F2FS_BLKSIZE / sizeof(struct f2fs_nat_entry))
 
 struct f2fs_nat_entry {
 	__u8 version;		/* latest version of cached nat entry */
@@ -378,12 +387,13 @@ struct f2fs_nat_block {
 /*
  * For SIT entries
  *
- * Each segment is 2MB in size by default so that a bitmap for validity of
- * there-in blocks should occupy 64 bytes, 512 bits.
+ * A validity bitmap of 64 bytes covers 512 blocks of area. For a 4K page size,
+ * this results in a segment size of 2MB. For 16k pages, the default segment size
+ * is 8MB.
  * Not allow to change this.
  */
 #define SIT_VBLOCK_MAP_SIZE 64
-#define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry))
+#define SIT_ENTRY_PER_BLOCK (F2FS_BLKSIZE / sizeof(struct f2fs_sit_entry))
 
 /*
  * F2FS uses 4 bytes to represent block address. As a result, supported size of
@@ -418,7 +428,7 @@ struct f2fs_sit_block {
  * For segment summary
  *
  * One summary block contains exactly 512 summary entries, which represents
- * exactly 2MB segment by default. Not allow to change the basic units.
+ * exactly one segment by default. Not allow to change the basic units.
  *
  * NOTE: For initializing fields, you must use set_summary
  *
@@ -429,12 +439,12 @@ struct f2fs_sit_block {
  * from node's page's beginning to get a data block address.
  * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node)
  */
-#define ENTRIES_IN_SUM		512
+#define ENTRIES_IN_SUM		(F2FS_BLKSIZE / 8)
 #define	SUMMARY_SIZE		(7)	/* sizeof(struct summary) */
 #define	SUM_FOOTER_SIZE		(5)	/* sizeof(struct summary_footer) */
 #define SUM_ENTRY_SIZE		(SUMMARY_SIZE * ENTRIES_IN_SUM)
 
-/* a summary entry for a 4KB-sized block in a segment */
+/* a summary entry for a block in a segment */
 struct f2fs_summary {
 	__le32 nid;		/* parent node id */
 	union {
@@ -518,7 +528,7 @@ struct f2fs_journal {
 	};
 } __packed;
 
-/* 4KB-sized summary block structure */
+/* Block-sized summary block structure */
 struct f2fs_summary_block {
 	struct f2fs_summary entries[ENTRIES_IN_SUM];
 	struct f2fs_journal journal;
@@ -559,11 +569,14 @@ typedef __le32	f2fs_hash_t;
  * Note: there are more reserved space in inline dentry than in regular
  * dentry, when converting inline dentry we should handle this carefully.
  */
-#define NR_DENTRY_IN_BLOCK	214	/* the number of dentry in a block */
+
+/* the number of dentry in a block */
+#define NR_DENTRY_IN_BLOCK	((BITS_PER_BYTE * F2FS_BLKSIZE) / \
+					((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * BITS_PER_BYTE + 1))
 #define SIZE_OF_DIR_ENTRY	11	/* by byte */
 #define SIZE_OF_DENTRY_BITMAP	((NR_DENTRY_IN_BLOCK + BITS_PER_BYTE - 1) / \
 					BITS_PER_BYTE)
-#define SIZE_OF_RESERVED	(PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
+#define SIZE_OF_RESERVED	(F2FS_BLKSIZE - ((SIZE_OF_DIR_ENTRY + \
 				F2FS_SLOT_LEN) * \
 				NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))
 #define MIN_INLINE_DENTRY_SIZE		40	/* just include '.' and '..' entries */
@@ -576,7 +589,7 @@ struct f2fs_dir_entry {
 	__u8 file_type;		/* file type */
 } __packed;
 
-/* 4KB-sized directory entry block */
+/* Block-sized directory entry block */
 struct f2fs_dentry_block {
 	/* validity bitmap for directory entries in each block */
 	__u8 dentry_bitmap[SIZE_OF_DENTRY_BITMAP];
-- 
cgit v1.2.3


From 1cf56299f9bc7d4b8e1e39af08f01d6380e28173 Mon Sep 17 00:00:00 2001
From: Randy Li <ayaka@soulik.info>
Date: Fri, 15 Sep 2023 01:23:23 +0800
Subject: USB: dma: remove unused function prototype

usb_buffer_map_sg() and usb_buffer_unmap_sg() have no definition
since the beginning of v5.4. The rest are gone from 2.6.12.

Signed-off-by: Randy Li <ayaka@soulik.info>
Link: https://lore.kernel.org/r/20230914172336.18761-2-ayaka@soulik.info
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index a21074861f91..8c61643acd49 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1823,22 +1823,6 @@ void *usb_alloc_coherent(struct usb_device *dev, size_t size,
 void usb_free_coherent(struct usb_device *dev, size_t size,
 	void *addr, dma_addr_t dma);
 
-#if 0
-struct urb *usb_buffer_map(struct urb *urb);
-void usb_buffer_dmasync(struct urb *urb);
-void usb_buffer_unmap(struct urb *urb);
-#endif
-
-struct scatterlist;
-int usb_buffer_map_sg(const struct usb_device *dev, int is_in,
-		      struct scatterlist *sg, int nents);
-#if 0
-void usb_buffer_dmasync_sg(const struct usb_device *dev, int is_in,
-			   struct scatterlist *sg, int n_hw_ents);
-#endif
-void usb_buffer_unmap_sg(const struct usb_device *dev, int is_in,
-			 struct scatterlist *sg, int n_hw_ents);
-
 /*-------------------------------------------------------------------*
  *                         SYNCHRONOUS CALL SUPPORT                  *
  *-------------------------------------------------------------------*/
-- 
cgit v1.2.3


From 0f28ada1fbf0054557cddcdb93ad17f767105208 Mon Sep 17 00:00:00 2001
From: Jorge Sanjuan Garcia <jorge.sanjuangarcia@duagon.com>
Date: Wed, 6 Sep 2023 11:49:26 +0000
Subject: mcb: remove is_added flag from mcb_device struct

When calling mcb_bus_add_devices(), both mcb devices and the mcb
bus will attempt to attach a device to a driver because they share
the same bus_type. This causes an issue when trying to cast the
container of the device to mcb_device struct using to_mcb_device(),
leading to a wrong cast when the mcb_bus is added. A crash occurs
when freing the ida resources as the bus numbering of mcb_bus gets
confused with the is_added flag on the mcb_device struct.

The only reason for this cast was to keep an is_added flag on the
mcb_device struct that does not seem necessary. The function
device_attach() handles already bound devices and the mcb subsystem
does nothing special with this is_added flag so remove it completely.

Fixes: 18d288198099 ("mcb: Correctly initialize the bus's device")
Cc: stable <stable@kernel.org>
Signed-off-by: Jorge Sanjuan Garcia <jorge.sanjuangarcia@duagon.com>
Co-developed-by: Jose Javier Rodriguez Barbarin <JoseJavier.Rodriguez@duagon.com>
Signed-off-by: Jose Javier Rodriguez Barbarin <JoseJavier.Rodriguez@duagon.com>
Link: https://lore.kernel.org/r/20230906114901.63174-2-JoseJavier.Rodriguez@duagon.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mcb.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mcb.h b/include/linux/mcb.h
index 1e5893138afe..0b971b24a804 100644
--- a/include/linux/mcb.h
+++ b/include/linux/mcb.h
@@ -63,7 +63,6 @@ static inline struct mcb_bus *to_mcb_bus(struct device *dev)
 struct mcb_device {
 	struct device dev;
 	struct mcb_bus *bus;
-	bool is_added;
 	struct mcb_driver *driver;
 	u16 id;
 	int inst;
-- 
cgit v1.2.3


From e6814ec3ba1994561db9b1c05a80227d30cc18fa Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Fri, 21 Jul 2023 09:06:07 +0000
Subject: perf/core: Rename perf_proc_update_handler() ->
 perf_event_max_sample_rate_handler(), for readability

Follow the naming pattern of the other sysctl handlers in perf.

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230721090607.172002-1-xiujianfeng@huawei.com
---
 include/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e85cd1c0eaf3..f31f962a6445 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1573,7 +1573,7 @@ extern int sysctl_perf_cpu_time_max_percent;
 
 extern void perf_sample_event_took(u64 sample_len_ns);
 
-int perf_proc_update_handler(struct ctl_table *table, int write,
+int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
-- 
cgit v1.2.3


From 0cff993e08a7578e2c1df93a95fc5059f447e7ae Mon Sep 17 00:00:00 2001
From: "pangzizhen001@208suo.com" <pangzizhen001@208suo.com>
Date: Thu, 20 Jul 2023 23:45:39 +0800
Subject: locking/seqlock: Fix typo in comment

s/the the
 /the

[ mingo: Cleaned up the changelog. ]

Signed-off-by: Zizhen Pang <pangzizhen001@208suo.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/70293ecd5bb7a1cd370fd4d95c35f936@208suo.com
---
 include/linux/seqlock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 987a59d977c5..ea7a58258af6 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -864,7 +864,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 }
 
 /*
- * For all seqlock_t write side functions, use the the internal
+ * For all seqlock_t write side functions, use the internal
  * do_write_seqcount_begin() instead of generic write_seqcount_begin().
  * This way, no redundant lockdep_assert_held() checks are added.
  */
-- 
cgit v1.2.3


From b83ce9cb4a465b8f9a3fa45561b721a9551f60e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Fri, 8 Sep 2023 10:27:23 +0200
Subject: dma-buf: add dma_fence_timestamp helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a fence signals there is a very small race window where the timestamp
isn't updated yet. sync_file solves this by busy waiting for the
timestamp to appear, but on other ocassions didn't handled this
correctly.

Provide a dma_fence_timestamp() helper function for this and use it in
all appropriate cases.

Another alternative would be to grab the spinlock when that happens.

v2 by teddy: add a wait parameter to wait for the timestamp to show up, in case
   the accurate timestamp is needed and/or the timestamp is not based on
   ktime (e.g. hw timestamp)
v3 chk: drop the parameter again for unified handling

Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Fixes: 1774baa64f93 ("drm/scheduler: Change scheduled fence track v2")
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
CC: stable@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20230929104725.2358-1-christian.koenig@amd.com
---
 include/linux/dma-fence.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 0d678e9a7b24..ebe78bd3d121 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -568,6 +568,25 @@ static inline void dma_fence_set_error(struct dma_fence *fence,
 	fence->error = error;
 }
 
+/**
+ * dma_fence_timestamp - helper to get the completion timestamp of a fence
+ * @fence: fence to get the timestamp from.
+ *
+ * After a fence is signaled the timestamp is updated with the signaling time,
+ * but setting the timestamp can race with tasks waiting for the signaling. This
+ * helper busy waits for the correct timestamp to appear.
+ */
+static inline ktime_t dma_fence_timestamp(struct dma_fence *fence)
+{
+	if (WARN_ON(!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)))
+		return ktime_get();
+
+	while (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags))
+		cpu_relax();
+
+	return fence->timestamp;
+}
+
 signed long dma_fence_wait_timeout(struct dma_fence *,
 				   bool intr, signed long timeout);
 signed long dma_fence_wait_any_timeout(struct dma_fence **fences,
-- 
cgit v1.2.3


From a083c755e136844a934bc9b4416cd23b5c19c617 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Fri, 8 Sep 2023 22:58:40 +0900
Subject: devres: rename the first parameter of devm_add_action(_or_reset)

The first parameter of devm_add_action(_or_reset) is a device.
The name 'release' is confusing because it is often used for
dr_release_t in the devres context.

Rename it to 'dev'. No functional change intended.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20230908135840.2362708-1-masahiroy@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 56d93a1ffb7b..d7a72a8749ea 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -389,8 +389,8 @@ void devm_remove_action(struct device *dev, void (*action)(void *), void *data);
 void devm_release_action(struct device *dev, void (*action)(void *), void *data);
 
 int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name);
-#define devm_add_action(release, action, data) \
-	__devm_add_action(release, action, data, #action)
+#define devm_add_action(dev, action, data) \
+	__devm_add_action(dev, action, data, #action)
 
 static inline int __devm_add_action_or_reset(struct device *dev, void (*action)(void *),
 					     void *data, const char *name)
@@ -403,8 +403,8 @@ static inline int __devm_add_action_or_reset(struct device *dev, void (*action)(
 
 	return ret;
 }
-#define devm_add_action_or_reset(release, action, data) \
-	__devm_add_action_or_reset(release, action, data, #action)
+#define devm_add_action_or_reset(dev, action, data) \
+	__devm_add_action_or_reset(dev, action, data, #action)
 
 /**
  * devm_alloc_percpu - Resource-managed alloc_percpu
-- 
cgit v1.2.3


From 5831fc1fd4a578232fea708b82de0c666ed17153 Mon Sep 17 00:00:00 2001
From: Longfang Liu <liulongfang@huawei.com>
Date: Thu, 28 Sep 2023 16:57:22 +0800
Subject: crypto: hisilicon/qm - fix PF queue parameter issue

If the queue isolation feature is enabled, the number of queues
supported by the device changes. When PF is enabled using the
current default number of queues, the default number of queues may
be greater than the number supported by the device. As a result,
the PF fails to be bound to the driver.

After modification, if queue isolation feature is enabled, when
the default queue parameter is greater than the number supported
by the device, the number of enabled queues will be changed to
the number supported by the device, so that the PF and driver
can be properly bound.

Fixes: 8bbecfb402f7 ("crypto: hisilicon/qm - add queue isolation support for Kunpeng930")
Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/hisi_acc_qm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 39fbfb4be944..9da4f3f1e6d6 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -144,6 +144,13 @@ enum qm_vf_state {
 	QM_NOT_READY,
 };
 
+enum qm_misc_ctl_bits {
+	QM_DRIVER_REMOVING = 0x0,
+	QM_RST_SCHED,
+	QM_RESETTING,
+	QM_MODULE_PARAM,
+};
+
 enum qm_cap_bits {
 	QM_SUPPORT_DB_ISOLATION = 0x0,
 	QM_SUPPORT_FUNC_QOS,
-- 
cgit v1.2.3


From b42ab1c61a77832040ad42ebf9adf237360e49f7 Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Thu, 28 Sep 2023 17:21:47 +0800
Subject: crypto: hisilicon/qm - check function qp num before alg register

When the Kunpeng accelerator executes tasks such as encryption
and decryption have minimum requirements on the number of device
queues. If the number of queues does not meet the requirement,
the process initialization will fail. Therefore, the driver checks
the number of queues on the device before registering the algorithm.
If the number does not meet the requirements, the driver does not register
the algorithm to crypto subsystem, the device is still added to the
qm_list.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/hisi_acc_qm.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 9da4f3f1e6d6..34c64a02712c 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -478,6 +478,20 @@ static inline void hisi_qm_init_list(struct hisi_qm_list *qm_list)
 	mutex_init(&qm_list->lock);
 }
 
+static inline void hisi_qm_add_list(struct hisi_qm *qm, struct hisi_qm_list *qm_list)
+{
+	mutex_lock(&qm_list->lock);
+	list_add_tail(&qm->list, &qm_list->list);
+	mutex_unlock(&qm_list->lock);
+}
+
+static inline void hisi_qm_del_list(struct hisi_qm *qm, struct hisi_qm_list *qm_list)
+{
+	mutex_lock(&qm_list->lock);
+	list_del(&qm->list);
+	mutex_unlock(&qm_list->lock);
+}
+
 int hisi_qm_init(struct hisi_qm *qm);
 void hisi_qm_uninit(struct hisi_qm *qm);
 int hisi_qm_start(struct hisi_qm *qm);
@@ -523,8 +537,8 @@ int hisi_qm_alloc_qps_node(struct hisi_qm_list *qm_list, int qp_num,
 void hisi_qm_free_qps(struct hisi_qp **qps, int qp_num);
 void hisi_qm_dev_shutdown(struct pci_dev *pdev);
 void hisi_qm_wait_task_finish(struct hisi_qm *qm, struct hisi_qm_list *qm_list);
-int hisi_qm_alg_register(struct hisi_qm *qm, struct hisi_qm_list *qm_list);
-void hisi_qm_alg_unregister(struct hisi_qm *qm, struct hisi_qm_list *qm_list);
+int hisi_qm_alg_register(struct hisi_qm *qm, struct hisi_qm_list *qm_list, int guard);
+void hisi_qm_alg_unregister(struct hisi_qm *qm, struct hisi_qm_list *qm_list, int guard);
 int hisi_qm_resume(struct device *dev);
 int hisi_qm_suspend(struct device *dev);
 void hisi_qm_pm_uninit(struct hisi_qm *qm);
-- 
cgit v1.2.3


From 8468516f9f93a41dc65158b6428a1a1039c68f20 Mon Sep 17 00:00:00 2001
From: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Date: Mon, 2 Oct 2023 00:57:15 +0100
Subject: crypto: pkcs7 - remove md4 md5 x.509 support

Remove support for md4 md5 hash and signatures in x.509 certificate
parsers, pkcs7 signature parser, authenticode parser.

All of these are insecure or broken, and everyone has long time ago
migrated to alternative hash implementations.

Also remove md2 & md3 oids which have already didn't have support.

This is also likely the last user of md4 in the kernel, and thus
crypto/md4.c and related tests in tcrypt & testmgr can likely be
removed. Other users such as cifs smbfs ext modpost sumversions have
their own internal implementation as needed.

Signed-off-by: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/oid_registry.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index f86a08ba0207..4d04fa5d1eec 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -30,9 +30,6 @@ enum OID {
 
 	/* PKCS#1 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-1(1)} */
 	OID_rsaEncryption,		/* 1.2.840.113549.1.1.1 */
-	OID_md2WithRSAEncryption,	/* 1.2.840.113549.1.1.2 */
-	OID_md3WithRSAEncryption,	/* 1.2.840.113549.1.1.3 */
-	OID_md4WithRSAEncryption,	/* 1.2.840.113549.1.1.4 */
 	OID_sha1WithRSAEncryption,	/* 1.2.840.113549.1.1.5 */
 	OID_sha256WithRSAEncryption,	/* 1.2.840.113549.1.1.11 */
 	OID_sha384WithRSAEncryption,	/* 1.2.840.113549.1.1.12 */
@@ -49,11 +46,6 @@ enum OID {
 	OID_smimeCapabilites,		/* 1.2.840.113549.1.9.15 */
 	OID_smimeAuthenticatedAttrs,	/* 1.2.840.113549.1.9.16.2.11 */
 
-	/* {iso(1) member-body(2) us(840) rsadsi(113549) digestAlgorithm(2)} */
-	OID_md2,			/* 1.2.840.113549.2.2 */
-	OID_md4,			/* 1.2.840.113549.2.4 */
-	OID_md5,			/* 1.2.840.113549.2.5 */
-
 	OID_mskrb5,			/* 1.2.840.48018.1.2.2 */
 	OID_krb5,			/* 1.2.840.113554.1.2.2 */
 	OID_krb5u2u,			/* 1.2.840.113554.1.2.2.3 */
-- 
cgit v1.2.3


From 7523d330aac7190f738998a52df8d5aa14293280 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 4 Sep 2023 13:40:46 +0300
Subject: device property: Clarify usage scope of some struct fwnode_handle
 members

Most of the struct fwnode_handle members are for exclusive use with
device links framework. Clarify this by adding a respective comment.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Link: https://lore.kernel.org/r/20230904104046.1682875-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/fwnode.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 5700451b300f..2a72f55d26eb 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -41,6 +41,8 @@ struct device;
 struct fwnode_handle {
 	struct fwnode_handle *secondary;
 	const struct fwnode_operations *ops;
+
+	/* The below is used solely by device links, don't use otherwise */
 	struct device *dev;
 	struct list_head suppliers;
 	struct list_head consumers;
-- 
cgit v1.2.3


From 1dc05a274a7b13fd61b6c43f0136153752e6f731 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 20 Sep 2023 18:38:19 +0300
Subject: device property: Replace custom implementation of COUNT_ARGS()

Replace custom and non-portable implementation of COUNT_ARGS().

Fixes: e64b674bc9d7 ("software node: implement reference properties")
Reported-by: Nick Desaulniers <ndesaulniers@google.com>
Closes: https://lore.kernel.org/r/ZQoILN6QCjzosCOs@google.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Closes: https://github.com/ClangBuiltLinux/linux/issues/1935
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/20230920153819.2069869-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/property.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 8c3c6685a2ae..9f2585d705a8 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -10,6 +10,7 @@
 #ifndef _LINUX_PROPERTY_H_
 #define _LINUX_PROPERTY_H_
 
+#include <linux/args.h>
 #include <linux/bits.h>
 #include <linux/fwnode.h>
 #include <linux/stddef.h>
@@ -288,7 +289,7 @@ struct software_node_ref_args {
 #define SOFTWARE_NODE_REFERENCE(_ref_, ...)			\
 (const struct software_node_ref_args) {				\
 	.node = _ref_,						\
-	.nargs = ARRAY_SIZE(((u64[]){ 0, ##__VA_ARGS__ })) - 1,	\
+	.nargs = COUNT_ARGS(__VA_ARGS__),			\
 	.args = { __VA_ARGS__ },				\
 }
 
-- 
cgit v1.2.3


From a56cc0a8338523f709892696cc229527617c1316 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 3 Oct 2023 15:17:24 +0200
Subject: thermal: core: Add function to walk trips under zone lock

Add a wrapper around for_each_thermal_trip(), called
thermal_zone_for_each_trip(), that will invoke the former under the
thermal zone lock and pass its return value to the caller.

Two drivers will be modified subsequently to use this new function.

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/thermal.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 6710a4ace992..2bab72149bbf 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -286,6 +286,9 @@ int thermal_zone_set_trip(struct thermal_zone_device *tz, int trip_id,
 int for_each_thermal_trip(struct thermal_zone_device *tz,
 			  int (*cb)(struct thermal_trip *, void *),
 			  void *data);
+int thermal_zone_for_each_trip(struct thermal_zone_device *tz,
+			       int (*cb)(struct thermal_trip *, void *),
+			       void *data);
 int thermal_zone_get_num_trips(struct thermal_zone_device *tz);
 
 int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp);
-- 
cgit v1.2.3


From 4963e34ce7b95237021575d208fa576f88697839 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 3 Oct 2023 15:25:33 +0200
Subject: thermal: core: Drop thermal_zone_device_exec()

Because thermal_zone_device_exec() has no users any more and there are
no plans to use it anywhere, revert commit 9a99a996d1ec ("thermal: core:
Introduce thermal_zone_device_exec()") that introduced it.

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/thermal.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 2bab72149bbf..c8600e313909 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -339,10 +339,6 @@ int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
 				       struct thermal_cooling_device *);
 void thermal_zone_device_update(struct thermal_zone_device *,
 				enum thermal_notify_event);
-void thermal_zone_device_exec(struct thermal_zone_device *tz,
-			      void (*cb)(struct thermal_zone_device *,
-					 unsigned long),
-			      unsigned long data);
 
 struct thermal_cooling_device *thermal_cooling_device_register(const char *,
 		void *, const struct thermal_cooling_device_ops *);
-- 
cgit v1.2.3


From c62f5032f72a745542aa6a7e777c7819c96adfbe Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 13 Sep 2023 18:07:01 +0100
Subject: comedi: comedi_8254: Use a call-back function for register access

Rework the comedi_8254 module to use a call-back function for register
access.  This will make it easier to isolate the parts that will depend
on the `CONFIG_HAS_IOPORT` macro being defined and also allows the
possibility of supplying an external callback function during
initialization by a variant of the `comedi_8254_init()` and
`comedi_8254_mm_init()` functions, although that has not been
implemented yet.

The `struct comedi_8254` members have been changed to use a pointer to a
callback function and a context of type `unsigned long`.  The
`comedi_8254_init()` and `comedi_8254_mm_init()` functions use an
internal callback function and set the context to the base address of
the registers (for `comedi_8254_mm_init()` that involves converting a
`void __iomem *` to `unsigned long`).

A minor change to `dio200_subdev_8254_offset()` in the
amplc_dio200_common module has been made due to the changes in `struct
comedi_8254`.

Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20230913170712.111719-3-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/comedi/comedi_8254.h | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/comedi/comedi_8254.h b/include/linux/comedi/comedi_8254.h
index d8264417e53c..18d12321c87d 100644
--- a/include/linux/comedi/comedi_8254.h
+++ b/include/linux/comedi/comedi_8254.h
@@ -57,10 +57,24 @@ struct comedi_subdevice;
 /* counter maps zero to 0x10000 */
 #define I8254_MAX_COUNT			0x10000
 
+struct comedi_8254;
+
+/**
+ * typedef comedi_8254_iocb_fn - call-back function type for 8254 register access
+ * @i8254:		pointer to struct comedi_8254
+ * @dir:		direction (0 = read, 1 = write)
+ * @reg:		register number
+ * @val:		value to write
+ *
+ * Return: Register value when reading, 0 when writing.
+ */
+typedef unsigned int comedi_8254_iocb_fn(struct comedi_8254 *i8254, int dir,
+					 unsigned int reg, unsigned int val);
+
 /**
  * struct comedi_8254 - private data used by this module
- * @iobase:		PIO base address of the registers (in/out)
- * @mmio:		MMIO base address of the registers (read/write)
+ * @iocb:		I/O call-back function for register access
+ * @context:		context for register access (e.g. a base address)
  * @iosize:		I/O size used to access the registers (b/w/l)
  * @regshift:		register gap shift
  * @osc_base:		cascaded oscillator speed in ns
@@ -76,8 +90,8 @@ struct comedi_subdevice;
  * @insn_config:	driver specific (*insn_config) callback
  */
 struct comedi_8254 {
-	unsigned long iobase;
-	void __iomem *mmio;
+	comedi_8254_iocb_fn *iocb;
+	unsigned long context;
 	unsigned int iosize;
 	unsigned int regshift;
 	unsigned int osc_base;
-- 
cgit v1.2.3


From fade5e5b0b2a2cc3855f64be6407b0bdcd837714 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 13 Sep 2023 18:07:02 +0100
Subject: comedi: comedi_8254: Replace comedi_8254_init() and
 comedi_8254_mm_init()

`comedi_8254_init()` and `comedi_8254_mm_init()` return `NULL` on
failure, but the failure is not necessarily due to lack of memory.
Change them to return an `ERR_PTR` value on failure and rename the
functions to make it obvious the API has changed.  `comedi_8254_init()`
has been replaced with `comedi_8254_io_alloc()`, and
`comedi_8254_mm_init()` has been replaced with `comedi_8254_mm_alloc()`.

Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20230913170712.111719-4-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/comedi/comedi_8254.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/comedi/comedi_8254.h b/include/linux/comedi/comedi_8254.h
index 18d12321c87d..393ccb301028 100644
--- a/include/linux/comedi/comedi_8254.h
+++ b/include/linux/comedi/comedi_8254.h
@@ -136,13 +136,13 @@ void comedi_8254_set_busy(struct comedi_8254 *i8254,
 void comedi_8254_subdevice_init(struct comedi_subdevice *s,
 				struct comedi_8254 *i8254);
 
-struct comedi_8254 *comedi_8254_init(unsigned long iobase,
-				     unsigned int osc_base,
-				     unsigned int iosize,
-				     unsigned int regshift);
-struct comedi_8254 *comedi_8254_mm_init(void __iomem *mmio,
-					unsigned int osc_base,
-					unsigned int iosize,
-					unsigned int regshift);
+struct comedi_8254 *comedi_8254_io_alloc(unsigned long iobase,
+					 unsigned int osc_base,
+					 unsigned int iosize,
+					 unsigned int regshift);
+struct comedi_8254 *comedi_8254_mm_alloc(void __iomem *mmio,
+					 unsigned int osc_base,
+					 unsigned int iosize,
+					 unsigned int regshift);
 
 #endif	/* _COMEDI_8254_H */
-- 
cgit v1.2.3


From 90d256757e0bffd7a9beafd7c2bdc40a0236f9ec Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 13 Sep 2023 18:07:03 +0100
Subject: comedi: comedi_8254: Conditionally remove I/O port support

The comedi_8254 module supports both port I/O and memory-mapped I/O.
In a future patch, the port I/O functions (`inb()`, `outb()`, and
friends) will only be declared if the `HAS_IOPORT` configuration option
is enabled.

Conditionally compile the parts of the module that use port I/O so they
are compiled if and only if the `CONFIG_HAS_IOPORT` macro is defined, so
that it can still be built if the port I/O functions have not been
declared.  If `CONFIG_HAS_IOPORT` is undefined, replace the GPL-exported
`comedi_8254_io_alloc()` function with a dummy static inline version
that just returns `ERR_PTR(-ENXIO)`.

Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20230913170712.111719-5-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/comedi/comedi_8254.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/comedi/comedi_8254.h b/include/linux/comedi/comedi_8254.h
index 393ccb301028..d527f04400df 100644
--- a/include/linux/comedi/comedi_8254.h
+++ b/include/linux/comedi/comedi_8254.h
@@ -12,6 +12,8 @@
 #define _COMEDI_8254_H
 
 #include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/err.h>
 
 struct comedi_device;
 struct comedi_insn;
@@ -136,10 +138,21 @@ void comedi_8254_set_busy(struct comedi_8254 *i8254,
 void comedi_8254_subdevice_init(struct comedi_subdevice *s,
 				struct comedi_8254 *i8254);
 
+#ifdef CONFIG_HAS_IOPORT
 struct comedi_8254 *comedi_8254_io_alloc(unsigned long iobase,
 					 unsigned int osc_base,
 					 unsigned int iosize,
 					 unsigned int regshift);
+#else
+static inline struct comedi_8254 *comedi_8254_io_alloc(unsigned long iobase,
+						       unsigned int osc_base,
+						       unsigned int iosize,
+						       unsigned int regshift)
+{
+	return ERR_PTR(-ENXIO);
+}
+#endif
+
 struct comedi_8254 *comedi_8254_mm_alloc(void __iomem *mmio,
 					 unsigned int osc_base,
 					 unsigned int iosize,
-- 
cgit v1.2.3


From 5c57b1ccecc72738d4b9be2dfcdfb9001be76bd7 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 13 Sep 2023 18:07:05 +0100
Subject: comedi: comedi_8255: Rework subdevice initialization functions

Comedi drivers can initialize an 8255 subdevice in I/O space by calling
`subdev_8255_init()`, or in memory-mapped I/O space by calling
`subdev_8255_mm_init()`, or by supplying a call-back function pointer
and context to either of those functions.  Change it so that a new
function `subdev_8255_cb_init()` shall be called instead when supplying
a callback function and context, and remove the call-back function
parameter from `subdev_8255_init()` and `subdev_8255_mm_init()`.

Also rename `subdev_8255_init()` to `subdev_8255_io_init()`.  The
parameters are changing, so might as well rename it at the same time.

Also rename the `regbase` member of `struct subdev_8255_private` to
`context` since this holds the context for the call-back function call.

Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20230913170712.111719-7-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/comedi/comedi_8255.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/comedi/comedi_8255.h b/include/linux/comedi/comedi_8255.h
index b2a5bc6b3a49..b396fcfbf8b0 100644
--- a/include/linux/comedi/comedi_8255.h
+++ b/include/linux/comedi/comedi_8255.h
@@ -27,16 +27,17 @@
 struct comedi_device;
 struct comedi_subdevice;
 
-int subdev_8255_init(struct comedi_device *dev, struct comedi_subdevice *s,
-		     int (*io)(struct comedi_device *dev, int dir, int port,
-			       int data, unsigned long regbase),
-		     unsigned long regbase);
+int subdev_8255_io_init(struct comedi_device *dev, struct comedi_subdevice *s,
+			unsigned long regbase);
 
 int subdev_8255_mm_init(struct comedi_device *dev, struct comedi_subdevice *s,
-			int (*io)(struct comedi_device *dev, int dir, int port,
-				  int data, unsigned long regbase),
 			unsigned long regbase);
 
+int subdev_8255_cb_init(struct comedi_device *dev, struct comedi_subdevice *s,
+			int (*io)(struct comedi_device *dev, int dir, int port,
+				  int data, unsigned long context),
+			unsigned long context);
+
 unsigned long subdev_8255_regbase(struct comedi_subdevice *s);
 
 #endif
-- 
cgit v1.2.3


From 7187a0939a1773e6a2663a02a6c456047a5e6289 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 13 Sep 2023 18:07:06 +0100
Subject: comedi: comedi_8255: Conditionally remove I/O port support

In a future patch, the port I/O functions (`inb()`, `outb()`, and
friends will only be declared in the `HAS_IOPORT` configuration option
is enabled.

The comedi_8255 module supports both port I/O and memory-mapped I/O.

Conditionally compile the parts of the module that use port I/O if and
only if the `CONFIG_HAS_IOPORT` macro is defined so that it can still be
built if the port I/O functions have not been declared.  If the
`CONFIG_HAS_IOPORT` macro is undefined, replace the GPL-exported
`subdev_8255_io_init()` function with a dummy static inline version that
just returns `-ENXIO`.

Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20230913170712.111719-8-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/comedi/comedi_8255.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/comedi/comedi_8255.h b/include/linux/comedi/comedi_8255.h
index b396fcfbf8b0..d24a69da389b 100644
--- a/include/linux/comedi/comedi_8255.h
+++ b/include/linux/comedi/comedi_8255.h
@@ -10,6 +10,8 @@
 #ifndef _COMEDI_8255_H
 #define _COMEDI_8255_H
 
+#include <linux/errno.h>
+
 #define I8255_SIZE		0x04
 
 #define I8255_DATA_A_REG	0x00
@@ -27,8 +29,17 @@
 struct comedi_device;
 struct comedi_subdevice;
 
+#ifdef CONFIG_HAS_IOPORT
 int subdev_8255_io_init(struct comedi_device *dev, struct comedi_subdevice *s,
 			unsigned long regbase);
+#else
+static inline int subdev_8255_io_init(struct comedi_device *dev,
+				      struct comedi_subdevice *s,
+				      unsigned long regbase)
+{
+	return -ENXIO;
+}
+#endif
 
 int subdev_8255_mm_init(struct comedi_device *dev, struct comedi_subdevice *s,
 			unsigned long regbase);
-- 
cgit v1.2.3


From 77f048bcbf07f7dc961f3b2b7815038b5405ec60 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 30 Sep 2023 11:14:47 +0200
Subject: comedi: Annotate struct comedi_lrange with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/5c3b7459b820e22e2ac6ce892d4aadcc119cc919.1696065263.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/comedi/comedidev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/comedi/comedidev.h b/include/linux/comedi/comedidev.h
index 0a1150900ef3..c08416a7364b 100644
--- a/include/linux/comedi/comedidev.h
+++ b/include/linux/comedi/comedidev.h
@@ -633,7 +633,7 @@ extern const struct comedi_lrange range_unknown;
  */
 struct comedi_lrange {
 	int length;
-	struct comedi_krange range[];
+	struct comedi_krange range[] __counted_by(length);
 };
 
 /**
-- 
cgit v1.2.3


From 0fedefd4c4e33dd24f726b13b5d7c143e2b483be Mon Sep 17 00:00:00 2001
From: Valentine Sinitsyn <valesini@yandex-team.ru>
Date: Mon, 25 Sep 2023 11:40:12 +0300
Subject: kernfs: sysfs: support custom llseek method for sysfs entries

As of now, seeking in sysfs files is handled by generic_file_llseek().
There are situations where one may want to customize seeking logic:

- Many sysfs entries are fixed files while generic_file_llseek() accepts
  past-the-end positions. Not only being useless by itself, this
  also means a bug in userspace code will trigger not at lseek(), but at
  some later point making debugging harder.
- generic_file_llseek() relies on f_mapping->host to get the file size
  which might not be correct for all sysfs entries.
  See commit 636b21b50152 ("PCI: Revoke mappings like devmem") as an example.

Implement llseek method to override this behavior at sysfs attribute
level. The method is optional, and if it is absent,
generic_file_llseek() is called to preserve backwards compatibility.

Signed-off-by: Valentine Sinitsyn <valesini@yandex-team.ru>
Link: https://lore.kernel.org/r/20230925084013.309399-1-valesini@yandex-team.ru
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kernfs.h | 1 +
 include/linux/sysfs.h  | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 2a36f3218b51..99aaa050ccb7 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -316,6 +316,7 @@ struct kernfs_ops {
 			 struct poll_table_struct *pt);
 
 	int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
+	loff_t (*llseek)(struct kernfs_open_file *of, loff_t offset, int whence);
 };
 
 /*
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index fd3fe5c8c17f..b717a70219f6 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -181,6 +181,8 @@ struct bin_attribute {
 			char *, loff_t, size_t);
 	ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *,
 			 char *, loff_t, size_t);
+	loff_t (*llseek)(struct file *, struct kobject *, struct bin_attribute *,
+			 loff_t, int);
 	int (*mmap)(struct file *, struct kobject *, struct bin_attribute *attr,
 		    struct vm_area_struct *vma);
 };
-- 
cgit v1.2.3


From eb7581deb4c2eef77f6368e1891e123b69349bb0 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 12 Sep 2023 19:53:12 +0300
Subject: resource: Constify resource crosscheck APIs

Constify APIs: _contains(), _overlaps(), _intersection(), _union().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230912165312.402422-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/ioport.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 25d768d48970..14f5cfabbbc8 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -229,7 +229,7 @@ static inline unsigned long resource_ext_type(const struct resource *res)
 	return res->flags & IORESOURCE_EXT_TYPE_BITS;
 }
 /* True iff r1 completely contains r2 */
-static inline bool resource_contains(struct resource *r1, struct resource *r2)
+static inline bool resource_contains(const struct resource *r1, const struct resource *r2)
 {
 	if (resource_type(r1) != resource_type(r2))
 		return false;
@@ -239,13 +239,13 @@ static inline bool resource_contains(struct resource *r1, struct resource *r2)
 }
 
 /* True if any part of r1 overlaps r2 */
-static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
+static inline bool resource_overlaps(const struct resource *r1, const struct resource *r2)
 {
        return r1->start <= r2->end && r1->end >= r2->start;
 }
 
-static inline bool
-resource_intersection(struct resource *r1, struct resource *r2, struct resource *r)
+static inline bool resource_intersection(const struct resource *r1, const struct resource *r2,
+					 struct resource *r)
 {
 	if (!resource_overlaps(r1, r2))
 		return false;
@@ -254,8 +254,8 @@ resource_intersection(struct resource *r1, struct resource *r2, struct resource
 	return true;
 }
 
-static inline bool
-resource_union(struct resource *r1, struct resource *r2, struct resource *r)
+static inline bool resource_union(const struct resource *r1, const struct resource *r2,
+				  struct resource *r)
 {
 	if (!resource_overlaps(r1, r2))
 		return false;
-- 
cgit v1.2.3


From 8a76356e7db02ec7b1913db06605e70294d94672 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Wed, 27 Sep 2023 11:27:41 +0300
Subject: iio: improve doc for available_scan_mask

The available_scan_mask is an array of bitmaps representing the channels
which can be simultaneously enabled by the driver. In many cases, the
hardware can offer more channels than what the user is interested in
obtaining. In such cases, it may be preferred that only a subset of
channels are enabled, and the driver reads only a subset of the channels
from the hardware.

Some devices can't support all channel combinations. For example, the
BM1390 pressure sensor must always read the pressure data in order to
acknowledge the watermark IRQ, while reading temperature can be omitted.
So, the available scan masks would be 'pressure and temperature' and
'pressure only'.

When IIO searches for the scan mask it asks the driver to use, it will
pick the first suitable one from the 'available_scan_mask' array. Hence,
ordering the masks in the array makes a difference. We should 'prefer'
reading just the pressure from the hardware (as it is a cheaper operation
than reading both pressure and temperature) over reading both pressure
and temperature. Hence, we should set the 'only pressure' as the first
scan mask in available_scan_mask array. If we set the 'pressure and
temperature' as first in the array, then the 'only temperature' will never
get used as 'pressure and temperature' can always serve the user's
needs.

Add (minimal) kerneldoc to the 'available_scan_mask' to hint the user
that the ordering of masks matters.

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Link: https://lore.kernel.org/r/4e43bf0186df5c8a56b470318b4827605f9cad6c.1695727471.git.mazziesaccount@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 202e55b0a28b..7bfa1b9bc8a2 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -556,7 +556,9 @@ struct iio_buffer_setup_ops {
  *			and owner
  * @buffer:		[DRIVER] any buffer present
  * @scan_bytes:		[INTERN] num bytes captured to be fed to buffer demux
- * @available_scan_masks: [DRIVER] optional array of allowed bitmasks
+ * @available_scan_masks: [DRIVER] optional array of allowed bitmasks. Sort the
+ *			   array in order of preference, the most preferred
+ *			   masks first.
  * @masklength:		[INTERN] the length of the mask established from
  *			channels
  * @active_scan_mask:	[INTERN] union of all scan masks requested by buffers
-- 
cgit v1.2.3


From b3a4dbc89d4021b3f90ff6a13537111a004f9d07 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Wed, 4 Oct 2023 20:05:31 -0400
Subject: io_uring/kbuf: Use slab for struct io_buffer objects

The allocation of struct io_buffer for metadata of provided buffers is
done through a custom allocator that directly gets pages and
fragments them.  But, slab would do just fine, as this is not a hot path
(in fact, it is a deprecated feature) and, by keeping a custom allocator
implementation we lose benefits like tracking, poisoning,
sanitizers. Finally, the custom code is more complex and requires
keeping the list of pages in struct ctx for no good reason.  This patch
cleans this path up and just uses slab.

I microbenchmarked it by forcing the allocation of a large number of
objects with the least number of io_uring commands possible (keeping
nbufs=USHRT_MAX), with and without the patch.  There is a slight
increase in time spent in the allocation with slab, of course, but even
when allocating to system resources exhaustion, which is not very
realistic and happened around 1/2 billion provided buffers for me, it
wasn't a significant hit in system time.  Specially if we think of a
real-world scenario, an application doing register/unregister of
provided buffers will hit ctx->io_buffers_cache more often than actually
going to slab.

Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20231005000531.30800-4-krisman@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e178461fa513..e4e67899b134 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -350,8 +350,6 @@ struct io_ring_ctx {
 	struct wait_queue_head		rsrc_quiesce_wq;
 	unsigned			rsrc_quiesce;
 
-	struct list_head		io_buffers_pages;
-
 	#if defined(CONFIG_UNIX)
 		struct socket		*ring_sock;
 	#endif
-- 
cgit v1.2.3


From 2819f23ac12ce93ff79ca7a54597df9a4a1f6331 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 5 Oct 2023 09:13:48 -0400
Subject: eventfs: Use eventfs_remove_events_dir()

The update to removing the eventfs_file changed the way the events top
level directory was handled. Instead of returning a dentry, it now returns
the eventfs_inode. In this changed, the removing of the events top level
directory is not much different than removing any of the other
directories. Because of this, the removal just called eventfs_remove_dir()
instead of eventfs_remove_events_dir().

Although eventfs_remove_dir() does the clean up, it misses out on the
dget() of the ei->dentry done in eventfs_create_events_dir(). It makes
more sense to match eventfs_create_events_dir() with a specific function
eventfs_remove_events_dir() and this specific function can then perform
the dput() to the dentry that had the dget() when it was created.

Fixes: 5790b1fb3d67 ("eventfs: Remove eventfs_file and just use eventfs_inode")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202310051743.y9EobbUr-lkp@intel.com/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracefs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 0c39704455d9..13359b1a35d1 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -41,6 +41,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
 					 const struct eventfs_entry *entries,
 					 int size, void *data);
 
+void eventfs_remove_events_dir(struct eventfs_inode *ei);
 void eventfs_remove_dir(struct eventfs_inode *ei);
 
 struct dentry *tracefs_create_file(const char *name, umode_t mode,
-- 
cgit v1.2.3


From 7e6f3b6d2c352b5fde37ce3fed83bdf6172eebd4 Mon Sep 17 00:00:00 2001
From: Vicki Pfau <vi@endrift.com>
Date: Wed, 27 Sep 2023 13:22:12 -0700
Subject: PCI: Prevent xHCI driver from claiming AMD VanGogh USB3 DRD device

The AMD VanGogh SoC contains a DesignWare USB3 Dual-Role Device that can be
operated as either a USB Host or a USB Device, similar to on the AMD Nolan
platform.

be6646bfbaec ("PCI: Prevent xHCI driver from claiming AMD Nolan USB3 DRD
device") added a quirk to let the dwc3 driver claim the Nolan device since
it provides more specific support.

Extend that quirk to include the VanGogh SoC USB3 device.

Link: https://lore.kernel.org/r/20230927202212.2388216-1-vi@endrift.com
Signed-off-by: Vicki Pfau <vi@endrift.com>
[bhelgaas: include be6646bfbaec reference, add stable tag]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: stable@vger.kernel.org	# v3.19+
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 5fb3d4c393a9..3a8e24e9a93f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -579,6 +579,7 @@
 #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3 0x12c3
 #define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3 0x16fb
 #define PCI_DEVICE_ID_AMD_MI200_DF_F3	0x14d3
+#define PCI_DEVICE_ID_AMD_VANGOGH_USB	0x163a
 #define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
-- 
cgit v1.2.3


From 5b9ceb63c49b9934cf2ec70b3b76951927e50a24 Mon Sep 17 00:00:00 2001
From: John Sanpe <sanpeqf@gmail.com>
Date: Fri, 15 Sep 2023 22:06:50 +0800
Subject: logic_pio: Remove logic_outb(), _outw(), outl() duplicate
 declarations

Remove duplicate declarations of logic_out* functions.

Link: https://lore.kernel.org/r/20230915140650.3562504-1-sanpeqf@gmail.com
Signed-off-by: John Sanpe <sanpeqf@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/logic_pio.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/logic_pio.h b/include/linux/logic_pio.h
index 54945aa824b4..babf4e3c28ba 100644
--- a/include/linux/logic_pio.h
+++ b/include/linux/logic_pio.h
@@ -39,9 +39,6 @@ struct logic_pio_host_ops {
 
 #ifdef CONFIG_INDIRECT_PIO
 u8 logic_inb(unsigned long addr);
-void logic_outb(u8 value, unsigned long addr);
-void logic_outw(u16 value, unsigned long addr);
-void logic_outl(u32 value, unsigned long addr);
 u16 logic_inw(unsigned long addr);
 u32 logic_inl(unsigned long addr);
 void logic_outb(u8 value, unsigned long addr);
-- 
cgit v1.2.3


From 7151d87a175c6618fe81705755eb3dc4199cad4e Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Wed, 27 Sep 2023 10:31:30 +0200
Subject: virtchnl: Add header dependencies

The <linux/avf/virtchnl.h> uses BIT, struct_size and ETH_ALEN macros
but does not include appropriate header files that defines them.
Add these dependencies so this header file can be included anywhere.

Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index dd71d3009771..6b3acf15be5c 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -4,6 +4,10 @@
 #ifndef _VIRTCHNL_H_
 #define _VIRTCHNL_H_
 
+#include <linux/bitops.h>
+#include <linux/overflow.h>
+#include <uapi/linux/if_ether.h>
+
 /* Description:
  * This header file describes the Virtual Function (VF) - Physical Function
  * (PF) communication protocol used by the drivers for all devices starting
-- 
cgit v1.2.3


From 9beebc2b5d0038a65977a7a14909598c64ce070f Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Thu, 28 Sep 2023 09:24:28 +0200
Subject: can: dev: add can_state_get_by_berr_counter() to return the CAN state
 based on the current error counters

Some CAN controllers do not have a register that contains the current
CAN state, but only a register that contains the error counters.

Introduce a new function can_state_get_by_berr_counter() that returns
the current TX and RX state depending on the provided CAN bit error
counters.

Link: https://lore.kernel.org/all/20231005-at91_can-rx_offload-v2-1-9987d53600e0@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 982ba245eb41..1b92aed49363 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -195,6 +195,10 @@ int can_restart_now(struct net_device *dev);
 void can_bus_off(struct net_device *dev);
 
 const char *can_get_state_str(const enum can_state state);
+void can_state_get_by_berr_counter(const struct net_device *dev,
+				   const struct can_berr_counter *bec,
+				   enum can_state *tx_state,
+				   enum can_state *rx_state);
 void can_change_state(struct net_device *dev, struct can_frame *cf,
 		      enum can_state tx_state, enum can_state rx_state);
 
-- 
cgit v1.2.3


From 9c8c3fa3a52bc55696ccc4dfcb8a49f969b5fb0e Mon Sep 17 00:00:00 2001
From: Akihiko Odaki <akihiko.odaki@daynix.com>
Date: Thu, 5 Oct 2023 16:21:36 +0900
Subject: bpf: Fix the comment for bpf_restore_data_end()

The comment used to say:
> Restore data saved by bpf_compute_data_pointers().

But bpf_compute_data_pointers() does not save the data;
bpf_compute_and_save_data_end() does.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20231005072137.29870-1-akihiko.odaki@daynix.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 27406aee2d40..ff7ecc89d3dd 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -736,7 +736,7 @@ static inline void bpf_compute_and_save_data_end(
 	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
-/* Restore data saved by bpf_compute_data_pointers(). */
+/* Restore data saved by bpf_compute_and_save_data_end(). */
 static inline void bpf_restore_data_end(
 	struct sk_buff *skb, void *saved_data_end)
 {
-- 
cgit v1.2.3


From 3fbc5c3b8522d655cf91d32c158261060fdc02fe Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 25 Sep 2023 15:17:07 +0200
Subject: PM: domains: Introduce dev_pm_domain_set_performance_state()

The generic PM domain is currently the only PM domain variant that supports
performance scaling. To allow performance scaling to be supported through a
common interface, let's add an optional callback ->set_performance_state(),
in the struct dev_pm_domain.

Moreover, let's add a function, dev_pm_domain_set_performance_state(), that
may be called by consumers to request a new performance state for a device
through its PM domain.

Note that, in most cases it's preferred that a consumer use the OPP library
to request a new performance state for its device. Although, this requires
some additional changes to be supported, which are being implemented from
subsequent changes.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm.h        | 2 ++
 include/linux/pm_domain.h | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 1400c37b29c7..4c9f571609c8 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -719,6 +719,7 @@ extern void dev_pm_put_subsys_data(struct device *dev);
  * @activate: Called before executing probe routines for bus types and drivers.
  * @sync: Called after successful driver probe.
  * @dismiss: Called after unsuccessful driver probe and after driver removal.
+ * @set_performance_state: Called to request a new performance state.
  *
  * Power domains provide callbacks that are executed during system suspend,
  * hibernation, system resume and during runtime PM transitions instead of
@@ -731,6 +732,7 @@ struct dev_pm_domain {
 	int (*activate)(struct device *dev);
 	void (*sync)(struct device *dev);
 	void (*dismiss)(struct device *dev);
+	int (*set_performance_state)(struct device *dev, unsigned int state);
 };
 
 /*
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index f776fb93eaa0..bda2964a0a56 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -430,6 +430,7 @@ struct device *dev_pm_domain_attach_by_name(struct device *dev,
 void dev_pm_domain_detach(struct device *dev, bool power_off);
 int dev_pm_domain_start(struct device *dev);
 void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd);
+int dev_pm_domain_set_performance_state(struct device *dev, unsigned int state);
 #else
 static inline int dev_pm_domain_attach(struct device *dev, bool power_on)
 {
@@ -452,6 +453,11 @@ static inline int dev_pm_domain_start(struct device *dev)
 }
 static inline void dev_pm_domain_set(struct device *dev,
 				     struct dev_pm_domain *pd) {}
+static inline int dev_pm_domain_set_performance_state(struct device *dev,
+						      unsigned int state)
+{
+	return 0;
+}
 #endif
 
 #endif /* _LINUX_PM_DOMAIN_H */
-- 
cgit v1.2.3


From 248a38d5cc3f3505e6cfbbc0514435c9f1ba00af Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 25 Sep 2023 15:17:09 +0200
Subject: OPP: Add dev_pm_opp_add_dynamic() to allow more flexibility

The dev_pm_opp_add() API is limited to add dynamic OPPs with a frequency
and a voltage level. To enable more flexibility, let's add a new API,
dev_pm_opp_add_dynamic() that's takes a struct dev_pm_opp_data* instead of
a list of in-parameters.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm_opp.h | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 91f87d7e807c..a8ee93ba41d8 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -92,6 +92,16 @@ struct dev_pm_opp_config {
 	struct device ***virt_devs;
 };
 
+/**
+ * struct dev_pm_opp_data - The data to use to initialize an OPP.
+ * @freq: The clock rate in Hz for the OPP.
+ * @u_volt: The voltage in uV for the OPP.
+ */
+struct dev_pm_opp_data {
+	unsigned long freq;
+	unsigned long u_volt;
+};
+
 #if defined(CONFIG_PM_OPP)
 
 struct opp_table *dev_pm_opp_get_opp_table(struct device *dev);
@@ -152,8 +162,8 @@ struct dev_pm_opp *dev_pm_opp_find_bw_floor(struct device *dev,
 
 void dev_pm_opp_put(struct dev_pm_opp *opp);
 
-int dev_pm_opp_add(struct device *dev, unsigned long freq,
-		   unsigned long u_volt);
+int dev_pm_opp_add_dynamic(struct device *dev, struct dev_pm_opp_data *opp);
+
 void dev_pm_opp_remove(struct device *dev, unsigned long freq);
 void dev_pm_opp_remove_all_dynamic(struct device *dev);
 
@@ -322,8 +332,8 @@ static inline struct dev_pm_opp *dev_pm_opp_find_bw_floor(struct device *dev,
 
 static inline void dev_pm_opp_put(struct dev_pm_opp *opp) {}
 
-static inline int dev_pm_opp_add(struct device *dev, unsigned long freq,
-					unsigned long u_volt)
+static inline int
+dev_pm_opp_add_dynamic(struct device *dev, struct dev_pm_opp_data *opp)
 {
 	return -EOPNOTSUPP;
 }
@@ -519,6 +529,17 @@ static inline int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_ta
 
 /* OPP Configuration helpers */
 
+static inline int dev_pm_opp_add(struct device *dev, unsigned long freq,
+				 unsigned long u_volt)
+{
+	struct dev_pm_opp_data data = {
+		.freq = freq,
+		.u_volt = u_volt,
+	};
+
+	return dev_pm_opp_add_dynamic(dev, &data);
+}
+
 /* Regulators helpers */
 static inline int dev_pm_opp_set_regulators(struct device *dev,
 					    const char * const names[])
-- 
cgit v1.2.3


From a0242c81bb759ef03184be8eddcc7d5bdf36cc16 Mon Sep 17 00:00:00 2001
From: Krishna chaitanya chundru <quic_krichai@quicinc.com>
Date: Thu, 7 Sep 2023 11:30:31 +0530
Subject: OPP: Add dev_pm_opp_find_level_floor()

Add dev_pm_opp_find_level_floor(), as is done for frequency and
bandwidth.

Signed-off-by: Krishna chaitanya chundru <quic_krichai@quicinc.com>
[ Viresh: Updated commit log and rearranged code ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm_opp.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 9ad168f4cbf1..ccd97bcef269 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -156,6 +156,9 @@ struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev,
 struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
 					      unsigned int *level);
 
+struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev,
+					       unsigned long *level);
+
 struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev,
 					   unsigned int *bw, int index);
 
@@ -320,6 +323,12 @@ static inline struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev,
+							     unsigned long *level)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev,
 					unsigned int *bw, int index)
 {
-- 
cgit v1.2.3


From 3166383da081461244918aeed7ad028ef11b17cc Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 25 Sep 2023 15:17:10 +0200
Subject: OPP: Extend dev_pm_opp_data with a level

Let's extend the dev_pm_opp_data with a level variable, to allow users to
specify a corresponding level (performance state) for a dynamically added
OPP.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm_opp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index a8ee93ba41d8..9ad168f4cbf1 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -94,10 +94,12 @@ struct dev_pm_opp_config {
 
 /**
  * struct dev_pm_opp_data - The data to use to initialize an OPP.
+ * @level: The performance level for the OPP.
  * @freq: The clock rate in Hz for the OPP.
  * @u_volt: The voltage in uV for the OPP.
  */
 struct dev_pm_opp_data {
+	unsigned int level;
 	unsigned long freq;
 	unsigned long u_volt;
 };
-- 
cgit v1.2.3


From 869b6ea1609f655a43251bf41757aa44e5350a8f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 4 Oct 2023 15:32:01 +0200
Subject: quota: Fix slow quotaoff

Eric has reported that commit dabc8b207566 ("quota: fix dqput() to
follow the guarantees dquot_srcu should provide") heavily increases
runtime of generic/270 xfstest for ext4 in nojournal mode. The reason
for this is that ext4 in nojournal mode leaves dquots dirty until the last
dqput() and thus the cleanup done in quota_release_workfn() has to write
them all. Due to the way quota_release_workfn() is written this results
in synchronize_srcu() call for each dirty dquot which makes the dquot
cleanup when turning quotas off extremely slow.

To be able to avoid synchronize_srcu() for each dirty dquot we need to
rework how we track dquots to be cleaned up. Instead of keeping the last
dquot reference while it is on releasing_dquots list, we drop it right
away and mark the dquot with new DQ_RELEASING_B bit instead. This way we
can we can remove dquot from releasing_dquots list when new reference to
it is acquired and thus there's no need to call synchronize_srcu() each
time we drop dq_list_lock.

References: https://lore.kernel.org/all/ZRytn6CxFK2oECUt@debian-BULLSEYE-live-builder-AMD64
Reported-by: Eric Whitney <enwlinux@gmail.com>
Fixes: dabc8b207566 ("quota: fix dqput() to follow the guarantees dquot_srcu should provide")
CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/quota.h    | 4 +++-
 include/linux/quotaops.h | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/quota.h b/include/linux/quota.h
index fd692b4a41d5..07071e64abf3 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -285,7 +285,9 @@ static inline void dqstats_dec(unsigned int type)
 #define DQ_FAKE_B	3	/* no limits only usage */
 #define DQ_READ_B	4	/* dquot was read into memory */
 #define DQ_ACTIVE_B	5	/* dquot is active (dquot_release not called) */
-#define DQ_LASTSET_B	6	/* Following 6 bits (see QIF_) are reserved\
+#define DQ_RELEASING_B	6	/* dquot is in releasing_dquots list waiting
+				 * to be cleaned up */
+#define DQ_LASTSET_B	7	/* Following 6 bits (see QIF_) are reserved\
 				 * for the mask of entries set via SETQUOTA\
 				 * quotactl. They are set under dq_data_lock\
 				 * and the quota format handling dquot can\
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 11a4becff3a9..4fa4ef0a173a 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -57,7 +57,7 @@ static inline bool dquot_is_busy(struct dquot *dquot)
 {
 	if (test_bit(DQ_MOD_B, &dquot->dq_flags))
 		return true;
-	if (atomic_read(&dquot->dq_count) > 1)
+	if (atomic_read(&dquot->dq_count) > 0)
 		return true;
 	return false;
 }
-- 
cgit v1.2.3


From 5a0b11a180a9b82b4437a4be1cf73530053f139b Mon Sep 17 00:00:00 2001
From: Vasant Hegde <vasant.hegde@amd.com>
Date: Fri, 6 Oct 2023 09:57:02 +0000
Subject: iommu/amd: Remove iommu_v2 module

AMD GPU driver which was the only in-kernel user of iommu_v2 module
removed dependency on iommu_v2 module.

Also we are working on adding SVA support in AMD IOMMU driver. Device
drivers are expected to use common SVA framework to enable device
PASID/PRI features.

Removing iommu_v2 module and then adding SVA simplifies the development.
Hence remove iommu_v2 module.

Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Tested-by: Alex Deucher <alexander.deucher@amd.com>
Link: https://lore.kernel.org/r/20231006095706.5694-2-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/amd-iommu.h | 94 -----------------------------------------------
 1 file changed, 94 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 99a5201d9e62..35ca00b09384 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -33,84 +33,6 @@ struct pci_dev;
 
 extern int amd_iommu_detect(void);
 
-/**
- * amd_iommu_init_device() - Init device for use with IOMMUv2 driver
- * @pdev: The PCI device to initialize
- * @pasids: Number of PASIDs to support for this device
- *
- * This function does all setup for the device pdev so that it can be
- * used with IOMMUv2.
- * Returns 0 on success or negative value on error.
- */
-extern int amd_iommu_init_device(struct pci_dev *pdev, int pasids);
-
-/**
- * amd_iommu_free_device() - Free all IOMMUv2 related device resources
- *			     and disable IOMMUv2 usage for this device
- * @pdev: The PCI device to disable IOMMUv2 usage for'
- */
-extern void amd_iommu_free_device(struct pci_dev *pdev);
-
-/**
- * amd_iommu_bind_pasid() - Bind a given task to a PASID on a device
- * @pdev: The PCI device to bind the task to
- * @pasid: The PASID on the device the task should be bound to
- * @task: the task to bind
- *
- * The function returns 0 on success or a negative value on error.
- */
-extern int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid,
-				struct task_struct *task);
-
-/**
- * amd_iommu_unbind_pasid() - Unbind a PASID from its task on
- *			      a device
- * @pdev: The device of the PASID
- * @pasid: The PASID to unbind
- *
- * When this function returns the device is no longer using the PASID
- * and the PASID is no longer bound to its task.
- */
-extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, u32 pasid);
-
-/**
- * amd_iommu_set_invalid_ppr_cb() - Register a call-back for failed
- *				    PRI requests
- * @pdev: The PCI device the call-back should be registered for
- * @cb: The call-back function
- *
- * The IOMMUv2 driver invokes this call-back when it is unable to
- * successfully handle a PRI request. The device driver can then decide
- * which PRI response the device should see. Possible return values for
- * the call-back are:
- *
- * - AMD_IOMMU_INV_PRI_RSP_SUCCESS - Send SUCCESS back to the device
- * - AMD_IOMMU_INV_PRI_RSP_INVALID - Send INVALID back to the device
- * - AMD_IOMMU_INV_PRI_RSP_FAIL    - Send Failure back to the device,
- *				     the device is required to disable
- *				     PRI when it receives this response
- *
- * The function returns 0 on success or negative value on error.
- */
-#define AMD_IOMMU_INV_PRI_RSP_SUCCESS	0
-#define AMD_IOMMU_INV_PRI_RSP_INVALID	1
-#define AMD_IOMMU_INV_PRI_RSP_FAIL	2
-
-typedef int (*amd_iommu_invalid_ppr_cb)(struct pci_dev *pdev,
-					u32 pasid,
-					unsigned long address,
-					u16);
-
-extern int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev,
-					amd_iommu_invalid_ppr_cb cb);
-
-#define PPR_FAULT_EXEC	(1 << 1)
-#define PPR_FAULT_READ  (1 << 2)
-#define PPR_FAULT_WRITE (1 << 5)
-#define PPR_FAULT_USER  (1 << 6)
-#define PPR_FAULT_RSVD  (1 << 7)
-#define PPR_FAULT_GN    (1 << 8)
-
 /**
  * amd_iommu_device_info() - Get information about IOMMUv2 support of a
  *			     PCI device
@@ -137,22 +59,6 @@ struct amd_iommu_device_info {
 extern int amd_iommu_device_info(struct pci_dev *pdev,
 				 struct amd_iommu_device_info *info);
 
-/**
- * amd_iommu_set_invalidate_ctx_cb() - Register a call-back for invalidating
- *				       a pasid context. This call-back is
- *				       invoked when the IOMMUv2 driver needs to
- *				       invalidate a PASID context, for example
- *				       because the task that is bound to that
- *				       context is about to exit.
- *
- * @pdev: The PCI device the call-back should be registered for
- * @cb: The call-back function
- */
-
-typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, u32 pasid);
-
-extern int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
-					   amd_iommu_invalidate_ctx cb);
 #else /* CONFIG_AMD_IOMMU */
 
 static inline int amd_iommu_detect(void) { return -ENODEV; }
-- 
cgit v1.2.3


From 37b282fa04dd7b9bfa7c290fe07ef1730444debb Mon Sep 17 00:00:00 2001
From: Vasant Hegde <vasant.hegde@amd.com>
Date: Fri, 6 Oct 2023 09:57:04 +0000
Subject: iommu/amd: Remove amd_iommu_device_info()

No one is using this function. Hence remove it. Also move PCI device
feature detection flags to amd_iommu_types.h as its only used inside
AMD IOMMU driver.

Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Tested-by: Alex Deucher <alexander.deucher@amd.com>
Link: https://lore.kernel.org/r/20231006095706.5694-4-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/amd-iommu.h | 26 --------------------------
 1 file changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 35ca00b09384..dc7ed2f46886 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -33,32 +33,6 @@ struct pci_dev;
 
 extern int amd_iommu_detect(void);
 
-/**
- * amd_iommu_device_info() - Get information about IOMMUv2 support of a
- *			     PCI device
- * @pdev: PCI device to query information from
- * @info: A pointer to an amd_iommu_device_info structure which will contain
- *	  the information about the PCI device
- *
- * Returns 0 on success, negative value on error
- */
-
-#define AMD_IOMMU_DEVICE_FLAG_ATS_SUP     0x1    /* ATS feature supported */
-#define AMD_IOMMU_DEVICE_FLAG_PRI_SUP     0x2    /* PRI feature supported */
-#define AMD_IOMMU_DEVICE_FLAG_PASID_SUP   0x4    /* PASID context supported */
-#define AMD_IOMMU_DEVICE_FLAG_EXEC_SUP    0x8    /* Device may request execution
-						    on memory pages */
-#define AMD_IOMMU_DEVICE_FLAG_PRIV_SUP   0x10    /* Device may request
-						    super-user privileges */
-
-struct amd_iommu_device_info {
-	int max_pasids;
-	u32 flags;
-};
-
-extern int amd_iommu_device_info(struct pci_dev *pdev,
-				 struct amd_iommu_device_info *info);
-
 #else /* CONFIG_AMD_IOMMU */
 
 static inline int amd_iommu_detect(void) { return -ENODEV; }
-- 
cgit v1.2.3


From 1609626c32c4538439f6333d0b6c912af9f13b77 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:44:54 +0100
Subject: firmware: arm_ffa: Update the FF-A command list with v1.1 additions

Arm Firmware Framework for A-profile(FFA) v1.1 introduces notifications
and indirect messaging based upon notifications support and extends some
of the memory interfaces.

Let us add all the newly supported FF-A function IDs in the spec.
Also update to the error values and associated handling.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-1-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index cc060da51bec..2ea1717a0825 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -20,6 +20,7 @@
 
 #define FFA_ERROR			FFA_SMC_32(0x60)
 #define FFA_SUCCESS			FFA_SMC_32(0x61)
+#define FFA_FN64_SUCCESS		FFA_SMC_64(0x61)
 #define FFA_INTERRUPT			FFA_SMC_32(0x62)
 #define FFA_VERSION			FFA_SMC_32(0x63)
 #define FFA_FEATURES			FFA_SMC_32(0x64)
@@ -54,6 +55,23 @@
 #define FFA_MEM_FRAG_RX			FFA_SMC_32(0x7A)
 #define FFA_MEM_FRAG_TX			FFA_SMC_32(0x7B)
 #define FFA_NORMAL_WORLD_RESUME		FFA_SMC_32(0x7C)
+#define FFA_NOTIFICATION_BITMAP_CREATE	FFA_SMC_32(0x7D)
+#define FFA_NOTIFICATION_BITMAP_DESTROY FFA_SMC_32(0x7E)
+#define FFA_NOTIFICATION_BIND		FFA_SMC_32(0x7F)
+#define FFA_NOTIFICATION_UNBIND		FFA_SMC_32(0x80)
+#define FFA_NOTIFICATION_SET		FFA_SMC_32(0x81)
+#define FFA_NOTIFICATION_GET		FFA_SMC_32(0x82)
+#define FFA_NOTIFICATION_INFO_GET	FFA_SMC_32(0x83)
+#define FFA_FN64_NOTIFICATION_INFO_GET	FFA_SMC_64(0x83)
+#define FFA_RX_ACQUIRE			FFA_SMC_32(0x84)
+#define FFA_SPM_ID_GET			FFA_SMC_32(0x85)
+#define FFA_MSG_SEND2			FFA_SMC_32(0x86)
+#define FFA_SECONDARY_EP_REGISTER	FFA_SMC_32(0x87)
+#define FFA_FN64_SECONDARY_EP_REGISTER	FFA_SMC_64(0x87)
+#define FFA_MEM_PERM_GET		FFA_SMC_32(0x88)
+#define FFA_FN64_MEM_PERM_GET		FFA_SMC_64(0x88)
+#define FFA_MEM_PERM_SET		FFA_SMC_32(0x89)
+#define FFA_FN64_MEM_PERM_SET		FFA_SMC_64(0x89)
 
 /*
  * For some calls it is necessary to use SMC64 to pass or return 64-bit values.
@@ -76,6 +94,7 @@
 #define FFA_RET_DENIED             (-6)
 #define FFA_RET_RETRY              (-7)
 #define FFA_RET_ABORTED            (-8)
+#define FFA_RET_NO_DATA            (-9)
 
 /* FFA version encoding */
 #define FFA_MAJOR_VERSION_MASK	GENMASK(30, 16)
@@ -86,6 +105,7 @@
 	(FIELD_PREP(FFA_MAJOR_VERSION_MASK, (major)) |		\
 	 FIELD_PREP(FFA_MINOR_VERSION_MASK, (minor)))
 #define FFA_VERSION_1_0		FFA_PACK_VERSION_INFO(1, 0)
+#define FFA_VERSION_1_1		FFA_PACK_VERSION_INFO(1, 1)
 
 /**
  * FF-A specification mentions explicitly about '4K pages'. This should
-- 
cgit v1.2.3


From fe2ddb6b42358ad25a6aed30512fb284522335f3 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:44:57 +0100
Subject: firmware: arm_ffa: Implement the FFA_RUN interface

FFA_RUN is used by a scheduler to allocate CPU cycles to a target
endpoint execution context specified in the target information parameter.

If the endpoint execution context is in the waiting/blocked state, it
transitions to the running state.

Expose the ability to call FFA_RUN in order to give any partition in the
system cpu cycles to perform IMPDEF functionality.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-4-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 2ea1717a0825..12fd134bf670 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -387,10 +387,15 @@ struct ffa_mem_ops {
 	int (*memory_lend)(struct ffa_mem_ops_args *args);
 };
 
+struct ffa_cpu_ops {
+	int (*run)(struct ffa_device *dev, u16 vcpu);
+};
+
 struct ffa_ops {
 	const struct ffa_info_ops *info_ops;
 	const struct ffa_msg_ops *msg_ops;
 	const struct ffa_mem_ops *mem_ops;
+	const struct ffa_cpu_ops *cpu_ops;
 };
 
 #endif /* _LINUX_ARM_FFA_H */
-- 
cgit v1.2.3


From 0184450b8b1e7734110472616c4758839e1aff96 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:45:02 +0100
Subject: firmware: arm_ffa: Add schedule receiver callback mechanism

Enable client drivers to register a callback function that will be
called when one or more notifications are pending for a target
partition as part of schedule receiver interrupt handling.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-9-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 12fd134bf670..f9cf6114ef82 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -391,11 +391,19 @@ struct ffa_cpu_ops {
 	int (*run)(struct ffa_device *dev, u16 vcpu);
 };
 
+typedef void (*ffa_sched_recv_cb)(u16 vcpu, bool is_per_vcpu, void *cb_data);
+struct ffa_notifier_ops {
+	int (*sched_recv_cb_register)(struct ffa_device *dev,
+				      ffa_sched_recv_cb cb, void *cb_data);
+	int (*sched_recv_cb_unregister)(struct ffa_device *dev);
+};
+
 struct ffa_ops {
 	const struct ffa_info_ops *info_ops;
 	const struct ffa_msg_ops *msg_ops;
 	const struct ffa_mem_ops *mem_ops;
 	const struct ffa_cpu_ops *cpu_ops;
+	const struct ffa_notifier_ops *notifier_ops;
 };
 
 #endif /* _LINUX_ARM_FFA_H */
-- 
cgit v1.2.3


From e0573444edbf4ee7e3c191d3d08a4ccbd26628be Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:45:03 +0100
Subject: firmware: arm_ffa: Add interfaces to request notification callbacks

Add interface to the FFA driver to allow for client drivers to request
and relinquish a notification as well as provide a callback for the
notification.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-10-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index f9cf6114ef82..fb6f388a3737 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -392,10 +392,15 @@ struct ffa_cpu_ops {
 };
 
 typedef void (*ffa_sched_recv_cb)(u16 vcpu, bool is_per_vcpu, void *cb_data);
+typedef void (*ffa_notifier_cb)(int notify_id, void *cb_data);
+
 struct ffa_notifier_ops {
 	int (*sched_recv_cb_register)(struct ffa_device *dev,
 				      ffa_sched_recv_cb cb, void *cb_data);
 	int (*sched_recv_cb_unregister)(struct ffa_device *dev);
+	int (*notify_request)(struct ffa_device *dev, bool per_vcpu,
+			      ffa_notifier_cb cb, void *cb_data, int notify_id);
+	int (*notify_relinquish)(struct ffa_device *dev, int notify_id);
 };
 
 struct ffa_ops {
-- 
cgit v1.2.3


From e5adb3b20e39dbf18651322a9bc24eb55050188f Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:45:04 +0100
Subject: firmware: arm_ffa: Add interface to send a notification to a given
 partition

The framework provides an interface to the sender endpoint to specify
the notification to signal to the receiver endpoint. A sender signals
a notification by requesting its partition manager to set the
corresponding bit in the notifications bitmap of the receiver.

Expose the ability to send a notification to another partition.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-11-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index fb6f388a3737..f6df81f14b6d 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -401,6 +401,8 @@ struct ffa_notifier_ops {
 	int (*notify_request)(struct ffa_device *dev, bool per_vcpu,
 			      ffa_notifier_cb cb, void *cb_data, int notify_id);
 	int (*notify_relinquish)(struct ffa_device *dev, int notify_id);
+	int (*notify_send)(struct ffa_device *dev, int notify_id, bool per_vcpu,
+			   u16 vcpu);
 };
 
 struct ffa_ops {
-- 
cgit v1.2.3


From c9b21ef0d0a87695d7bfeee9a04b89760b49ccf5 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:45:06 +0100
Subject: firmware: arm_ffa: Simplify the computation of transmit and fragment
 length

The computation of endpoint memory access descriptor's composite memory
region descriptor offset is using COMPOSITE_CONSTITUENTS_OFFSET which is
unnecessary complicated. Composite memory region descriptor always follow
the endpoint memory access descriptor array and hence it is computed
accordingly. COMPOSITE_CONSTITUENTS_OFFSET is useless and wrong for any
input other than endpoint memory access descriptor count.

Let us drop the usage of COMPOSITE_CONSTITUENTS_OFFSET to simplify the
computation of total transmit and fragment length in the memory
transactions.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-13-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index f6df81f14b6d..748d0a83a4bc 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -356,8 +356,6 @@ struct ffa_mem_region {
 	(offsetof(struct ffa_mem_region, ep_mem_access[x]))
 #define CONSTITUENTS_OFFSET(x)	\
 	(offsetof(struct ffa_composite_mem_region, constituents[x]))
-#define COMPOSITE_CONSTITUENTS_OFFSET(x, y)	\
-	(COMPOSITE_OFFSET(x) + CONSTITUENTS_OFFSET(y))
 
 struct ffa_mem_ops_args {
 	bool use_txbuf;
-- 
cgit v1.2.3


From bc5bc309db45a7ab218ce8259ba9bc7659be61ca Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Thu, 5 Oct 2023 08:41:23 +0000
Subject: bpf: Inherit system settings for CPU security mitigations

Currently, there exists a system-wide setting related to CPU security
mitigations, denoted as 'mitigations='. When set to 'mitigations=off', it
deactivates all optional CPU mitigations. Therefore, if we implement a
system-wide 'mitigations=off' setting, it should inherently bypass Spectre
v1 and Spectre v4 in the BPF subsystem.

Please note that there is also a more specific 'nospectre_v1' setting on
x86 and ppc architectures, though it is not currently exported. For the
time being, let's disregard more fine-grained options.

This idea emerged during our discussion about potential Spectre v1 attacks
with Luis [0].

  [0] https://lore.kernel.org/bpf/b4fc15f7-b204-767e-ebb9-fdb4233961fb@iogearbox.net

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Song Liu <song@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Cc: Luis Gerhorst <gerhorst@cs.fau.de>
Link: https://lore.kernel.org/bpf/20231005084123.1338-1-laoar.shao@gmail.com
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a82efd34b741..61bde4520f5c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2164,12 +2164,12 @@ static inline bool bpf_allow_uninit_stack(void)
 
 static inline bool bpf_bypass_spec_v1(void)
 {
-	return perfmon_capable();
+	return perfmon_capable() || cpu_mitigations_off();
 }
 
 static inline bool bpf_bypass_spec_v4(void)
 {
-	return perfmon_capable();
+	return perfmon_capable() || cpu_mitigations_off();
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
-- 
cgit v1.2.3


From e7a1b32e43b194bbf930281ae7f5149c420cd122 Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Thu, 5 Oct 2023 15:41:20 +0200
Subject: cpufreq: Rebuild sched-domains when removing cpufreq driver

The Energy Aware Scheduler (EAS) relies on the schedutil governor.
When moving to/from the schedutil governor, sched domains must be
rebuilt to allow re-evaluating the enablement conditions of EAS.
This is done through sched_cpufreq_governor_change().

Having a cpufreq governor assumes a cpufreq driver is running.
Inserting/removing a cpufreq driver should trigger a re-evaluation
of EAS enablement conditions, avoiding to see EAS enabled when
removing a running cpufreq driver.

Rebuild the sched domains in schedutil's sugov_init()/sugov_exit(),
allowing to check EAS's enablement condition whenever schedutil
governor is initialized/exited from.
Move relevant code up in schedutil.c to avoid a split and conditional
function declaration.
Rename sched_cpufreq_governor_change() to sugov_eas_rebuild_sd().

Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 71d186d6933a..1c5ca92a0555 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1193,14 +1193,6 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_
 }
 #endif
 
-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
-void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
-			struct cpufreq_governor *old_gov);
-#else
-static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
-			struct cpufreq_governor *old_gov) { }
-#endif
-
 extern unsigned int arch_freq_get_on_cpu(int cpu);
 
 #ifndef arch_set_freq_scale
-- 
cgit v1.2.3


From 51a23b1be92046f0cc52384d30cf060700f1a54e Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@cn.fujitsu.com>
Date: Wed, 2 Aug 2023 17:28:56 +0800
Subject: acpi,mm: fix typo sibiling -> sibling

First found this typo as reviewing memory tier code. Fix it by sed like:
$ sed -i 's/sibiling/sibling/g' $(git grep -l sibiling)

so the acpi one will be corrected as well.

Link: https://lkml.kernel.org/r/20230802092856.819328-1-lizhijian@cn.fujitsu.com
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Huang, Ying <ying.huang@intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 437441cdf78f..4fa178b50784 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -22,7 +22,7 @@
 struct memory_tier;
 struct memory_dev_type {
 	/* list of memory types that are part of same tier as this type */
-	struct list_head tier_sibiling;
+	struct list_head tier_sibling;
 	/* abstract distance for this specific memory type */
 	int adistance;
 	/* Nodes of same abstract distance */
-- 
cgit v1.2.3


From 55d2a0bd5eadaade850efa9d3a7ffbb0aeb67198 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Mon, 18 Sep 2023 14:31:42 +0800
Subject: mm: add statistics for PUD level pagetable

Recently, we found that cross-die access to pagetable pages on ARM64
machines can cause performance fluctuations in our business.  Currently,
there are no PMU events available to track this situation on our ARM64
machines, so accurate pagetable accounting can help to analyze this issue,
but now the PUD level pagetable accounting is missed.

So introduce pagetable_pud_ctor/dtor() to help to get accurate PUD
pagetable accounting, as well as converting the architectures which use
generic PUD pagetable allocation to add corresponding PUD pagetable
accounting.  Moreover this patch will mark the PUD level pagetable with
PG_table flag, which will help to do sanity validation in
unpoison_memory().

On my testing machine, I can see more pagetables statistics after the patch
with page-types tool:

Before patch:
        flags           page-count      MB  symbolic-flags                     long-symbolic-flags
0x0000000004000000           27326      106  __________________________g_________________       pgtable
After patch:
0x0000000004000000           27541      107  __________________________g_________________       pgtable

Link: https://lkml.kernel.org/r/876c71c03a7e69c17722a690e3225a4f7b172fb2.1695017383.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 31dc25d3f6b5..126b54b45442 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3049,6 +3049,22 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
 	return ptl;
 }
 
+static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
+{
+	struct folio *folio = ptdesc_folio(ptdesc);
+
+	__folio_set_pgtable(folio);
+	lruvec_stat_add_folio(folio, NR_PAGETABLE);
+}
+
+static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
+{
+	struct folio *folio = ptdesc_folio(ptdesc);
+
+	__folio_clear_pgtable(folio);
+	lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+}
+
 extern void __init pagecache_init(void);
 extern void free_initmem(void);
 
-- 
cgit v1.2.3


From 7ced098fcfe596feab3cea4f40128b0119c7bf1a Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 18 Sep 2023 22:18:32 +0200
Subject: mm: document mmu_notifier_invalidate_range_start_nonblock()

Document what mmu_notifier_invalidate_range_start_nonblock() is for.  Also
add a __must_check annotation to signal that callers must bail out if a
notifier vetoes the operation.

Link: https://lkml.kernel.org/r/20230918201832.265108-1-jannh@google.com
Signed-off-by: Jann Horn <jannh@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 6e3c857606f1..f349e08a9dfe 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -459,7 +459,14 @@ mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
 	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
 }
 
-static inline int
+/*
+ * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it
+ * can return an error if a notifier can't proceed without blocking, in which
+ * case you're not allowed to modify PTEs in the specified range.
+ *
+ * This is mainly intended for OOM handling.
+ */
+static inline int __must_check
 mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
 {
 	int ret = 0;
-- 
cgit v1.2.3


From 24e41bf8a6b424c76c5902fb999e9eca61bdf83d Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Mon, 28 Aug 2023 17:08:57 +0200
Subject: mm: add a NO_INHERIT flag to the PR_SET_MDWE prctl

This extends the current PR_SET_MDWE prctl arg with a bit to indicate that
the process doesn't want MDWE protection to propagate to children.

To implement this no-inherit mode, the tag in current->mm->flags must be
absent from MMF_INIT_MASK.  This means that the encoding for "MDWE but
without inherit" is different in the prctl than in the mm flags.  This
leads to a bit of bit-mangling in the prctl implementation.

Link: https://lkml.kernel.org/r/20230828150858.393570-6-revest@chromium.org
Signed-off-by: Florent Revest <revest@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alexey Izbyshev <izbyshev@ispras.ru>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ayush Jain <ayush.jain3@amd.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
Cc: Topi Miettinen <toiwoton@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/coredump.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 0ee96ea7a0e9..1b37fa8fc723 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -91,4 +91,14 @@ static inline int get_dumpable(struct mm_struct *mm)
 				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
 
 #define MMF_VM_MERGE_ANY	29
+#define MMF_HAS_MDWE_NO_INHERIT	30
+
+static inline unsigned long mmf_init_flags(unsigned long flags)
+{
+	if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
+		flags &= ~((1UL << MMF_HAS_MDWE) |
+			   (1UL << MMF_HAS_MDWE_NO_INHERIT));
+	return flags & MMF_INIT_MASK;
+}
+
 #endif /* _LINUX_SCHED_COREDUMP_H */
-- 
cgit v1.2.3


From 7e845ecb2fbfa1bf800e703df29ee2e06592c2a0 Mon Sep 17 00:00:00 2001
From: Sui Jingfeng <suijingfeng@loongson.cn>
Date: Wed, 30 Aug 2023 19:15:28 +0800
Subject: PCI: Add pci_is_vga() helper

The PCI Code and ID Assignment spec, r1.15, secs 1.4 and 1.1, define
VGA Base Class and Sub-Classes:

  03 00  PCI_CLASS_DISPLAY_VGA      VGA-compatible or 8514-compatible
  00 01  PCI_CLASS_NOT_DEFINED_VGA  VGA-compatible (before Class Code)

Add a pci_is_vga() helper to return true if a device is in either category.

These VGA devices use the hardwired legacy VGA resources
([mem 0xa0000-0xbffff], [io 0x3b0-0x3bb], [io 0x3c0-0x3df] and aliases), so
they require special handling if more than one is present in the system.

Link: https://lore.kernel.org/r/20230830111532.444535-2-sui.jingfeng@linux.dev
Signed-off-by: Sui Jingfeng <suijingfeng@loongson.cn>
[bhelgaas: commit log, drop !pdev test]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: "Maciej W. Rozycki" <macro@orcam.me.uk>
---
 include/linux/pci.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8c7c2c3c6c65..7bab234391cb 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -713,6 +713,30 @@ static inline bool pci_is_bridge(struct pci_dev *dev)
 		dev->hdr_type == PCI_HEADER_TYPE_CARDBUS;
 }
 
+/**
+ * pci_is_vga - check if the PCI device is a VGA device
+ *
+ * The PCI Code and ID Assignment spec, r1.15, secs 1.4 and 1.1, define
+ * VGA Base Class and Sub-Classes:
+ *
+ *   03 00  PCI_CLASS_DISPLAY_VGA      VGA-compatible or 8514-compatible
+ *   00 01  PCI_CLASS_NOT_DEFINED_VGA  VGA-compatible (before Class Code)
+ *
+ * Return true if the PCI device is a VGA device and uses the legacy VGA
+ * resources ([mem 0xa0000-0xbffff], [io 0x3b0-0x3bb], [io 0x3c0-0x3df] and
+ * aliases).
+ */
+static inline bool pci_is_vga(struct pci_dev *pdev)
+{
+	if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
+		return true;
+
+	if ((pdev->class >> 8) == PCI_CLASS_NOT_DEFINED_VGA)
+		return true;
+
+	return false;
+}
+
 #define for_each_pci_bridge(dev, bus)				\
 	list_for_each_entry(dev, &bus->devices, bus_list)	\
 		if (!pci_is_bridge(dev)) {} else
-- 
cgit v1.2.3


From 3abbd0699b678fc48e0100704338cff9180fe4bb Mon Sep 17 00:00:00 2001
From: Giulio Benetti <giulio.benetti+tekvox@benettiengineering.com>
Date: Thu, 5 Oct 2023 20:29:15 +0200
Subject: net: phy: broadcom: add support for BCM5221 phy

This patch adds the BCM5221 PHY support by reusing brcm_fet_*()
callbacks and adding quirks for BCM5221 when needed.

Cc: Jim Reinhart <jimr@tekvox.com>
Cc: James Autry <jautry@tekvox.com>
Cc: Matthew Maron <matthewm@tekvox.com>
Signed-off-by: Giulio Benetti <giulio.benetti+tekvox@benettiengineering.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/20231005182915.153815-1-giulio.benetti@benettiengineering.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/brcmphy.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index c55810a43541..1394ba302367 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -11,6 +11,7 @@
 
 #define PHY_ID_BCM50610			0x0143bd60
 #define PHY_ID_BCM50610M		0x0143bd70
+#define PHY_ID_BCM5221			0x004061e0
 #define PHY_ID_BCM5241			0x0143bc30
 #define PHY_ID_BCMAC131			0x0143bc70
 #define PHY_ID_BCM5481			0x0143bca0
@@ -331,6 +332,15 @@
 
 #define BCM54XX_WOL_INT_STATUS		(MII_BCM54XX_EXP_SEL_WOL + 0x94)
 
+/* BCM5221 Registers */
+#define BCM5221_AEGSR			0x1C
+#define BCM5221_AEGSR_MDIX_STATUS	BIT(13)
+#define BCM5221_AEGSR_MDIX_MAN_SWAP	BIT(12)
+#define BCM5221_AEGSR_MDIX_DIS		BIT(11)
+
+#define BCM5221_SHDW_AM4_EN_CLK_LPM	BIT(2)
+#define BCM5221_SHDW_AM4_FORCE_LPM	BIT(1)
+
 /*****************************************************************************/
 /* Fast Ethernet Transceiver definitions. */
 /*****************************************************************************/
-- 
cgit v1.2.3


From 77bbfe607b1d306c88bf96fed00c030f6bf462f1 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Wed, 4 Oct 2023 07:42:23 +0800
Subject: firmware: arm_scmi: Add support for clock parents

SCMI v3.2 spec introduces CLOCK_POSSIBLE_PARENTS_GET, CLOCK_PARENT_SET
and CLOCK_PARENT_GET. Add support for these to enable clock parents
and use them in the clock driver.

Reviewed-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20231004-scmi-clock-v3-v5-1-1b8a1435673e@nxp.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_protocol.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 27bfa5a65b45..f2f05fb42d28 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -58,6 +58,8 @@ struct scmi_clock_info {
 			u64 step_size;
 		} range;
 	};
+	int num_parents;
+	u32 *parents;
 };
 
 enum scmi_power_scale {
@@ -83,6 +85,8 @@ struct scmi_protocol_handle;
  * @state_get: get the status of the specified clock
  * @config_oem_get: get the value of an OEM specific clock config
  * @config_oem_set: set the value of an OEM specific clock config
+ * @parent_get: get the parent id of a clk
+ * @parent_set: set the parent of a clock
  */
 struct scmi_clk_proto_ops {
 	int (*count_get)(const struct scmi_protocol_handle *ph);
@@ -104,6 +108,8 @@ struct scmi_clk_proto_ops {
 			      bool atomic);
 	int (*config_oem_set)(const struct scmi_protocol_handle *ph, u32 clk_id,
 			      u8 oem_type, u32 oem_val, bool atomic);
+	int (*parent_get)(const struct scmi_protocol_handle *ph, u32 clk_id, u32 *parent_id);
+	int (*parent_set)(const struct scmi_protocol_handle *ph, u32 clk_id, u32 parent_id);
 };
 
 struct scmi_perf_domain_info {
-- 
cgit v1.2.3


From 76cf932c95b9e7c07b065b5c71e56957e2826ae2 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:45:07 +0100
Subject: KVM: arm64: FFA: Remove access of endpoint memory access descriptor
 array

FF-A v1.1 removes the fixed location of endpoint memory access descriptor
array within the memory transaction descriptor structure. In preparation
to remove the ep_mem_access member from the ffa_mem_region structure,
provide the accessor to fetch the offset and use the same in FF-A proxy
implementation.

The accessor take the FF-A version as the argument from which the memory
access descriptor format can be determined. v1.0 uses the old format while
v1.1 onwards use the new format specified in the v1.1 specification.

Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Will Deacon <will@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-14-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 748d0a83a4bc..2444d596b703 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -357,6 +357,12 @@ struct ffa_mem_region {
 #define CONSTITUENTS_OFFSET(x)	\
 	(offsetof(struct ffa_composite_mem_region, constituents[x]))
 
+static inline u32
+ffa_mem_desc_offset(struct ffa_mem_region *buf, int count, u32 ffa_version)
+{
+	return COMPOSITE_OFFSET(0);
+}
+
 struct ffa_mem_ops_args {
 	bool use_txbuf;
 	u32 nattrs;
-- 
cgit v1.2.3


From 113580530ee7dc61e668b641d657920734533b9f Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:45:09 +0100
Subject: firmware: arm_ffa: Update memory descriptor to support v1.1 format

Update memory transaction descriptor structure to accommodate couple of
new entries in v1.1 which were previously marked reserved and MBZ(must
be zero).

It also removes the flexible array member ep_mem_access in the memory
transaction descriptor structure as it need not be at fixed offset.
Also update ffa_mem_desc_offset() accessor to handle both old and new
formats of memory transaction descriptors.

The updated ffa_mem_region structure aligns with new format in v1.1 and
hence the driver/user must take care not to use members beyond and
including ep_mem_offset when using the old format.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-16-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 2444d596b703..1abedb5b2e48 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -6,6 +6,7 @@
 #ifndef _LINUX_ARM_FFA_H
 #define _LINUX_ARM_FFA_H
 
+#include <linux/bitfield.h>
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -298,8 +299,8 @@ struct ffa_mem_region {
 #define FFA_MEM_NON_SHAREABLE	(0)
 #define FFA_MEM_OUTER_SHAREABLE	(2)
 #define FFA_MEM_INNER_SHAREABLE	(3)
-	u8 attributes;
-	u8 reserved_0;
+	/* Memory region attributes, upper byte MBZ pre v1.1 */
+	u16 attributes;
 /*
  * Clear memory region contents after unmapping it from the sender and
  * before mapping it for any receiver.
@@ -337,30 +338,40 @@ struct ffa_mem_region {
 	 * memory region.
 	 */
 	u64 tag;
-	u32 reserved_1;
+	/* Size of each endpoint memory access descriptor, MBZ pre v1.1 */
+	u32 ep_mem_size;
 	/*
 	 * The number of `ffa_mem_region_attributes` entries included in this
 	 * transaction.
 	 */
 	u32 ep_count;
 	/*
-	 * An array of endpoint memory access descriptors.
-	 * Each one specifies a memory region offset, an endpoint and the
-	 * attributes with which this memory region should be mapped in that
-	 * endpoint's page table.
+	 * 16-byte aligned offset from the base address of this descriptor
+	 * to the first element of the endpoint memory access descriptor array
+	 * Valid only from v1.1
 	 */
-	struct ffa_mem_region_attributes ep_mem_access[];
+	u32 ep_mem_offset;
+	/* MBZ, valid only from v1.1 */
+	u32 reserved[3];
 };
 
-#define	COMPOSITE_OFFSET(x)	\
-	(offsetof(struct ffa_mem_region, ep_mem_access[x]))
 #define CONSTITUENTS_OFFSET(x)	\
 	(offsetof(struct ffa_composite_mem_region, constituents[x]))
 
 static inline u32
 ffa_mem_desc_offset(struct ffa_mem_region *buf, int count, u32 ffa_version)
 {
-	return COMPOSITE_OFFSET(0);
+	u32 offset = count * sizeof(struct ffa_mem_region_attributes);
+	/*
+	 * Earlier to v1.1, the endpoint memory descriptor array started at
+	 * offset 32(i.e. offset of ep_mem_offset in the current structure)
+	 */
+	if (ffa_version <= FFA_VERSION_1_0)
+		offset += offsetof(struct ffa_mem_region, ep_mem_offset);
+	else
+		offset += sizeof(struct ffa_mem_region);
+
+	return offset;
 }
 
 struct ffa_mem_ops_args {
-- 
cgit v1.2.3


From 3e7807d5a7d770c59837026e9967fe99ad043174 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 4 Oct 2023 22:55:32 -0400
Subject: fscrypt: rename fscrypt_info => fscrypt_inode_info

We are going to track per-extent information, so it'll be necessary to
distinguish between inode infos and extent infos.  Rename fscrypt_info
to fscrypt_inode_info, adjusting any lines that now exceed 80
characters.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ebiggers: rebased onto fscrypt tree, renamed fscrypt_get_info(),
 adjusted two comments, and fixed some lines over 80 characters]
Link: https://lore.kernel.org/r/20231005025757.33521-1-ebiggers@kernel.org
Reviewed-by: Neal Gompa <neal@gompa.dev>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 include/linux/fs.h      |  4 ++--
 include/linux/fscrypt.h | 10 ++++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b528f063e8ff..a3df96736473 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -67,7 +67,7 @@ struct swap_info_struct;
 struct seq_file;
 struct workqueue_struct;
 struct iov_iter;
-struct fscrypt_info;
+struct fscrypt_inode_info;
 struct fscrypt_operations;
 struct fsverity_info;
 struct fsverity_operations;
@@ -738,7 +738,7 @@ struct inode {
 #endif
 
 #ifdef CONFIG_FS_ENCRYPTION
-	struct fscrypt_info	*i_crypt_info;
+	struct fscrypt_inode_info	*i_crypt_info;
 #endif
 
 #ifdef CONFIG_FS_VERITY
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index b559e6f77707..12f9e455d569 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -31,7 +31,7 @@
 #define FSCRYPT_CONTENTS_ALIGNMENT 16
 
 union fscrypt_policy;
-struct fscrypt_info;
+struct fscrypt_inode_info;
 struct fs_parameter;
 struct seq_file;
 
@@ -192,7 +192,8 @@ struct fscrypt_operations {
 					     unsigned int *num_devs);
 };
 
-static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
+static inline struct fscrypt_inode_info *
+fscrypt_get_inode_info(const struct inode *inode)
 {
 	/*
 	 * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info().
@@ -404,7 +405,8 @@ static inline void fscrypt_set_ops(struct super_block *sb,
 }
 #else  /* !CONFIG_FS_ENCRYPTION */
 
-static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
+static inline struct fscrypt_inode_info *
+fscrypt_get_inode_info(const struct inode *inode)
 {
 	return NULL;
 }
@@ -882,7 +884,7 @@ static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode)
  */
 static inline bool fscrypt_has_encryption_key(const struct inode *inode)
 {
-	return fscrypt_get_info(inode) != NULL;
+	return fscrypt_get_inode_info(inode) != NULL;
 }
 
 /**
-- 
cgit v1.2.3


From 4a530cb932af31b0c919a109bc107dd186653381 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:50:53 -0700
Subject: hwmon: Annotate struct gsc_hwmon_platform_data with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct gsc_hwmon_platform_data.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Tim Harvey <tharvey@gateworks.com>
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922175053.work.564-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/platform_data/gsc_hwmon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/gsc_hwmon.h b/include/linux/platform_data/gsc_hwmon.h
index f2781aa7eff8..70e8a6bec0f6 100644
--- a/include/linux/platform_data/gsc_hwmon.h
+++ b/include/linux/platform_data/gsc_hwmon.h
@@ -40,6 +40,6 @@ struct gsc_hwmon_platform_data {
 	unsigned int resolution;
 	unsigned int vreference;
 	unsigned int fan_base;
-	struct gsc_hwmon_channel channels[];
+	struct gsc_hwmon_channel channels[] __counted_by(nchannels);
 };
 #endif
-- 
cgit v1.2.3


From a48e1f656b3c9b8192b6ca6fc92ef4daa30535fb Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:51:21 -0700
Subject: KVM: Annotate struct kvm_irq_routing_table with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct kvm_irq_routing_table.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: kvm@vger.kernel.org
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922175121.work.660-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fb6c6109fdca..4944136efaa2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -664,7 +664,7 @@ struct kvm_irq_routing_table {
 	 * Array indexed by gsi. Each entry contains list of irq chips
 	 * the gsi is connected to.
 	 */
-	struct hlist_head map[];
+	struct hlist_head map[] __counted_by(nr_rt_entries);
 };
 #endif
 
-- 
cgit v1.2.3


From 313ebe47d75558511aa1237b6e35c663b5c0ec6f Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Wed, 20 Sep 2023 14:36:09 +0200
Subject: string.h: add array-wrappers for (v)memdup_user()

Currently, user array duplications are sometimes done without an
overflow check. Sometimes the checks are done manually; sometimes the
array size is calculated with array_size() and sometimes by calculating
n * size directly in code.

Introduce wrappers for arrays for memdup_user() and vmemdup_user() to
provide a standardized and safe way for duplicating user arrays.

This is both for new code as well as replacing usage of (v)memdup_user()
in existing code that uses, e.g., n * size to calculate array sizes.

Suggested-by: David Airlie <airlied@redhat.com>
Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Zack Rusin <zackr@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230920123612.16914-3-pstanner@redhat.com
---
 include/linux/string.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index dbfc66400050..debf4ef1098f 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -5,7 +5,9 @@
 #include <linux/compiler.h>	/* for inline */
 #include <linux/types.h>	/* for size_t */
 #include <linux/stddef.h>	/* for NULL */
+#include <linux/err.h>		/* for ERR_PTR() */
 #include <linux/errno.h>	/* for E2BIG */
+#include <linux/overflow.h>	/* for check_mul_overflow() */
 #include <linux/stdarg.h>
 #include <uapi/linux/string.h>
 
@@ -14,6 +16,44 @@ extern void *memdup_user(const void __user *, size_t);
 extern void *vmemdup_user(const void __user *, size_t);
 extern void *memdup_user_nul(const void __user *, size_t);
 
+/**
+ * memdup_array_user - duplicate array from user space
+ * @src: source address in user space
+ * @n: number of array members to copy
+ * @size: size of one array member
+ *
+ * Return: an ERR_PTR() on failure. Result is physically
+ * contiguous, to be freed by kfree().
+ */
+static inline void *memdup_array_user(const void __user *src, size_t n, size_t size)
+{
+	size_t nbytes;
+
+	if (check_mul_overflow(n, size, &nbytes))
+		return ERR_PTR(-EOVERFLOW);
+
+	return memdup_user(src, nbytes);
+}
+
+/**
+ * vmemdup_array_user - duplicate array from user space
+ * @src: source address in user space
+ * @n: number of array members to copy
+ * @size: size of one array member
+ *
+ * Return: an ERR_PTR() on failure. Result may be not
+ * physically contiguous. Use kvfree() to free.
+ */
+static inline void *vmemdup_array_user(const void __user *src, size_t n, size_t size)
+{
+	size_t nbytes;
+
+	if (check_mul_overflow(n, size, &nbytes))
+		return ERR_PTR(-EOVERFLOW);
+
+	return vmemdup_user(src, nbytes);
+}
+
 /*
  * Include machine specific inline routines
  */
-- 
cgit v1.2.3


From 6d0d419914286a1b255530812a38d992c6c4e608 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:06 +0100
Subject: iov_iter, net: Move csum_and_copy_to/from_iter() to net/

Move csum_and_copy_to/from_iter() to net code now that the iteration
framework can be #included.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-10-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: netdev@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/skbuff.h | 25 +++++++++++++++++++++++++
 include/linux/uio.h    | 18 ------------------
 2 files changed, 25 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4174c4b82d13..d0656cc11c16 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3679,6 +3679,31 @@ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int l
 	return __skb_put_padto(skb, len, true);
 }
 
+static inline __wsum csum_and_memcpy(void *to, const void *from, size_t len,
+				     __wsum sum, size_t off)
+{
+	__wsum next = csum_partial_copy_nocheck(from, to, len);
+	return csum_block_add(sum, next, off);
+}
+
+struct csum_state {
+	__wsum csum;
+	size_t off;
+};
+
+size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
+
+static __always_inline __must_check
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
+				  __wsum *csum, struct iov_iter *i)
+{
+	size_t copied = csum_and_copy_from_iter(addr, bytes, csum, i);
+	if (likely(copied == bytes))
+		return true;
+	iov_iter_revert(i, copied);
+	return false;
+}
+
 static inline int skb_add_data(struct sk_buff *skb,
 			       struct iov_iter *from, int copy)
 {
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 65d9143f83c8..0a5426c97e02 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -338,24 +338,6 @@ iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
 	return npages;
 }
 
-struct csum_state {
-	__wsum csum;
-	size_t off;
-};
-
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i);
-size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
-
-static __always_inline __must_check
-bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
-				  __wsum *csum, struct iov_iter *i)
-{
-	size_t copied = csum_and_copy_from_iter(addr, bytes, csum, i);
-	if (likely(copied == bytes))
-		return true;
-	iov_iter_revert(i, copied);
-	return false;
-}
 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
 		struct iov_iter *i);
 
-- 
cgit v1.2.3


From dc32bff195b45e8571c442954beee259e9500dac Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:07 +0100
Subject: iov_iter, net: Fold in csum_and_memcpy()

Fold csum_and_memcpy() in to its callers.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-11-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: netdev@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/skbuff.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d0656cc11c16..c81ef5d76953 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3679,13 +3679,6 @@ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int l
 	return __skb_put_padto(skb, len, true);
 }
 
-static inline __wsum csum_and_memcpy(void *to, const void *from, size_t len,
-				     __wsum sum, size_t off)
-{
-	__wsum next = csum_partial_copy_nocheck(from, to, len);
-	return csum_block_add(sum, next, off);
-}
-
 struct csum_state {
 	__wsum csum;
 	size_t off;
-- 
cgit v1.2.3


From 7c6f353e8a73cbb8919a20e0912021a9f9d24170 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:08 +0100
Subject: iov_iter, net: Merge csum_and_copy_from_iter{,_full}() together

Move csum_and_copy_from_iter_full() out of line and then merge
csum_and_copy_from_iter() into its only caller.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-12-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: netdev@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/skbuff.h | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c81ef5d76953..be402f55f6d6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3679,23 +3679,8 @@ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int l
 	return __skb_put_padto(skb, len, true);
 }
 
-struct csum_state {
-	__wsum csum;
-	size_t off;
-};
-
-size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
-
-static __always_inline __must_check
-bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
-				  __wsum *csum, struct iov_iter *i)
-{
-	size_t copied = csum_and_copy_from_iter(addr, bytes, csum, i);
-	if (likely(copied == bytes))
-		return true;
-	iov_iter_revert(i, copied);
-	return false;
-}
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i)
+	__must_check;
 
 static inline int skb_add_data(struct sk_buff *skb,
 			       struct iov_iter *from, int copy)
-- 
cgit v1.2.3


From b5f0e20f444cd150121e0ce912ebd3f2dabd12bc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:09 +0100
Subject: iov_iter, net: Move hash_and_copy_to_iter() to net/

Move hash_and_copy_to_iter() to be with its only caller in networking code.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-13-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: netdev@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uio.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 0a5426c97e02..b6214cbf2a43 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -338,9 +338,6 @@ iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
 	return npages;
 }
 
-size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
-		struct iov_iter *i);
-
 struct iovec *iovec_from_user(const struct iovec __user *uvector,
 		unsigned long nr_segs, unsigned long fast_segs,
 		struct iovec *fast_iov, bool compat);
-- 
cgit v1.2.3


From 2b76129c5ae710423cfb55806803341af6a403a7 Mon Sep 17 00:00:00 2001
From: David Meriin <dmeriin@habana.ai>
Date: Mon, 24 Jul 2023 23:30:44 +0300
Subject: accel/habanalabs: move cpucp interface to linux/habanalabs

The CPUCP interface is moved to a shared folder outside of accel as
a pre-requisite to upstream the NIC drivers that will also include
this file.

Signed-off-by: David Meriin <dmeriin@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 include/linux/habanalabs/cpucp_if.h   | 1407 +++++++++++++++++++++++++++++++++
 include/linux/habanalabs/hl_boot_if.h |  790 ++++++++++++++++++
 2 files changed, 2197 insertions(+)
 create mode 100644 include/linux/habanalabs/cpucp_if.h
 create mode 100644 include/linux/habanalabs/hl_boot_if.h

(limited to 'include/linux')

diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
new file mode 100644
index 000000000000..4cdedb603ecb
--- /dev/null
+++ b/include/linux/habanalabs/cpucp_if.h
@@ -0,0 +1,1407 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2020-2022 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef CPUCP_IF_H
+#define CPUCP_IF_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#include "hl_boot_if.h"
+
+#define NUM_HBM_PSEUDO_CH				2
+#define NUM_HBM_CH_PER_DEV				8
+#define CPUCP_PKT_HBM_ECC_INFO_WR_PAR_SHIFT		0
+#define CPUCP_PKT_HBM_ECC_INFO_WR_PAR_MASK		0x00000001
+#define CPUCP_PKT_HBM_ECC_INFO_RD_PAR_SHIFT		1
+#define CPUCP_PKT_HBM_ECC_INFO_RD_PAR_MASK		0x00000002
+#define CPUCP_PKT_HBM_ECC_INFO_CA_PAR_SHIFT		2
+#define CPUCP_PKT_HBM_ECC_INFO_CA_PAR_MASK		0x00000004
+#define CPUCP_PKT_HBM_ECC_INFO_DERR_SHIFT		3
+#define CPUCP_PKT_HBM_ECC_INFO_DERR_MASK		0x00000008
+#define CPUCP_PKT_HBM_ECC_INFO_SERR_SHIFT		4
+#define CPUCP_PKT_HBM_ECC_INFO_SERR_MASK		0x00000010
+#define CPUCP_PKT_HBM_ECC_INFO_TYPE_SHIFT		5
+#define CPUCP_PKT_HBM_ECC_INFO_TYPE_MASK		0x00000020
+#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_SHIFT		6
+#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK		0x000007C0
+
+#define PLL_MAP_MAX_BITS	128
+#define PLL_MAP_LEN		(PLL_MAP_MAX_BITS / 8)
+
+/*
+ * info of the pkt queue pointers in the first async occurrence
+ */
+struct cpucp_pkt_sync_err {
+	__le32 pi;
+	__le32 ci;
+};
+
+struct hl_eq_hbm_ecc_data {
+	/* SERR counter */
+	__le32 sec_cnt;
+	/* DERR counter */
+	__le32 dec_cnt;
+	/* Supplemental Information according to the mask bits */
+	__le32 hbm_ecc_info;
+	/* Address in hbm where the ecc happened */
+	__le32 first_addr;
+	/* SERR continuous address counter */
+	__le32 sec_cont_cnt;
+	__le32 pad;
+};
+
+/*
+ * EVENT QUEUE
+ */
+
+struct hl_eq_header {
+	__le32 reserved;
+	__le32 ctl;
+};
+
+struct hl_eq_ecc_data {
+	__le64 ecc_address;
+	__le64 ecc_syndrom;
+	__u8 memory_wrapper_idx;
+	__u8 is_critical;
+	__le16 block_id;
+	__u8 pad[4];
+};
+
+enum hl_sm_sei_cause {
+	SM_SEI_SO_OVERFLOW,
+	SM_SEI_LBW_4B_UNALIGNED,
+	SM_SEI_AXI_RESPONSE_ERR
+};
+
+struct hl_eq_sm_sei_data {
+	__le32 sei_log;
+	/* enum hl_sm_sei_cause */
+	__u8 sei_cause;
+	__u8 pad[3];
+};
+
+enum hl_fw_alive_severity {
+	FW_ALIVE_SEVERITY_MINOR,
+	FW_ALIVE_SEVERITY_CRITICAL
+};
+
+struct hl_eq_fw_alive {
+	__le64 uptime_seconds;
+	__le32 process_id;
+	__le32 thread_id;
+	/* enum hl_fw_alive_severity */
+	__u8 severity;
+	__u8 pad[7];
+};
+
+struct hl_eq_intr_cause {
+	__le64 intr_cause_data;
+};
+
+struct hl_eq_pcie_drain_ind_data {
+	struct hl_eq_intr_cause intr_cause;
+	__le64 drain_wr_addr_lbw;
+	__le64 drain_rd_addr_lbw;
+	__le64 drain_wr_addr_hbw;
+	__le64 drain_rd_addr_hbw;
+};
+
+struct hl_eq_razwi_lbw_info_regs {
+	__le32 rr_aw_razwi_reg;
+	__le32 rr_aw_razwi_id_reg;
+	__le32 rr_ar_razwi_reg;
+	__le32 rr_ar_razwi_id_reg;
+};
+
+struct hl_eq_razwi_hbw_info_regs {
+	__le32 rr_aw_razwi_hi_reg;
+	__le32 rr_aw_razwi_lo_reg;
+	__le32 rr_aw_razwi_id_reg;
+	__le32 rr_ar_razwi_hi_reg;
+	__le32 rr_ar_razwi_lo_reg;
+	__le32 rr_ar_razwi_id_reg;
+};
+
+/* razwi_happened masks */
+#define RAZWI_HAPPENED_HBW	0x1
+#define RAZWI_HAPPENED_LBW	0x2
+#define RAZWI_HAPPENED_AW	0x4
+#define RAZWI_HAPPENED_AR	0x8
+
+struct hl_eq_razwi_info {
+	__le32 razwi_happened_mask;
+	union {
+		struct hl_eq_razwi_lbw_info_regs lbw;
+		struct hl_eq_razwi_hbw_info_regs hbw;
+	};
+	__le32 pad;
+};
+
+struct hl_eq_razwi_with_intr_cause {
+	struct hl_eq_razwi_info razwi_info;
+	struct hl_eq_intr_cause intr_cause;
+};
+
+#define HBM_CA_ERR_CMD_LIFO_LEN		8
+#define HBM_RD_ERR_DATA_LIFO_LEN	8
+#define HBM_WR_PAR_CMD_LIFO_LEN		11
+
+enum hl_hbm_sei_cause {
+	/* Command/address parity error event is split into 2 events due to
+	 * size limitation: ODD suffix for odd HBM CK_t cycles and EVEN  suffix
+	 * for even HBM CK_t cycles
+	 */
+	HBM_SEI_CMD_PARITY_EVEN,
+	HBM_SEI_CMD_PARITY_ODD,
+	/* Read errors can be reflected as a combination of SERR/DERR/parity
+	 * errors. Therefore, we define one event for all read error types.
+	 * LKD will perform further proccessing.
+	 */
+	HBM_SEI_READ_ERR,
+	HBM_SEI_WRITE_DATA_PARITY_ERR,
+	HBM_SEI_CATTRIP,
+	HBM_SEI_MEM_BIST_FAIL,
+	HBM_SEI_DFI,
+	HBM_SEI_INV_TEMP_READ_OUT,
+	HBM_SEI_BIST_FAIL,
+};
+
+/* Masks for parsing hl_hbm_sei_headr fields */
+#define HBM_ECC_SERR_CNTR_MASK		0xFF
+#define HBM_ECC_DERR_CNTR_MASK		0xFF00
+#define HBM_RD_PARITY_CNTR_MASK		0xFF0000
+
+/* HBM index and MC index are known by the event_id */
+struct hl_hbm_sei_header {
+	union {
+		/* relevant only in case of HBM read error */
+		struct {
+			__u8 ecc_serr_cnt;
+			__u8 ecc_derr_cnt;
+			__u8 read_par_cnt;
+			__u8 reserved;
+		};
+		/* All other cases */
+		__le32 cnt;
+	};
+	__u8 sei_cause;		/* enum hl_hbm_sei_cause */
+	__u8 mc_channel;		/* range: 0-3 */
+	__u8 mc_pseudo_channel;	/* range: 0-7 */
+	__u8 is_critical;
+};
+
+#define HBM_RD_ADDR_SID_SHIFT		0
+#define HBM_RD_ADDR_SID_MASK		0x1
+#define HBM_RD_ADDR_BG_SHIFT		1
+#define HBM_RD_ADDR_BG_MASK		0x6
+#define HBM_RD_ADDR_BA_SHIFT		3
+#define HBM_RD_ADDR_BA_MASK		0x18
+#define HBM_RD_ADDR_COL_SHIFT		5
+#define HBM_RD_ADDR_COL_MASK		0x7E0
+#define HBM_RD_ADDR_ROW_SHIFT		11
+#define HBM_RD_ADDR_ROW_MASK		0x3FFF800
+
+struct hbm_rd_addr {
+	union {
+		/* bit fields are only for FW use */
+		struct {
+			u32 dbg_rd_err_addr_sid:1;
+			u32 dbg_rd_err_addr_bg:2;
+			u32 dbg_rd_err_addr_ba:2;
+			u32 dbg_rd_err_addr_col:6;
+			u32 dbg_rd_err_addr_row:15;
+			u32 reserved:6;
+		};
+		__le32 rd_addr_val;
+	};
+};
+
+#define HBM_RD_ERR_BEAT_SHIFT		2
+/* dbg_rd_err_misc fields: */
+/* Read parity is calculated per DW on every beat */
+#define HBM_RD_ERR_PAR_ERR_BEAT0_SHIFT	0
+#define HBM_RD_ERR_PAR_ERR_BEAT0_MASK	0x3
+#define HBM_RD_ERR_PAR_DATA_BEAT0_SHIFT	8
+#define HBM_RD_ERR_PAR_DATA_BEAT0_MASK	0x300
+/* ECC is calculated per PC on every beat */
+#define HBM_RD_ERR_SERR_BEAT0_SHIFT	16
+#define HBM_RD_ERR_SERR_BEAT0_MASK	0x10000
+#define HBM_RD_ERR_DERR_BEAT0_SHIFT	24
+#define HBM_RD_ERR_DERR_BEAT0_MASK	0x100000
+
+struct hl_eq_hbm_sei_read_err_intr_info {
+	/* DFI_RD_ERR_REP_ADDR */
+	struct hbm_rd_addr dbg_rd_err_addr;
+	/* DFI_RD_ERR_REP_ERR */
+	union {
+		struct {
+			/* bit fields are only for FW use */
+			u32 dbg_rd_err_par:8;
+			u32 dbg_rd_err_par_data:8;
+			u32 dbg_rd_err_serr:4;
+			u32 dbg_rd_err_derr:4;
+			u32 reserved:8;
+		};
+		__le32 dbg_rd_err_misc;
+	};
+	/* DFI_RD_ERR_REP_DM */
+	__le32 dbg_rd_err_dm;
+	/* DFI_RD_ERR_REP_SYNDROME */
+	__le32 dbg_rd_err_syndrome;
+	/* DFI_RD_ERR_REP_DATA */
+	__le32 dbg_rd_err_data[HBM_RD_ERR_DATA_LIFO_LEN];
+};
+
+struct hl_eq_hbm_sei_ca_par_intr_info {
+	/* 14 LSBs */
+	__le16 dbg_row[HBM_CA_ERR_CMD_LIFO_LEN];
+	/* 18 LSBs */
+	__le32 dbg_col[HBM_CA_ERR_CMD_LIFO_LEN];
+};
+
+#define WR_PAR_LAST_CMD_COL_SHIFT	0
+#define WR_PAR_LAST_CMD_COL_MASK	0x3F
+#define WR_PAR_LAST_CMD_BG_SHIFT	6
+#define WR_PAR_LAST_CMD_BG_MASK		0xC0
+#define WR_PAR_LAST_CMD_BA_SHIFT	8
+#define WR_PAR_LAST_CMD_BA_MASK		0x300
+#define WR_PAR_LAST_CMD_SID_SHIFT	10
+#define WR_PAR_LAST_CMD_SID_MASK	0x400
+
+/* Row address isn't latched */
+struct hbm_sei_wr_cmd_address {
+	/* DFI_DERR_LAST_CMD */
+	union {
+		struct {
+			/* bit fields are only for FW use */
+			u32 col:6;
+			u32 bg:2;
+			u32 ba:2;
+			u32 sid:1;
+			u32 reserved:21;
+		};
+		__le32 dbg_wr_cmd_addr;
+	};
+};
+
+struct hl_eq_hbm_sei_wr_par_intr_info {
+	/* entry 0: WR command address from the 1st cycle prior to the error
+	 * entry 1: WR command address from the 2nd cycle prior to the error
+	 * and so on...
+	 */
+	struct hbm_sei_wr_cmd_address dbg_last_wr_cmds[HBM_WR_PAR_CMD_LIFO_LEN];
+	/* derr[0:1] - 1st HBM cycle DERR output
+	 * derr[2:3] - 2nd HBM cycle DERR output
+	 */
+	__u8 dbg_derr;
+	/* extend to reach 8B */
+	__u8 pad[3];
+};
+
+/*
+ * this struct represents the following sei causes:
+ * command parity, ECC double error, ECC single error, dfi error, cattrip,
+ * temperature read-out, read parity error and write parity error.
+ * some only use the header while some have extra data.
+ */
+struct hl_eq_hbm_sei_data {
+	struct hl_hbm_sei_header hdr;
+	union {
+		struct hl_eq_hbm_sei_ca_par_intr_info ca_parity_even_info;
+		struct hl_eq_hbm_sei_ca_par_intr_info ca_parity_odd_info;
+		struct hl_eq_hbm_sei_read_err_intr_info read_err_info;
+		struct hl_eq_hbm_sei_wr_par_intr_info wr_parity_info;
+	};
+};
+
+/* Engine/farm arc interrupt type */
+enum hl_engine_arc_interrupt_type {
+	/* Qman/farm ARC DCCM QUEUE FULL interrupt type */
+	ENGINE_ARC_DCCM_QUEUE_FULL_IRQ = 1
+};
+
+/* Data structure specifies details of payload of DCCM QUEUE FULL interrupt */
+struct hl_engine_arc_dccm_queue_full_irq {
+	/* Queue index value which caused DCCM QUEUE FULL */
+	__le32 queue_index;
+	__le32 pad;
+};
+
+/* Data structure specifies details of QM/FARM ARC interrupt */
+struct hl_eq_engine_arc_intr_data {
+	/* ARC engine id e.g.  DCORE0_TPC0_QM_ARC, DCORE0_TCP1_QM_ARC */
+	__le32 engine_id;
+	__le32 intr_type; /* enum hl_engine_arc_interrupt_type */
+	/* More info related to the interrupt e.g. queue index
+	 * incase of DCCM_QUEUE_FULL interrupt.
+	 */
+	__le64 payload;
+	__le64 pad[5];
+};
+
+#define ADDR_DEC_ADDRESS_COUNT_MAX 4
+
+/* Data structure specifies details of ADDR_DEC interrupt */
+struct hl_eq_addr_dec_intr_data {
+	struct hl_eq_intr_cause intr_cause;
+	__le64 addr[ADDR_DEC_ADDRESS_COUNT_MAX];
+	__u8 addr_cnt;
+	__u8 pad[7];
+};
+
+struct hl_eq_entry {
+	struct hl_eq_header hdr;
+	union {
+		__le64 data_placeholder;
+		struct hl_eq_ecc_data ecc_data;
+		struct hl_eq_hbm_ecc_data hbm_ecc_data;	/* Obsolete */
+		struct hl_eq_sm_sei_data sm_sei_data;
+		struct cpucp_pkt_sync_err pkt_sync_err;
+		struct hl_eq_fw_alive fw_alive;
+		struct hl_eq_intr_cause intr_cause;
+		struct hl_eq_pcie_drain_ind_data pcie_drain_ind_data;
+		struct hl_eq_razwi_info razwi_info;
+		struct hl_eq_razwi_with_intr_cause razwi_with_intr_cause;
+		struct hl_eq_hbm_sei_data sei_data;	/* Gaudi2 HBM */
+		struct hl_eq_engine_arc_intr_data arc_data;
+		struct hl_eq_addr_dec_intr_data addr_dec;
+		__le64 data[7];
+	};
+};
+
+#define HL_EQ_ENTRY_SIZE		sizeof(struct hl_eq_entry)
+
+#define EQ_CTL_READY_SHIFT		31
+#define EQ_CTL_READY_MASK		0x80000000
+
+#define EQ_CTL_EVENT_TYPE_SHIFT		16
+#define EQ_CTL_EVENT_TYPE_MASK		0x0FFF0000
+
+#define EQ_CTL_INDEX_SHIFT		0
+#define EQ_CTL_INDEX_MASK		0x0000FFFF
+
+enum pq_init_status {
+	PQ_INIT_STATUS_NA = 0,
+	PQ_INIT_STATUS_READY_FOR_CP,
+	PQ_INIT_STATUS_READY_FOR_HOST,
+	PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI,
+	PQ_INIT_STATUS_LEN_NOT_POWER_OF_TWO_ERR,
+	PQ_INIT_STATUS_ILLEGAL_Q_ADDR_ERR
+};
+
+/*
+ * CpuCP Primary Queue Packets
+ *
+ * During normal operation, the host's kernel driver needs to send various
+ * messages to CpuCP, usually either to SET some value into a H/W periphery or
+ * to GET the current value of some H/W periphery. For example, SET the
+ * frequency of MME/TPC and GET the value of the thermal sensor.
+ *
+ * These messages can be initiated either by the User application or by the
+ * host's driver itself, e.g. power management code. In either case, the
+ * communication from the host's driver to CpuCP will *always* be in
+ * synchronous mode, meaning that the host will send a single message and poll
+ * until the message was acknowledged and the results are ready (if results are
+ * needed).
+ *
+ * This means that only a single message can be sent at a time and the host's
+ * driver must wait for its result before sending the next message. Having said
+ * that, because these are control messages which are sent in a relatively low
+ * frequency, this limitation seems acceptable. It's important to note that
+ * in case of multiple devices, messages to different devices *can* be sent
+ * at the same time.
+ *
+ * The message, inputs/outputs (if relevant) and fence object will be located
+ * on the device DDR at an address that will be determined by the host's driver.
+ * During device initialization phase, the host will pass to CpuCP that address.
+ * Most of the message types will contain inputs/outputs inside the message
+ * itself. The common part of each message will contain the opcode of the
+ * message (its type) and a field representing a fence object.
+ *
+ * When the host's driver wishes to send a message to CPU CP, it will write the
+ * message contents to the device DDR, clear the fence object and then write to
+ * the PSOC_ARC1_AUX_SW_INTR, to issue interrupt 121 to ARC Management CPU.
+ *
+ * Upon receiving the interrupt (#121), CpuCP will read the message from the
+ * DDR. In case the message is a SET operation, CpuCP will first perform the
+ * operation and then write to the fence object on the device DDR. In case the
+ * message is a GET operation, CpuCP will first fill the results section on the
+ * device DDR and then write to the fence object. If an error occurred, CpuCP
+ * will fill the rc field with the right error code.
+ *
+ * In the meantime, the host's driver will poll on the fence object. Once the
+ * host sees that the fence object is signaled, it will read the results from
+ * the device DDR (if relevant) and resume the code execution in the host's
+ * driver.
+ *
+ * To use QMAN packets, the opcode must be the QMAN opcode, shifted by 8
+ * so the value being put by the host's driver matches the value read by CpuCP
+ *
+ * Non-QMAN packets should be limited to values 1 through (2^8 - 1)
+ *
+ * Detailed description:
+ *
+ * CPUCP_PACKET_DISABLE_PCI_ACCESS -
+ *       After receiving this packet the embedded CPU must NOT issue PCI
+ *       transactions (read/write) towards the Host CPU. This also include
+ *       sending MSI-X interrupts.
+ *       This packet is usually sent before the device is moved to D3Hot state.
+ *
+ * CPUCP_PACKET_ENABLE_PCI_ACCESS -
+ *       After receiving this packet the embedded CPU is allowed to issue PCI
+ *       transactions towards the Host CPU, including sending MSI-X interrupts.
+ *       This packet is usually send after the device is moved to D0 state.
+ *
+ * CPUCP_PACKET_TEMPERATURE_GET -
+ *       Fetch the current temperature / Max / Max Hyst / Critical /
+ *       Critical Hyst of a specified thermal sensor. The packet's
+ *       arguments specify the desired sensor and the field to get.
+ *
+ * CPUCP_PACKET_VOLTAGE_GET -
+ *       Fetch the voltage / Max / Min of a specified sensor. The packet's
+ *       arguments specify the sensor and type.
+ *
+ * CPUCP_PACKET_CURRENT_GET -
+ *       Fetch the current / Max / Min of a specified sensor. The packet's
+ *       arguments specify the sensor and type.
+ *
+ * CPUCP_PACKET_FAN_SPEED_GET -
+ *       Fetch the speed / Max / Min of a specified fan. The packet's
+ *       arguments specify the sensor and type.
+ *
+ * CPUCP_PACKET_PWM_GET -
+ *       Fetch the pwm value / mode of a specified pwm. The packet's
+ *       arguments specify the sensor and type.
+ *
+ * CPUCP_PACKET_PWM_SET -
+ *       Set the pwm value / mode of a specified pwm. The packet's
+ *       arguments specify the sensor, type and value.
+ *
+ * CPUCP_PACKET_FREQUENCY_SET -
+ *       Set the frequency of a specified PLL. The packet's arguments specify
+ *       the PLL and the desired frequency. The actual frequency in the device
+ *       might differ from the requested frequency.
+ *
+ * CPUCP_PACKET_FREQUENCY_GET -
+ *       Fetch the frequency of a specified PLL. The packet's arguments specify
+ *       the PLL.
+ *
+ * CPUCP_PACKET_LED_SET -
+ *       Set the state of a specified led. The packet's arguments
+ *       specify the led and the desired state.
+ *
+ * CPUCP_PACKET_I2C_WR -
+ *       Write 32-bit value to I2C device. The packet's arguments specify the
+ *       I2C bus, address and value.
+ *
+ * CPUCP_PACKET_I2C_RD -
+ *       Read 32-bit value from I2C device. The packet's arguments specify the
+ *       I2C bus and address.
+ *
+ * CPUCP_PACKET_INFO_GET -
+ *       Fetch information from the device as specified in the packet's
+ *       structure. The host's driver passes the max size it allows the CpuCP to
+ *       write to the structure, to prevent data corruption in case of
+ *       mismatched driver/FW versions.
+ *
+ * CPUCP_PACKET_FLASH_PROGRAM_REMOVED - this packet was removed
+ *
+ * CPUCP_PACKET_UNMASK_RAZWI_IRQ -
+ *       Unmask the given IRQ. The IRQ number is specified in the value field.
+ *       The packet is sent after receiving an interrupt and printing its
+ *       relevant information.
+ *
+ * CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY -
+ *       Unmask the given IRQs. The IRQs numbers are specified in an array right
+ *       after the cpucp_packet structure, where its first element is the array
+ *       length. The packet is sent after a soft reset was done in order to
+ *       handle any interrupts that were sent during the reset process.
+ *
+ * CPUCP_PACKET_TEST -
+ *       Test packet for CpuCP connectivity. The CPU will put the fence value
+ *       in the result field.
+ *
+ * CPUCP_PACKET_FREQUENCY_CURR_GET -
+ *       Fetch the current frequency of a specified PLL. The packet's arguments
+ *       specify the PLL.
+ *
+ * CPUCP_PACKET_MAX_POWER_GET -
+ *       Fetch the maximal power of the device.
+ *
+ * CPUCP_PACKET_MAX_POWER_SET -
+ *       Set the maximal power of the device. The packet's arguments specify
+ *       the power.
+ *
+ * CPUCP_PACKET_EEPROM_DATA_GET -
+ *       Get EEPROM data from the CpuCP kernel. The buffer is specified in the
+ *       addr field. The CPU will put the returned data size in the result
+ *       field. In addition, the host's driver passes the max size it allows the
+ *       CpuCP to write to the structure, to prevent data corruption in case of
+ *       mismatched driver/FW versions.
+ *
+ * CPUCP_PACKET_NIC_INFO_GET -
+ *       Fetch information from the device regarding the NIC. the host's driver
+ *       passes the max size it allows the CpuCP to write to the structure, to
+ *       prevent data corruption in case of mismatched driver/FW versions.
+ *
+ * CPUCP_PACKET_TEMPERATURE_SET -
+ *       Set the value of the offset property of a specified thermal sensor.
+ *       The packet's arguments specify the desired sensor and the field to
+ *       set.
+ *
+ * CPUCP_PACKET_VOLTAGE_SET -
+ *       Trigger the reset_history property of a specified voltage sensor.
+ *       The packet's arguments specify the desired sensor and the field to
+ *       set.
+ *
+ * CPUCP_PACKET_CURRENT_SET -
+ *       Trigger the reset_history property of a specified current sensor.
+ *       The packet's arguments specify the desired sensor and the field to
+ *       set.
+ *
+ * CPUCP_PACKET_PCIE_THROUGHPUT_GET -
+ *       Get throughput of PCIe.
+ *       The packet's arguments specify the transaction direction (TX/RX).
+ *       The window measurement is 10[msec], and the return value is in KB/sec.
+ *
+ * CPUCP_PACKET_PCIE_REPLAY_CNT_GET
+ *       Replay count measures number of "replay" events, which is basicly
+ *       number of retries done by PCIe.
+ *
+ * CPUCP_PACKET_TOTAL_ENERGY_GET -
+ *       Total Energy is measurement of energy from the time FW Linux
+ *       is loaded. It is calculated by multiplying the average power
+ *       by time (passed from armcp start). The units are in MilliJouls.
+ *
+ * CPUCP_PACKET_PLL_INFO_GET -
+ *       Fetch frequencies of PLL from the required PLL IP.
+ *       The packet's arguments specify the device PLL type
+ *       Pll type is the PLL from device pll_index enum.
+ *       The result is composed of 4 outputs, each is 16-bit
+ *       frequency in MHz.
+ *
+ * CPUCP_PACKET_POWER_GET -
+ *       Fetch the present power consumption of the device (Current * Voltage).
+ *
+ * CPUCP_PACKET_NIC_PFC_SET -
+ *       Enable/Disable the NIC PFC feature. The packet's arguments specify the
+ *       NIC port, relevant lanes to configure and one bit indication for
+ *       enable/disable.
+ *
+ * CPUCP_PACKET_NIC_FAULT_GET -
+ *       Fetch the current indication for local/remote faults from the NIC MAC.
+ *       The result is 32-bit value of the relevant register.
+ *
+ * CPUCP_PACKET_NIC_LPBK_SET -
+ *       Enable/Disable the MAC loopback feature. The packet's arguments specify
+ *       the NIC port, relevant lanes to configure and one bit indication for
+ *       enable/disable.
+ *
+ * CPUCP_PACKET_NIC_MAC_INIT -
+ *       Configure the NIC MAC channels. The packet's arguments specify the
+ *       NIC port and the speed.
+ *
+ * CPUCP_PACKET_MSI_INFO_SET -
+ *       set the index number for each supported msi type going from
+ *       host to device
+ *
+ * CPUCP_PACKET_NIC_XPCS91_REGS_GET -
+ *       Fetch the un/correctable counters values from the NIC MAC.
+ *
+ * CPUCP_PACKET_NIC_STAT_REGS_GET -
+ *       Fetch various NIC MAC counters from the NIC STAT.
+ *
+ * CPUCP_PACKET_NIC_STAT_REGS_CLR -
+ *       Clear the various NIC MAC counters in the NIC STAT.
+ *
+ * CPUCP_PACKET_NIC_STAT_REGS_ALL_GET -
+ *       Fetch all NIC MAC counters from the NIC STAT.
+ *
+ * CPUCP_PACKET_IS_IDLE_CHECK -
+ *       Check if the device is IDLE in regard to the DMA/compute engines
+ *       and QMANs. The f/w will return a bitmask where each bit represents
+ *       a different engine or QMAN according to enum cpucp_idle_mask.
+ *       The bit will be 1 if the engine is NOT idle.
+ *
+ * CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET -
+ *       Fetch all HBM replaced-rows and prending to be replaced rows data.
+ *
+ * CPUCP_PACKET_HBM_PENDING_ROWS_STATUS -
+ *       Fetch status of HBM rows pending replacement and need a reboot to
+ *       be replaced.
+ *
+ * CPUCP_PACKET_POWER_SET -
+ *       Resets power history of device to 0
+ *
+ * CPUCP_PACKET_ENGINE_CORE_ASID_SET -
+ *       Packet to perform engine core ASID configuration
+ *
+ * CPUCP_PACKET_SEC_ATTEST_GET -
+ *       Get the attestaion data that is collected during various stages of the
+ *       boot sequence. the attestation data is also hashed with some unique
+ *       number (nonce) provided by the host to prevent replay attacks.
+ *       public key and certificate also provided as part of the FW response.
+ *
+ * CPUCP_PACKET_MONITOR_DUMP_GET -
+ *       Get monitors registers dump from the CpuCP kernel.
+ *       The CPU will put the registers dump in the a buffer allocated by the driver
+ *       which address is passed via the CpuCp packet. In addition, the host's driver
+ *       passes the max size it allows the CpuCP to write to the structure, to prevent
+ *       data corruption in case of mismatched driver/FW versions.
+ *       Obsolete.
+ *
+ * CPUCP_PACKET_GENERIC_PASSTHROUGH -
+ *      Generic opcode for all firmware info that is only passed to host
+ *      through the LKD, without getting parsed there.
+ *
+ * CPUCP_PACKET_ACTIVE_STATUS_SET -
+ *       LKD sends FW indication whether device is free or in use, this indication is reported
+ *       also to the BMC.
+ *
+ * CPUCP_PACKET_REGISTER_INTERRUPTS -
+ *       Packet to register interrupts indicating LKD is ready to receive events from FW.
+ *
+ * CPUCP_PACKET_SOFT_RESET -
+ *      Packet to perform soft-reset.
+ *
+ * CPUCP_PACKET_INTS_REGISTER -
+ *       Packet to inform FW that queues have been established and LKD is ready to receive
+ *       EQ events.
+ */
+
+enum cpucp_packet_id {
+	CPUCP_PACKET_DISABLE_PCI_ACCESS = 1,	/* internal */
+	CPUCP_PACKET_ENABLE_PCI_ACCESS,		/* internal */
+	CPUCP_PACKET_TEMPERATURE_GET,		/* sysfs */
+	CPUCP_PACKET_VOLTAGE_GET,		/* sysfs */
+	CPUCP_PACKET_CURRENT_GET,		/* sysfs */
+	CPUCP_PACKET_FAN_SPEED_GET,		/* sysfs */
+	CPUCP_PACKET_PWM_GET,			/* sysfs */
+	CPUCP_PACKET_PWM_SET,			/* sysfs */
+	CPUCP_PACKET_FREQUENCY_SET,		/* sysfs */
+	CPUCP_PACKET_FREQUENCY_GET,		/* sysfs */
+	CPUCP_PACKET_LED_SET,			/* debugfs */
+	CPUCP_PACKET_I2C_WR,			/* debugfs */
+	CPUCP_PACKET_I2C_RD,			/* debugfs */
+	CPUCP_PACKET_INFO_GET,			/* IOCTL */
+	CPUCP_PACKET_FLASH_PROGRAM_REMOVED,
+	CPUCP_PACKET_UNMASK_RAZWI_IRQ,		/* internal */
+	CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY,	/* internal */
+	CPUCP_PACKET_TEST,			/* internal */
+	CPUCP_PACKET_FREQUENCY_CURR_GET,	/* sysfs */
+	CPUCP_PACKET_MAX_POWER_GET,		/* sysfs */
+	CPUCP_PACKET_MAX_POWER_SET,		/* sysfs */
+	CPUCP_PACKET_EEPROM_DATA_GET,		/* sysfs */
+	CPUCP_PACKET_NIC_INFO_GET,		/* internal */
+	CPUCP_PACKET_TEMPERATURE_SET,		/* sysfs */
+	CPUCP_PACKET_VOLTAGE_SET,		/* sysfs */
+	CPUCP_PACKET_CURRENT_SET,		/* sysfs */
+	CPUCP_PACKET_PCIE_THROUGHPUT_GET,	/* internal */
+	CPUCP_PACKET_PCIE_REPLAY_CNT_GET,	/* internal */
+	CPUCP_PACKET_TOTAL_ENERGY_GET,		/* internal */
+	CPUCP_PACKET_PLL_INFO_GET,		/* internal */
+	CPUCP_PACKET_NIC_STATUS,		/* internal */
+	CPUCP_PACKET_POWER_GET,			/* internal */
+	CPUCP_PACKET_NIC_PFC_SET,		/* internal */
+	CPUCP_PACKET_NIC_FAULT_GET,		/* internal */
+	CPUCP_PACKET_NIC_LPBK_SET,		/* internal */
+	CPUCP_PACKET_NIC_MAC_CFG,		/* internal */
+	CPUCP_PACKET_MSI_INFO_SET,		/* internal */
+	CPUCP_PACKET_NIC_XPCS91_REGS_GET,	/* internal */
+	CPUCP_PACKET_NIC_STAT_REGS_GET,		/* internal */
+	CPUCP_PACKET_NIC_STAT_REGS_CLR,		/* internal */
+	CPUCP_PACKET_NIC_STAT_REGS_ALL_GET,	/* internal */
+	CPUCP_PACKET_IS_IDLE_CHECK,		/* internal */
+	CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET,/* internal */
+	CPUCP_PACKET_HBM_PENDING_ROWS_STATUS,	/* internal */
+	CPUCP_PACKET_POWER_SET,			/* internal */
+	CPUCP_PACKET_RESERVED,			/* not used */
+	CPUCP_PACKET_ENGINE_CORE_ASID_SET,	/* internal */
+	CPUCP_PACKET_RESERVED2,			/* not used */
+	CPUCP_PACKET_SEC_ATTEST_GET,		/* internal */
+	CPUCP_PACKET_RESERVED3,			/* not used */
+	CPUCP_PACKET_RESERVED4,			/* not used */
+	CPUCP_PACKET_MONITOR_DUMP_GET,		/* debugfs */
+	CPUCP_PACKET_RESERVED5,			/* not used */
+	CPUCP_PACKET_RESERVED6,			/* not used */
+	CPUCP_PACKET_RESERVED7,			/* not used */
+	CPUCP_PACKET_GENERIC_PASSTHROUGH,	/* IOCTL */
+	CPUCP_PACKET_RESERVED8,			/* not used */
+	CPUCP_PACKET_ACTIVE_STATUS_SET,		/* internal */
+	CPUCP_PACKET_RESERVED9,			/* not used */
+	CPUCP_PACKET_RESERVED10,		/* not used */
+	CPUCP_PACKET_RESERVED11,		/* not used */
+	CPUCP_PACKET_RESERVED12,		/* internal */
+	CPUCP_PACKET_RESERVED13,                /* internal */
+	CPUCP_PACKET_SOFT_RESET,                /* internal */
+	CPUCP_PACKET_INTS_REGISTER,             /* internal */
+	CPUCP_PACKET_ID_MAX                     /* must be last */
+};
+
+#define CPUCP_PACKET_FENCE_VAL	0xFE8CE7A5
+
+#define CPUCP_PKT_CTL_RC_SHIFT		12
+#define CPUCP_PKT_CTL_RC_MASK		0x0000F000
+
+#define CPUCP_PKT_CTL_OPCODE_SHIFT	16
+#define CPUCP_PKT_CTL_OPCODE_MASK	0x1FFF0000
+
+#define CPUCP_PKT_RES_PLL_OUT0_SHIFT	0
+#define CPUCP_PKT_RES_PLL_OUT0_MASK	0x000000000000FFFFull
+#define CPUCP_PKT_RES_PLL_OUT1_SHIFT	16
+#define CPUCP_PKT_RES_PLL_OUT1_MASK	0x00000000FFFF0000ull
+#define CPUCP_PKT_RES_PLL_OUT2_SHIFT	32
+#define CPUCP_PKT_RES_PLL_OUT2_MASK	0x0000FFFF00000000ull
+#define CPUCP_PKT_RES_PLL_OUT3_SHIFT	48
+#define CPUCP_PKT_RES_PLL_OUT3_MASK	0xFFFF000000000000ull
+
+#define CPUCP_PKT_RES_EEPROM_OUT0_SHIFT	0
+#define CPUCP_PKT_RES_EEPROM_OUT0_MASK	0x000000000000FFFFull
+#define CPUCP_PKT_RES_EEPROM_OUT1_SHIFT	16
+#define CPUCP_PKT_RES_EEPROM_OUT1_MASK	0x0000000000FF0000ull
+
+#define CPUCP_PKT_VAL_PFC_IN1_SHIFT	0
+#define CPUCP_PKT_VAL_PFC_IN1_MASK	0x0000000000000001ull
+#define CPUCP_PKT_VAL_PFC_IN2_SHIFT	1
+#define CPUCP_PKT_VAL_PFC_IN2_MASK	0x000000000000001Eull
+
+#define CPUCP_PKT_VAL_LPBK_IN1_SHIFT	0
+#define CPUCP_PKT_VAL_LPBK_IN1_MASK	0x0000000000000001ull
+#define CPUCP_PKT_VAL_LPBK_IN2_SHIFT	1
+#define CPUCP_PKT_VAL_LPBK_IN2_MASK	0x000000000000001Eull
+
+#define CPUCP_PKT_VAL_MAC_CNT_IN1_SHIFT	0
+#define CPUCP_PKT_VAL_MAC_CNT_IN1_MASK	0x0000000000000001ull
+#define CPUCP_PKT_VAL_MAC_CNT_IN2_SHIFT	1
+#define CPUCP_PKT_VAL_MAC_CNT_IN2_MASK	0x00000000FFFFFFFEull
+
+/* heartbeat status bits */
+#define CPUCP_PKT_HB_STATUS_EQ_FAULT_SHIFT		0
+#define CPUCP_PKT_HB_STATUS_EQ_FAULT_MASK		0x00000001
+
+struct cpucp_packet {
+	union {
+		__le64 value;	/* For SET packets */
+		__le64 result;	/* For GET packets */
+		__le64 addr;	/* For PQ */
+	};
+
+	__le32 ctl;
+
+	__le32 fence;		/* Signal to host that message is completed */
+
+	union {
+		struct {/* For temperature/current/voltage/fan/pwm get/set */
+			__le16 sensor_index;
+			__le16 type;
+		};
+
+		struct {	/* For I2C read/write */
+			__u8 i2c_bus;
+			__u8 i2c_addr;
+			__u8 i2c_reg;
+			/*
+			 * In legacy implemetations, i2c_len was not present,
+			 * was unused and just added as pad.
+			 * So if i2c_len is 0, it is treated as legacy
+			 * and r/w 1 Byte, else if i2c_len is specified,
+			 * its treated as new multibyte r/w support.
+			 */
+			__u8 i2c_len;
+		};
+
+		struct {/* For PLL info fetch */
+			__le16 pll_type;
+			/* TODO pll_reg is kept temporary before removal */
+			__le16 pll_reg;
+		};
+
+		/* For any general request */
+		__le32 index;
+
+		/* For frequency get/set */
+		__le32 pll_index;
+
+		/* For led set */
+		__le32 led_index;
+
+		/* For get CpuCP info/EEPROM data/NIC info */
+		__le32 data_max_size;
+
+		/*
+		 * For any general status bitmask. Shall be used whenever the
+		 * result cannot be used to hold general purpose data.
+		 */
+		__le32 status_mask;
+
+		/* random, used once number, for security packets */
+		__le32 nonce;
+	};
+
+	union {
+		/* For NIC requests */
+		__le32 port_index;
+
+		/* For Generic packet sub index */
+		__le32 pkt_subidx;
+	};
+};
+
+struct cpucp_unmask_irq_arr_packet {
+	struct cpucp_packet cpucp_pkt;
+	__le32 length;
+	__le32 irqs[];
+};
+
+struct cpucp_nic_status_packet {
+	struct cpucp_packet cpucp_pkt;
+	__le32 length;
+	__le32 data[];
+};
+
+struct cpucp_array_data_packet {
+	struct cpucp_packet cpucp_pkt;
+	__le32 length;
+	__le32 data[];
+};
+
+enum cpucp_led_index {
+	CPUCP_LED0_INDEX = 0,
+	CPUCP_LED1_INDEX,
+	CPUCP_LED2_INDEX,
+	CPUCP_LED_MAX_INDEX = CPUCP_LED2_INDEX
+};
+
+/*
+ * enum cpucp_packet_rc - Error return code
+ * @cpucp_packet_success	-> in case of success.
+ * @cpucp_packet_invalid	-> this is to support first generation platforms.
+ * @cpucp_packet_fault		-> in case of processing error like failing to
+ *                                 get device binding or semaphore etc.
+ * @cpucp_packet_invalid_pkt	-> when cpucp packet is un-supported.
+ * @cpucp_packet_invalid_params	-> when checking parameter like length of buffer
+ *				   or attribute value etc.
+ * @cpucp_packet_rc_max		-> It indicates size of enum so should be at last.
+ */
+enum cpucp_packet_rc {
+	cpucp_packet_success,
+	cpucp_packet_invalid,
+	cpucp_packet_fault,
+	cpucp_packet_invalid_pkt,
+	cpucp_packet_invalid_params,
+	cpucp_packet_rc_max
+};
+
+/*
+ * cpucp_temp_type should adhere to hwmon_temp_attributes
+ * defined in Linux kernel hwmon.h file
+ */
+enum cpucp_temp_type {
+	cpucp_temp_input,
+	cpucp_temp_min = 4,
+	cpucp_temp_min_hyst,
+	cpucp_temp_max = 6,
+	cpucp_temp_max_hyst,
+	cpucp_temp_crit,
+	cpucp_temp_crit_hyst,
+	cpucp_temp_offset = 19,
+	cpucp_temp_lowest = 21,
+	cpucp_temp_highest = 22,
+	cpucp_temp_reset_history = 23,
+	cpucp_temp_warn = 24,
+	cpucp_temp_max_crit = 25,
+	cpucp_temp_max_warn = 26,
+};
+
+enum cpucp_in_attributes {
+	cpucp_in_input,
+	cpucp_in_min,
+	cpucp_in_max,
+	cpucp_in_lowest = 6,
+	cpucp_in_highest = 7,
+	cpucp_in_reset_history,
+	cpucp_in_intr_alarm_a,
+	cpucp_in_intr_alarm_b,
+};
+
+enum cpucp_curr_attributes {
+	cpucp_curr_input,
+	cpucp_curr_min,
+	cpucp_curr_max,
+	cpucp_curr_lowest = 6,
+	cpucp_curr_highest = 7,
+	cpucp_curr_reset_history
+};
+
+enum cpucp_fan_attributes {
+	cpucp_fan_input,
+	cpucp_fan_min = 2,
+	cpucp_fan_max
+};
+
+enum cpucp_pwm_attributes {
+	cpucp_pwm_input,
+	cpucp_pwm_enable
+};
+
+enum cpucp_pcie_throughput_attributes {
+	cpucp_pcie_throughput_tx,
+	cpucp_pcie_throughput_rx
+};
+
+/* TODO temporary kept before removal */
+enum cpucp_pll_reg_attributes {
+	cpucp_pll_nr_reg,
+	cpucp_pll_nf_reg,
+	cpucp_pll_od_reg,
+	cpucp_pll_div_factor_reg,
+	cpucp_pll_div_sel_reg
+};
+
+/* TODO temporary kept before removal */
+enum cpucp_pll_type_attributes {
+	cpucp_pll_cpu,
+	cpucp_pll_pci,
+};
+
+/*
+ * cpucp_power_type aligns with hwmon_power_attributes
+ * defined in Linux kernel hwmon.h file
+ */
+enum cpucp_power_type {
+	CPUCP_POWER_INPUT = 8,
+	CPUCP_POWER_INPUT_HIGHEST = 9,
+	CPUCP_POWER_RESET_INPUT_HISTORY = 11
+};
+
+/*
+ * MSI type enumeration table for all ASICs and future SW versions.
+ * For future ASIC-LKD compatibility, we can only add new enumerations.
+ * at the end of the table (before CPUCP_NUM_OF_MSI_TYPES).
+ * Changing the order of entries or removing entries is not allowed.
+ */
+enum cpucp_msi_type {
+	CPUCP_EVENT_QUEUE_MSI_TYPE,
+	CPUCP_NIC_PORT1_MSI_TYPE,
+	CPUCP_NIC_PORT3_MSI_TYPE,
+	CPUCP_NIC_PORT5_MSI_TYPE,
+	CPUCP_NIC_PORT7_MSI_TYPE,
+	CPUCP_NIC_PORT9_MSI_TYPE,
+	CPUCP_NUM_OF_MSI_TYPES
+};
+
+/*
+ * PLL enumeration table used for all ASICs and future SW versions.
+ * For future ASIC-LKD compatibility, we can only add new enumerations.
+ * at the end of the table.
+ * Changing the order of entries or removing entries is not allowed.
+ */
+enum pll_index {
+	CPU_PLL = 0,
+	PCI_PLL = 1,
+	NIC_PLL = 2,
+	DMA_PLL = 3,
+	MESH_PLL = 4,
+	MME_PLL = 5,
+	TPC_PLL = 6,
+	IF_PLL = 7,
+	SRAM_PLL = 8,
+	NS_PLL = 9,
+	HBM_PLL = 10,
+	MSS_PLL = 11,
+	DDR_PLL = 12,
+	VID_PLL = 13,
+	BANK_PLL = 14,
+	MMU_PLL = 15,
+	IC_PLL = 16,
+	MC_PLL = 17,
+	EMMC_PLL = 18,
+	D2D_PLL = 19,
+	CS_PLL = 20,
+	C2C_PLL = 21,
+	NCH_PLL = 22,
+	C2M_PLL = 23,
+	PLL_MAX
+};
+
+enum rl_index {
+	TPC_RL = 0,
+	MME_RL,
+	EDMA_RL,
+};
+
+enum pvt_index {
+	PVT_SW,
+	PVT_SE,
+	PVT_NW,
+	PVT_NE
+};
+
+/* Event Queue Packets */
+
+struct eq_generic_event {
+	__le64 data[7];
+};
+
+/*
+ * CpuCP info
+ */
+
+#define CARD_NAME_MAX_LEN		16
+#define CPUCP_MAX_SENSORS		128
+#define CPUCP_MAX_NICS			128
+#define CPUCP_LANES_PER_NIC		4
+#define CPUCP_NIC_QSFP_EEPROM_MAX_LEN	1024
+#define CPUCP_MAX_NIC_LANES		(CPUCP_MAX_NICS * CPUCP_LANES_PER_NIC)
+#define CPUCP_NIC_MASK_ARR_LEN		((CPUCP_MAX_NICS + 63) / 64)
+#define CPUCP_NIC_POLARITY_ARR_LEN	((CPUCP_MAX_NIC_LANES + 63) / 64)
+#define CPUCP_HBM_ROW_REPLACE_MAX	32
+
+struct cpucp_sensor {
+	__le32 type;
+	__le32 flags;
+};
+
+/**
+ * struct cpucp_card_types - ASIC card type.
+ * @cpucp_card_type_pci: PCI card.
+ * @cpucp_card_type_pmc: PCI Mezzanine Card.
+ */
+enum cpucp_card_types {
+	cpucp_card_type_pci,
+	cpucp_card_type_pmc
+};
+
+#define CPUCP_SEC_CONF_ENABLED_SHIFT	0
+#define CPUCP_SEC_CONF_ENABLED_MASK	0x00000001
+
+#define CPUCP_SEC_CONF_FLASH_WP_SHIFT	1
+#define CPUCP_SEC_CONF_FLASH_WP_MASK	0x00000002
+
+#define CPUCP_SEC_CONF_EEPROM_WP_SHIFT	2
+#define CPUCP_SEC_CONF_EEPROM_WP_MASK	0x00000004
+
+/**
+ * struct cpucp_security_info - Security information.
+ * @config: configuration bit field
+ * @keys_num: number of stored keys
+ * @revoked_keys: revoked keys bit field
+ * @min_svn: minimal security version
+ */
+struct cpucp_security_info {
+	__u8 config;
+	__u8 keys_num;
+	__u8 revoked_keys;
+	__u8 min_svn;
+};
+
+/**
+ * struct cpucp_info - Info from CpuCP that is necessary to the host's driver
+ * @sensors: available sensors description.
+ * @kernel_version: CpuCP linux kernel version.
+ * @reserved: reserved field.
+ * @card_type: card configuration type.
+ * @card_location: in a server, each card has different connections topology
+ *                 depending on its location (relevant for PMC card type)
+ * @cpld_version: CPLD programmed F/W version.
+ * @infineon_version: Infineon main DC-DC version.
+ * @fuse_version: silicon production FUSE information.
+ * @thermal_version: thermald S/W version.
+ * @cpucp_version: CpuCP S/W version.
+ * @infineon_second_stage_version: Infineon 2nd stage DC-DC version.
+ * @dram_size: available DRAM size.
+ * @card_name: card name that will be displayed in HWMON subsystem on the host
+ * @tpc_binning_mask: TPC binning mask, 1 bit per TPC instance
+ *                    (0 = functional, 1 = binned)
+ * @decoder_binning_mask: Decoder binning mask, 1 bit per decoder instance
+ *                        (0 = functional, 1 = binned), maximum 1 per dcore
+ * @sram_binning: Categorize SRAM functionality
+ *                (0 = fully functional, 1 = lower-half is not functional,
+ *                 2 = upper-half is not functional)
+ * @sec_info: security information
+ * @pll_map: Bit map of supported PLLs for current ASIC version.
+ * @mme_binning_mask: MME binning mask,
+ *                    bits [0:6]   <==> dcore0 mme fma
+ *                    bits [7:13]  <==> dcore1 mme fma
+ *                    bits [14:20] <==> dcore0 mme ima
+ *                    bits [21:27] <==> dcore1 mme ima
+ *                    For each group, if the 6th bit is set then first 5 bits
+ *                    represent the col's idx [0-31], otherwise these bits are
+ *                    ignored, and col idx 32 is binned. 7th bit is don't care.
+ * @dram_binning_mask: DRAM binning mask, 1 bit per dram instance
+ *                     (0 = functional 1 = binned)
+ * @memory_repair_flag: eFuse flag indicating memory repair
+ * @edma_binning_mask: EDMA binning mask, 1 bit per EDMA instance
+ *                     (0 = functional 1 = binned)
+ * @xbar_binning_mask: Xbar binning mask, 1 bit per Xbar instance
+ *                     (0 = functional 1 = binned)
+ * @interposer_version: Interposer version programmed in eFuse
+ * @substrate_version: Substrate version programmed in eFuse
+ * @fw_hbm_region_size: Size in bytes of FW reserved region in HBM.
+ * @fw_os_version: Firmware OS Version
+ */
+struct cpucp_info {
+	struct cpucp_sensor sensors[CPUCP_MAX_SENSORS];
+	__u8 kernel_version[VERSION_MAX_LEN];
+	__le32 reserved;
+	__le32 card_type;
+	__le32 card_location;
+	__le32 cpld_version;
+	__le32 infineon_version;
+	__u8 fuse_version[VERSION_MAX_LEN];
+	__u8 thermal_version[VERSION_MAX_LEN];
+	__u8 cpucp_version[VERSION_MAX_LEN];
+	__le32 infineon_second_stage_version;
+	__le64 dram_size;
+	char card_name[CARD_NAME_MAX_LEN];
+	__le64 tpc_binning_mask;
+	__le64 decoder_binning_mask;
+	__u8 sram_binning;
+	__u8 dram_binning_mask;
+	__u8 memory_repair_flag;
+	__u8 edma_binning_mask;
+	__u8 xbar_binning_mask;
+	__u8 interposer_version;
+	__u8 substrate_version;
+	__u8 reserved2;
+	struct cpucp_security_info sec_info;
+	__le32 fw_hbm_region_size;
+	__u8 pll_map[PLL_MAP_LEN];
+	__le64 mme_binning_mask;
+	__u8 fw_os_version[VERSION_MAX_LEN];
+};
+
+struct cpucp_mac_addr {
+	__u8 mac_addr[ETH_ALEN];
+};
+
+enum cpucp_serdes_type {
+	TYPE_1_SERDES_TYPE,
+	TYPE_2_SERDES_TYPE,
+	HLS1_SERDES_TYPE,
+	HLS1H_SERDES_TYPE,
+	HLS2_SERDES_TYPE,
+	HLS2_TYPE_1_SERDES_TYPE,
+	MAX_NUM_SERDES_TYPE,		/* number of types */
+	UNKNOWN_SERDES_TYPE = 0xFFFF	/* serdes_type is u16 */
+};
+
+struct cpucp_nic_info {
+	struct cpucp_mac_addr mac_addrs[CPUCP_MAX_NICS];
+	__le64 link_mask[CPUCP_NIC_MASK_ARR_LEN];
+	__le64 pol_tx_mask[CPUCP_NIC_POLARITY_ARR_LEN];
+	__le64 pol_rx_mask[CPUCP_NIC_POLARITY_ARR_LEN];
+	__le64 link_ext_mask[CPUCP_NIC_MASK_ARR_LEN];
+	__u8 qsfp_eeprom[CPUCP_NIC_QSFP_EEPROM_MAX_LEN];
+	__le64 auto_neg_mask[CPUCP_NIC_MASK_ARR_LEN];
+	__le16 serdes_type; /* enum cpucp_serdes_type */
+	__le16 tx_swap_map[CPUCP_MAX_NICS];
+	__u8 reserved[6];
+};
+
+#define PAGE_DISCARD_MAX	64
+
+struct page_discard_info {
+	__u8 num_entries;
+	__u8 reserved[7];
+	__le32 mmu_page_idx[PAGE_DISCARD_MAX];
+};
+
+/*
+ * struct frac_val - fracture value represented by "integer.frac".
+ * @integer: the integer part of the fracture value;
+ * @frac: the fracture part of the fracture value.
+ */
+struct frac_val {
+	union {
+		struct {
+			__le16 integer;
+			__le16 frac;
+		};
+		__le32 val;
+	};
+};
+
+/*
+ * struct ser_val - the SER (symbol error rate) value is represented by "integer * 10 ^ -exp".
+ * @integer: the integer part of the SER value;
+ * @exp: the exponent part of the SER value.
+ */
+struct ser_val {
+	__le16 integer;
+	__le16 exp;
+};
+
+/*
+ * struct cpucp_nic_status - describes the status of a NIC port.
+ * @port: NIC port index.
+ * @bad_format_cnt: e.g. CRC.
+ * @responder_out_of_sequence_psn_cnt: e.g NAK.
+ * @high_ber_reinit_cnt: link reinit due to high BER.
+ * @correctable_err_cnt: e.g. bit-flip.
+ * @uncorrectable_err_cnt: e.g. MAC errors.
+ * @retraining_cnt: re-training counter.
+ * @up: is port up.
+ * @pcs_link: has PCS link.
+ * @phy_ready: is PHY ready.
+ * @auto_neg: is Autoneg enabled.
+ * @timeout_retransmission_cnt: timeout retransmission events.
+ * @high_ber_cnt: high ber events.
+ * @pre_fec_ser: pre FEC SER value.
+ * @post_fec_ser: post FEC SER value.
+ * @throughput: measured throughput.
+ * @latency: measured latency.
+ */
+struct cpucp_nic_status {
+	__le32 port;
+	__le32 bad_format_cnt;
+	__le32 responder_out_of_sequence_psn_cnt;
+	__le32 high_ber_reinit;
+	__le32 correctable_err_cnt;
+	__le32 uncorrectable_err_cnt;
+	__le32 retraining_cnt;
+	__u8 up;
+	__u8 pcs_link;
+	__u8 phy_ready;
+	__u8 auto_neg;
+	__le32 timeout_retransmission_cnt;
+	__le32 high_ber_cnt;
+	struct ser_val pre_fec_ser;
+	struct ser_val post_fec_ser;
+	struct frac_val bandwidth;
+	struct frac_val lat;
+};
+
+enum cpucp_hbm_row_replace_cause {
+	REPLACE_CAUSE_DOUBLE_ECC_ERR,
+	REPLACE_CAUSE_MULTI_SINGLE_ECC_ERR,
+};
+
+struct cpucp_hbm_row_info {
+	__u8 hbm_idx;
+	__u8 pc;
+	__u8 sid;
+	__u8 bank_idx;
+	__le16 row_addr;
+	__u8 replaced_row_cause; /* enum cpucp_hbm_row_replace_cause */
+	__u8 pad;
+};
+
+struct cpucp_hbm_row_replaced_rows_info {
+	__le16 num_replaced_rows;
+	__u8 pad[6];
+	struct cpucp_hbm_row_info replaced_rows[CPUCP_HBM_ROW_REPLACE_MAX];
+};
+
+enum cpu_reset_status {
+	CPU_RST_STATUS_NA = 0,
+	CPU_RST_STATUS_SOFT_RST_DONE = 1,
+};
+
+#define SEC_PCR_DATA_BUF_SZ	256
+#define SEC_PCR_QUOTE_BUF_SZ	510	/* (512 - 2) 2 bytes used for size */
+#define SEC_SIGNATURE_BUF_SZ	255	/* (256 - 1) 1 byte used for size */
+#define SEC_PUB_DATA_BUF_SZ	510	/* (512 - 2) 2 bytes used for size */
+#define SEC_CERTIFICATE_BUF_SZ	2046	/* (2048 - 2) 2 bytes used for size */
+
+/*
+ * struct cpucp_sec_attest_info - attestation report of the boot
+ * @pcr_data: raw values of the PCR registers
+ * @pcr_num_reg: number of PCR registers in the pcr_data array
+ * @pcr_reg_len: length of each PCR register in the pcr_data array (bytes)
+ * @nonce: number only used once. random number provided by host. this also
+ *	    passed to the quote command as a qualifying data.
+ * @pcr_quote_len: length of the attestation quote data (bytes)
+ * @pcr_quote: attestation report data structure
+ * @quote_sig_len: length of the attestation report signature (bytes)
+ * @quote_sig: signature structure of the attestation report
+ * @pub_data_len: length of the public data (bytes)
+ * @public_data: public key for the signed attestation
+ *		 (outPublic + name + qualifiedName)
+ * @certificate_len: length of the certificate (bytes)
+ * @certificate: certificate for the attestation signing key
+ */
+struct cpucp_sec_attest_info {
+	__u8 pcr_data[SEC_PCR_DATA_BUF_SZ];
+	__u8 pcr_num_reg;
+	__u8 pcr_reg_len;
+	__le16 pad0;
+	__le32 nonce;
+	__le16 pcr_quote_len;
+	__u8 pcr_quote[SEC_PCR_QUOTE_BUF_SZ];
+	__u8 quote_sig_len;
+	__u8 quote_sig[SEC_SIGNATURE_BUF_SZ];
+	__le16 pub_data_len;
+	__u8 public_data[SEC_PUB_DATA_BUF_SZ];
+	__le16 certificate_len;
+	__u8 certificate[SEC_CERTIFICATE_BUF_SZ];
+};
+
+/*
+ * struct cpucp_dev_info_signed - device information signed by a secured device
+ * @info: device information structure as defined above
+ * @nonce: number only used once. random number provided by host. this number is
+ *	   hashed and signed along with the device information.
+ * @info_sig_len: length of the attestation signature (bytes)
+ * @info_sig: signature of the info + nonce data.
+ * @pub_data_len: length of the public data (bytes)
+ * @public_data: public key info signed info data
+ *		 (outPublic + name + qualifiedName)
+ * @certificate_len: length of the certificate (bytes)
+ * @certificate: certificate for the signing key
+ */
+struct cpucp_dev_info_signed {
+	struct cpucp_info info;	/* assumed to be 64bit aligned */
+	__le32 nonce;
+	__le32 pad0;
+	__u8 info_sig_len;
+	__u8 info_sig[SEC_SIGNATURE_BUF_SZ];
+	__le16 pub_data_len;
+	__u8 public_data[SEC_PUB_DATA_BUF_SZ];
+	__le16 certificate_len;
+	__u8 certificate[SEC_CERTIFICATE_BUF_SZ];
+};
+
+#define DCORE_MON_REGS_SZ	512
+/*
+ * struct dcore_monitor_regs_data - DCORE monitor regs data.
+ * the structure follows sync manager block layout. Obsolete.
+ * @mon_pay_addrl: array of payload address low bits.
+ * @mon_pay_addrh: array of payload address high bits.
+ * @mon_pay_data: array of payload data.
+ * @mon_arm: array of monitor arm.
+ * @mon_status: array of monitor status.
+ */
+struct dcore_monitor_regs_data {
+	__le32 mon_pay_addrl[DCORE_MON_REGS_SZ];
+	__le32 mon_pay_addrh[DCORE_MON_REGS_SZ];
+	__le32 mon_pay_data[DCORE_MON_REGS_SZ];
+	__le32 mon_arm[DCORE_MON_REGS_SZ];
+	__le32 mon_status[DCORE_MON_REGS_SZ];
+};
+
+/* contains SM data for each SYNC_MNGR (Obsolete) */
+struct cpucp_monitor_dump {
+	struct dcore_monitor_regs_data sync_mngr_w_s;
+	struct dcore_monitor_regs_data sync_mngr_e_s;
+	struct dcore_monitor_regs_data sync_mngr_w_n;
+	struct dcore_monitor_regs_data sync_mngr_e_n;
+};
+
+/*
+ * The Type of the generic request (and other input arguments) will be fetched from user by reading
+ * from "pkt_subidx" field in struct cpucp_packet.
+ *
+ * HL_PASSTHROUGHT_VERSIONS	- Fetch all firmware versions.
+ */
+enum hl_passthrough_type {
+	HL_PASSTHROUGH_VERSIONS,
+};
+
+#endif /* CPUCP_IF_H */
diff --git a/include/linux/habanalabs/hl_boot_if.h b/include/linux/habanalabs/hl_boot_if.h
new file mode 100644
index 000000000000..7de8a5786a36
--- /dev/null
+++ b/include/linux/habanalabs/hl_boot_if.h
@@ -0,0 +1,790 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2018-2020 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef HL_BOOT_IF_H
+#define HL_BOOT_IF_H
+
+#define LKD_HARD_RESET_MAGIC		0xED7BD694 /* deprecated - do not use */
+#define HL_POWER9_HOST_MAGIC		0x1DA30009
+
+#define BOOT_FIT_SRAM_OFFSET		0x200000
+
+#define VERSION_MAX_LEN			128
+
+enum cpu_boot_err {
+	CPU_BOOT_ERR_DRAM_INIT_FAIL = 0,
+	CPU_BOOT_ERR_FIT_CORRUPTED = 1,
+	CPU_BOOT_ERR_TS_INIT_FAIL = 2,
+	CPU_BOOT_ERR_DRAM_SKIPPED = 3,
+	CPU_BOOT_ERR_BMC_WAIT_SKIPPED = 4,
+	CPU_BOOT_ERR_NIC_DATA_NOT_RDY = 5,
+	CPU_BOOT_ERR_NIC_FW_FAIL = 6,
+	CPU_BOOT_ERR_SECURITY_NOT_RDY = 7,
+	CPU_BOOT_ERR_SECURITY_FAIL = 8,
+	CPU_BOOT_ERR_EFUSE_FAIL = 9,
+	CPU_BOOT_ERR_PRI_IMG_VER_FAIL = 10,
+	CPU_BOOT_ERR_SEC_IMG_VER_FAIL = 11,
+	CPU_BOOT_ERR_PLL_FAIL = 12,
+	CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL = 13,
+	CPU_BOOT_ERR_BOOT_FW_CRIT_ERR = 18,
+	CPU_BOOT_ERR_BINNING_FAIL = 19,
+	CPU_BOOT_ERR_TPM_FAIL = 20,
+	CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL = 21,
+	CPU_BOOT_ERR_EEPROM_FAIL = 22,
+	CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL = 23,
+	CPU_BOOT_ERR_ENABLED = 31,
+	CPU_BOOT_ERR_SCND_EN = 63,
+	CPU_BOOT_ERR_LAST = 64 /* we have 2 registers of 32 bits */
+};
+
+/*
+ * Mask for fatal failures
+ * This mask contains all possible fatal failures, and a dynamic code
+ * will clear the non-relevant ones.
+ */
+#define CPU_BOOT_ERR_FATAL_MASK					\
+		((1 << CPU_BOOT_ERR_DRAM_INIT_FAIL) |		\
+		 (1 << CPU_BOOT_ERR_PLL_FAIL) |			\
+		 (1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL) |	\
+		 (1 << CPU_BOOT_ERR_BINNING_FAIL) |		\
+		 (1 << CPU_BOOT_ERR_DRAM_SKIPPED) |		\
+		 (1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) |	\
+		 (1 << CPU_BOOT_ERR_EEPROM_FAIL))
+
+/*
+ * CPU error bits in BOOT_ERROR registers
+ *
+ * CPU_BOOT_ERR0_DRAM_INIT_FAIL		DRAM initialization failed.
+ *					DRAM is not reliable to use.
+ *
+ * CPU_BOOT_ERR0_FIT_CORRUPTED		FIT data integrity verification of the
+ *					image provided by the host has failed.
+ *
+ * CPU_BOOT_ERR0_TS_INIT_FAIL		Thermal Sensor initialization failed.
+ *					Boot continues as usual, but keep in
+ *					mind this is a warning.
+ *
+ * CPU_BOOT_ERR0_DRAM_SKIPPED		DRAM initialization has been skipped.
+ *					Skipping DRAM initialization has been
+ *					requested (e.g. strap, command, etc.)
+ *					and FW skipped the DRAM initialization.
+ *					Host can initialize the DRAM.
+ *
+ * CPU_BOOT_ERR0_BMC_WAIT_SKIPPED	Waiting for BMC data will be skipped.
+ *					Meaning the BMC data might not be
+ *					available until reset.
+ *
+ * CPU_BOOT_ERR0_NIC_DATA_NOT_RDY	NIC data from BMC is not ready.
+ *					BMC has not provided the NIC data yet.
+ *					Once provided this bit will be cleared.
+ *
+ * CPU_BOOT_ERR0_NIC_FW_FAIL		NIC FW loading failed.
+ *					The NIC FW loading and initialization
+ *					failed. This means NICs are not usable.
+ *
+ * CPU_BOOT_ERR0_SECURITY_NOT_RDY	Chip security initialization has been
+ *					started, but is not ready yet - chip
+ *					cannot be accessed.
+ *
+ * CPU_BOOT_ERR0_SECURITY_FAIL		Security related tasks have failed.
+ *					The tasks are security init (root of
+ *					trust), boot authentication (chain of
+ *					trust), data packets authentication.
+ *
+ * CPU_BOOT_ERR0_EFUSE_FAIL		Reading from eFuse failed.
+ *					The PCI device ID might be wrong.
+ *
+ * CPU_BOOT_ERR0_PRI_IMG_VER_FAIL	Verification of primary image failed.
+ *					It mean that ppboot checksum
+ *					verification for the preboot primary
+ *					image has failed to match expected
+ *					checksum. Trying to program image again
+ *					might solve this.
+ *
+ * CPU_BOOT_ERR0_SEC_IMG_VER_FAIL	Verification of secondary image failed.
+ *					It mean that ppboot checksum
+ *					verification for the preboot secondary
+ *					image has failed to match expected
+ *					checksum. Trying to program image again
+ *					might solve this.
+ *
+ * CPU_BOOT_ERR0_PLL_FAIL		PLL settings failed, meaning that one
+ *					of the PLLs remains in REF_CLK
+ *
+ * CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL	Device is unusable and customer support
+ *					should be contacted.
+ *
+ * CPU_BOOT_ERR0_BOOT_FW_CRIT_ERR	Critical error was detected during
+ *					the execution of ppboot or preboot.
+ *					for example: stack overflow.
+ *
+ * CPU_BOOT_ERR0_BINNING_FAIL		Binning settings failed, meaning
+ *					malfunctioning components might still be
+ *					in use.
+ *
+ * CPU_BOOT_ERR0_TPM_FAIL		TPM verification flow failed.
+ *
+ * CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL	Failed to set threshold for tmperature
+ *					sensor.
+ *
+ * CPU_BOOT_ERR_EEPROM_FAIL		Failed reading EEPROM data. Defaults
+ *					are used.
+ *
+ * CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL	Failed scrubbing the Engines/ARCFarm
+ *					memories. Boot disabled until reset.
+ *
+ * CPU_BOOT_ERR0_ENABLED		Error registers enabled.
+ *					This is a main indication that the
+ *					running FW populates the error
+ *					registers. Meaning the error bits are
+ *					not garbage, but actual error statuses.
+ */
+#define CPU_BOOT_ERR0_DRAM_INIT_FAIL		(1 << CPU_BOOT_ERR_DRAM_INIT_FAIL)
+#define CPU_BOOT_ERR0_FIT_CORRUPTED		(1 << CPU_BOOT_ERR_FIT_CORRUPTED)
+#define CPU_BOOT_ERR0_TS_INIT_FAIL		(1 << CPU_BOOT_ERR_TS_INIT_FAIL)
+#define CPU_BOOT_ERR0_DRAM_SKIPPED		(1 << CPU_BOOT_ERR_DRAM_SKIPPED)
+#define CPU_BOOT_ERR0_BMC_WAIT_SKIPPED		(1 << CPU_BOOT_ERR_BMC_WAIT_SKIPPED)
+#define CPU_BOOT_ERR0_NIC_DATA_NOT_RDY		(1 << CPU_BOOT_ERR_NIC_DATA_NOT_RDY)
+#define CPU_BOOT_ERR0_NIC_FW_FAIL		(1 << CPU_BOOT_ERR_NIC_FW_FAIL)
+#define CPU_BOOT_ERR0_SECURITY_NOT_RDY		(1 << CPU_BOOT_ERR_SECURITY_NOT_RDY)
+#define CPU_BOOT_ERR0_SECURITY_FAIL		(1 << CPU_BOOT_ERR_SECURITY_FAIL)
+#define CPU_BOOT_ERR0_EFUSE_FAIL		(1 << CPU_BOOT_ERR_EFUSE_FAIL)
+#define CPU_BOOT_ERR0_PRI_IMG_VER_FAIL		(1 << CPU_BOOT_ERR_PRI_IMG_VER_FAIL)
+#define CPU_BOOT_ERR0_SEC_IMG_VER_FAIL		(1 << CPU_BOOT_ERR_SEC_IMG_VER_FAIL)
+#define CPU_BOOT_ERR0_PLL_FAIL			(1 << CPU_BOOT_ERR_PLL_FAIL)
+#define CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL	(1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL)
+#define CPU_BOOT_ERR0_BOOT_FW_CRIT_ERR		(1 << CPU_BOOT_ERR_BOOT_FW_CRIT_ERR)
+#define CPU_BOOT_ERR0_BINNING_FAIL		(1 << CPU_BOOT_ERR_BINNING_FAIL)
+#define CPU_BOOT_ERR0_TPM_FAIL			(1 << CPU_BOOT_ERR_TPM_FAIL)
+#define CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL	(1 << CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL)
+#define CPU_BOOT_ERR0_EEPROM_FAIL		(1 << CPU_BOOT_ERR_EEPROM_FAIL)
+#define CPU_BOOT_ERR0_ENG_ARC_MEM_SCRUB_FAIL	(1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL)
+#define CPU_BOOT_ERR0_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
+#define CPU_BOOT_ERR1_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
+
+enum cpu_boot_dev_sts {
+	CPU_BOOT_DEV_STS_SECURITY_EN = 0,
+	CPU_BOOT_DEV_STS_DEBUG_EN = 1,
+	CPU_BOOT_DEV_STS_WATCHDOG_EN = 2,
+	CPU_BOOT_DEV_STS_DRAM_INIT_EN = 3,
+	CPU_BOOT_DEV_STS_BMC_WAIT_EN = 4,
+	CPU_BOOT_DEV_STS_E2E_CRED_EN = 5,
+	CPU_BOOT_DEV_STS_HBM_CRED_EN = 6,
+	CPU_BOOT_DEV_STS_RL_EN = 7,
+	CPU_BOOT_DEV_STS_SRAM_SCR_EN = 8,
+	CPU_BOOT_DEV_STS_DRAM_SCR_EN = 9,
+	CPU_BOOT_DEV_STS_FW_HARD_RST_EN = 10,
+	CPU_BOOT_DEV_STS_PLL_INFO_EN = 11,
+	CPU_BOOT_DEV_STS_SP_SRAM_EN = 12,
+	CPU_BOOT_DEV_STS_CLK_GATE_EN = 13,
+	CPU_BOOT_DEV_STS_HBM_ECC_EN = 14,
+	CPU_BOOT_DEV_STS_PKT_PI_ACK_EN = 15,
+	CPU_BOOT_DEV_STS_FW_LD_COM_EN = 16,
+	CPU_BOOT_DEV_STS_FW_IATU_CONF_EN = 17,
+	CPU_BOOT_DEV_STS_FW_NIC_MAC_EN = 18,
+	CPU_BOOT_DEV_STS_DYN_PLL_EN = 19,
+	CPU_BOOT_DEV_STS_GIC_PRIVILEGED_EN = 20,
+	CPU_BOOT_DEV_STS_EQ_INDEX_EN = 21,
+	CPU_BOOT_DEV_STS_MULTI_IRQ_POLL_EN = 22,
+	CPU_BOOT_DEV_STS_FW_NIC_STAT_XPCS91_EN = 23,
+	CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN = 24,
+	CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN = 25,
+	CPU_BOOT_DEV_STS_MAP_HWMON_EN = 26,
+	CPU_BOOT_DEV_STS_ENABLED = 31,
+	CPU_BOOT_DEV_STS_SCND_EN = 63,
+	CPU_BOOT_DEV_STS_LAST = 64 /* we have 2 registers of 32 bits */
+};
+
+/*
+ * BOOT DEVICE STATUS bits in BOOT_DEVICE_STS registers
+ *
+ * CPU_BOOT_DEV_STS0_SECURITY_EN	Security is Enabled.
+ *					This is an indication for security
+ *					enabled in FW, which means that
+ *					all conditions for security are met:
+ *					device is indicated as security enabled,
+ *					registers are protected, and device
+ *					uses keys for image verification.
+ *					Initialized in: preboot
+ *
+ * CPU_BOOT_DEV_STS0_DEBUG_EN		Debug is enabled.
+ *					Enabled when JTAG or DEBUG is enabled
+ *					in FW.
+ *					Initialized in: preboot
+ *
+ * CPU_BOOT_DEV_STS0_WATCHDOG_EN	Watchdog is enabled.
+ *					Watchdog is enabled in FW.
+ *					Initialized in: preboot
+ *
+ * CPU_BOOT_DEV_STS0_DRAM_INIT_EN	DRAM initialization is enabled.
+ *					DRAM initialization has been done in FW.
+ *					Initialized in: u-boot
+ *
+ * CPU_BOOT_DEV_STS0_BMC_WAIT_EN	Waiting for BMC data enabled.
+ *					If set, it means that during boot,
+ *					FW waited for BMC data.
+ *					Initialized in: u-boot
+ *
+ * CPU_BOOT_DEV_STS0_E2E_CRED_EN	E2E credits initialized.
+ *					FW initialized E2E credits.
+ *					Initialized in: u-boot
+ *
+ * CPU_BOOT_DEV_STS0_HBM_CRED_EN	HBM credits initialized.
+ *					FW initialized HBM credits.
+ *					Initialized in: u-boot
+ *
+ * CPU_BOOT_DEV_STS0_RL_EN		Rate limiter initialized.
+ *					FW initialized rate limiter.
+ *					Initialized in: u-boot
+ *
+ * CPU_BOOT_DEV_STS0_SRAM_SCR_EN	SRAM scrambler enabled.
+ *					FW initialized SRAM scrambler.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_DRAM_SCR_EN	DRAM scrambler enabled.
+ *					FW initialized DRAM scrambler.
+ *					Initialized in: u-boot
+ *
+ * CPU_BOOT_DEV_STS0_FW_HARD_RST_EN	FW hard reset procedure is enabled.
+ *					FW has the hard reset procedure
+ *					implemented. This means that FW will
+ *					perform hard reset procedure on
+ *					receiving the halt-machine event.
+ *					Initialized in: preboot, u-boot, linux
+ *
+ * CPU_BOOT_DEV_STS0_PLL_INFO_EN	FW retrieval of PLL info is enabled.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_SP_SRAM_EN		SP SRAM is initialized and available
+ *					for use.
+ *					Initialized in: preboot
+ *
+ * CPU_BOOT_DEV_STS0_CLK_GATE_EN	Clock Gating enabled.
+ *					FW initialized Clock Gating.
+ *					Initialized in: preboot
+ *
+ * CPU_BOOT_DEV_STS0_HBM_ECC_EN		HBM ECC handling Enabled.
+ *					FW handles HBM ECC indications.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN	Packets ack value used in the armcpd
+ *					is set to the PI counter.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_FW_LD_COM_EN	Flexible FW loading communication
+ *					protocol is enabled.
+ *					Initialized in: preboot
+ *
+ * CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN	FW iATU configuration is enabled.
+ *					This bit if set, means the iATU has been
+ *					configured and is ready for use.
+ *					Initialized in: ppboot
+ *
+ * CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN	NIC MAC channels init is done by FW and
+ *					any access to them is done via the FW.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_DYN_PLL_EN		Dynamic PLL configuration is enabled.
+ *					FW sends to host a bitmap of supported
+ *					PLLs.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN	GIC access permission only from
+ *					previleged entity. FW sets this status
+ *					bit for host. If this bit is set then
+ *					GIC can not be accessed from host.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_EQ_INDEX_EN	Event Queue (EQ) index is a running
+ *					index for each new event sent to host.
+ *					This is used as a method in host to
+ *					identify that the waiting event in
+ *					queue is actually a new event which
+ *					was not served before.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN	Use multiple scratchpad interfaces to
+ *					prevent IRQs overriding each other.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN
+ *					NIC STAT and XPCS91 access is restricted
+ *					and is done via FW only.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN
+ *					NIC STAT get all is supported.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN
+ *					F/W checks if the device is idle by reading defined set
+ *					of registers. It returns a bitmask of all the engines,
+ *					where a bit is set if the engine is not idle.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_MAP_HWMON_EN
+ *					If set, means f/w supports proprietary
+ *					HWMON enum mapping to cpucp enums.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_ENABLED		Device status register enabled.
+ *					This is a main indication that the
+ *					running FW populates the device status
+ *					register. Meaning the device status
+ *					bits are not garbage, but actual
+ *					statuses.
+ *					Initialized in: preboot
+ *
+ */
+#define CPU_BOOT_DEV_STS0_SECURITY_EN		(1 << CPU_BOOT_DEV_STS_SECURITY_EN)
+#define CPU_BOOT_DEV_STS0_DEBUG_EN		(1 << CPU_BOOT_DEV_STS_DEBUG_EN)
+#define CPU_BOOT_DEV_STS0_WATCHDOG_EN		(1 << CPU_BOOT_DEV_STS_WATCHDOG_EN)
+#define CPU_BOOT_DEV_STS0_DRAM_INIT_EN		(1 << CPU_BOOT_DEV_STS_DRAM_INIT_EN)
+#define CPU_BOOT_DEV_STS0_BMC_WAIT_EN		(1 << CPU_BOOT_DEV_STS_BMC_WAIT_EN)
+#define CPU_BOOT_DEV_STS0_E2E_CRED_EN		(1 << CPU_BOOT_DEV_STS_E2E_CRED_EN)
+#define CPU_BOOT_DEV_STS0_HBM_CRED_EN		(1 << CPU_BOOT_DEV_STS_HBM_CRED_EN)
+#define CPU_BOOT_DEV_STS0_RL_EN			(1 << CPU_BOOT_DEV_STS_RL_EN)
+#define CPU_BOOT_DEV_STS0_SRAM_SCR_EN		(1 << CPU_BOOT_DEV_STS_SRAM_SCR_EN)
+#define CPU_BOOT_DEV_STS0_DRAM_SCR_EN		(1 << CPU_BOOT_DEV_STS_DRAM_SCR_EN)
+#define CPU_BOOT_DEV_STS0_FW_HARD_RST_EN	(1 << CPU_BOOT_DEV_STS_FW_HARD_RST_EN)
+#define CPU_BOOT_DEV_STS0_PLL_INFO_EN		(1 << CPU_BOOT_DEV_STS_PLL_INFO_EN)
+#define CPU_BOOT_DEV_STS0_SP_SRAM_EN		(1 << CPU_BOOT_DEV_STS_SP_SRAM_EN)
+#define CPU_BOOT_DEV_STS0_CLK_GATE_EN		(1 << CPU_BOOT_DEV_STS_CLK_GATE_EN)
+#define CPU_BOOT_DEV_STS0_HBM_ECC_EN		(1 << CPU_BOOT_DEV_STS_HBM_ECC_EN)
+#define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN		(1 << CPU_BOOT_DEV_STS_PKT_PI_ACK_EN)
+#define CPU_BOOT_DEV_STS0_FW_LD_COM_EN		(1 << CPU_BOOT_DEV_STS_FW_LD_COM_EN)
+#define CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN	(1 << CPU_BOOT_DEV_STS_FW_IATU_CONF_EN)
+#define CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN		(1 << CPU_BOOT_DEV_STS_FW_NIC_MAC_EN)
+#define CPU_BOOT_DEV_STS0_DYN_PLL_EN		(1 << CPU_BOOT_DEV_STS_DYN_PLL_EN)
+#define CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN	(1 << CPU_BOOT_DEV_STS_GIC_PRIVILEGED_EN)
+#define CPU_BOOT_DEV_STS0_EQ_INDEX_EN		(1 << CPU_BOOT_DEV_STS_EQ_INDEX_EN)
+#define CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN	(1 << CPU_BOOT_DEV_STS_MULTI_IRQ_POLL_EN)
+#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN	(1 << CPU_BOOT_DEV_STS_FW_NIC_STAT_XPCS91_EN)
+#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN	(1 << CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN)
+#define CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN	(1 << CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN)
+#define CPU_BOOT_DEV_STS0_MAP_HWMON_EN		(1 << CPU_BOOT_DEV_STS_MAP_HWMON_EN)
+#define CPU_BOOT_DEV_STS0_ENABLED		(1 << CPU_BOOT_DEV_STS_ENABLED)
+#define CPU_BOOT_DEV_STS1_ENABLED		(1 << CPU_BOOT_DEV_STS_ENABLED)
+
+enum cpu_boot_status {
+	CPU_BOOT_STATUS_NA = 0,		/* Default value after reset of chip */
+	CPU_BOOT_STATUS_IN_WFE = 1,
+	CPU_BOOT_STATUS_DRAM_RDY = 2,
+	CPU_BOOT_STATUS_SRAM_AVAIL = 3,
+	CPU_BOOT_STATUS_IN_BTL = 4,	/* BTL is H/W FSM */
+	CPU_BOOT_STATUS_IN_PREBOOT = 5,
+	CPU_BOOT_STATUS_IN_SPL,		/* deprecated - not reported */
+	CPU_BOOT_STATUS_IN_UBOOT = 7,
+	CPU_BOOT_STATUS_DRAM_INIT_FAIL,	/* deprecated - will be removed */
+	CPU_BOOT_STATUS_FIT_CORRUPTED,	/* deprecated - will be removed */
+	/* U-Boot console prompt activated, commands are not processed */
+	CPU_BOOT_STATUS_UBOOT_NOT_READY = 10,
+	/* Finished NICs init, reported after DRAM and NICs */
+	CPU_BOOT_STATUS_NIC_FW_RDY = 11,
+	CPU_BOOT_STATUS_TS_INIT_FAIL,	/* deprecated - will be removed */
+	CPU_BOOT_STATUS_DRAM_SKIPPED,	/* deprecated - will be removed */
+	CPU_BOOT_STATUS_BMC_WAITING_SKIPPED, /* deprecated - will be removed */
+	/* Last boot loader progress status, ready to receive commands */
+	CPU_BOOT_STATUS_READY_TO_BOOT = 15,
+	/* Internal Boot finished, ready for boot-fit */
+	CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT = 16,
+	/* Internal Security has been initialized, device can be accessed */
+	CPU_BOOT_STATUS_SECURITY_READY = 17,
+};
+
+enum kmd_msg {
+	KMD_MSG_NA = 0,
+	KMD_MSG_GOTO_WFE,
+	KMD_MSG_FIT_RDY,
+	KMD_MSG_SKIP_BMC,
+	RESERVED,
+	KMD_MSG_RST_DEV,
+	KMD_MSG_LAST
+};
+
+enum cpu_msg_status {
+	CPU_MSG_CLR = 0,
+	CPU_MSG_OK,
+	CPU_MSG_ERR,
+};
+
+/* communication registers mapping - consider ABI when changing */
+struct cpu_dyn_regs {
+	__le32 cpu_pq_base_addr_low;
+	__le32 cpu_pq_base_addr_high;
+	__le32 cpu_pq_length;
+	__le32 cpu_pq_init_status;
+	__le32 cpu_eq_base_addr_low;
+	__le32 cpu_eq_base_addr_high;
+	__le32 cpu_eq_length;
+	__le32 cpu_eq_ci;
+	__le32 cpu_cq_base_addr_low;
+	__le32 cpu_cq_base_addr_high;
+	__le32 cpu_cq_length;
+	__le32 cpu_pf_pq_pi;
+	__le32 cpu_boot_dev_sts0;
+	__le32 cpu_boot_dev_sts1;
+	__le32 cpu_boot_err0;
+	__le32 cpu_boot_err1;
+	__le32 cpu_boot_status;
+	__le32 fw_upd_sts;
+	__le32 fw_upd_cmd;
+	__le32 fw_upd_pending_sts;
+	__le32 fuse_ver_offset;
+	__le32 preboot_ver_offset;
+	__le32 uboot_ver_offset;
+	__le32 hw_state;
+	__le32 kmd_msg_to_cpu;
+	__le32 cpu_cmd_status_to_host;
+	__le32 gic_host_pi_upd_irq;
+	__le32 gic_tpc_qm_irq_ctrl;
+	__le32 gic_mme_qm_irq_ctrl;
+	__le32 gic_dma_qm_irq_ctrl;
+	__le32 gic_nic_qm_irq_ctrl;
+	__le32 gic_dma_core_irq_ctrl;
+	__le32 gic_host_halt_irq;
+	__le32 gic_host_ints_irq;
+	__le32 gic_host_soft_rst_irq;
+	__le32 gic_rot_qm_irq_ctrl;
+	__le32 cpu_rst_status;
+	__le32 eng_arc_irq_ctrl;
+	__le32 reserved1[20];		/* reserve for future use */
+};
+
+/* TODO: remove the desc magic after the code is updated to use message */
+/* HCDM - Habana Communications Descriptor Magic */
+#define HL_COMMS_DESC_MAGIC	0x4843444D
+#define HL_COMMS_DESC_VER	3
+
+/* HCMv - Habana Communications Message + header version */
+#define HL_COMMS_MSG_MAGIC_VALUE	0x48434D00
+#define HL_COMMS_MSG_MAGIC_MASK		0xFFFFFF00
+#define HL_COMMS_MSG_MAGIC_VER_MASK	0xFF
+
+#define HL_COMMS_MSG_MAGIC_VER(ver)	(HL_COMMS_MSG_MAGIC_VALUE |	\
+					((ver) & HL_COMMS_MSG_MAGIC_VER_MASK))
+#define HL_COMMS_MSG_MAGIC_V0		HL_COMMS_DESC_MAGIC
+#define HL_COMMS_MSG_MAGIC_V1		HL_COMMS_MSG_MAGIC_VER(1)
+#define HL_COMMS_MSG_MAGIC_V2		HL_COMMS_MSG_MAGIC_VER(2)
+#define HL_COMMS_MSG_MAGIC_V3		HL_COMMS_MSG_MAGIC_VER(3)
+
+#define HL_COMMS_MSG_MAGIC		HL_COMMS_MSG_MAGIC_V3
+
+#define HL_COMMS_MSG_MAGIC_VALIDATE_MAGIC(magic)			\
+		(((magic) & HL_COMMS_MSG_MAGIC_MASK) ==			\
+		HL_COMMS_MSG_MAGIC_VALUE)
+
+#define HL_COMMS_MSG_MAGIC_VALIDATE_VERSION(magic, ver)			\
+		(((magic) & HL_COMMS_MSG_MAGIC_VER_MASK) >=		\
+		((ver) & HL_COMMS_MSG_MAGIC_VER_MASK))
+
+#define HL_COMMS_MSG_MAGIC_VALIDATE(magic, ver)				\
+		(HL_COMMS_MSG_MAGIC_VALIDATE_MAGIC((magic)) &&		\
+		HL_COMMS_MSG_MAGIC_VALIDATE_VERSION((magic), (ver)))
+
+enum comms_msg_type {
+	HL_COMMS_DESC_TYPE = 0,
+	HL_COMMS_RESET_CAUSE_TYPE = 1,
+	HL_COMMS_FW_CFG_SKIP_TYPE = 2,
+	HL_COMMS_BINNING_CONF_TYPE = 3,
+};
+
+/*
+ * Binning information shared between LKD and FW
+ * @tpc_mask_l - TPC binning information lower 64 bit
+ * @dec_mask - Decoder binning information
+ * @dram_mask - DRAM binning information
+ * @edma_mask - EDMA binning information
+ * @mme_mask_l - MME binning information lower 32
+ * @mme_mask_h - MME binning information upper 32
+ * @rot_mask - Rotator binning information
+ * @xbar_mask - xBAR binning information
+ * @reserved - reserved field for future binning info w/o ABI change
+ * @tpc_mask_h - TPC binning information upper 64 bit
+ * @nic_mask - NIC binning information
+ */
+struct lkd_fw_binning_info {
+	__le64 tpc_mask_l;
+	__le32 dec_mask;
+	__le32 dram_mask;
+	__le32 edma_mask;
+	__le32 mme_mask_l;
+	__le32 mme_mask_h;
+	__le32 rot_mask;
+	__le32 xbar_mask;
+	__le32 reserved0;
+	__le64 tpc_mask_h;
+	__le64 nic_mask;
+	__le32 reserved1[8];
+};
+
+/* TODO: remove this struct after the code is updated to use message */
+/* this is the comms descriptor header - meta data */
+struct comms_desc_header {
+	__le32 magic;		/* magic for validation */
+	__le32 crc32;		/* CRC32 of the descriptor w/o header */
+	__le16 size;		/* size of the descriptor w/o header */
+	__u8 version;	/* descriptor version */
+	__u8 reserved[5];	/* pad to 64 bit */
+};
+
+/* this is the comms message header - meta data */
+struct comms_msg_header {
+	__le32 magic;		/* magic for validation */
+	__le32 crc32;		/* CRC32 of the message w/o header */
+	__le16 size;		/* size of the message w/o header */
+	__u8 version;	/* message payload version */
+	__u8 type;		/* message type */
+	__u8 reserved[4];	/* pad to 64 bit */
+};
+
+enum lkd_fw_ascii_msg_lvls {
+	LKD_FW_ASCII_MSG_ERR = 0,
+	LKD_FW_ASCII_MSG_WRN = 1,
+	LKD_FW_ASCII_MSG_INF = 2,
+	LKD_FW_ASCII_MSG_DBG = 3,
+};
+
+#define LKD_FW_ASCII_MSG_MAX_LEN	128
+#define LKD_FW_ASCII_MSG_MAX		4	/* consider ABI when changing */
+
+struct lkd_fw_ascii_msg {
+	__u8 valid;
+	__u8 msg_lvl;
+	__u8 reserved[6];
+	char msg[LKD_FW_ASCII_MSG_MAX_LEN];
+};
+
+/* this is the main FW descriptor - consider ABI when changing */
+struct lkd_fw_comms_desc {
+	struct comms_desc_header header;
+	struct cpu_dyn_regs cpu_dyn_regs;
+	char fuse_ver[VERSION_MAX_LEN];
+	char cur_fw_ver[VERSION_MAX_LEN];
+	/* can be used for 1 more version w/o ABI change */
+	char reserved0[VERSION_MAX_LEN];
+	__le64 img_addr;	/* address for next FW component load */
+	struct lkd_fw_binning_info binning_info;
+	struct lkd_fw_ascii_msg ascii_msg[LKD_FW_ASCII_MSG_MAX];
+	__le32 rsvd_mem_size_mb; /* reserved memory size [MB] for FW/SVE */
+	char reserved1[4];
+};
+
+enum comms_reset_cause {
+	HL_RESET_CAUSE_UNKNOWN = 0,
+	HL_RESET_CAUSE_HEARTBEAT = 1,
+	HL_RESET_CAUSE_TDR = 2,
+};
+
+/* TODO: remove define after struct name is aligned on all projects */
+#define lkd_msg_comms lkd_fw_comms_msg
+
+/* this is the comms message descriptor */
+struct lkd_fw_comms_msg {
+	struct comms_msg_header header;
+	/* union for future expantions of new messages */
+	union {
+		struct {
+			struct cpu_dyn_regs cpu_dyn_regs;
+			char fuse_ver[VERSION_MAX_LEN];
+			char cur_fw_ver[VERSION_MAX_LEN];
+			/* can be used for 1 more version w/o ABI change */
+			char reserved0[VERSION_MAX_LEN];
+			/* address for next FW component load */
+			__le64 img_addr;
+			struct lkd_fw_binning_info binning_info;
+			struct lkd_fw_ascii_msg ascii_msg[LKD_FW_ASCII_MSG_MAX];
+			/* reserved memory size [MB] for FW/SVE */
+			__le32 rsvd_mem_size_mb;
+			char reserved1[4];
+		};
+		struct {
+			__u8 reset_cause;
+		};
+		struct {
+			__u8 fw_cfg_skip; /* 1 - skip, 0 - don't skip */
+		};
+		struct lkd_fw_binning_info binning_conf;
+	};
+};
+
+/*
+ * LKD commands:
+ *
+ * COMMS_NOOP			Used to clear the command register and no actual
+ *				command is send.
+ *
+ * COMMS_CLR_STS		Clear status command - FW should clear the
+ *				status register. Used for synchronization
+ *				between the commands as part of the race free
+ *				protocol.
+ *
+ * COMMS_RST_STATE		Reset the current communication state which is
+ *				kept by FW for proper responses.
+ *				Should be used in the beginning of the
+ *				communication cycle to clean any leftovers from
+ *				previous communication attempts.
+ *
+ * COMMS_PREP_DESC		Prepare descriptor for setting up the
+ *				communication and other dynamic data:
+ *				struct lkd_fw_comms_desc.
+ *				This command has a parameter stating the next FW
+ *				component size, so the FW can actually prepare a
+ *				space for it and in the status response provide
+ *				the descriptor offset. The Offset of the next FW
+ *				data component is a part of the descriptor
+ *				structure.
+ *
+ * COMMS_DATA_RDY		The FW data has been uploaded and is ready for
+ *				validation.
+ *
+ * COMMS_EXEC			Execute the next FW component.
+ *
+ * COMMS_RST_DEV		Reset the device.
+ *
+ * COMMS_GOTO_WFE		Execute WFE command. Allowed only on non-secure
+ *				devices.
+ *
+ * COMMS_SKIP_BMC		Perform actions required for BMC-less servers.
+ *				Do not wait for BMC response.
+ *
+ * COMMS_PREP_DESC_ELBI		Same as COMMS_PREP_DESC only that the memory
+ *				space is allocated in a ELBI access only
+ *				address range.
+ *
+ */
+enum comms_cmd {
+	COMMS_NOOP = 0,
+	COMMS_CLR_STS = 1,
+	COMMS_RST_STATE = 2,
+	COMMS_PREP_DESC = 3,
+	COMMS_DATA_RDY = 4,
+	COMMS_EXEC = 5,
+	COMMS_RST_DEV = 6,
+	COMMS_GOTO_WFE = 7,
+	COMMS_SKIP_BMC = 8,
+	COMMS_PREP_DESC_ELBI = 10,
+	COMMS_INVLD_LAST
+};
+
+#define COMMS_COMMAND_SIZE_SHIFT	0
+#define COMMS_COMMAND_SIZE_MASK		0x1FFFFFF
+#define COMMS_COMMAND_CMD_SHIFT		27
+#define COMMS_COMMAND_CMD_MASK		0xF8000000
+
+/*
+ * LKD command to FW register structure
+ * @size	- FW component size
+ * @cmd		- command from enum comms_cmd
+ */
+struct comms_command {
+	union {		/* bit fields are only for FW use */
+		struct {
+			u32 size :25;		/* 32MB max. */
+			u32 reserved :2;
+			enum comms_cmd cmd :5;		/* 32 commands */
+		};
+		__le32 val;
+	};
+};
+
+/*
+ * FW status
+ *
+ * COMMS_STS_NOOP		Used to clear the status register and no actual
+ *				status is provided.
+ *
+ * COMMS_STS_ACK		Command has been received and recognized.
+ *
+ * COMMS_STS_OK			Command execution has finished successfully.
+ *
+ * COMMS_STS_ERR		Command execution was unsuccessful and resulted
+ *				in error.
+ *
+ * COMMS_STS_VALID_ERR		FW validation has failed.
+ *
+ * COMMS_STS_TIMEOUT_ERR	Command execution has timed out.
+ */
+enum comms_sts {
+	COMMS_STS_NOOP = 0,
+	COMMS_STS_ACK = 1,
+	COMMS_STS_OK = 2,
+	COMMS_STS_ERR = 3,
+	COMMS_STS_VALID_ERR = 4,
+	COMMS_STS_TIMEOUT_ERR = 5,
+	COMMS_STS_INVLD_LAST
+};
+
+/* RAM types for FW components loading - defines the base address */
+enum comms_ram_types {
+	COMMS_SRAM = 0,
+	COMMS_DRAM = 1,
+};
+
+#define COMMS_STATUS_OFFSET_SHIFT	0
+#define COMMS_STATUS_OFFSET_MASK	0x03FFFFFF
+#define COMMS_STATUS_OFFSET_ALIGN_SHIFT	2
+#define COMMS_STATUS_RAM_TYPE_SHIFT	26
+#define COMMS_STATUS_RAM_TYPE_MASK	0x0C000000
+#define COMMS_STATUS_STATUS_SHIFT	28
+#define COMMS_STATUS_STATUS_MASK	0xF0000000
+
+/*
+ * FW status to LKD register structure
+ * @offset	- an offset from the base of the ram_type shifted right by
+ *		  2 bits (always aligned to 32 bits).
+ *		  Allows a maximum addressable offset of 256MB from RAM base.
+ *		  Example: for real offset in RAM of 0x800000 (8MB), the value
+ *		  in offset field is (0x800000 >> 2) = 0x200000.
+ * @ram_type	- the RAM type that should be used for offset from
+ *		  enum comms_ram_types
+ * @status	- status from enum comms_sts
+ */
+struct comms_status {
+	union {		/* bit fields are only for FW use */
+		struct {
+			u32 offset :26;
+			enum comms_ram_types ram_type :2;
+			enum comms_sts status :4;	/* 16 statuses */
+		};
+		__le32 val;
+	};
+};
+
+#define NAME_MAX_LEN	32 /* bytes */
+struct hl_module_data {
+	__u8 name[NAME_MAX_LEN];
+	__u8 version[VERSION_MAX_LEN];
+};
+
+/**
+ * struct hl_component_versions - versions associated with hl component.
+ * @struct_size: size of all the struct (including dynamic size of modules).
+ * @modules_offset: offset of the modules field in this struct.
+ * @component: version of the component itself.
+ * @fw_os: Firmware OS Version.
+ * @comp_name: Name of the component.
+ * @modules_counter: number of set bits in modules_mask.
+ * @reserved: reserved for future use.
+ * @modules: versions of the component's modules. Elborated explanation in
+ *              struct cpucp_versions.
+ */
+struct hl_component_versions {
+	__le16 struct_size;
+	__le16 modules_offset;
+	__u8 component[VERSION_MAX_LEN];
+	__u8 fw_os[VERSION_MAX_LEN];
+	__u8 comp_name[NAME_MAX_LEN];
+	__u8 modules_counter;
+	__u8 reserved[3];
+	struct hl_module_data modules[];
+};
+
+/* Max size of fit size */
+#define HL_FW_VERSIONS_FIT_SIZE	4096
+
+#endif /* HL_BOOT_IF_H */
-- 
cgit v1.2.3


From 7c4130e6ddd709be2033a6635c91d445cb2baea5 Mon Sep 17 00:00:00 2001
From: farah kassabri <fkassabri@habana.ai>
Date: Tue, 8 Aug 2023 12:56:47 +0300
Subject: accel/habanalabs/gaudi2: handle eq health heartbeat check

Add mechanism for fw eq health check. this will be done using two flows:
using the heartbeat mechanism and raising a dedicated interrupt to
indicate an eq failure like EQ full.
This patch will add implementation for the eq heartbeat for gaudi2 asic.

More info about the heartbeat mechanism:
Expand the heartbeat mechanism to monitor a new event that
will be sent from FW upon receiving heartbeat message.
that way driver can know that the eq is working or not.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 include/linux/habanalabs/cpucp_if.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
index 4cdedb603ecb..a18fa81aad1f 100644
--- a/include/linux/habanalabs/cpucp_if.h
+++ b/include/linux/habanalabs/cpucp_if.h
@@ -33,6 +33,17 @@
 #define PLL_MAP_MAX_BITS	128
 #define PLL_MAP_LEN		(PLL_MAP_MAX_BITS / 8)
 
+enum eq_event_id {
+	EQ_EVENT_NIC_STS_REQUEST = 0,
+	EQ_EVENT_PWR_MODE_0,
+	EQ_EVENT_PWR_MODE_1,
+	EQ_EVENT_PWR_MODE_2,
+	EQ_EVENT_PWR_MODE_3,
+	EQ_EVENT_PWR_BRK_ENTRY,
+	EQ_EVENT_PWR_BRK_EXIT,
+	EQ_EVENT_HEARTBEAT,
+};
+
 /*
  * info of the pkt queue pointers in the first async occurrence
  */
@@ -1143,6 +1154,7 @@ struct cpucp_security_info {
  *                     (0 = functional 1 = binned)
  * @interposer_version: Interposer version programmed in eFuse
  * @substrate_version: Substrate version programmed in eFuse
+ * @eq_health_check_supported: eq health check feature supported in FW.
  * @fw_hbm_region_size: Size in bytes of FW reserved region in HBM.
  * @fw_os_version: Firmware OS Version
  */
@@ -1169,7 +1181,7 @@ struct cpucp_info {
 	__u8 xbar_binning_mask;
 	__u8 interposer_version;
 	__u8 substrate_version;
-	__u8 reserved2;
+	__u8 eq_health_check_supported;
 	struct cpucp_security_info sec_info;
 	__le32 fw_hbm_region_size;
 	__u8 pll_map[PLL_MAP_LEN];
-- 
cgit v1.2.3


From 764bfd138f359423b299b7bf3fcbabb56b981ef5 Mon Sep 17 00:00:00 2001
From: farah kassabri <fkassabri@habana.ai>
Date: Wed, 23 Aug 2023 12:36:25 +0300
Subject: accel/habanalabs/gaudi2: add eq health check using irq

This is the second patch for applying the eq health check mechanism
which will add support for the interrupt flow for gaudi2 asic.

More info about the interrupt mechanism:
set a dedicated msix for the eq error interrupt, and add
interrupt handler for it.
when FW detects some issue with EQ like EQ_FULL, it'll
raise that interrupt and driver should reset the device.
Driver will inform the FW which msix index to use through
the already existing handshake mechanism which will
send msix info message to fw.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 include/linux/habanalabs/cpucp_if.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
index a18fa81aad1f..84d74c4ee4d3 100644
--- a/include/linux/habanalabs/cpucp_if.h
+++ b/include/linux/habanalabs/cpucp_if.h
@@ -1004,6 +1004,7 @@ enum cpucp_msi_type {
 	CPUCP_NIC_PORT5_MSI_TYPE,
 	CPUCP_NIC_PORT7_MSI_TYPE,
 	CPUCP_NIC_PORT9_MSI_TYPE,
+	CPUCP_EVENT_QUEUE_ERR_MSI_TYPE,
 	CPUCP_NUM_OF_MSI_TYPES
 };
 
-- 
cgit v1.2.3


From 9dca13141332e69fd657873194e77a1960fc9ab2 Mon Sep 17 00:00:00 2001
From: Dafna Hirschfeld <dhirschfeld@habana.ai>
Date: Thu, 7 Sep 2023 14:43:01 +0300
Subject: accel/habanalabs: add fw status SHUTDOWN_PREP

update hl_boot_if.h from specs to include
CPU_BOOT_STATUS_FW_SHUTDOWN_PREP

Signed-off-by: Dafna Hirschfeld <dhirschfeld@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 include/linux/habanalabs/hl_boot_if.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/habanalabs/hl_boot_if.h b/include/linux/habanalabs/hl_boot_if.h
index 7de8a5786a36..93366d5621fd 100644
--- a/include/linux/habanalabs/hl_boot_if.h
+++ b/include/linux/habanalabs/hl_boot_if.h
@@ -394,6 +394,8 @@ enum cpu_boot_status {
 	CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT = 16,
 	/* Internal Security has been initialized, device can be accessed */
 	CPU_BOOT_STATUS_SECURITY_READY = 17,
+	/* FW component is preparing to shutdown and communication with host is not available */
+	CPU_BOOT_STATUS_FW_SHUTDOWN_PREP = 18,
 };
 
 enum kmd_msg {
-- 
cgit v1.2.3


From 7f1cd6fdd5872160da05098a84a94dee5e709e54 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Thu, 21 Sep 2023 14:52:01 +0300
Subject: accel/habanalabs: minor cosmetics update to cpucp_if.h

- Update copyright years
- Align comments

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Ofir Bitton <obitton@habana.ai>
---
 include/linux/habanalabs/cpucp_if.h | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
index 84d74c4ee4d3..86ea7c63a0d2 100644
--- a/include/linux/habanalabs/cpucp_if.h
+++ b/include/linux/habanalabs/cpucp_if.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0
  *
- * Copyright 2020-2022 HabanaLabs, Ltd.
+ * Copyright 2020-2023 HabanaLabs, Ltd.
  * All Rights Reserved.
  *
  */
@@ -668,18 +668,15 @@ enum pq_init_status {
  *       Obsolete.
  *
  * CPUCP_PACKET_GENERIC_PASSTHROUGH -
- *      Generic opcode for all firmware info that is only passed to host
- *      through the LKD, without getting parsed there.
+ *       Generic opcode for all firmware info that is only passed to host
+ *       through the LKD, without getting parsed there.
  *
  * CPUCP_PACKET_ACTIVE_STATUS_SET -
  *       LKD sends FW indication whether device is free or in use, this indication is reported
  *       also to the BMC.
  *
- * CPUCP_PACKET_REGISTER_INTERRUPTS -
- *       Packet to register interrupts indicating LKD is ready to receive events from FW.
- *
  * CPUCP_PACKET_SOFT_RESET -
- *      Packet to perform soft-reset.
+ *       Packet to perform soft-reset.
  *
  * CPUCP_PACKET_INTS_REGISTER -
  *       Packet to inform FW that queues have been established and LKD is ready to receive
@@ -750,9 +747,9 @@ enum cpucp_packet_id {
 	CPUCP_PACKET_RESERVED11,		/* not used */
 	CPUCP_PACKET_RESERVED12,		/* internal */
 	CPUCP_PACKET_RESERVED13,                /* internal */
-	CPUCP_PACKET_SOFT_RESET,                /* internal */
-	CPUCP_PACKET_INTS_REGISTER,             /* internal */
-	CPUCP_PACKET_ID_MAX                     /* must be last */
+	CPUCP_PACKET_SOFT_RESET,		/* internal */
+	CPUCP_PACKET_INTS_REGISTER,		/* internal */
+	CPUCP_PACKET_ID_MAX			/* must be last */
 };
 
 #define CPUCP_PACKET_FENCE_VAL	0xFE8CE7A5
-- 
cgit v1.2.3


From 74975b4f2836e3cc10eeb6c38a6da54311e1de5b Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 26 Sep 2023 16:59:43 +0200
Subject: gpio: acpi: remove acpi_get_and_request_gpiod()

With no more users, we can remove acpi_get_and_request_gpiod().

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 include/linux/gpio/consumer.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 6cc345440a5b..db2dfbae8edb 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -606,8 +606,6 @@ void acpi_dev_remove_driver_gpios(struct acpi_device *adev);
 int devm_acpi_dev_add_driver_gpios(struct device *dev,
 				   const struct acpi_gpio_mapping *gpios);
 
-struct gpio_desc *acpi_get_and_request_gpiod(char *path, unsigned int pin, char *label);
-
 #else  /* CONFIG_GPIOLIB && CONFIG_ACPI */
 
 #include <linux/err.h>
@@ -625,12 +623,6 @@ static inline int devm_acpi_dev_add_driver_gpios(struct device *dev,
 	return -ENXIO;
 }
 
-static inline struct gpio_desc *acpi_get_and_request_gpiod(char *path, unsigned int pin,
-							   char *label)
-{
-	return ERR_PTR(-ENOSYS);
-}
-
 #endif /* CONFIG_GPIOLIB && CONFIG_ACPI */
 
 
-- 
cgit v1.2.3


From a0fddaa0b5a587cc8d185f8802fe7e48493c43ed Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 15 Sep 2023 08:22:37 -0700
Subject: rtc: Add API function to return alarm time bound by hardware limit

Add rtc_bound_alarmtime() to return the requested alarm timeout bound by
the maxmum alarm timeout that is supported by a given RTC.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230915152238.1144706-2-linux@roeck-us.net
---
 include/linux/rtc.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 4c0bcbeb1f00..5f8e438a0312 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -225,6 +225,23 @@ static inline bool is_leap_year(unsigned int year)
 	return (!(year % 4) && (year % 100)) || !(year % 400);
 }
 
+/**
+ * rtc_bound_alarmtime() - Return alarm time bound by rtc limit
+ * @rtc: Pointer to rtc device structure
+ * @requested: Requested alarm timeout
+ *
+ * Return: Alarm timeout bound by maximum alarm time supported by rtc.
+ */
+static inline ktime_t rtc_bound_alarmtime(struct rtc_device *rtc,
+					  ktime_t requested)
+{
+	if (rtc->alarm_offset_max &&
+	    rtc->alarm_offset_max * MSEC_PER_SEC < ktime_to_ms(requested))
+		return ms_to_ktime(rtc->alarm_offset_max * MSEC_PER_SEC);
+
+	return requested;
+}
+
 #define devm_rtc_register_device(device) \
 	__devm_rtc_register_device(THIS_MODULE, device)
 
-- 
cgit v1.2.3


From e346fb6d774abf1d9a87d39b1e3eef0b7397d154 Mon Sep 17 00:00:00 2001
From: Wedson Almeida Filho <walmeida@microsoft.com>
Date: Sat, 30 Sep 2023 02:00:05 -0300
Subject: xattr: make the xattr array itself const

As it is currently declared, the xattr_handler structs are const but the
array containing their pointers is not. This patch makes it so that fs
modules can place them in .rodata, which makes it harder for
accidental/malicious modifications at runtime.

Signed-off-by: Wedson Almeida Filho <walmeida@microsoft.com>
Link: https://lore.kernel.org/r/20230930050033.41174-2-wedsonaf@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4aeb3fa11927..bba22e25664d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1206,7 +1206,7 @@ struct super_block {
 #ifdef CONFIG_SECURITY
 	void                    *s_security;
 #endif
-	const struct xattr_handler **s_xattr;
+	const struct xattr_handler * const *s_xattr;
 #ifdef CONFIG_FS_ENCRYPTION
 	const struct fscrypt_operations	*s_cop;
 	struct fscrypt_keyring	*s_master_keys; /* master crypto keys in use */
-- 
cgit v1.2.3


From e01cc1e8c2ad73cebb980878ede5584e0f2688f7 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 25 Sep 2023 16:50:23 +0200
Subject: locking/atomic: Add generic support for sync_try_cmpxchg() and its
 fallback

Provide the generic sync_try_cmpxchg() function from the
raw_ prefixed version, also adding explicit instrumentation.

The patch amends existing scripts to generate sync_try_cmpxchg()
locking primitive and its raw_sync_try_cmpxchg() fallback, while
leaving existing macros from the try_cmpxchg() family unchanged.

The target can define its own arch_sync_try_cmpxchg() to override the
generic version of raw_sync_try_cmpxchg(). This allows the target
to generate more optimal assembly than the generic version.

Additionally, the patch renames two scripts to better reflect
whet they really do.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
---
 include/linux/atomic/atomic-arch-fallback.h | 15 ++++++++++++++-
 include/linux/atomic/atomic-instrumented.h  | 10 +++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h
index b83ef19da13d..5e95faa959c4 100644
--- a/include/linux/atomic/atomic-arch-fallback.h
+++ b/include/linux/atomic/atomic-arch-fallback.h
@@ -428,6 +428,19 @@ extern void raw_cmpxchg128_relaxed_not_implemented(void);
 
 #define raw_sync_cmpxchg arch_sync_cmpxchg
 
+#ifdef arch_sync_try_cmpxchg
+#define raw_sync_try_cmpxchg arch_sync_try_cmpxchg
+#else
+#define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif
+
 /**
  * raw_atomic_read() - atomic load with relaxed ordering
  * @v: pointer to atomic_t
@@ -4649,4 +4662,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v)
 }
 
 #endif /* _LINUX_ATOMIC_FALLBACK_H */
-// 2fdd6702823fa842f9cea57a002e6e4476ae780c
+// eec048affea735b8464f58e6d96992101f8f85f1
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index d401b406ef7c..54d7bbe0aeaa 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -4998,6 +4998,14 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 	raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
+#define sync_try_cmpxchg(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_mb(); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \
+})
+
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 1568f875fef72097413caab8339120c065a39aa4
+// 2cc4bc990fef44d3836ec108f11b610f3f438184
-- 
cgit v1.2.3


From 9ae5c00ea2e600a8b823f9b95606dd244f3096bf Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:38 +0100
Subject: sched/numa: Document vma_numab_state fields

Document the intended usage of the fields.

[ mingo: Reformatted to take less vertical space & tidied it up. ]

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231010083143.19593-2-mgorman@techsingularity.net
---
 include/linux/mm_types.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 36c5b43999e6..d7f042ec1f33 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -551,8 +551,29 @@ struct vma_lock {
 };
 
 struct vma_numab_state {
+	/*
+	 * Initialised as time in 'jiffies' after which VMA
+	 * should be scanned.  Delays first scan of new VMA by at
+	 * least sysctl_numa_balancing_scan_delay:
+	 */
 	unsigned long next_scan;
+
+	/*
+	 * Time in jiffies when access_pids[] is reset to
+	 * detect phase change behaviour:
+	 */
 	unsigned long next_pid_reset;
+
+	/*
+	 * Approximate tracking of PIDs that trapped a NUMA hinting
+	 * fault. May produce false positives due to hash collisions.
+	 *
+	 *   [0] Previous PID tracking
+	 *   [1] Current PID tracking
+	 *
+	 * Window moves after next_pid_reset has expired approximately
+	 * every VMA_PID_RESET_PERIOD jiffies:
+	 */
 	unsigned long access_pids[2];
 };
 
-- 
cgit v1.2.3


From f3a6c97940fbd25d6c84c2d5642338fc99a9b35b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:39 +0100
Subject: sched/numa: Rename vma_numab_state::access_pids[] => ::pids_active[],
 ::next_pid_reset => ::pids_active_reset

The access_pids[] field name is somewhat ambiguous as no PIDs are accessed.
Similarly, it's not clear that next_pid_reset is related to access_pids[].
Rename the fields to more accurately reflect their purpose.

[ mingo: Rename in the comments too. ]

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231010083143.19593-3-mgorman@techsingularity.net
---
 include/linux/mm.h       | 4 ++--
 include/linux/mm_types.h | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..19fc73b02c9f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1726,8 +1726,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 	unsigned int pid_bit;
 
 	pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
-	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
-		__set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
+		__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
 	}
 }
 #else /* !CONFIG_NUMA_BALANCING */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d7f042ec1f33..e7571eca1131 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -559,10 +559,10 @@ struct vma_numab_state {
 	unsigned long next_scan;
 
 	/*
-	 * Time in jiffies when access_pids[] is reset to
+	 * Time in jiffies when pids_active[] is reset to
 	 * detect phase change behaviour:
 	 */
-	unsigned long next_pid_reset;
+	unsigned long pids_active_reset;
 
 	/*
 	 * Approximate tracking of PIDs that trapped a NUMA hinting
@@ -574,7 +574,7 @@ struct vma_numab_state {
 	 * Window moves after next_pid_reset has expired approximately
 	 * every VMA_PID_RESET_PERIOD jiffies:
 	 */
-	unsigned long access_pids[2];
+	unsigned long pids_active[2];
 };
 
 /*
-- 
cgit v1.2.3


From ed2da8b725b932b1e2b2f4835bb664d47ed03031 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:40 +0100
Subject: sched/numa: Trace decisions related to skipping VMAs

NUMA balancing skips or scans VMAs for a variety of reasons. In preparation
for completing scans of VMAs regardless of PID access, trace the reasons
why a VMA was skipped. In a later patch, the tracing will be used to track
if a VMA was forcibly scanned.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231010083143.19593-4-mgorman@techsingularity.net
---
 include/linux/sched/numa_balancing.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..c127a1509e2f 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -15,6 +15,14 @@
 #define TNF_FAULT_LOCAL	0x08
 #define TNF_MIGRATE_FAIL 0x10
 
+enum numa_vmaskip_reason {
+	NUMAB_SKIP_UNSUITABLE,
+	NUMAB_SKIP_SHARED_RO,
+	NUMAB_SKIP_INACCESSIBLE,
+	NUMAB_SKIP_SCAN_DELAY,
+	NUMAB_SKIP_PID_INACTIVE,
+};
+
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
-- 
cgit v1.2.3


From 57ec42b9a1b7e4db4a1c2aa4fcc4eefe6d31bcb8 Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Tue, 3 Oct 2023 15:53:39 +0800
Subject: i3c: Fix typo "Provisional ID" to "Provisioned ID"

The MIPI I3C spec refers to a Provisioned ID, since it is (sometimes)
provisioned at device manufacturing.

Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20231003075339.197099-1-matt@codeconstruct.com.au
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/device.h | 2 +-
 include/linux/i3c/master.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h
index 90fa83464f00..84ed77c04940 100644
--- a/include/linux/i3c/device.h
+++ b/include/linux/i3c/device.h
@@ -96,7 +96,7 @@ enum i3c_dcr {
 
 /**
  * struct i3c_device_info - I3C device information
- * @pid: Provisional ID
+ * @pid: Provisioned ID
  * @bcr: Bus Characteristic Register
  * @dcr: Device Characteristic Register
  * @static_addr: static/I2C address
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 0b52da4f2346..4fd6a777150f 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -166,7 +166,7 @@ struct i3c_device_ibi_info {
  *		 assigned a dynamic address by the master. Will be used during
  *		 bus initialization to assign it a specific dynamic address
  *		 before starting DAA (Dynamic Address Assignment)
- * @pid: I3C Provisional ID exposed by the device. This is a unique identifier
+ * @pid: I3C Provisioned ID exposed by the device. This is a unique identifier
  *	 that may be used to attach boardinfo to i3c_dev_desc when the device
  *	 does not have a static address
  * @of_node: optional DT node in case the device has been described in the DT
-- 
cgit v1.2.3


From 295d3c441226d004d1ed59c4fcf62d5dba18d9e1 Mon Sep 17 00:00:00 2001
From: Wedson Almeida Filho <walmeida@microsoft.com>
Date: Sat, 30 Sep 2023 02:00:33 -0300
Subject: net: move sockfs_xattr_handlers to .rodata

This makes it harder for accidental or malicious changes to
sockfs_xattr_handlers at runtime.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: netdev@vger.kernel.org
Signed-off-by: Wedson Almeida Filho <walmeida@microsoft.com>
Link: https://lore.kernel.org/r/20230930050033.41174-30-wedsonaf@gmail.com
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pseudo_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
index eceda1d1407a..730f77381d55 100644
--- a/include/linux/pseudo_fs.h
+++ b/include/linux/pseudo_fs.h
@@ -5,7 +5,7 @@
 
 struct pseudo_fs_context {
 	const struct super_operations *ops;
-	const struct xattr_handler **xattr;
+	const struct xattr_handler * const *xattr;
 	const struct dentry_operations *dops;
 	unsigned long magic;
 };
-- 
cgit v1.2.3


From 909f4abd1097769d024c3a9c2e59c2fbe5d2d0c0 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Thu, 28 Sep 2023 00:15:23 -0700
Subject: iommu: Add new iommu op to create domains owned by userspace

Introduce a new iommu_domain op to create domains owned by userspace,
e.g. through IOMMUFD. These domains have a few different properties
compares to kernel owned domains:

 - They may be PAGING domains, but created with special parameters.
   For instance aperture size changes/number of levels, different
   IOPTE formats, or other things necessary to make a vIOMMU work

 - We have to track all the memory allocations with GFP_KERNEL_ACCOUNT
   to make the cgroup sandbox stronger

 - Device-specialty domains, such as NESTED domains can be created by
   IOMMUFD.

The new op clearly says the domain is being created by IOMMUFD, that the
domain is intended for userspace use, and it provides a way to pass user
flags or a driver specific uAPI structure to customize the created domain
to exactly what the vIOMMU userspace driver requires.

iommu drivers that cannot support VFIO/IOMMUFD should not support this
op. This includes any driver that cannot provide a fully functional PAGING
domain.

This new op for now is only supposed to be used by IOMMUFD, hence no
wrapper for it. IOMMUFD would call the callback directly. As for domain
free, IOMMUFD would use iommu_domain_free().

Link: https://lore.kernel.org/r/20230928071528.26258-2-yi.l.liu@intel.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Co-developed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c50a769d569a..3861d66b65c1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -234,7 +234,15 @@ struct iommu_iotlb_gather {
  *           op is allocated in the iommu driver and freed by the caller after
  *           use. The information type is one of enum iommu_hw_info_type defined
  *           in include/uapi/linux/iommufd.h.
- * @domain_alloc: allocate iommu domain
+ * @domain_alloc: allocate and return an iommu domain if success. Otherwise
+ *                NULL is returned. The domain is not fully initialized until
+ *                the caller iommu_domain_alloc() returns.
+ * @domain_alloc_user: Allocate an iommu domain corresponding to the input
+ *                     parameters as defined in include/uapi/linux/iommufd.h.
+ *                     Unlike @domain_alloc, it is called only by IOMMUFD and
+ *                     must fully initialize the new domain before return.
+ *                     Upon success, a domain is returned. Upon failure,
+ *                     ERR_PTR must be returned.
  * @probe_device: Add device to iommu driver handling
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -267,6 +275,7 @@ struct iommu_ops {
 
 	/* Domain allocation and freeing by the iommu driver */
 	struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
+	struct iommu_domain *(*domain_alloc_user)(struct device *dev, u32 flags);
 
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
-- 
cgit v1.2.3


From b7a5b537c55c088d891ae554103d1b281abef781 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:42 +0100
Subject: sched/numa: Complete scanning of partial VMAs regardless of PID
 activity

NUMA Balancing skips VMAs when the current task has not trapped a NUMA
fault within the VMA. If the VMA is skipped then mm->numa_scan_offset
advances and a task that is trapping faults within the VMA may never
fully update PTEs within the VMA.

Force tasks to update PTEs for partially scanned PTEs. The VMA will
be tagged for NUMA hints by some task but this removes some of the
benefit of tracking PID activity within a VMA. A follow-on patch
will mitigate this problem.

The test cases and machines evaluated did not trigger the corner case so
the performance results are neutral with only small changes within the
noise from normal test-to-test variance. However, the next patch makes
the corner case easier to trigger.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Raghavendra K T <raghavendra.kt@amd.com>
Link: https://lore.kernel.org/r/20231010083143.19593-6-mgorman@techsingularity.net
---
 include/linux/sched/numa_balancing.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index c127a1509e2f..7dcc0bdfddbb 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -21,6 +21,7 @@ enum numa_vmaskip_reason {
 	NUMAB_SKIP_INACCESSIBLE,
 	NUMAB_SKIP_SCAN_DELAY,
 	NUMAB_SKIP_PID_INACTIVE,
+	NUMAB_SKIP_IGNORE_PID,
 };
 
 #ifdef CONFIG_NUMA_BALANCING
-- 
cgit v1.2.3


From f169c62ff7cd1acf8bac8ae17bfeafa307d9e6fa Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:43 +0100
Subject: sched/numa: Complete scanning of inactive VMAs when there is no
 alternative

VMAs are skipped if there is no recent fault activity but this represents
a chicken-and-egg problem as there may be no fault activity if the PTEs
are never updated to trap NUMA hints. There is an indirect reliance on
scanning to be forced early in the lifetime of a task but this may fail
to detect changes in phase behaviour. Force inactive VMAs to be scanned
when all other eligible VMAs have been updated within the same scan
sequence.

Test results in general look good with some changes in performance, both
negative and positive, depending on whether the additional scanning and
faulting was beneficial or not to the workload. The autonuma benchmark
workload NUMA01_THREADLOCAL was picked for closer examination. The workload
creates two processes with numerous threads and thread-local storage that
is zero-filled in a loop. It exercises the corner case where unrelated
threads may skip VMAs that are thread-local to another thread and still
has some VMAs that inactive while the workload executes.

The VMA skipping activity frequency with and without the patch:

	6.6.0-rc2-sched-numabtrace-v1
	=============================
	    649 reason=scan_delay
	  9,094 reason=unsuitable
	 48,915 reason=shared_ro
	143,919 reason=inaccessible
	193,050 reason=pid_inactive

	6.6.0-rc2-sched-numabselective-v1
	=============================
	    146 reason=seq_completed
	    622 reason=ignore_pid_inactive

	    624 reason=scan_delay
	  6,570 reason=unsuitable
	 16,101 reason=shared_ro
	 27,608 reason=inaccessible
	 41,939 reason=pid_inactive

Note that with the patch applied, the PID activity is ignored
(ignore_pid_inactive) to ensure a VMA with some activity is completely
scanned. In addition, a small number of VMAs are scanned when no other
eligible VMA is available during a single scan window (seq_completed).
The number of times a VMA is skipped due to no PID activity from the
scanning task (pid_inactive) drops dramatically. It is expected that
this will increase the number of PTEs updated for NUMA hinting faults
as well as hinting faults but these represent PTEs that would otherwise
have been missed. The tradeoff is scan+fault overhead versus improving
locality due to migration.

On a 2-socket Cascade Lake test machine, the time to complete the
workload is as follows;

                                                 6.6.0-rc2              6.6.0-rc2
                                       sched-numabtrace-v1 sched-numabselective-v1
  Min       elsp-NUMA01_THREADLOCAL      174.22 (   0.00%)      117.64 (  32.48%)
  Amean     elsp-NUMA01_THREADLOCAL      175.68 (   0.00%)      123.34 *  29.79%*
  Stddev    elsp-NUMA01_THREADLOCAL        1.20 (   0.00%)        4.06 (-238.20%)
  CoeffVar  elsp-NUMA01_THREADLOCAL        0.68 (   0.00%)        3.29 (-381.70%)
  Max       elsp-NUMA01_THREADLOCAL      177.18 (   0.00%)      128.03 (  27.74%)

The time to complete the workload is reduced by almost 30%:

                     6.6.0-rc2   6.6.0-rc2
                  sched-numabtrace-v1 sched-numabselective-v1 /
  Duration User       91201.80    63506.64
  Duration System      2015.53     1819.78
  Duration Elapsed     1234.77      868.37

In this specific case, system CPU time was not increased but it's not
universally true.

From vmstat, the NUMA scanning and fault activity is as follows;

                                        6.6.0-rc2      6.6.0-rc2
                              sched-numabtrace-v1 sched-numabselective-v1
  Ops NUMA base-page range updates       64272.00    26374386.00
  Ops NUMA PTE updates                   36624.00       55538.00
  Ops NUMA PMD updates                      54.00       51404.00
  Ops NUMA hint faults                   15504.00       75786.00
  Ops NUMA hint local faults %           14860.00       56763.00
  Ops NUMA hint local percent               95.85          74.90
  Ops NUMA pages migrated                 1629.00     6469222.00

Both the number of PTE updates and hint faults is dramatically
increased. While this is superficially unfortunate, it represents
ranges that were simply skipped without the patch. As a result
of the scanning and hinting faults, many more pages were also
migrated but as the time to completion is reduced, the overhead
is offset by the gain.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Raghavendra K T <raghavendra.kt@amd.com>
Link: https://lore.kernel.org/r/20231010083143.19593-7-mgorman@techsingularity.net
---
 include/linux/mm_types.h             | 6 ++++++
 include/linux/sched/numa_balancing.h | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e7571eca1131..589f31ef2e84 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -575,6 +575,12 @@ struct vma_numab_state {
 	 * every VMA_PID_RESET_PERIOD jiffies:
 	 */
 	unsigned long pids_active[2];
+
+	/*
+	 * MM scan sequence ID when the VMA was last completely scanned.
+	 * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
+	 */
+	int prev_scan_seq;
 };
 
 /*
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 7dcc0bdfddbb..b69afb8630db 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -22,6 +22,7 @@ enum numa_vmaskip_reason {
 	NUMAB_SKIP_SCAN_DELAY,
 	NUMAB_SKIP_PID_INACTIVE,
 	NUMAB_SKIP_IGNORE_PID,
+	NUMAB_SKIP_SEQ_COMPLETED,
 };
 
 #ifdef CONFIG_NUMA_BALANCING
-- 
cgit v1.2.3


From 8527ca7735ef4cdad32c45853b0138f46ab2df58 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 8 Oct 2023 14:41:21 -0700
Subject: net: skbuff: fix kernel-doc typos

Correct punctuation and drop an extraneous word.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20231008214121.25940-1-rdunlap@infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4174c4b82d13..97bfef071255 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1309,7 +1309,7 @@ struct sk_buff_fclones {
  *
  * Returns true if skb is a fast clone, and its clone is not freed.
  * Some drivers call skb_orphan() in their ndo_start_xmit(),
- * so we also check that this didnt happen.
+ * so we also check that didn't happen.
  */
 static inline bool skb_fclone_busy(const struct sock *sk,
 				   const struct sk_buff *skb)
@@ -2016,7 +2016,7 @@ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
  *	Copy shared buffers into a new sk_buff. We effectively do COW on
  *	packets to handle cases where we have a local reader and forward
  *	and a couple of other messy ones. The normal one is tcpdumping
- *	a packet thats being forwarded.
+ *	a packet that's being forwarded.
  */
 
 /**
-- 
cgit v1.2.3


From 5247dbf16cee4e83eb89e4d3b87bd5e79c5d1655 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Mon, 9 Oct 2023 19:16:33 +0800
Subject: net/core: Introduce netdev_core_stats_inc()

Although there is a kfree_skb_reason() helper function that can be used to
find the reason why this skb is dropped, but most callers didn't increase
one of rx_dropped, tx_dropped, rx_nohandler and rx_otherhost_dropped.

For the users, people are more concerned about why the dropped in ip
is increasing.

Introduce netdev_core_stats_inc() for trace the caller of
dev_core_stats_*_inc().

Also, add __code to netdev_core_stats_alloc(), as it's called with small
probability. And add noinline make sure netdev_core_stats_inc was never
inlined.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Suggested-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e070a4540fba..11d704bfec9b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4002,32 +4002,19 @@ static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
 	return false;
 }
 
-struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev);
-
-static inline struct net_device_core_stats __percpu *dev_core_stats(struct net_device *dev)
-{
-	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
-	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
-
-	if (likely(p))
-		return p;
-
-	return netdev_core_stats_alloc(dev);
-}
+void netdev_core_stats_inc(struct net_device *dev, u32 offset);
 
 #define DEV_CORE_STATS_INC(FIELD)						\
 static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev)		\
 {										\
-	struct net_device_core_stats __percpu *p;				\
-										\
-	p = dev_core_stats(dev);						\
-	if (p)									\
-		this_cpu_inc(p->FIELD);						\
+	netdev_core_stats_inc(dev,						\
+			offsetof(struct net_device_core_stats, FIELD));		\
 }
 DEV_CORE_STATS_INC(rx_dropped)
 DEV_CORE_STATS_INC(tx_dropped)
 DEV_CORE_STATS_INC(rx_nohandler)
 DEV_CORE_STATS_INC(rx_otherhost_dropped)
+#undef DEV_CORE_STATS_INC
 
 static __always_inline int ____dev_forward_skb(struct net_device *dev,
 					       struct sk_buff *skb,
-- 
cgit v1.2.3


From acd6199f195d6de814ac4090ce0864a613b1580e Mon Sep 17 00:00:00 2001
From: Wentong Wu <wentong.wu@intel.com>
Date: Mon, 9 Oct 2023 14:33:22 +0800
Subject: usb: Add support for Intel LJCA device

Implements the USB part of Intel USB-I2C/GPIO/SPI adapter device
named "La Jolla Cove Adapter" (LJCA).

The communication between the various LJCA module drivers and the
hardware will be muxed/demuxed by this driver. Three modules (
I2C, GPIO, and SPI) are supported currently.

Each sub-module of LJCA device is identified by type field within
the LJCA message header.

The sub-modules of LJCA can use ljca_transfer() to issue a transfer
between host and hardware. And ljca_register_event_cb is exported
to LJCA sub-module drivers for hardware event subscription.

The minimum code in ASL that covers this board is
Scope (\_SB.PCI0.DWC3.RHUB.HS01)
    {
        Device (GPIO)
        {
            Name (_ADR, Zero)
            Name (_STA, 0x0F)
        }

        Device (I2C)
        {
            Name (_ADR, One)
            Name (_STA, 0x0F)
        }

        Device (SPI)
        {
            Name (_ADR, 0x02)
            Name (_STA, 0x0F)
        }
    }

Signed-off-by: Wentong Wu <wentong.wu@intel.com>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Oliver Neukum <oneukum@suse.com>
Link: https://lore.kernel.org/r/1696833205-16716-2-git-send-email-wentong.wu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/ljca.h | 145 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 include/linux/usb/ljca.h

(limited to 'include/linux')

diff --git a/include/linux/usb/ljca.h b/include/linux/usb/ljca.h
new file mode 100644
index 000000000000..47661feda96c
--- /dev/null
+++ b/include/linux/usb/ljca.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023, Intel Corporation. All rights reserved.
+ */
+#ifndef _LINUX_USB_LJCA_H_
+#define _LINUX_USB_LJCA_H_
+
+#include <linux/auxiliary_bus.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#define LJCA_MAX_GPIO_NUM 64
+
+#define auxiliary_dev_to_ljca_client(auxiliary_dev)			\
+		container_of(auxiliary_dev, struct ljca_client, auxdev)
+
+struct ljca_adapter;
+
+/**
+ * typedef ljca_event_cb_t - event callback function signature
+ *
+ * @context: the execution context of who registered this callback
+ * @cmd: the command from device for this event
+ * @evt_data: the event data payload
+ * @len: the event data payload length
+ *
+ * The callback function is called in interrupt context and the data payload is
+ * only valid during the call. If the user needs later access of the data, it
+ * must copy it.
+ */
+typedef void (*ljca_event_cb_t)(void *context, u8 cmd, const void *evt_data, int len);
+
+/**
+ * struct ljca_client - represent a ljca client device
+ *
+ * @type: ljca client type
+ * @id: ljca client id within same client type
+ * @link: ljca client on the same ljca adapter
+ * @auxdev: auxiliary device object
+ * @adapter: ljca adapter the ljca client sit on
+ * @context: the execution context of the event callback
+ * @event_cb: ljca client driver register this callback to get
+ *	firmware asynchronous rx buffer pending notifications
+ * @event_cb_lock: spinlock to protect event callback
+ */
+struct ljca_client {
+	u8 type;
+	u8 id;
+	struct list_head link;
+	struct auxiliary_device auxdev;
+	struct ljca_adapter *adapter;
+
+	void *context;
+	ljca_event_cb_t event_cb;
+	/* lock to protect event_cb */
+	spinlock_t event_cb_lock;
+};
+
+/**
+ * struct ljca_gpio_info - ljca gpio client device info
+ *
+ * @num: ljca gpio client device pin number
+ * @valid_pin_map: ljca gpio client device valid pin mapping
+ */
+struct ljca_gpio_info {
+	unsigned int num;
+	DECLARE_BITMAP(valid_pin_map, LJCA_MAX_GPIO_NUM);
+};
+
+/**
+ * struct ljca_i2c_info - ljca i2c client device info
+ *
+ * @id: ljca i2c client device identification number
+ * @capacity: ljca i2c client device capacity
+ * @intr_pin: ljca i2c client device interrupt pin number if exists
+ */
+struct ljca_i2c_info {
+	u8 id;
+	u8 capacity;
+	u8 intr_pin;
+};
+
+/**
+ * struct ljca_spi_info - ljca spi client device info
+ *
+ * @id: ljca spi client device identification number
+ * @capacity: ljca spi client device capacity
+ */
+struct ljca_spi_info {
+	u8 id;
+	u8 capacity;
+};
+
+/**
+ * ljca_register_event_cb - register a callback function to receive events
+ *
+ * @client: ljca client device
+ * @event_cb: callback function
+ * @context: execution context of event callback
+ *
+ * Return: 0 in case of success, negative value in case of error
+ */
+int ljca_register_event_cb(struct ljca_client *client, ljca_event_cb_t event_cb, void *context);
+
+/**
+ * ljca_unregister_event_cb - unregister the callback function for an event
+ *
+ * @client: ljca client device
+ */
+void ljca_unregister_event_cb(struct ljca_client *client);
+
+/**
+ * ljca_transfer - issue a LJCA command and wait for a response
+ *
+ * @client: ljca client device
+ * @cmd: the command to be sent to the device
+ * @obuf: the buffer to be sent to the device; it can be NULL if the user
+ *	doesn't need to transmit data with this command
+ * @obuf_len: the size of the buffer to be sent to the device; it should
+ *	be 0 when obuf is NULL
+ * @ibuf: any data associated with the response will be copied here; it can be
+ *	NULL if the user doesn't need the response data
+ * @ibuf_len: must be initialized to the input buffer size
+ *
+ * Return: the actual length of response data for success, negative value for errors
+ */
+int ljca_transfer(struct ljca_client *client, u8 cmd, const u8 *obuf,
+		  u8 obuf_len, u8 *ibuf, u8 ibuf_len);
+
+/**
+ * ljca_transfer_noack - issue a LJCA command without a response
+ *
+ * @client: ljca client device
+ * @cmd: the command to be sent to the device
+ * @obuf: the buffer to be sent to the device; it can be NULL if the user
+ *	doesn't need to transmit data with this command
+ * @obuf_len: the size of the buffer to be sent to the device
+ *
+ * Return: 0 for success, negative value for errors
+ */
+int ljca_transfer_noack(struct ljca_client *client, u8 cmd, const u8 *obuf,
+			u8 obuf_len);
+
+#endif
-- 
cgit v1.2.3


From c4dd854f740c21ae8dd9903fc67969c5497cb14b Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 25 Sep 2023 17:28:39 +0100
Subject: cpu-hotplug: Provide prototypes for arch CPU registration

Provide common prototypes for arch_register_cpu() and
arch_unregister_cpu(). These are called by acpi_processor.c, with weak
versions, so the prototype for this is already set. It is generally not
necessary for function prototypes to be conditional on preprocessor macros.

Some architectures (e.g. Loongarch) are missing the prototype for this, and
rather than add it to Loongarch's asm/cpu.h, do the job once for everyone.

Since this covers everyone, remove the now unnecessary prototypes in
asm/cpu.h, and therefore remove the 'static' from one of ia64's
arch_register_cpu() definitions.

[ tglx: Bring back the ia64 part and remove the ACPI prototypes ]

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/E1qkoRr-0088Q8-Da@rmk-PC.armlinux.org.uk
---
 include/linux/cpu.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 0abd60a7987b..eb768a866fe3 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -80,6 +80,8 @@ extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,
 				 const struct attribute_group **groups,
 				 const char *fmt, ...);
+extern int arch_register_cpu(int cpu);
+extern void arch_unregister_cpu(int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 extern void unregister_cpu(struct cpu *cpu);
 extern ssize_t arch_cpu_probe(const char *, size_t);
-- 
cgit v1.2.3


From 5987279373446e97206a7078b2229446ba871ea0 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Thu, 5 Oct 2023 19:50:29 -0500
Subject: iio: event: add optional event label support

This adds a new optional field to struct iio_info to allow drivers to
specify a label for the event. This is useful for cases where there are
many events or the event attribute name is not descriptive enough or
where an event doesn't have any other attributes.

The implementation is based on the existing label support for channels.
So either all events of a device have a label attribute or none do.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://lore.kernel.org/r/20231005-ad2s1210-mainline-v4-12-ec00746840fc@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 7bfa1b9bc8a2..d0ce3b71106a 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -427,6 +427,8 @@ struct iio_trigger; /* forward declaration */
  * @write_event_config:	set if the event is enabled.
  * @read_event_value:	read a configuration value associated with the event.
  * @write_event_value:	write a configuration value for the event.
+ * @read_event_label:	function to request label name for a specified label,
+ *			for better event identification.
  * @validate_trigger:	function to validate the trigger when the
  *			current trigger gets changed.
  * @update_scan_mode:	function to configure device and scan buffer when
@@ -511,6 +513,12 @@ struct iio_info {
 				 enum iio_event_direction dir,
 				 enum iio_event_info info, int val, int val2);
 
+	int (*read_event_label)(struct iio_dev *indio_dev,
+				struct iio_chan_spec const *chan,
+				enum iio_event_type type,
+				enum iio_event_direction dir,
+				char *label);
+
 	int (*validate_trigger)(struct iio_dev *indio_dev,
 				struct iio_trigger *trig);
 	int (*update_scan_mode)(struct iio_dev *indio_dev,
-- 
cgit v1.2.3


From 21ca59b365c091d583f36ac753eaa8baf947be6f Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 28 Oct 2021 12:31:14 +0200
Subject: binfmt_misc: enable sandboxed mounts

Enable unprivileged sandboxes to create their own binfmt_misc mounts.
This is based on Laurent's work in [1] but has been significantly
reworked to fix various issues we identified in earlier versions.

While binfmt_misc can currently only be mounted in the initial user
namespace, binary types registered in this binfmt_misc instance are
available to all sandboxes (Either by having them installed in the
sandbox or by registering the binary type with the F flag causing the
interpreter to be opened right away). So binfmt_misc binary types are
already delegated to sandboxes implicitly.

However, while a sandbox has access to all registered binary types in
binfmt_misc a sandbox cannot currently register its own binary types
in binfmt_misc. This has prevented various use-cases some of which were
already outlined in [1] but we have a range of issues associated with
this (cf. [3]-[5] below which are just a small sample).

Extend binfmt_misc to be mountable in non-initial user namespaces.
Similar to other filesystem such as nfsd, mqueue, and sunrpc we use
keyed superblock management. The key determines whether we need to
create a new superblock or can reuse an already existing one. We use the
user namespace of the mount as key. This means a new binfmt_misc
superblock is created once per user namespace creation. Subsequent
mounts of binfmt_misc in the same user namespace will mount the same
binfmt_misc instance. We explicitly do not create a new binfmt_misc
superblock on every binfmt_misc mount as the semantics for
load_misc_binary() line up with the keying model. This also allows us to
retrieve the relevant binfmt_misc instance based on the caller's user
namespace which can be done in a simple (bounded to 32 levels) loop.

Similar to the current binfmt_misc semantics allowing access to the
binary types in the initial binfmt_misc instance we do allow sandboxes
access to their parent's binfmt_misc mounts if they do not have created
a separate binfmt_misc instance.

Overall, this will unblock the use-cases mentioned below and in general
will also allow to support and harden execution of another
architecture's binaries in tight sandboxes. For instance, using the
unshare binary it possible to start a chroot of another architecture and
configure the binfmt_misc interpreter without being root to run the
binaries in this chroot and without requiring the host to modify its
binary type handlers.

Henning had already posted a few experiments in the cover letter at [1].
But here's an additional example where an unprivileged container
registers qemu-user-static binary handlers for various binary types in
its separate binfmt_misc mount and is then seamlessly able to start
containers with a different architecture without affecting the host:

root    [lxc monitor] /var/snap/lxd/common/lxd/containers f1
1000000  \_ /sbin/init
1000000      \_ /lib/systemd/systemd-journald
1000000      \_ /lib/systemd/systemd-udevd
1000100      \_ /lib/systemd/systemd-networkd
1000101      \_ /lib/systemd/systemd-resolved
1000000      \_ /usr/sbin/cron -f
1000103      \_ /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-activation --syslog-only
1000000      \_ /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
1000104      \_ /usr/sbin/rsyslogd -n -iNONE
1000000      \_ /lib/systemd/systemd-logind
1000000      \_ /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220
1000107      \_ dnsmasq --conf-file=/dev/null -u lxc-dnsmasq --strict-order --bind-interfaces --pid-file=/run/lxc/dnsmasq.pid --liste
1000000      \_ [lxc monitor] /var/lib/lxc f1-s390x
1100000          \_ /usr/bin/qemu-s390x-static /sbin/init
1100000              \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-journald
1100000              \_ /usr/bin/qemu-s390x-static /usr/sbin/cron -f
1100103              \_ /usr/bin/qemu-s390x-static /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-ac
1100000              \_ /usr/bin/qemu-s390x-static /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
1100104              \_ /usr/bin/qemu-s390x-static /usr/sbin/rsyslogd -n -iNONE
1100000              \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-logind
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/0 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/1 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/2 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/3 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-udevd

[1]: https://lore.kernel.org/all/20191216091220.465626-1-laurent@vivier.eu
[2]: https://discuss.linuxcontainers.org/t/binfmt-misc-permission-denied
[3]: https://discuss.linuxcontainers.org/t/lxd-binfmt-support-for-qemu-static-interpreters
[4]: https://discuss.linuxcontainers.org/t/3-1-0-binfmt-support-service-in-unprivileged-guest-requires-write-access-on-hosts-proc-sys-fs-binfmt-misc
[5]: https://discuss.linuxcontainers.org/t/qemu-user-static-not-working-4-11

Link: https://lore.kernel.org/r/20191216091220.465626-2-laurent@vivier.eu (origin)
Link: https://lore.kernel.org/r/20211028103114.2849140-2-brauner@kernel.org (v1)
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Jann Horn <jannh@google.com>
Cc: Henning Schild <henning.schild@siemens.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Laurent Vivier <laurent@vivier.eu>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
/* v2 */
- Serge Hallyn <serge@hallyn.com>:
  - Use GFP_KERNEL_ACCOUNT for userspace triggered allocations when a
    new binary type handler is registered.
- Christian Brauner <christian.brauner@ubuntu.com>:
  - Switch authorship to me. I refused to do that earlier even though
    Laurent said I should do so because I think it's genuinely bad form.
    But by now I have changed so many things that it'd be unfair to
    blame Laurent for any potential bugs in here.
  - Add more comments that explain what's going on.
  - Rename functions while changing them to better reflect what they are
    doing to make the code easier to understand.
  - In the first version when a specific binary type handler was removed
    either through a write to the entry's file or all binary type
    handlers were removed by a write to the binfmt_misc mount's status
    file all cleanup work happened during inode eviction.
    That includes removal of the relevant entries from entry list. While
    that works fine I disliked that model after thinking about it for a
    bit. Because it means that there was a window were someone has
    already removed a or all binary handlers but they could still be
    safely reached from load_misc_binary() when it has managed to take
    the read_lock() on the entries list while inode eviction was already
    happening. Again, that perfectly benign but it's cleaner to remove
    the binary handler from the list immediately meaning that ones the
    write to then entry's file or the binfmt_misc status file returns
    the binary type cannot be executed anymore. That gives stronger
    guarantees to the user.
---
 include/linux/binfmts.h        | 10 ++++++++++
 include/linux/user_namespace.h |  8 ++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 8d51f69f9f5e..70f97f685bff 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -90,6 +90,16 @@ struct linux_binfmt {
 #endif
 } __randomize_layout;
 
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc {
+	struct list_head entries;
+	rwlock_t entries_lock;
+	bool enabled;
+} __randomize_layout;
+
+extern struct binfmt_misc init_binfmt_misc;
+#endif
+
 extern void __register_binfmt(struct linux_binfmt *fmt, int insert);
 
 /* Registration of default binfmt handlers */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 45f09bec02c4..6030a8235617 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -65,6 +65,10 @@ enum rlimit_type {
 	UCOUNT_RLIMIT_COUNTS,
 };
 
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc;
+#endif
+
 struct user_namespace {
 	struct uid_gid_map	uid_map;
 	struct uid_gid_map	gid_map;
@@ -102,6 +106,10 @@ struct user_namespace {
 	struct ucounts		*ucounts;
 	long ucount_max[UCOUNT_COUNTS];
 	long rlimit_max[UCOUNT_RLIMIT_COUNTS];
+
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+	struct binfmt_misc *binfmt_misc;
+#endif
 } __randomize_layout;
 
 struct ucounts {
-- 
cgit v1.2.3


From f6d7f050e258e3c71e310f5167c4d65bbefaeb31 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 10 Oct 2023 19:31:00 +0300
Subject: spi: Don't use flexible array in struct spi_message definition

The struct spi_message can be embedded into another structures.
With that the flexible array might be problematic as sparse
complains about it, although there is no real issue in the code
because when the message is embedded it doesn't use flexible array
member. That memeber is a private to spi_message_alloc() API, so
move it to that API in a form of an inherited data type.

Reported-by: Marc Kleine-Budde <mkl@pengutronix.de>
Fixes: 75e308ffc4f0 ("spi: Use struct_size() helper"))
Closes: https://lore.kernel.org/r/20231009-onshore-underage-c58415adfd92-mkl@pengutronix.de
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Marc Kleine-Budde <mkl@pengutronix.de>
Link: https://lore.kernel.org/r/20231010163100.89734-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 7f8b478fdeb3..487da1f6e4b7 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -1086,8 +1086,6 @@ struct spi_transfer {
  * @state: for use by whichever driver currently owns the message
  * @resources: for resource management when the SPI message is processed
  * @prepared: spi_prepare_message was called for the this message
- * @t: for use with spi_message_alloc() when message and transfers have
- *	been allocated together
  *
  * A @spi_message is used to execute an atomic sequence of data transfers,
  * each represented by a struct spi_transfer.  The sequence is "atomic"
@@ -1142,9 +1140,6 @@ struct spi_message {
 
 	/* List of spi_res resources when the SPI message is processed */
 	struct list_head        resources;
-
-	/* For embedding transfers into the memory of the message */
-	struct spi_transfer	t[];
 };
 
 static inline void spi_message_init_no_memset(struct spi_message *m)
@@ -1203,17 +1198,21 @@ struct spi_transfer *xfers, unsigned int num_xfers)
  */
 static inline struct spi_message *spi_message_alloc(unsigned ntrans, gfp_t flags)
 {
-	struct spi_message *m;
+	struct spi_message_with_transfers {
+		struct spi_message m;
+		struct spi_transfer t[];
+	} *mwt;
+	unsigned i;
+
+	mwt = kzalloc(struct_size(mwt, t, ntrans), flags);
+	if (!mwt)
+		return NULL;
 
-	m = kzalloc(struct_size(m, t, ntrans), flags);
-	if (m) {
-		unsigned i;
+	spi_message_init_no_memset(&mwt->m);
+	for (i = 0; i < ntrans; i++)
+		spi_message_add_tail(&mwt->t[i], &mwt->m);
 
-		spi_message_init_no_memset(m);
-		for (i = 0; i < ntrans; i++)
-			spi_message_add_tail(&m->t[i], m);
-	}
-	return m;
+	return &mwt->m;
 }
 
 static inline void spi_message_free(struct spi_message *m)
-- 
cgit v1.2.3


From 9d77eb52778499a97cab662aa96de4e2e4fa72d3 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 24 Aug 2023 16:39:08 +0200
Subject: nvme-keyring: register '.nvme' keyring

Register a '.nvme' keyring to hold keys for TLS and DH-HMAC-CHAP and
add a new config option NVME_KEYRING.
We need a separate keyring for NVMe as the configuration is done
via individual commands (eg for configfs), and the usual per-session
or per-process keyrings can't be used.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme-keyring.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 include/linux/nvme-keyring.h

(limited to 'include/linux')

diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h
new file mode 100644
index 000000000000..32bd264a71e6
--- /dev/null
+++ b/include/linux/nvme-keyring.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Hannes Reinecke, SUSE Labs
+ */
+
+#ifndef _NVME_KEYRING_H
+#define _NVME_KEYRING_H
+
+#ifdef CONFIG_NVME_KEYRING
+
+key_serial_t nvme_keyring_id(void);
+int nvme_keyring_init(void);
+void nvme_keyring_exit(void);
+
+#else
+
+static inline key_serial_t nvme_keyring_id(void)
+{
+	return 0;
+}
+static inline int nvme_keyring_init(void)
+{
+	return 0;
+}
+static inline void nvme_keyring_exit(void) {}
+
+#endif /* !CONFIG_NVME_KEYRING */
+#endif /* _NVME_KEYRING_H */
-- 
cgit v1.2.3


From 646f45b23218c7a97a84259d8eeb22dad5711fc8 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 24 Aug 2023 16:39:10 +0200
Subject: nvme: add TCP TSAS definitions

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 26dd3f859d9d..a7ba74babad7 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -108,6 +108,13 @@ enum {
 	NVMF_RDMA_CMS_RDMA_CM	= 1, /* Sockets based endpoint addressing */
 };
 
+/* TSAS SECTYPE for TCP transport */
+enum {
+	NVMF_TCP_SECTYPE_NONE = 0, /* No Security */
+	NVMF_TCP_SECTYPE_TLS12 = 1, /* TLSv1.2, NVMe-oF 1.1 and NVMe-TCP 3.6.1.1 */
+	NVMF_TCP_SECTYPE_TLS13 = 2, /* TLSv1.3, NVMe-oF 1.1 and NVMe-TCP 3.6.1.1 */
+};
+
 #define NVME_AQ_DEPTH		32
 #define NVME_NR_AEN_COMMANDS	1
 #define NVME_AQ_BLK_MQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
@@ -1493,6 +1500,9 @@ struct nvmf_disc_rsp_page_entry {
 			__u16	pkey;
 			__u8	resv10[246];
 		} rdma;
+		struct tcp {
+			__u8	sectype;
+		} tcp;
 	} tsas;
 };
 
-- 
cgit v1.2.3


From a86062aac34d100a3117c0fff91ee1892ebfb460 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 24 Aug 2023 16:39:11 +0200
Subject: nvme-tcp: add definitions for TLS cipher suites

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme-tcp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h
index 57ebe1267f7f..e07e8978d691 100644
--- a/include/linux/nvme-tcp.h
+++ b/include/linux/nvme-tcp.h
@@ -18,6 +18,12 @@ enum nvme_tcp_pfv {
 	NVME_TCP_PFV_1_0 = 0x0,
 };
 
+enum nvme_tcp_tls_cipher {
+	NVME_TCP_TLS_CIPHER_INVALID     = 0,
+	NVME_TCP_TLS_CIPHER_SHA256      = 1,
+	NVME_TCP_TLS_CIPHER_SHA384      = 2,
+};
+
 enum nvme_tcp_fatal_error_status {
 	NVME_TCP_FES_INVALID_PDU_HDR		= 0x01,
 	NVME_TCP_FES_PDU_SEQ_ERR		= 0x02,
-- 
cgit v1.2.3


From 501cc6f4aca8dc0958c4d9716f0233ba7cff4830 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 24 Aug 2023 16:39:12 +0200
Subject: nvme-keyring: implement nvme_tls_psk_default()

Implement a function to select the preferred PSK for TLS.
A 'retained' PSK should be preferred over a 'generated' PSK,
and SHA-384 should be preferred to SHA-256.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme-keyring.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h
index 32bd264a71e6..4efea9dd967c 100644
--- a/include/linux/nvme-keyring.h
+++ b/include/linux/nvme-keyring.h
@@ -8,12 +8,20 @@
 
 #ifdef CONFIG_NVME_KEYRING
 
+key_serial_t nvme_tls_psk_default(struct key *keyring,
+		const char *hostnqn, const char *subnqn);
+
 key_serial_t nvme_keyring_id(void);
 int nvme_keyring_init(void);
 void nvme_keyring_exit(void);
 
 #else
 
+static inline key_serial_t nvme_tls_psk_default(struct key *keyring,
+		const char *hostnqn, const char *subnqn)
+{
+	return 0;
+}
 static inline key_serial_t nvme_keyring_id(void)
 {
 	return 0;
-- 
cgit v1.2.3


From 037c34318a479294cdb98dc8018edd5d191b68c0 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 24 Aug 2023 16:39:13 +0200
Subject: security/keys: export key_lookup()

For in-kernel consumers one cannot readily assign a user (eg when
running from a workqueue), so the normal key search permissions
cannot be applied.
This patch exports the 'key_lookup()' function for a simple lookup
of keys without checking for permissions.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/key.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 938d7ecfb495..943a432da3ae 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -515,6 +515,7 @@ extern void key_init(void);
 #define key_init()			do { } while(0)
 #define key_free_user_ns(ns)		do { } while(0)
 #define key_remove_domain(d)		do { } while(0)
+#define key_lookup(k)			NULL
 
 #endif /* CONFIG_KEYS */
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From 39c6eed1f61594f737160e498d29673edbd9eefd Mon Sep 17 00:00:00 2001
From: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Date: Tue, 10 Oct 2023 12:42:36 +0200
Subject: x86/resctrl: Rename arch_has_sparse_bitmaps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename arch_has_sparse_bitmaps to arch_has_sparse_bitmasks to ensure
consistent terminology throughout resctrl.

Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Peter Newman <peternewman@google.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Peter Newman <peternewman@google.com>
Link: https://lore.kernel.org/r/e330fcdae873ef1a831e707025a4b70fa346666e.1696934091.git.maciej.wieczor-retman@intel.com
---
 include/linux/resctrl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 8334eeacfec5..66942d7fba7f 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -94,7 +94,7 @@ struct rdt_domain {
  *			zero CBM.
  * @shareable_bits:	Bitmask of shareable resource with other
  *			executing entities
- * @arch_has_sparse_bitmaps:	True if a bitmap like f00f is valid.
+ * @arch_has_sparse_bitmasks:	True if a bitmask like f00f is valid.
  * @arch_has_per_cpu_cfg:	True if QOS_CFG register for this cache
  *				level has CPU scope.
  */
@@ -102,7 +102,7 @@ struct resctrl_cache {
 	unsigned int	cbm_len;
 	unsigned int	min_cbm_bits;
 	unsigned int	shareable_bits;
-	bool		arch_has_sparse_bitmaps;
+	bool		arch_has_sparse_bitmasks;
 	bool		arch_has_per_cpu_cfg;
 };
 
-- 
cgit v1.2.3


From fefba7d1ae198dcbf8b3b432de46a4e29f8dbd8c Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Wed, 11 Oct 2023 20:51:04 +0200
Subject: bpf: Propagate modified uaddrlen from cgroup sockaddr programs

As prep for adding unix socket support to the cgroup sockaddr hooks,
let's propagate the sockaddr length back to the caller after running
a bpf cgroup sockaddr hook program. While not important for AF_INET or
AF_INET6, the sockaddr length is important when working with AF_UNIX
sockaddrs as the size of the sockaddr cannot be determined just from the
address family or the sockaddr's contents.

__cgroup_bpf_run_filter_sock_addr() is modified to take the uaddrlen as
an input/output argument. After running the program, the modified sockaddr
length is stored in the uaddrlen pointer.

Signed-off-by: Daan De Meyer <daan.j.demeyer@gmail.com>
Link: https://lore.kernel.org/r/20231011185113.140426-3-daan.j.demeyer@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf-cgroup.h | 73 +++++++++++++++++++++++-----------------------
 include/linux/filter.h     |  1 +
 2 files changed, 38 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8506690dbb9c..31561e789715 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -120,6 +120,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
+				      int *uaddrlen,
 				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
 				      u32 *flags);
@@ -230,22 +231,22 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk)				       \
 	BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND)
 
-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype)				       \
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype)		       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))					       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
-							  NULL, NULL);	       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+							  atype, NULL, NULL);  \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx)		       \
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx)	       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
-							  t_ctx, NULL);	       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+							  atype, t_ctx, NULL); \
 		release_sock(sk);					       \
 	}								       \
 	__ret;								       \
@@ -256,14 +257,14 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
  * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check
  * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE).
  */
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags)	       \
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, bind_flags) \
 ({									       \
 	u32 __flags = 0;						       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
-							  NULL, &__flags);     \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+							  atype, NULL, &__flags); \
 		release_sock(sk);					       \
 		if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE)	       \
 			*bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE;	       \
@@ -276,29 +277,29 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 	  cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) &&		       \
 	 (sk)->sk_prot->pre_connect)
 
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT)
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen)			\
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT)
 
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT)
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen)			\
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT)
 
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL)
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT, NULL)
 
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL)
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL)
 
-#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx)
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx)
 
-#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx)
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx)
 
-#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr)			\
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL)
+#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL)
 
-#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr)			\
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL)
+#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL)
 
 /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
  * fullsock and its parent fullsock cannot be traced by
@@ -477,24 +478,24 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 }
 
 #define cgroup_bpf_enabled(atype) (0)
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; })
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) ({ 0; })
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, flags) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
diff --git a/include/linux/filter.h b/include/linux/filter.h
index ff7ecc89d3dd..bcd2bc15ff56 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1335,6 +1335,7 @@ struct bpf_sock_addr_kern {
 	 */
 	u64 tmp_reg;
 	void *t_ctx;	/* Attach type specific context. */
+	u32 uaddrlen;
 };
 
 struct bpf_sock_ops_kern {
-- 
cgit v1.2.3


From 859051dd165ec6cc915f0f2114699021144fd249 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Wed, 11 Oct 2023 20:51:06 +0200
Subject: bpf: Implement cgroup sockaddr hooks for unix sockets

These hooks allows intercepting connect(), getsockname(),
getpeername(), sendmsg() and recvmsg() for unix sockets. The unix
socket hooks get write access to the address length because the
address length is not fixed when dealing with unix sockets and
needs to be modified when a unix socket address is modified by
the hook. Because abstract socket unix addresses start with a
NUL byte, we cannot recalculate the socket address in kernelspace
after running the hook by calculating the length of the unix socket
path using strlen().

These hooks can be used when users want to multiplex syscall to a
single unix socket to multiple different processes behind the scenes
by redirecting the connect() and other syscalls to process specific
sockets.

We do not implement support for intercepting bind() because when
using bind() with unix sockets with a pathname address, this creates
an inode in the filesystem which must be cleaned up. If we rewrite
the address, the user might try to clean up the wrong file, leaking
the socket in the filesystem where it is never cleaned up. Until we
figure out a solution for this (and a use case for intercepting bind()),
we opt to not allow rewriting the sockaddr in bind() calls.

We also implement recvmsg() support for connected streams so that
after a connect() that is modified by a sockaddr hook, any corresponding
recmvsg() on the connected socket can also be modified to make the
connected program think it is connected to the "intended" remote.

Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Daan De Meyer <daan.j.demeyer@gmail.com>
Link: https://lore.kernel.org/r/20231011185113.140426-5-daan.j.demeyer@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf-cgroup-defs.h |  5 +++++
 include/linux/bpf-cgroup.h      | 17 +++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index 7b121bd780eb..0985221d5478 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -28,19 +28,24 @@ enum cgroup_bpf_attach_type {
 	CGROUP_INET6_BIND,
 	CGROUP_INET4_CONNECT,
 	CGROUP_INET6_CONNECT,
+	CGROUP_UNIX_CONNECT,
 	CGROUP_INET4_POST_BIND,
 	CGROUP_INET6_POST_BIND,
 	CGROUP_UDP4_SENDMSG,
 	CGROUP_UDP6_SENDMSG,
+	CGROUP_UNIX_SENDMSG,
 	CGROUP_SYSCTL,
 	CGROUP_UDP4_RECVMSG,
 	CGROUP_UDP6_RECVMSG,
+	CGROUP_UNIX_RECVMSG,
 	CGROUP_GETSOCKOPT,
 	CGROUP_SETSOCKOPT,
 	CGROUP_INET4_GETPEERNAME,
 	CGROUP_INET6_GETPEERNAME,
+	CGROUP_UNIX_GETPEERNAME,
 	CGROUP_INET4_GETSOCKNAME,
 	CGROUP_INET6_GETSOCKNAME,
+	CGROUP_UNIX_GETSOCKNAME,
 	CGROUP_INET_SOCK_RELEASE,
 	CGROUP_LSM_START,
 	CGROUP_LSM_END = CGROUP_LSM_START + CGROUP_LSM_NUM - 1,
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 31561e789715..98b8cea904fe 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -48,19 +48,24 @@ to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type)
 	CGROUP_ATYPE(CGROUP_INET6_BIND);
 	CGROUP_ATYPE(CGROUP_INET4_CONNECT);
 	CGROUP_ATYPE(CGROUP_INET6_CONNECT);
+	CGROUP_ATYPE(CGROUP_UNIX_CONNECT);
 	CGROUP_ATYPE(CGROUP_INET4_POST_BIND);
 	CGROUP_ATYPE(CGROUP_INET6_POST_BIND);
 	CGROUP_ATYPE(CGROUP_UDP4_SENDMSG);
 	CGROUP_ATYPE(CGROUP_UDP6_SENDMSG);
+	CGROUP_ATYPE(CGROUP_UNIX_SENDMSG);
 	CGROUP_ATYPE(CGROUP_SYSCTL);
 	CGROUP_ATYPE(CGROUP_UDP4_RECVMSG);
 	CGROUP_ATYPE(CGROUP_UDP6_RECVMSG);
+	CGROUP_ATYPE(CGROUP_UNIX_RECVMSG);
 	CGROUP_ATYPE(CGROUP_GETSOCKOPT);
 	CGROUP_ATYPE(CGROUP_SETSOCKOPT);
 	CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME);
 	CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME);
+	CGROUP_ATYPE(CGROUP_UNIX_GETPEERNAME);
 	CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME);
 	CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME);
+	CGROUP_ATYPE(CGROUP_UNIX_GETSOCKNAME);
 	CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE);
 	default:
 		return CGROUP_BPF_ATTACH_TYPE_INVALID;
@@ -289,18 +294,27 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL)
 
+#define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_CONNECT, NULL)
+
 #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx)
 
+#define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_SENDMSG, t_ctx)
+
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL)
 
+#define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_RECVMSG, NULL)
+
 /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
  * fullsock and its parent fullsock cannot be traced by
  * sk_to_full_sk().
@@ -492,10 +506,13 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
-- 
cgit v1.2.3


From 0a779003213b589d7b3eef72f69e19a30f603ebc Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Mon, 9 Oct 2023 15:37:51 +0200
Subject: netdev: make napi_schedule return bool on NAPI successful schedule

Change napi_schedule to return a bool on NAPI successful schedule.
This might be useful for some driver to do additional steps after a
NAPI has been scheduled.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20231009133754.9834-2-ansuelsmth@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 11d704bfec9b..2874770a0a74 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -490,11 +490,18 @@ bool napi_schedule_prep(struct napi_struct *n);
  *
  * Schedule NAPI poll routine to be called if it is not already
  * running.
+ * Return true if we schedule a NAPI or false if not.
+ * Refer to napi_schedule_prep() for additional reason on why
+ * a NAPI might not be scheduled.
  */
-static inline void napi_schedule(struct napi_struct *n)
+static inline bool napi_schedule(struct napi_struct *n)
 {
-	if (napi_schedule_prep(n))
+	if (napi_schedule_prep(n)) {
 		__napi_schedule(n);
+		return true;
+	}
+
+	return false;
 }
 
 /**
-- 
cgit v1.2.3


From 73382e919f3d938554dadd01d95760f90d1c25c1 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Mon, 9 Oct 2023 15:37:52 +0200
Subject: netdev: replace napi_reschedule with napi_schedule

Now that napi_schedule return a bool, we can drop napi_reschedule that
does the same exact function. The function comes from a very old commit
bfe13f54f502 ("ibm_emac: Convert to use napi_struct independent of struct
net_device") and the purpose is actually deprecated in favour of
different logic.

Convert every user of napi_reschedule to napi_schedule.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Acked-by: Jeff Johnson <quic_jjohnson@quicinc.com> # ath10k
Acked-by: Nick Child <nnac123@linux.ibm.com> # ibm
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de> # for can/dev/rx-offload.c
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20231009133754.9834-3-ansuelsmth@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2874770a0a74..ae553f886796 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -516,16 +516,6 @@ static inline void napi_schedule_irqoff(struct napi_struct *n)
 		__napi_schedule_irqoff(n);
 }
 
-/* Try to reschedule poll. Called by dev->poll() after napi_complete().  */
-static inline bool napi_reschedule(struct napi_struct *napi)
-{
-	if (napi_schedule_prep(napi)) {
-		__napi_schedule(napi);
-		return true;
-	}
-	return false;
-}
-
 /**
  * napi_complete_done - NAPI processing complete
  * @n: NAPI context
-- 
cgit v1.2.3


From b9f29205c0182a2059b4dfa2883db5ef423574d4 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 30 Sep 2023 15:14:27 -0700
Subject: iosys-map: fix kernel-doc typos

Correct spelling of "beginning".

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: dri-devel@lists.freedesktop.org
Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: https://patchwork.freedesktop.org/patch/msgid/20230930221428.18463-2-rdunlap@infradead.org
---
 include/linux/iosys-map.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iosys-map.h b/include/linux/iosys-map.h
index cb71aa616bd3..1b06d074ade0 100644
--- a/include/linux/iosys-map.h
+++ b/include/linux/iosys-map.h
@@ -426,7 +426,7 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * iosys_map_rd_field - Read a member from a struct in the iosys_map
  *
  * @map__:		The iosys_map structure
- * @struct_offset__:	Offset from the beggining of the map, where the struct
+ * @struct_offset__:	Offset from the beginning of the map, where the struct
  *			is located
  * @struct_type__:	The struct describing the layout of the mapping
  * @field__:		Member of the struct to read
@@ -494,7 +494,7 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * iosys_map_wr_field - Write to a member of a struct in the iosys_map
  *
  * @map__:		The iosys_map structure
- * @struct_offset__:	Offset from the beggining of the map, where the struct
+ * @struct_offset__:	Offset from the beginning of the map, where the struct
  *			is located
  * @struct_type__:	The struct describing the layout of the mapping
  * @field__:		Member of the struct to read
-- 
cgit v1.2.3


From 668706b10c9b8181a53bd8881a77bb81b328ab33 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 11 Oct 2023 23:26:35 +0300
Subject: gpiolib: provide gpio_device_find_by_fwnode()

One of the ways of looking up GPIO devices is using their fwnode.
Provide a helper for that to avoid every user implementing their
own matching function.

Reviewed-by: Dipen Patel <dipenp@nvidia.com>
Tested-by: Dipen Patel <dipenp@nvidia.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20231010151709.4104747-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index f8ad7f40100c..ae4162d3f1d3 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -611,6 +611,7 @@ struct gpio_chip *gpiochip_find(void *data,
 struct gpio_device *gpio_device_find(void *data,
 				int (*match)(struct gpio_chip *gc, void *data));
 struct gpio_device *gpio_device_find_by_label(const char *label);
+struct gpio_device *gpio_device_find_by_fwnode(const struct fwnode_handle *fwnode);
 
 struct gpio_device *gpio_device_get(struct gpio_device *gdev);
 void gpio_device_put(struct gpio_device *gdev);
-- 
cgit v1.2.3


From e001d1447cd4585d7f23a44ff668ba2bc624badb Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 12 Oct 2023 15:24:17 +0300
Subject: fs: factor out vfs_parse_monolithic_sep() helper

Factor out vfs_parse_monolithic_sep() from generic_parse_monolithic(),
so filesystems could use it with a custom option separator callback.

Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 include/linux/fs_context.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 96332db693d5..c13e99cbbf81 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -136,6 +136,8 @@ extern struct fs_context *vfs_dup_fs_context(struct fs_context *fc);
 extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param);
 extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
 			       const char *value, size_t v_size);
+int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
+			     char *(*sep)(char **));
 extern int generic_parse_monolithic(struct fs_context *fc, void *data);
 extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
-- 
cgit v1.2.3


From 13cc9ee8f8ed58e563294d87d74a62006be40f21 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 12 Oct 2023 13:09:02 -0400
Subject: cgroup: Fix incorrect css_set_rwsem reference in comment

Since commit f0d9a5f17575 ("cgroup: make css_set_rwsem a spinlock
and rename it to css_set_lock"), css_set_rwsem has been replaced by
css_set_lock. That commit, however, missed the css_set_rwsem reference
in include/linux/cgroup-defs.h. Fix that by changing it to css_set_lock
as well.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index f1b3151ac30b..265da00a1a8b 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -238,7 +238,7 @@ struct css_set {
 	 * Lists running through all tasks using this cgroup group.
 	 * mg_tasks lists tasks which belong to this cset but are in the
 	 * process of being migrated out or in.  Protected by
-	 * css_set_rwsem, but, during migration, once tasks are moved to
+	 * css_set_lock, but, during migration, once tasks are moved to
 	 * mg_tasks, it can be read safely while holding cgroup_mutex.
 	 */
 	struct list_head tasks;
-- 
cgit v1.2.3


From f06cc667f79909e9175460b167c277b7c64d3df0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 9 Oct 2023 23:04:25 +0200
Subject: perf: Optimize perf_cgroup_switch()

Namhyung reported that bd2756811766 ("perf: Rewrite core context handling")
regresses context switch overhead when perf-cgroup is in use together
with 'slow' PMUs like uncore.

Specifically, perf_cgroup_switch()'s perf_ctx_disable() /
ctx_sched_out() etc.. all iterate the full list of active PMUs for
that CPU, even if they don't have cgroup events.

Previously there was cgrp_cpuctx_list which linked the relevant PMUs
together, but that got lost in the rework. Instead of re-instruducing
a similar list, let the perf_event_pmu_context iteration skip those
that do not have cgroup events. This avoids growing multiple versions
of the perf_event_pmu_context iteration.

Measured performance (on a slightly different patch):

Before)

  $ taskset -c 0 ./perf bench sched pipe -l 10000 -G AAA,BBB
  # Running 'sched/pipe' benchmark:
  # Executed 10000 pipe operations between two processes

       Total time: 0.901 [sec]

        90.128700 usecs/op
            11095 ops/sec

After)

  $ taskset -c 0 ./perf bench sched pipe -l 10000 -G AAA,BBB
  # Running 'sched/pipe' benchmark:
  # Executed 10000 pipe operations between two processes

       Total time: 0.065 [sec]

         6.560100 usecs/op
           152436 ops/sec

Fixes: bd2756811766 ("perf: Rewrite core context handling")
Reported-by: Namhyung Kim <namhyung@kernel.org>
Debugged-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231009210425.GC6307@noisy.programming.kicks-ass.net
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f31f962a6445..0367d748fae0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -878,6 +878,7 @@ struct perf_event_pmu_context {
 	unsigned int			embedded : 1;
 
 	unsigned int			nr_events;
+	unsigned int			nr_cgroups;
 
 	atomic_t			refcount; /* event <-> epc */
 	struct rcu_head			rcu_head;
-- 
cgit v1.2.3


From f995443f01b4dbcce723539b99050ce69b319e58 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 12 Oct 2023 16:31:58 +0200
Subject: locking/seqlock: Simplify SEQCOUNT_LOCKNAME()

1. Kill the "lockmember" argument. It is always s->lock plus
   __seqprop_##lockname##_sequence() already uses s->lock and
   ignores "lockmember".

2. Kill the "lock_acquire" argument. __seqprop_##lockname##_sequence()
   can use the same "lockbase" prefix for _lock and _unlock.

Apart from line numbers, gcc -E outputs the same code.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/r/20231012143158.GA16133@redhat.com
---
 include/linux/seqlock.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index af518e4d0c6b..7e7109dbc3f5 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -191,11 +191,9 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
  * @lockname:		"LOCKNAME" part of seqcount_LOCKNAME_t
  * @locktype:		LOCKNAME canonical C data type
  * @preemptible:	preemptibility of above locktype
- * @lockmember:		argument for lockdep_assert_held()
- * @lockbase:		associated lock release function (prefix only)
- * @lock_acquire:	associated lock acquisition function (full call)
+ * @lockbase:		prefix for associated lock/unlock
  */
-#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember, lockbase, lock_acquire) \
+#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase)	\
 typedef struct seqcount_##lockname {					\
 	seqcount_t		seqcount;				\
 	__SEQ_LOCK(locktype	*lock);					\
@@ -216,7 +214,7 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)	\
 		return seq;						\
 									\
 	if (preemptible && unlikely(seq & 1)) {				\
-		__SEQ_LOCK(lock_acquire);				\
+		__SEQ_LOCK(lockbase##_lock(s->lock));			\
 		__SEQ_LOCK(lockbase##_unlock(s->lock));			\
 									\
 		/*							\
@@ -242,7 +240,7 @@ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s)	\
 static __always_inline void						\
 __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)		\
 {									\
-	__SEQ_LOCK(lockdep_assert_held(lockmember));			\
+	__SEQ_LOCK(lockdep_assert_held(s->lock));			\
 }
 
 /*
@@ -271,10 +269,10 @@ static inline void __seqprop_assert(const seqcount_t *s)
 
 #define __SEQ_RT	IS_ENABLED(CONFIG_PREEMPT_RT)
 
-SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    s->lock,        raw_spin, raw_spin_lock(s->lock))
-SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, s->lock,        spin,     spin_lock(s->lock))
-SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, s->lock,        read,     read_lock(s->lock))
-SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     s->lock,        mutex,    mutex_lock(s->lock))
+SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    raw_spin)
+SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, spin)
+SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, read)
+SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
 
 /*
  * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
-- 
cgit v1.2.3


From e6115c6f7a0ce3388cc60b69a284facf78b5dbfd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 12 Oct 2023 16:32:27 +0200
Subject: locking/seqlock: Change __seqprop() to return the function pointer

This simplifies the macro and makes it easy to add the new seqprop's
with 2 or more args.

Plus this way we do not lose the type info, the (void*) type cast is
no longer needed.

And the latter reveals the problem: a lot of seqcount_t helpers pass
the "const seqcount_t *s" argument to __seqprop_ptr(seqcount_t *s)
but (before this patch) "(void *)(s)" masked the problem.

So this patch changes __seqprop_ptr() and __seqprop_##lockname##_ptr()
to accept the "const LOCKNAME *s" argument. This is not nice either,
they need to drop the constness on return because these helpers are used
by both the readers and writers, but at least it is clear what's going on.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/r/20231012143227.GA16143@redhat.com
---
 include/linux/seqlock.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 7e7109dbc3f5..4b8dcd3a0d93 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -200,9 +200,9 @@ typedef struct seqcount_##lockname {					\
 } seqcount_##lockname##_t;						\
 									\
 static __always_inline seqcount_t *					\
-__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)			\
+__seqprop_##lockname##_ptr(const seqcount_##lockname##_t *s)		\
 {									\
-	return &s->seqcount;						\
+	return (void *)&s->seqcount; /* drop const */			\
 }									\
 									\
 static __always_inline unsigned						\
@@ -247,9 +247,9 @@ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)		\
  * __seqprop() for seqcount_t
  */
 
-static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
+static inline seqcount_t *__seqprop_ptr(const seqcount_t *s)
 {
-	return s;
+	return (void *)s; /* drop const */
 }
 
 static inline unsigned __seqprop_sequence(const seqcount_t *s)
@@ -292,19 +292,19 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
 #define SEQCNT_WW_MUTEX_ZERO(name, lock) 	SEQCOUNT_LOCKNAME_ZERO(name, lock)
 
 #define __seqprop_case(s, lockname, prop)				\
-	seqcount_##lockname##_t: __seqprop_##lockname##_##prop((void *)(s))
+	seqcount_##lockname##_t: __seqprop_##lockname##_##prop
 
 #define __seqprop(s, prop) _Generic(*(s),				\
-	seqcount_t:		__seqprop_##prop((void *)(s)),		\
+	seqcount_t:		__seqprop_##prop,			\
 	__seqprop_case((s),	raw_spinlock,	prop),			\
 	__seqprop_case((s),	spinlock,	prop),			\
 	__seqprop_case((s),	rwlock,		prop),			\
 	__seqprop_case((s),	mutex,		prop))
 
-#define seqprop_ptr(s)			__seqprop(s, ptr)
-#define seqprop_sequence(s)		__seqprop(s, sequence)
-#define seqprop_preemptible(s)		__seqprop(s, preemptible)
-#define seqprop_assert(s)		__seqprop(s, assert)
+#define seqprop_ptr(s)			__seqprop(s, ptr)(s)
+#define seqprop_sequence(s)		__seqprop(s, sequence)(s)
+#define seqprop_preemptible(s)		__seqprop(s, preemptible)(s)
+#define seqprop_assert(s)		__seqprop(s, assert)(s)
 
 /**
  * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
-- 
cgit v1.2.3


From 7cbabed16464ad980c8d80b601559b36e1bc0b0c Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 31 Jul 2023 05:25:36 +0300
Subject: thunderbolt: Fix typo in enum tb_link_width kernel-doc

Typo trasmitters -> transmitters.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Gil Fine <gil.fine@linux.intel.com>
---
 include/linux/thunderbolt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 02333f47c994..6151c210d987 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -175,7 +175,7 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir);
  * enum tb_link_width - Thunderbolt/USB4 link width
  * @TB_LINK_WIDTH_SINGLE: Single lane link
  * @TB_LINK_WIDTH_DUAL: Dual lane symmetric link
- * @TB_LINK_WIDTH_ASYM_TX: Dual lane asymmetric Gen 4 link with 3 trasmitters
+ * @TB_LINK_WIDTH_ASYM_TX: Dual lane asymmetric Gen 4 link with 3 transmitters
  * @TB_LINK_WIDTH_ASYM_RX: Dual lane asymmetric Gen 4 link with 3 receivers
  */
 enum tb_link_width {
-- 
cgit v1.2.3


From 1559d14977b694570f010854b8192e6de034bc27 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 11 Oct 2023 15:02:02 +0200
Subject: gpiolib: provide gpio_device_to_device()

There are users in the kernel who need to retrieve the address of the
struct device backing the GPIO device. Currently they needlessly poke in
the internals of GPIOLIB. Add a dedicated getter function.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Peter Rosin <peda@axentia.se>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index ae4162d3f1d3..0e40098aa283 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -619,6 +619,8 @@ void gpio_device_put(struct gpio_device *gdev);
 DEFINE_FREE(gpio_device_put, struct gpio_device *,
 	    if (IS_ERR_OR_NULL(_T)) gpio_device_put(_T));
 
+struct device *gpio_device_to_device(struct gpio_device *gdev);
+
 bool gpiochip_line_is_irq(struct gpio_chip *gc, unsigned int offset);
 int gpiochip_reqres_irq(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_relres_irq(struct gpio_chip *gc, unsigned int offset);
-- 
cgit v1.2.3


From 370232d096e3fe188e4596f77bc6560636bd40c1 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 11 Oct 2023 15:02:03 +0200
Subject: gpiolib: provide gpiod_to_gpio_device()

Accessing struct gpio_chip backing a GPIO device is only allowed for the
actual providers of that chip.

Similarly to how we introduced gpio_device_find() in order to replace
the abused gpiochip_find(), let's introduce a counterpart to
gpiod_to_chip() that returns a reference to the GPIO device owning the
descriptor. This is done in order to later remove gpiod_to_chip()
entirely.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Peter Rosin <peda@axentia.se>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 0e40098aa283..d231c4f31cb4 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -785,6 +785,7 @@ int gpiochip_lock_as_irq(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_unlock_as_irq(struct gpio_chip *gc, unsigned int offset);
 
 struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc);
+struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc);
 
 #else /* CONFIG_GPIOLIB */
 
-- 
cgit v1.2.3


From 8c85a102fc4e5c0c942c10677fa43f7a19baa92f Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 5 Sep 2023 20:52:55 +0200
Subject: gpiolib: provide gpio_device_get_base()

Let's start adding getters for the opaque struct gpio_device. Start with
a function allowing to retrieve the base GPIO number.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index d231c4f31cb4..1d454dc944b3 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -787,6 +787,9 @@ void gpiochip_unlock_as_irq(struct gpio_chip *gc, unsigned int offset);
 struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc);
 struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc);
 
+/* struct gpio_device getters */
+int gpio_device_get_base(struct gpio_device *gdev);
+
 #else /* CONFIG_GPIOLIB */
 
 #include <linux/err.h>
-- 
cgit v1.2.3


From 384461abcab6602abc06c2dfb8fb99beeeaa12b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 4 Aug 2023 16:27:06 +0200
Subject: pwm: Manage owner assignment implicitly for drivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of requiring each driver to care for assigning the owner member
of struct pwm_ops, handle that implicitly using a macro. Note that the
owner member has to be moved to struct pwm_chip, as the ops structure
usually lives in read-only memory and so cannot be modified.

The upside is that new low level drivers cannot forget the assignment and
save one line each. The pwm-crc driver didn't assign .owner, that's not
a problem in practice though as the driver cannot be compiled as a
module.

Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com> # Intel LPSS
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com> # pwm-{bcm,brcm}*.c
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com> # sun4i
Acked-by: Andi Shyti <andi.shyti@kernel.org>
Acked-by: Nobuhiro Iwamatsu <nobuhiro1.iwamatsu@toshiba.co.jp> # pwm-visconti
Acked-by: Heiko Stuebner <heiko@sntech.de> # pwm-rockchip
Acked-by: Michael Walle <michael@walle.cc> # pwm-sl28cpld
Acked-by: Neil Armstrong <neil.armstrong@linaro.org> # pwm-meson
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20230804142707.412137-2-u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index d2f9f690a9c1..56e3b7a09824 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -267,7 +267,6 @@ struct pwm_capture {
  * @get_state: get the current PWM state. This function is only
  *	       called once per PWM device when the PWM chip is
  *	       registered.
- * @owner: helps prevent removal of modules exporting active PWMs
  */
 struct pwm_ops {
 	int (*request)(struct pwm_chip *chip, struct pwm_device *pwm);
@@ -278,13 +277,13 @@ struct pwm_ops {
 		     const struct pwm_state *state);
 	int (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
 			 struct pwm_state *state);
-	struct module *owner;
 };
 
 /**
  * struct pwm_chip - abstract a PWM controller
  * @dev: device providing the PWMs
  * @ops: callbacks for this PWM controller
+ * @owner: module providing this chip
  * @base: number of first PWM controlled by this chip
  * @npwm: number of PWMs controlled by this chip
  * @of_xlate: request a PWM device given a device tree PWM specifier
@@ -295,6 +294,7 @@ struct pwm_ops {
 struct pwm_chip {
 	struct device *dev;
 	const struct pwm_ops *ops;
+	struct module *owner;
 	int base;
 	unsigned int npwm;
 
@@ -386,10 +386,12 @@ int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result,
 int pwm_set_chip_data(struct pwm_device *pwm, void *data);
 void *pwm_get_chip_data(struct pwm_device *pwm);
 
-int pwmchip_add(struct pwm_chip *chip);
+int __pwmchip_add(struct pwm_chip *chip, struct module *owner);
+#define pwmchip_add(chip) __pwmchip_add(chip, THIS_MODULE)
 void pwmchip_remove(struct pwm_chip *chip);
 
-int devm_pwmchip_add(struct device *dev, struct pwm_chip *chip);
+int __devm_pwmchip_add(struct device *dev, struct pwm_chip *chip, struct module *owner);
+#define devm_pwmchip_add(dev, chip) __devm_pwmchip_add(dev, chip, THIS_MODULE)
 
 struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
 					 unsigned int index,
-- 
cgit v1.2.3


From a6e5654e0b8b53e2d0e316bc7cecb81dd8371f18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 5 Jul 2023 10:06:50 +0200
Subject: pwm: Drop pwm_[sg]et_chip_data()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The semantic of chip_data is a bit surprising as it's cleared when
pwm_put() is called. Also there is a big overlap with the standard driver
data.

All drivers were adapted to not make use of chip_data any more, so it can
go away.

Link: https://lore.kernel.org/r/20230705080650.2353391-9-u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 56e3b7a09824..e3b437587b32 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -71,7 +71,6 @@ struct pwm_state {
  * @hwpwm: per-chip relative index of the PWM device
  * @pwm: global index of the PWM device
  * @chip: PWM chip providing this PWM device
- * @chip_data: chip-private data associated with the PWM device
  * @args: PWM arguments
  * @state: last applied state
  * @last: last implemented state (for PWM_DEBUG)
@@ -82,7 +81,6 @@ struct pwm_device {
 	unsigned int hwpwm;
 	unsigned int pwm;
 	struct pwm_chip *chip;
-	void *chip_data;
 
 	struct pwm_args args;
 	struct pwm_state state;
@@ -383,8 +381,6 @@ static inline void pwm_disable(struct pwm_device *pwm)
 /* PWM provider APIs */
 int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result,
 		unsigned long timeout);
-int pwm_set_chip_data(struct pwm_device *pwm, void *data);
-void *pwm_get_chip_data(struct pwm_device *pwm);
 
 int __pwmchip_add(struct pwm_chip *chip, struct module *owner);
 #define pwmchip_add(chip) __pwmchip_add(chip, THIS_MODULE)
@@ -447,16 +443,6 @@ static inline int pwm_capture(struct pwm_device *pwm,
 	return -EINVAL;
 }
 
-static inline int pwm_set_chip_data(struct pwm_device *pwm, void *data)
-{
-	return -EINVAL;
-}
-
-static inline void *pwm_get_chip_data(struct pwm_device *pwm)
-{
-	return NULL;
-}
-
 static inline int pwmchip_add(struct pwm_chip *chip)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 38985e8c278b82e6d4d62d4acd57c761cc23ce63 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Mon, 9 Oct 2023 13:06:08 +0300
Subject: net: Handle bulk delete policy in bridge driver

The merge commit 92716869375b ("Merge branch 'br-flush-filtering'")
added support for FDB flushing in bridge driver. The following patches
will extend VXLAN driver to support FDB flushing as well. The netlink
message for bulk delete is shared between the drivers. With the existing
implementation, there is no way to prevent user from flushing with
attributes that are not supported per driver. For example, when VNI will
be added, user will not get an error for flush FDB entries in bridge
with VNI, although this attribute is not relevant for bridge.

As preparation for support of FDB flush in VXLAN driver, move the policy
to be handled in bridge driver, later a new policy for VXLAN will be
added in VXLAN driver. Do not pass 'vid' as part of ndo_fdb_del_bulk(),
as this field is relevant only for bridge.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ae553f886796..1c7681263d30 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1287,9 +1287,7 @@ struct netdev_net_notifier {
  *		      struct net_device *dev,
  *		      const unsigned char *addr, u16 vid)
  *	Deletes the FDB entry from dev coresponding to addr.
- * int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[],
- *			   struct net_device *dev,
- *			   u16 vid,
+ * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev,
  *			   struct netlink_ext_ack *extack);
  * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
  *		       struct net_device *dev, struct net_device *filter_dev,
@@ -1564,10 +1562,8 @@ struct net_device_ops {
 					       struct net_device *dev,
 					       const unsigned char *addr,
 					       u16 vid, struct netlink_ext_ack *extack);
-	int			(*ndo_fdb_del_bulk)(struct ndmsg *ndm,
-						    struct nlattr *tb[],
+	int			(*ndo_fdb_del_bulk)(struct nlmsghdr *nlh,
 						    struct net_device *dev,
-						    u16 vid,
 						    struct netlink_ext_ack *extack);
 	int			(*ndo_fdb_dump)(struct sk_buff *skb,
 						struct netlink_callback *cb,
-- 
cgit v1.2.3


From 03adc61edad49e1bbecfb53f7ea5d78f398fe368 Mon Sep 17 00:00:00 2001
From: Dan Clash <daclash@linux.microsoft.com>
Date: Thu, 12 Oct 2023 14:55:18 -0700
Subject: audit,io_uring: io_uring openat triggers audit reference count
 underflow

An io_uring openat operation can update an audit reference count
from multiple threads resulting in the call trace below.

A call to io_uring_submit() with a single openat op with a flag of
IOSQE_ASYNC results in the following reference count updates.

These first part of the system call performs two increments that do not race.

do_syscall_64()
  __do_sys_io_uring_enter()
    io_submit_sqes()
      io_openat_prep()
        __io_openat_prep()
          getname()
            getname_flags()       /* update 1 (increment) */
              __audit_getname()   /* update 2 (increment) */

The openat op is queued to an io_uring worker thread which starts the
opportunity for a race.  The system call exit performs one decrement.

do_syscall_64()
  syscall_exit_to_user_mode()
    syscall_exit_to_user_mode_prepare()
      __audit_syscall_exit()
        audit_reset_context()
           putname()              /* update 3 (decrement) */

The io_uring worker thread performs one increment and two decrements.
These updates can race with the system call decrement.

io_wqe_worker()
  io_worker_handle_work()
    io_wq_submit_work()
      io_issue_sqe()
        io_openat()
          io_openat2()
            do_filp_open()
              path_openat()
                __audit_inode()   /* update 4 (increment) */
            putname()             /* update 5 (decrement) */
        __audit_uring_exit()
          audit_reset_context()
            putname()             /* update 6 (decrement) */

The fix is to change the refcnt member of struct audit_names
from int to atomic_t.

kernel BUG at fs/namei.c:262!
Call Trace:
...
 ? putname+0x68/0x70
 audit_reset_context.part.0.constprop.0+0xe1/0x300
 __audit_uring_exit+0xda/0x1c0
 io_issue_sqe+0x1f3/0x450
 ? lock_timer_base+0x3b/0xd0
 io_wq_submit_work+0x8d/0x2b0
 ? __try_to_del_timer_sync+0x67/0xa0
 io_worker_handle_work+0x17c/0x2b0
 io_wqe_worker+0x10a/0x350

Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/
Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
Signed-off-by: Dan Clash <daclash@linux.microsoft.com>
Link: https://lore.kernel.org/r/20231012215518.GA4048@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b528f063e8ff..4a40823c3c67 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2403,7 +2403,7 @@ struct audit_names;
 struct filename {
 	const char		*name;	/* pointer to actual string */
 	const __user char	*uptr;	/* original userland pointer */
-	int			refcnt;
+	atomic_t		refcnt;
 	struct audit_names	*aname;
 	const char		iname[];
 };
-- 
cgit v1.2.3


From 236334aeec0f93217cf9235f2004e61a0a1a5985 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Fri, 13 Oct 2023 08:39:16 +0000
Subject: bpf: Avoid unnecessary audit log for CPU security mitigations

Check cpu_mitigations_off() first to avoid calling capable() if it is off.
This can avoid unnecessary audit log.

Fixes: bc5bc309db45 ("bpf: Inherit system settings for CPU security mitigations")
Suggested-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/CAEf4Bza6UVUWqcWQ-66weZ-nMDr+TFU3Mtq=dumZFD-pSqU7Ow@mail.gmail.com/
Link: https://lore.kernel.org/bpf/20231013083916.4199-1-laoar.shao@gmail.com
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 61bde4520f5c..f0891ba24cb1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2164,12 +2164,12 @@ static inline bool bpf_allow_uninit_stack(void)
 
 static inline bool bpf_bypass_spec_v1(void)
 {
-	return perfmon_capable() || cpu_mitigations_off();
+	return cpu_mitigations_off() || perfmon_capable();
 }
 
 static inline bool bpf_bypass_spec_v4(void)
 {
-	return perfmon_capable() || cpu_mitigations_off();
+	return cpu_mitigations_off() || perfmon_capable();
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
-- 
cgit v1.2.3


From 84aefafe6b294041b7fa0757414c4a29c1bdeea2 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 30 Sep 2023 15:14:26 -0700
Subject: clk: linux/clk-provider.h: fix kernel-doc warnings and typos

Fix spelling of "Structure".

Fix multiple kernel-doc warnings:

clk-provider.h:269: warning: Function parameter or member 'recalc_rate' not described in 'clk_ops'
clk-provider.h:468: warning: Function parameter or member 'parent_data' not described in 'clk_hw_register_fixed_rate_with_accuracy_parent_data'
clk-provider.h:468: warning: Excess function parameter 'parent_name' description in 'clk_hw_register_fixed_rate_with_accuracy_parent_data'
clk-provider.h:482: warning: Function parameter or member 'parent_data' not described in 'clk_hw_register_fixed_rate_parent_accuracy'
clk-provider.h:482: warning: Excess function parameter 'parent_name' description in 'clk_hw_register_fixed_rate_parent_accuracy'
clk-provider.h:687: warning: Function parameter or member 'flags' not described in 'clk_divider'
clk-provider.h:1164: warning: Function parameter or member 'flags' not described in 'clk_fractional_divider'
clk-provider.h:1164: warning: Function parameter or member 'approximation' not described in 'clk_fractional_divider'
clk-provider.h:1213: warning: Function parameter or member 'flags' not described in 'clk_multiplier'

Fixes: 9fba738a53dd ("clk: add duty cycle support")
Fixes: b2476490ef11 ("clk: introduce the common clock framework")
Fixes: 2d34f09e79c9 ("clk: fixed-rate: Add support for specifying parents via DT/pointers")
Fixes: f5290d8e4f0c ("clk: asm9260: use parent index to link the reference clock")
Fixes: 9d9f78ed9af0 ("clk: basic clock hardware types")
Fixes: e2d0e90fae82 ("clk: new basic clk type for fractional divider")
Fixes: f2e0a53271a4 ("clk: Add a basic multiplier clock")

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Michael Turquette <mturquette@baylibre.com>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: linux-clk@vger.kernel.org
Link: https://lore.kernel.org/r/20230930221428.18463-1-rdunlap@infradead.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index ec32ec58c59f..ace3a4ce2fc9 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -74,7 +74,7 @@ void clk_hw_forward_rate_request(const struct clk_hw *core,
 				 unsigned long parent_rate);
 
 /**
- * struct clk_duty - Struture encoding the duty cycle ratio of a clock
+ * struct clk_duty - Structure encoding the duty cycle ratio of a clock
  *
  * @num:	Numerator of the duty cycle ratio
  * @den:	Denominator of the duty cycle ratio
@@ -129,7 +129,7 @@ struct clk_duty {
  * @restore_context: Restore the context of the clock after a restoration
  *		of power.
  *
- * @recalc_rate	Recalculate the rate of this clock, by querying hardware. The
+ * @recalc_rate: Recalculate the rate of this clock, by querying hardware. The
  *		parent rate is an input parameter.  It is up to the caller to
  *		ensure that the prepare_mutex is held across this call. If the
  *		driver cannot figure out a rate for this clock, it must return
@@ -456,7 +456,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
  * clock with the clock framework
  * @dev: device that is registering this clock
  * @name: name of this clock
- * @parent_name: name of clock's parent
+ * @parent_data: name of clock's parent
  * @flags: framework-specific flags
  * @fixed_rate: non-adjustable clock rate
  * @fixed_accuracy: non-adjustable clock accuracy
@@ -471,7 +471,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
  * the clock framework
  * @dev: device that is registering this clock
  * @name: name of this clock
- * @parent_name: name of clock's parent
+ * @parent_data: name of clock's parent
  * @flags: framework-specific flags
  * @fixed_rate: non-adjustable clock rate
  */
@@ -649,7 +649,7 @@ struct clk_div_table {
  * Clock with an adjustable divider affecting its output frequency.  Implements
  * .recalc_rate, .set_rate and .round_rate
  *
- * Flags:
+ * @flags:
  * CLK_DIVIDER_ONE_BASED - by default the divisor is the value read from the
  *	register plus one.  If CLK_DIVIDER_ONE_BASED is set then the divider is
  *	the raw value read from the register, with the value of zero considered
@@ -1130,11 +1130,12 @@ struct clk_hw *clk_hw_register_fixed_factor_parent_hw(struct device *dev,
  * @mwidth:	width of the numerator bit field
  * @nshift:	shift to the denominator bit field
  * @nwidth:	width of the denominator bit field
+ * @approximation: clk driver's callback for calculating the divider clock
  * @lock:	register lock
  *
  * Clock with adjustable fractional divider affecting its output frequency.
  *
- * Flags:
+ * @flags:
  * CLK_FRAC_DIVIDER_ZERO_BASED - by default the numerator and denominator
  *	is the value read from the register. If CLK_FRAC_DIVIDER_ZERO_BASED
  *	is set then the numerator and denominator are both the value read
@@ -1191,7 +1192,7 @@ void clk_hw_unregister_fractional_divider(struct clk_hw *hw);
  * Clock with an adjustable multiplier affecting its output frequency.
  * Implements .recalc_rate, .set_rate and .round_rate
  *
- * Flags:
+ * @flags:
  * CLK_MULTIPLIER_ZERO_BYPASS - By default, the multiplier is the value read
  *	from the register, with 0 being a valid value effectively
  *	zeroing the output clock rate. If CLK_MULTIPLIER_ZERO_BYPASS is
-- 
cgit v1.2.3


From 787650cc335201a0489905c5504a9179470ebc51 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 13 Oct 2023 20:04:12 -0700
Subject: Input: Annotate struct ff_device with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for
array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ff_device.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20231006201739.work.350-kees@kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/input.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index 49790c1bd2c4..de6503c0edb8 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -562,7 +562,7 @@ struct ff_device {
 
 	int max_effects;
 	struct ff_effect *effects;
-	struct file *effect_owners[];
+	struct file *effect_owners[] __counted_by(max_effects);
 };
 
 int input_ff_create(struct input_dev *dev, unsigned int max_effects);
-- 
cgit v1.2.3


From 5b90073defd1a52aa8120403d79f6e0fc10c87ee Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Sun, 8 Oct 2023 20:36:17 +0800
Subject: crypto: hisilicon/qm - alloc buffer to set and get xqc

If the temporarily applied memory is used to set or get the xqc
information, the driver releases the memory immediately after the
hardware mailbox operation time exceeds the driver waiting time.
However, the hardware does not cancel the operation, so the hardware
may write data to released memory.

Therefore, when the driver is bound to a device, the driver reserves
memory for the xqc configuration. The subsequent xqc configuration
uses the reserved memory to prevent hardware from accessing the
released memory.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/hisi_acc_qm.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 34c64a02712c..44e0c44a2e20 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -292,6 +292,18 @@ struct qm_err_isolate {
 	struct list_head qm_hw_errs;
 };
 
+struct qm_rsv_buf {
+	struct qm_sqc *sqc;
+	struct qm_cqc *cqc;
+	struct qm_eqc *eqc;
+	struct qm_aeqc *aeqc;
+	dma_addr_t sqc_dma;
+	dma_addr_t cqc_dma;
+	dma_addr_t eqc_dma;
+	dma_addr_t aeqc_dma;
+	struct qm_dma qcdma;
+};
+
 struct hisi_qm {
 	enum qm_hw_ver ver;
 	enum qm_fun_type fun_type;
@@ -324,6 +336,7 @@ struct hisi_qm {
 	dma_addr_t cqc_dma;
 	dma_addr_t eqe_dma;
 	dma_addr_t aeqe_dma;
+	struct qm_rsv_buf xqc_buf;
 
 	struct hisi_qm_status status;
 	const struct hisi_qm_err_ini *err_ini;
-- 
cgit v1.2.3


From 886ee55eabac0d46faf8bc0b22207ca2740847ba Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 13 Oct 2023 10:15:46 +0200
Subject: locking/seqlock: Propagate 'const' pointers within read-only methods,
 remove forced type casts

Currently __seqprop_ptr() is an inline function that must chose to either
use 'const' or non-const seqcount related pointers - but this results in
the undesirable loss of 'const' propagation, via a forced type cast.

The easiest solution would be to turn the pointer wrappers into macros that
pass through whatever type is passed to them - but the clever maze of
seqlock API instantiation macros relies on the GCC CPP '##' macro
extension, which isn't recursive, so inline functions must be used here.

So create two wrapper variants instead: 'ptr' and 'const_ptr', and pick the
right one for the codepaths that are const: read_seqcount_begin() and
read_seqcount_retry().

This cleans up type handling and allows the removal of all type forcing.

No change in functionality.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/seqlock.h | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 4b8dcd3a0d93..80f21d2ca2aa 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -200,9 +200,15 @@ typedef struct seqcount_##lockname {					\
 } seqcount_##lockname##_t;						\
 									\
 static __always_inline seqcount_t *					\
-__seqprop_##lockname##_ptr(const seqcount_##lockname##_t *s)		\
+__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)			\
 {									\
-	return (void *)&s->seqcount; /* drop const */			\
+	return &s->seqcount;						\
+}									\
+									\
+static __always_inline const seqcount_t *				\
+__seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s)	\
+{									\
+	return &s->seqcount;						\
 }									\
 									\
 static __always_inline unsigned						\
@@ -247,9 +253,14 @@ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)		\
  * __seqprop() for seqcount_t
  */
 
-static inline seqcount_t *__seqprop_ptr(const seqcount_t *s)
+static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
+{
+	return s;
+}
+
+static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s)
 {
-	return (void *)s; /* drop const */
+	return s;
 }
 
 static inline unsigned __seqprop_sequence(const seqcount_t *s)
@@ -302,6 +313,7 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
 	__seqprop_case((s),	mutex,		prop))
 
 #define seqprop_ptr(s)			__seqprop(s, ptr)(s)
+#define seqprop_const_ptr(s)		__seqprop(s, const_ptr)(s)
 #define seqprop_sequence(s)		__seqprop(s, sequence)(s)
 #define seqprop_preemptible(s)		__seqprop(s, preemptible)(s)
 #define seqprop_assert(s)		__seqprop(s, assert)(s)
@@ -353,7 +365,7 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
  */
 #define read_seqcount_begin(s)						\
 ({									\
-	seqcount_lockdep_reader_access(seqprop_ptr(s));			\
+	seqcount_lockdep_reader_access(seqprop_const_ptr(s));		\
 	raw_read_seqcount_begin(s);					\
 })
 
@@ -419,7 +431,7 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
  * Return: true if a read section retry is required, else false
  */
 #define __read_seqcount_retry(s, start)					\
-	do___read_seqcount_retry(seqprop_ptr(s), start)
+	do___read_seqcount_retry(seqprop_const_ptr(s), start)
 
 static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
@@ -439,7 +451,7 @@ static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
  * Return: true if a read section retry is required, else false
  */
 #define read_seqcount_retry(s, start)					\
-	do_read_seqcount_retry(seqprop_ptr(s), start)
+	do_read_seqcount_retry(seqprop_const_ptr(s), start)
 
 static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
-- 
cgit v1.2.3


From 3f7f31fff2510272334f3d0374c432bdaa4f1536 Mon Sep 17 00:00:00 2001
From: Wei Zhang <weizhang@nvidia.com>
Date: Thu, 12 Oct 2023 12:27:36 -0700
Subject: net/mlx5: Parallelize vhca event handling

At present, mlx5 driver have a general purpose
event handler which not only handles vhca event
but also many other events. This incurs a huge
bottleneck because the event handler is
implemented by single threaded workqueue and all
events are forced to be handled in serial manner
even though application tries to create multiple
SFs simultaneously.

Introduce a dedicated vhca event handler which
manages SFs parallel creation.

Signed-off-by: Wei Zhang <weizhang@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 52e982bc0f50..7968c5ee85c4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -615,6 +615,7 @@ struct mlx5_priv {
 	int			adev_idx;
 	int			sw_vhca_id;
 	struct mlx5_events      *events;
+	struct mlx5_vhca_events *vhca_events;
 
 	struct mlx5_flow_steering *steering;
 	struct mlx5_mpfs        *mpfs;
-- 
cgit v1.2.3


From e534552c92a44690e48593f9567fe689545ded73 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Thu, 12 Oct 2023 12:27:39 -0700
Subject: net/mlx5: Refactor LAG peer device lookout bus logic to mlx5 devcom

LAG peer device lookout bus logic required the usage of global lock,
mlx5_intf_mutex.
As part of the effort to remove this global lock, refactor LAG peer
device lookout to use mlx5 devcom layer.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7968c5ee85c4..f60cdc9bd40f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -624,6 +624,7 @@ struct mlx5_priv {
 	struct mlx5_lag		*lag;
 	u32			flags;
 	struct mlx5_devcom_dev	*devc;
+	struct mlx5_devcom_comp_dev *hca_devcom_comp;
 	struct mlx5_fw_reset	*fw_reset;
 	struct mlx5_core_roce	roce;
 	struct mlx5_fc_stats		fc_stats;
-- 
cgit v1.2.3


From 0d2d6bc7e74fee586f587d33484b797bb78f334c Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Thu, 12 Oct 2023 12:27:41 -0700
Subject: net/mlx5: Remove unused declaration

Commit 2ac9cfe78223 ("net/mlx5e: IPSec, Add Innova IPSec offload TX data path")
declared mlx5e_ipsec_inverse_table_init() but never implemented it.
Commit f52f2faee581 ("net/mlx5e: Introduce flow steering API")
declared mlx5e_fs_set_tc() but never implemented it.
Commit f2f3df550139 ("net/mlx5: EQ, Privatize eq_table and friends")
declared mlx5_eq_comp_cpumask() but never implemented it.
Commit cac1eb2cf2e3 ("net/mlx5: Lag, properly lock eswitch if needed")
removed mlx5_lag_update() but not its declaration.
Commit 35ba005d820b ("net/mlx5: DR, Set flex parser for TNL_MPLS dynamically")
removed mlx5dr_ste_build_tnl_mpls() but not its declaration.

Commit e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
declared but never implemented mlx5_alloc_cmd_mailbox_chain() and mlx5_free_cmd_mailbox_chain().
Commit 0cf53c124756 ("net/mlx5: FWPage, Use async events chain")
removed mlx5_core_req_pages_handler() but not its declaration.
Commit 938fe83c8dcb ("net/mlx5_core: New device capabilities handling")
removed mlx5_query_odp_caps() but not its declaration.
Commit f6a8a19bb11b ("RDMA/netdev: Hoist alloc_netdev_mqs out of the driver")
removed mlx5_rdma_netdev_alloc() but not its declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f60cdc9bd40f..d2b8d4a74a30 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1043,10 +1043,6 @@ void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
 int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
 			     struct mlx5_frag_buf *buf, int node);
 void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf);
-struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
-						      gfp_t flags, int npages);
-void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
-				 struct mlx5_cmd_mailbox *head);
 int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in,
 			  int inlen);
 int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, u32 mkey);
@@ -1060,8 +1056,6 @@ void mlx5_pagealloc_start(struct mlx5_core_dev *dev);
 void mlx5_pagealloc_stop(struct mlx5_core_dev *dev);
 void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev);
 void mlx5_pages_debugfs_cleanup(struct mlx5_core_dev *dev);
-void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
-				 s32 npages, bool ec_function);
 int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot);
 int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev);
 void mlx5_register_debugfs(void);
@@ -1101,8 +1095,6 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn,
 int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num);
 __be32 mlx5_core_get_terminate_scatter_list_mkey(struct mlx5_core_dev *dev);
 void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common);
-int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
-			struct mlx5_odp_caps *odp_caps);
 
 int mlx5_init_rl_table(struct mlx5_core_dev *dev);
 void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
@@ -1204,12 +1196,6 @@ int mlx5_sriov_blocking_notifier_register(struct mlx5_core_dev *mdev,
 void mlx5_sriov_blocking_notifier_unregister(struct mlx5_core_dev *mdev,
 					     int vf_id,
 					     struct notifier_block *nb);
-#ifdef CONFIG_MLX5_CORE_IPOIB
-struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
-					  struct ib_device *ibdev,
-					  const char *name,
-					  void (*setup)(struct net_device *));
-#endif /* CONFIG_MLX5_CORE_IPOIB */
 int mlx5_rdma_rn_get_params(struct mlx5_core_dev *mdev,
 			    struct ib_device *device,
 			    struct rdma_netdev_alloc_params *params);
-- 
cgit v1.2.3


From 57f728d59f005dffdbb52a03531e480a71599bc5 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 31 Jul 2023 22:08:17 -0700
Subject: cpumask: kernel-doc cleanups and additions

Clean up some punctutation and abbreviations.
Add kernel-doc notation for one function and function return value
for 39 functions.

cpumask.h:
Fix some punctuation (plural vs. possessive).
Fix some abbreviations (ie. -> i.e., id -> ID).

Fix 35 warnings like this:
include/linux/cpumask.h:161: warning: No description found for return value of 'cpumask_first'

cpumask.c:
Add Return: value for 4 functions.
Add kernel-doc for cpumask_any_distribute().

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 113 +++++++++++++++++++++++++++++-------------------
 1 file changed, 69 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index f10fb87d49db..cfb545841a2c 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -4,7 +4,7 @@
 
 /*
  * Cpumasks provide a bitmap suitable for representing the
- * set of CPU's in a system, one bit position per CPU number.  In general,
+ * set of CPUs in a system, one bit position per CPU number.  In general,
  * only nr_cpu_ids (<= NR_CPUS) bits are valid.
  */
 #include <linux/kernel.h>
@@ -97,7 +97,7 @@ static inline void set_nr_cpu_ids(unsigned int nr)
  *
  *  If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
  *
- *  The cpu_possible_mask is fixed at boot time, as the set of CPU id's
+ *  The cpu_possible_mask is fixed at boot time, as the set of CPU IDs
  *  that it is possible might ever be plugged in at anytime during the
  *  life of that system boot.  The cpu_present_mask is dynamic(*),
  *  representing which CPUs are currently plugged in.  And
@@ -112,7 +112,7 @@ static inline void set_nr_cpu_ids(unsigned int nr)
  *      hotplug, it's a copy of cpu_possible_mask, hence fixed at boot.
  *
  * Subtleties:
- * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
+ * 1) UP ARCHes (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
  *    assumption that their single CPU is online.  The UP
  *    cpu_{online,possible,present}_masks are placebos.  Changing them
  *    will have no useful affect on the following num_*_cpus()
@@ -155,7 +155,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu)
  * cpumask_first - get the first cpu in a cpumask
  * @srcp: the cpumask pointer
  *
- * Returns >= nr_cpu_ids if no cpus set.
+ * Return: >= nr_cpu_ids if no cpus set.
  */
 static inline unsigned int cpumask_first(const struct cpumask *srcp)
 {
@@ -166,7 +166,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
  * cpumask_first_zero - get the first unset cpu in a cpumask
  * @srcp: the cpumask pointer
  *
- * Returns >= nr_cpu_ids if all cpus are set.
+ * Return: >= nr_cpu_ids if all cpus are set.
  */
 static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
 {
@@ -178,7 +178,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
  * @srcp1: the first input
  * @srcp2: the second input
  *
- * Returns >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
+ * Return: >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
  */
 static inline
 unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
@@ -190,7 +190,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask
  * cpumask_last - get the last CPU in a cpumask
  * @srcp:	- the cpumask pointer
  *
- * Returns	>= nr_cpumask_bits if no CPUs set.
+ * Return:	>= nr_cpumask_bits if no CPUs set.
  */
 static inline unsigned int cpumask_last(const struct cpumask *srcp)
 {
@@ -199,10 +199,10 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp)
 
 /**
  * cpumask_next - get the next cpu in a cpumask
- * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @n: the cpu prior to the place to search (i.e. return will be > @n)
  * @srcp: the cpumask pointer
  *
- * Returns >= nr_cpu_ids if no further cpus set.
+ * Return: >= nr_cpu_ids if no further cpus set.
  */
 static inline
 unsigned int cpumask_next(int n, const struct cpumask *srcp)
@@ -215,10 +215,10 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp)
 
 /**
  * cpumask_next_zero - get the next unset cpu in a cpumask
- * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @n: the cpu prior to the place to search (i.e. return will be > @n)
  * @srcp: the cpumask pointer
  *
- * Returns >= nr_cpu_ids if no further cpus unset.
+ * Return: >= nr_cpu_ids if no further cpus unset.
  */
 static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 {
@@ -254,11 +254,11 @@ unsigned int cpumask_any_distribute(const struct cpumask *srcp);
 
 /**
  * cpumask_next_and - get the next cpu in *src1p & *src2p
- * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @n: the cpu prior to the place to search (i.e. return will be > @n)
  * @src1p: the first cpumask pointer
  * @src2p: the second cpumask pointer
  *
- * Returns >= nr_cpu_ids if no further cpus set in both.
+ * Return: >= nr_cpu_ids if no further cpus set in both.
  */
 static inline
 unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
@@ -373,7 +373,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
  * @cpu: the cpu to ignore.
  *
  * Often used to find any cpu but smp_processor_id() in a mask.
- * Returns >= nr_cpu_ids if no cpus set.
+ * Return: >= nr_cpu_ids if no cpus set.
  */
 static inline
 unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
@@ -388,11 +388,11 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 }
 
 /**
- * cpumask_nth - get the first cpu in a cpumask
+ * cpumask_nth - get the Nth cpu in a cpumask
  * @srcp: the cpumask pointer
- * @cpu: the N'th cpu to find, starting from 0
+ * @cpu: the Nth cpu to find, starting from 0
  *
- * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ * Return: >= nr_cpu_ids if such cpu doesn't exist.
  */
 static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
 {
@@ -400,12 +400,12 @@ static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *s
 }
 
 /**
- * cpumask_nth_and - get the first cpu in 2 cpumasks
+ * cpumask_nth_and - get the Nth cpu in 2 cpumasks
  * @srcp1: the cpumask pointer
  * @srcp2: the cpumask pointer
- * @cpu: the N'th cpu to find, starting from 0
+ * @cpu: the Nth cpu to find, starting from 0
  *
- * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ * Return: >= nr_cpu_ids if such cpu doesn't exist.
  */
 static inline
 unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
@@ -416,12 +416,12 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
 }
 
 /**
- * cpumask_nth_andnot - get the first cpu set in 1st cpumask, and clear in 2nd.
+ * cpumask_nth_andnot - get the Nth cpu set in 1st cpumask, and clear in 2nd.
  * @srcp1: the cpumask pointer
  * @srcp2: the cpumask pointer
- * @cpu: the N'th cpu to find, starting from 0
+ * @cpu: the Nth cpu to find, starting from 0
  *
- * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ * Return: >= nr_cpu_ids if such cpu doesn't exist.
  */
 static inline
 unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
@@ -436,9 +436,9 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
  * @srcp1: the cpumask pointer
  * @srcp2: the cpumask pointer
  * @srcp3: the cpumask pointer
- * @cpu: the N'th cpu to find, starting from 0
+ * @cpu: the Nth cpu to find, starting from 0
  *
- * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ * Return: >= nr_cpu_ids if such cpu doesn't exist.
  */
 static __always_inline
 unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
@@ -497,7 +497,7 @@ static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
  * @cpu: cpu number (< nr_cpu_ids)
  * @cpumask: the cpumask pointer
  *
- * Returns true if @cpu is set in @cpumask, else returns false
+ * Return: true if @cpu is set in @cpumask, else returns false
  */
 static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
 {
@@ -509,9 +509,9 @@ static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpum
  * @cpu: cpu number (< nr_cpu_ids)
  * @cpumask: the cpumask pointer
  *
- * Returns true if @cpu is set in old bitmap of @cpumask, else returns false
- *
  * test_and_set_bit wrapper for cpumasks.
+ *
+ * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
  */
 static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
 {
@@ -523,9 +523,9 @@ static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cp
  * @cpu: cpu number (< nr_cpu_ids)
  * @cpumask: the cpumask pointer
  *
- * Returns true if @cpu is set in old bitmap of @cpumask, else returns false
- *
  * test_and_clear_bit wrapper for cpumasks.
+ *
+ * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
  */
 static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
 {
@@ -560,7 +560,7 @@ static inline void cpumask_clear(struct cpumask *dstp)
  * @src1p: the first input
  * @src2p: the second input
  *
- * If *@dstp is empty, returns false, else returns true
+ * Return: false if *@dstp is empty, else returns true
  */
 static inline bool cpumask_and(struct cpumask *dstp,
 			       const struct cpumask *src1p,
@@ -603,7 +603,7 @@ static inline void cpumask_xor(struct cpumask *dstp,
  * @src1p: the first input
  * @src2p: the second input
  *
- * If *@dstp is empty, returns false, else returns true
+ * Return: false if *@dstp is empty, else returns true
  */
 static inline bool cpumask_andnot(struct cpumask *dstp,
 				  const struct cpumask *src1p,
@@ -617,6 +617,8 @@ static inline bool cpumask_andnot(struct cpumask *dstp,
  * cpumask_equal - *src1p == *src2p
  * @src1p: the first input
  * @src2p: the second input
+ *
+ * Return: true if the cpumasks are equal, false if not
  */
 static inline bool cpumask_equal(const struct cpumask *src1p,
 				const struct cpumask *src2p)
@@ -630,6 +632,9 @@ static inline bool cpumask_equal(const struct cpumask *src1p,
  * @src1p: the first input
  * @src2p: the second input
  * @src3p: the third input
+ *
+ * Return: true if first cpumask ORed with second cpumask == third cpumask,
+ *	   otherwise false
  */
 static inline bool cpumask_or_equal(const struct cpumask *src1p,
 				    const struct cpumask *src2p,
@@ -643,6 +648,9 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p,
  * cpumask_intersects - (*src1p & *src2p) != 0
  * @src1p: the first input
  * @src2p: the second input
+ *
+ * Return: true if first cpumask ANDed with second cpumask is non-empty,
+ *	   otherwise false
  */
 static inline bool cpumask_intersects(const struct cpumask *src1p,
 				     const struct cpumask *src2p)
@@ -656,7 +664,7 @@ static inline bool cpumask_intersects(const struct cpumask *src1p,
  * @src1p: the first input
  * @src2p: the second input
  *
- * Returns true if *@src1p is a subset of *@src2p, else returns false
+ * Return: true if *@src1p is a subset of *@src2p, else returns false
  */
 static inline bool cpumask_subset(const struct cpumask *src1p,
 				 const struct cpumask *src2p)
@@ -668,6 +676,8 @@ static inline bool cpumask_subset(const struct cpumask *src1p,
 /**
  * cpumask_empty - *srcp == 0
  * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
+ *
+ * Return: true if srcp is empty (has no bits set), else false
  */
 static inline bool cpumask_empty(const struct cpumask *srcp)
 {
@@ -677,6 +687,8 @@ static inline bool cpumask_empty(const struct cpumask *srcp)
 /**
  * cpumask_full - *srcp == 0xFFFFFFFF...
  * @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
+ *
+ * Return: true if srcp is full (has all bits set), else false
  */
 static inline bool cpumask_full(const struct cpumask *srcp)
 {
@@ -686,6 +698,8 @@ static inline bool cpumask_full(const struct cpumask *srcp)
 /**
  * cpumask_weight - Count of bits in *srcp
  * @srcp: the cpumask to count bits (< nr_cpu_ids) in.
+ *
+ * Return: count of bits set in *srcp
  */
 static inline unsigned int cpumask_weight(const struct cpumask *srcp)
 {
@@ -696,6 +710,8 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp)
  * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2)
  * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
  * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
+ *
+ * Return: count of bits set in both *srcp1 and *srcp2
  */
 static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
 						const struct cpumask *srcp2)
@@ -744,7 +760,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
  * cpumask_any - pick a "random" cpu from *srcp
  * @srcp: the input cpumask
  *
- * Returns >= nr_cpu_ids if no cpus set.
+ * Return: >= nr_cpu_ids if no cpus set.
  */
 #define cpumask_any(srcp) cpumask_first(srcp)
 
@@ -753,7 +769,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
  * @mask1: the first input cpumask
  * @mask2: the second input cpumask
  *
- * Returns >= nr_cpu_ids if no cpus set.
+ * Return: >= nr_cpu_ids if no cpus set.
  */
 #define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2))
 
@@ -769,7 +785,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
  * @len: the length of the buffer
  * @dstp: the cpumask to set.
  *
- * Returns -errno, or 0 for success.
+ * Return: -errno, or 0 for success.
  */
 static inline int cpumask_parse_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
@@ -783,7 +799,7 @@ static inline int cpumask_parse_user(const char __user *buf, int len,
  * @len: the length of the buffer
  * @dstp: the cpumask to set.
  *
- * Returns -errno, or 0 for success.
+ * Return: -errno, or 0 for success.
  */
 static inline int cpumask_parselist_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
@@ -797,7 +813,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
  * @buf: the buffer to extract from
  * @dstp: the cpumask to set.
  *
- * Returns -errno, or 0 for success.
+ * Return: -errno, or 0 for success.
  */
 static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
 {
@@ -809,7 +825,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
  * @buf: the buffer to extract from
  * @dstp: the cpumask to set.
  *
- * Returns -errno, or 0 for success.
+ * Return: -errno, or 0 for success.
  */
 static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
 {
@@ -817,7 +833,9 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
 }
 
 /**
- * cpumask_size - size to allocate for a 'struct cpumask' in bytes
+ * cpumask_size - calculate size to allocate for a 'struct cpumask' in bytes
+ *
+ * Return: size to allocate for a &struct cpumask in bytes
  */
 static inline unsigned int cpumask_size(void)
 {
@@ -831,7 +849,7 @@ static inline unsigned int cpumask_size(void)
  * little more difficult, we typedef cpumask_var_t to an array or a
  * pointer: doing &mask on an array is a noop, so it still works.
  *
- * ie.
+ * i.e.
  *	cpumask_var_t tmpmask;
  *	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
  *		return -ENOMEM;
@@ -887,6 +905,8 @@ bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
  * a nop returning a constant 1 (in <linux/cpumask.h>).
  *
  * See alloc_cpumask_var_node.
+ *
+ * Return: %true if allocation succeeded, %false if not
  */
 static inline
 bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
@@ -1025,7 +1045,7 @@ set_cpu_dying(unsigned int cpu, bool dying)
 }
 
 /**
- * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
+ * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask *
  * @bitmap: the bitmap
  *
  * There are a few places where cpumask_var_t isn't appropriate and
@@ -1068,6 +1088,8 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
  * interface gives only a momentary snapshot and is not protected against
  * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
  * region.
+ *
+ * Return: momentary snapshot of the number of online CPUs
  */
 static __always_inline unsigned int num_online_cpus(void)
 {
@@ -1160,7 +1182,7 @@ static inline bool cpu_dying(unsigned int cpu)
  * @mask: the cpumask to copy
  * @buf: the buffer to copy into
  *
- * Returns the length of the (null-terminated) @buf string, zero if
+ * Return: the length of the (null-terminated) @buf string, zero if
  * nothing is copied.
  */
 static inline ssize_t
@@ -1183,7 +1205,7 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
  * cpumask; Typically used by bin_attribute to export cpumask bitmask
  * ABI.
  *
- * Returns the length of how many bytes have been copied, excluding
+ * Return: the length of how many bytes have been copied, excluding
  * terminating '\0'.
  */
 static inline ssize_t
@@ -1204,6 +1226,9 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
  *
  * Everything is same with the above cpumap_print_bitmask_to_buf()
  * except the print format.
+ *
+ * Return: the length of how many bytes have been copied, excluding
+ * terminating '\0'.
  */
 static inline ssize_t
 cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
-- 
cgit v1.2.3


From 7733aa893847f021c674d0d30b723d892109369d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 17 Aug 2023 19:20:54 +0300
Subject: bitmap: Remove dead code, i.e. bitmap_copy_le()

Besides the fact it's not used anywhere it should be implemented
differently, i.e. via helpers from linux/byteorder/generic.h.
Yet the helpers themselves need to be introduced first.

Also note, the function lacks of the test cases, they must be provided.

Hence, drop the current dead code for good.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 03644237e1ef..1516ff979315 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -220,11 +220,6 @@ int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
 void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
 
-#ifdef __BIG_ENDIAN
-void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
-#else
-#define bitmap_copy_le bitmap_copy
-#endif
 int bitmap_print_to_pagebuf(bool list, char *buf,
 				   const unsigned long *maskp, int nmaskbits);
 
-- 
cgit v1.2.3


From aae06fc1b5a2e4b52f8504a1f12f9b8b98e80641 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 7 Oct 2023 16:35:10 -0700
Subject: lib/bitmap: split-out string-related operations to a separate files

lib/bitmap.c and corresponding include/linux/bitmap.h are intended to
hold functions related to operations on bitmaps, like bitmap_shift or
bitmap_set. Historically, some string-related operations like
bitmap_parse are also reside in lib/bitmap.c.

Now that the subsystem evolves, string-related bitmap operations became a
significant part of the file. Because they are quite different from the
other bitmap functions by nature, it's worth to split them to a separate
source/header files.

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
CC: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap-str.h | 16 ++++++++++++++++
 include/linux/bitmap.h     | 18 +-----------------
 2 files changed, 17 insertions(+), 17 deletions(-)
 create mode 100644 include/linux/bitmap-str.h

(limited to 'include/linux')

diff --git a/include/linux/bitmap-str.h b/include/linux/bitmap-str.h
new file mode 100644
index 000000000000..17caeca94cab
--- /dev/null
+++ b/include/linux/bitmap-str.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_BITMAP_STR_H
+#define __LINUX_BITMAP_STR_H
+
+int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits);
+int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits);
+extern int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp,
+					int nmaskbits, loff_t off, size_t count);
+extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
+					int nmaskbits, loff_t off, size_t count);
+int bitmap_parse(const char *buf, unsigned int buflen, unsigned long *dst, int nbits);
+int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits);
+int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
+					unsigned long *dst, int nbits);
+
+#endif /* __LINUX_BITMAP_STR_H */
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 1516ff979315..1cca950a54ae 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -10,6 +10,7 @@
 #include <linux/limits.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/bitmap-str.h>
 
 struct device;
 
@@ -200,14 +201,6 @@ bitmap_find_next_zero_area(unsigned long *map,
 					      align_mask, 0);
 }
 
-int bitmap_parse(const char *buf, unsigned int buflen,
-			unsigned long *dst, int nbits);
-int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
-			unsigned long *dst, int nbits);
-int bitmap_parselist(const char *buf, unsigned long *maskp,
-			int nmaskbits);
-int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
-			unsigned long *dst, int nbits);
 void bitmap_remap(unsigned long *dst, const unsigned long *src,
 		const unsigned long *old, const unsigned long *new, unsigned int nbits);
 int bitmap_bitremap(int oldbit,
@@ -220,15 +213,6 @@ int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
 void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
 
-int bitmap_print_to_pagebuf(bool list, char *buf,
-				   const unsigned long *maskp, int nmaskbits);
-
-extern int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp,
-				      int nmaskbits, loff_t off, size_t count);
-
-extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
-				      int nmaskbits, loff_t off, size_t count);
-
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
 #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
 
-- 
cgit v1.2.3


From 49dbe25adac42d3e06f65d1420946bec65896222 Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Tue, 10 Oct 2023 22:15:14 +0300
Subject: vsock: read from socket's error queue

This adds handling of MSG_ERRQUEUE input flag in receive call. This flag
is used to read socket's error queue instead of data queue. Possible
scenario of error queue usage is receiving completions for transmission
with MSG_ZEROCOPY flag. This patch also adds new defines: 'SOL_VSOCK'
and 'VSOCK_RECVERR'.

Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 39b74d83c7c4..cfcb7e2c3813 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -383,6 +383,7 @@ struct ucred {
 #define SOL_MPTCP	284
 #define SOL_MCTP	285
 #define SOL_SMC		286
+#define SOL_VSOCK	287
 
 /* IPX options */
 #define IPX_TYPE	1
-- 
cgit v1.2.3


From d7fbc0b7e846e9e0e70ae766d274b8720fbab412 Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Wed, 11 Oct 2023 12:12:34 +0200
Subject: dpll: netlink/core: add support for pin-dpll signal phase
 offset/adjust

Add callback ops for pin-dpll phase measurement.
Add callback for pin signal phase adjustment.
Add min and max phase adjustment values to pin proprties.
Invoke callbacks in dpll_netlink.c when filling the pin details to
provide user with phase related attribute values.

Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dpll.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index bbc480cd2932..578fc5fa3750 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -68,6 +68,18 @@ struct dpll_pin_ops {
 	int (*prio_set)(const struct dpll_pin *pin, void *pin_priv,
 			const struct dpll_device *dpll, void *dpll_priv,
 			const u32 prio, struct netlink_ext_ack *extack);
+	int (*phase_offset_get)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_device *dpll, void *dpll_priv,
+				s64 *phase_offset,
+				struct netlink_ext_ack *extack);
+	int (*phase_adjust_get)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_device *dpll, void *dpll_priv,
+				s32 *phase_adjust,
+				struct netlink_ext_ack *extack);
+	int (*phase_adjust_set)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_device *dpll, void *dpll_priv,
+				const s32 phase_adjust,
+				struct netlink_ext_ack *extack);
 };
 
 struct dpll_pin_frequency {
@@ -91,6 +103,11 @@ struct dpll_pin_frequency {
 #define DPLL_PIN_FREQUENCY_DCF77 \
 	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_77_5_KHZ)
 
+struct dpll_pin_phase_adjust_range {
+	s32 min;
+	s32 max;
+};
+
 struct dpll_pin_properties {
 	const char *board_label;
 	const char *panel_label;
@@ -99,6 +116,7 @@ struct dpll_pin_properties {
 	unsigned long capabilities;
 	u32 freq_supported_num;
 	struct dpll_pin_frequency *freq_supported;
+	struct dpll_pin_phase_adjust_range phase_range;
 };
 
 #if IS_ENABLED(CONFIG_DPLL)
-- 
cgit v1.2.3


From fc8b2a619469378717e7270d2a4e1ef93c585f7a Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 11 Oct 2023 10:01:14 -0400
Subject: net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation

Syzbot reported two new paths to hit an internal WARNING using the
new virtio gso type VIRTIO_NET_HDR_GSO_UDP_L4.

    RIP: 0010:skb_checksum_help+0x4a2/0x600 net/core/dev.c:3260
    skb len=64521 gso_size=344
and

    RIP: 0010:skb_warn_bad_offload+0x118/0x240 net/core/dev.c:3262

Older virtio types have historically had loose restrictions, leading
to many entirely impractical fuzzer generated packets causing
problems deep in the kernel stack. Ideally, we would have had strict
validation for all types from the start.

New virtio types can have tighter validation. Limit UDP GSO packets
inserted via virtio to the same limits imposed by the UDP_SEGMENT
socket interface:

1. must use checksum offload
2. checksum offload matches UDP header
3. no more segments than UDP_MAX_SEGMENTS
4. UDP GSO does not take modifier flags, notably SKB_GSO_TCP_ECN

Fixes: 860b7f27b8f7 ("linux/virtio_net.h: Support USO offload in vnet header.")
Reported-by: syzbot+01cdbc31e9c0ae9b33ac@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/0000000000005039270605eb0b7f@google.com/
Reported-by: syzbot+c99d835ff081ca30f986@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/0000000000005426680605eb0b9f@google.com/
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 7b4dd69555e4..27cc1d464321 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -3,8 +3,8 @@
 #define _LINUX_VIRTIO_NET_H
 
 #include <linux/if_vlan.h>
+#include <linux/udp.h>
 #include <uapi/linux/tcp.h>
-#include <uapi/linux/udp.h>
 #include <uapi/linux/virtio_net.h>
 
 static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type)
@@ -151,9 +151,22 @@ retry:
 		unsigned int nh_off = p_off;
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
 
-		/* UFO may not include transport header in gso_size. */
-		if (gso_type & SKB_GSO_UDP)
+		switch (gso_type & ~SKB_GSO_TCP_ECN) {
+		case SKB_GSO_UDP:
+			/* UFO may not include transport header in gso_size. */
 			nh_off -= thlen;
+			break;
+		case SKB_GSO_UDP_L4:
+			if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM))
+				return -EINVAL;
+			if (skb->csum_offset != offsetof(struct udphdr, check))
+				return -EINVAL;
+			if (skb->len - p_off > gso_size * UDP_MAX_SEGMENTS)
+				return -EINVAL;
+			if (gso_type != SKB_GSO_UDP_L4)
+				return -EINVAL;
+			break;
+		}
 
 		/* Kernel has a special handling for GSO_BY_FRAGS. */
 		if (gso_size == GSO_BY_FRAGS)
-- 
cgit v1.2.3


From 60c6946675fc06dd2fd2b7a4b6fd1c1f046f1056 Mon Sep 17 00:00:00 2001
From: Xabier Marquiegui <reibax@gmail.com>
Date: Thu, 12 Oct 2023 00:39:53 +0200
Subject: posix-clock: introduce posix_clock_context concept

Add the necessary structure to support custom private-data per
posix-clock user.

The previous implementation of posix-clock assumed all file open
instances need access to the same clock structure on private_data.

The need for individual data structures per file open instance has been
identified when developing support for multiple timestamp event queue
users for ptp_clock.

Signed-off-by: Xabier Marquiegui <reibax@gmail.com>
Suggested-by: Richard Cochran <richardcochran@gmail.com>
Suggested-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/posix-clock.h | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h
index 468328b1e1dd..ef8619f48920 100644
--- a/include/linux/posix-clock.h
+++ b/include/linux/posix-clock.h
@@ -14,6 +14,7 @@
 #include <linux/rwsem.h>
 
 struct posix_clock;
+struct posix_clock_context;
 
 /**
  * struct posix_clock_operations - functional interface to the clock
@@ -50,18 +51,18 @@ struct posix_clock_operations {
 	/*
 	 * Optional character device methods:
 	 */
-	long    (*ioctl)   (struct posix_clock *pc,
-			    unsigned int cmd, unsigned long arg);
+	long (*ioctl)(struct posix_clock_context *pccontext, unsigned int cmd,
+		      unsigned long arg);
 
-	int     (*open)    (struct posix_clock *pc, fmode_t f_mode);
+	int (*open)(struct posix_clock_context *pccontext, fmode_t f_mode);
 
-	__poll_t (*poll)   (struct posix_clock *pc,
-			    struct file *file, poll_table *wait);
+	__poll_t (*poll)(struct posix_clock_context *pccontext, struct file *file,
+			 poll_table *wait);
 
-	int     (*release) (struct posix_clock *pc);
+	int (*release)(struct posix_clock_context *pccontext);
 
-	ssize_t (*read)    (struct posix_clock *pc,
-			    uint flags, char __user *buf, size_t cnt);
+	ssize_t (*read)(struct posix_clock_context *pccontext, uint flags,
+			char __user *buf, size_t cnt);
 };
 
 /**
@@ -90,6 +91,24 @@ struct posix_clock {
 	bool zombie;
 };
 
+/**
+ * struct posix_clock_context - represents clock file operations context
+ *
+ * @clk:              Pointer to the clock
+ * @private_clkdata:  Pointer to user data
+ *
+ * Drivers should use struct posix_clock_context during specific character
+ * device file operation methods to access the posix clock.
+ *
+ * Drivers can store a private data structure during the open operation
+ * if they have specific information that is required in other file
+ * operations.
+ */
+struct posix_clock_context {
+	struct posix_clock *clk;
+	void *private_clkdata;
+};
+
 /**
  * posix_clock_register() - register a new clock
  * @clk:   Pointer to the clock. Caller must provide 'ops' field
-- 
cgit v1.2.3


From 058b4d9de86b3f77cd23fbd43a0f5ee4ea8e0aeb Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Wed, 11 Oct 2023 11:01:21 +0300
Subject: iommu: change iommu_map_sgtable to return signed values

The iommu_map_sgtable() function returns ssize_t and negative error
codes but it's declared as size_t instead.  I think that static checkers
would have complained if this caused a bug, but even though it doesn't
cause a bug, it's definitely worth fixing.

Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/06672b96-23fd-424c-8880-1626e7bf119c@moroto.mountain
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 64bd20142cbe..c28178f3690a 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1116,7 +1116,7 @@ static inline void iommu_free_global_pasid(ioasid_t pasid) {}
  * Creates a mapping at @iova for the buffer described by a scatterlist
  * stored in the given sg_table object in the provided IOMMU domain.
  */
-static inline size_t iommu_map_sgtable(struct iommu_domain *domain,
+static inline ssize_t iommu_map_sgtable(struct iommu_domain *domain,
 			unsigned long iova, struct sg_table *sgt, int prot)
 {
 	return iommu_map_sg(domain, iova, sgt->sgl, sgt->orig_nents, prot,
-- 
cgit v1.2.3


From f6ca3fb6978f94d95ee79f95085fc22e71ca17cc Mon Sep 17 00:00:00 2001
From: Rouven Czerwinski <r.czerwinski@pengutronix.de>
Date: Fri, 22 Sep 2023 16:17:16 +0200
Subject: mtd: rawnand: Ensure the nand chip supports cached reads

Both the JEDEC and ONFI specification say that read cache sequential
support is an optional command. This means that we not only need to
check whether the individual controller supports the command, we also
need to check the parameter pages for both ONFI and JEDEC NAND flashes
before enabling sequential cache reads.

This fixes support for NAND flashes which don't support enabling cache
reads, i.e. Samsung K9F4G08U0F or Toshiba TC58NVG0S3HTA00.

Sequential cache reads are now only available for ONFI and JEDEC
devices, if individual vendors implement this, it needs to be enabled
per vendor.

Tested on i.MX6Q with a Samsung NAND flash chip that doesn't support
sequential reads.

Fixes: 003fe4b9545b ("mtd: rawnand: Support for sequential cache reads")
Cc: stable@vger.kernel.org
Signed-off-by: Rouven Czerwinski <r.czerwinski@pengutronix.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20230922141717.35977-1-r.czerwinski@pengutronix.de
---
 include/linux/mtd/jedec.h   | 3 +++
 include/linux/mtd/onfi.h    | 1 +
 include/linux/mtd/rawnand.h | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/jedec.h b/include/linux/mtd/jedec.h
index 0b6b59f7cfbd..56047a4e54c9 100644
--- a/include/linux/mtd/jedec.h
+++ b/include/linux/mtd/jedec.h
@@ -21,6 +21,9 @@ struct jedec_ecc_info {
 /* JEDEC features */
 #define JEDEC_FEATURE_16_BIT_BUS	(1 << 0)
 
+/* JEDEC Optional Commands */
+#define JEDEC_OPT_CMD_READ_CACHE	BIT(1)
+
 struct nand_jedec_params {
 	/* rev info and features block */
 	/* 'J' 'E' 'S' 'D'  */
diff --git a/include/linux/mtd/onfi.h b/include/linux/mtd/onfi.h
index a7376f9beddf..55ab2e4d62f9 100644
--- a/include/linux/mtd/onfi.h
+++ b/include/linux/mtd/onfi.h
@@ -55,6 +55,7 @@
 #define ONFI_SUBFEATURE_PARAM_LEN	4
 
 /* ONFI optional commands SET/GET FEATURES supported? */
+#define ONFI_OPT_CMD_READ_CACHE		BIT(1)
 #define ONFI_OPT_CMD_SET_GET_FEATURES	BIT(2)
 
 struct nand_onfi_params {
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 90a141ba2a5a..c29ace15a053 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -225,6 +225,7 @@ struct gpio_desc;
  * struct nand_parameters - NAND generic parameters from the parameter page
  * @model: Model name
  * @supports_set_get_features: The NAND chip supports setting/getting features
+ * @supports_read_cache: The NAND chip supports read cache operations
  * @set_feature_list: Bitmap of features that can be set
  * @get_feature_list: Bitmap of features that can be get
  * @onfi: ONFI specific parameters
@@ -233,6 +234,7 @@ struct nand_parameters {
 	/* Generic parameters */
 	const char *model;
 	bool supports_set_get_features;
+	bool supports_read_cache;
 	DECLARE_BITMAP(set_feature_list, ONFI_FEATURE_NUMBER);
 	DECLARE_BITMAP(get_feature_list, ONFI_FEATURE_NUMBER);
 
-- 
cgit v1.2.3


From f447318fb1d156b4b6da79266724c7ee347d1b59 Mon Sep 17 00:00:00 2001
From: Martin Kurbanov <mmkurbanov@salutedevices.com>
Date: Mon, 2 Oct 2023 17:04:58 +0300
Subject: mtd: spinand: add support for FORESEE F35SQA002G

Add support for FORESEE F35SQA002G SPI NAND.
Datasheet:
  https://www.longsys.com/uploads/LM-00006FORESEEF35SQA002GDatasheet_1650183701.pdf

Signed-off-by: Martin Kurbanov <mmkurbanov@salutedevices.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20231002140458.147605-1-mmkurbanov@salutedevices.com
---
 include/linux/mtd/spinand.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 3e285c09d16d..badb4c1ac079 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -263,6 +263,7 @@ struct spinand_manufacturer {
 extern const struct spinand_manufacturer alliancememory_spinand_manufacturer;
 extern const struct spinand_manufacturer ato_spinand_manufacturer;
 extern const struct spinand_manufacturer esmt_c8_spinand_manufacturer;
+extern const struct spinand_manufacturer foresee_spinand_manufacturer;
 extern const struct spinand_manufacturer gigadevice_spinand_manufacturer;
 extern const struct spinand_manufacturer macronix_spinand_manufacturer;
 extern const struct spinand_manufacturer micron_spinand_manufacturer;
-- 
cgit v1.2.3


From 0da28d5fc808dfcfbc910870b4b0277c1a7ccb6c Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Fri, 15 Sep 2023 10:53:27 +0200
Subject: drm: renesas: shmobile: Remove backlight support

Backlight support should be implemented by panels, not by the LCDC
driver.  As the feature is currently unused anyway, remove it.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
[geert: Cleanups]
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/69707650245bc2193d072f24723d4d5482ea590b.1694767209.git.geert+renesas@glider.be
---
 include/linux/platform_data/shmob_drm.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/shmob_drm.h b/include/linux/platform_data/shmob_drm.h
index d661399b217d..b6b5b6607fb5 100644
--- a/include/linux/platform_data/shmob_drm.h
+++ b/include/linux/platform_data/shmob_drm.h
@@ -40,13 +40,6 @@ enum shmob_drm_interface {
 	SHMOB_DRM_IFACE_SYS24,		/* 24bpp */
 };
 
-struct shmob_drm_backlight_data {
-	const char *name;
-	int max_brightness;
-	int (*get_brightness)(void);
-	int (*set_brightness)(int brightness);
-};
-
 struct shmob_drm_panel_data {
 	unsigned int width_mm;		/* Panel width in mm */
 	unsigned int height_mm;		/* Panel height in mm */
@@ -83,7 +76,6 @@ struct shmob_drm_platform_data {
 	enum shmob_drm_clk_source clk_source;
 	struct shmob_drm_interface_data iface;
 	struct shmob_drm_panel_data panel;
-	struct shmob_drm_backlight_data backlight;
 };
 
 #endif /* __SHMOB_DRM_H__ */
-- 
cgit v1.2.3


From 04ed052f3ab4b3c4c3e8451522ffaa84479bf0fb Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Fri, 15 Sep 2023 10:53:30 +0200
Subject: drm: renesas: shmobile: Remove support for SYS panels

SYS panels are not used, and have no defined DT bindings.  Remove their
support to avoid impeding DT support.  It can always be added back
later.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/4ccca2a5ac05c73ea9fd6e44b8bc443fd9d14e0d.1694767209.git.geert+renesas@glider.be
---
 include/linux/platform_data/shmob_drm.h | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/shmob_drm.h b/include/linux/platform_data/shmob_drm.h
index b6b5b6607fb5..b728e24222d9 100644
--- a/include/linux/platform_data/shmob_drm.h
+++ b/include/linux/platform_data/shmob_drm.h
@@ -27,17 +27,6 @@ enum shmob_drm_interface {
 	SHMOB_DRM_IFACE_RGB18,		/* 18bpp */
 	SHMOB_DRM_IFACE_RGB24,		/* 24bpp */
 	SHMOB_DRM_IFACE_YUV422,		/* 16bpp */
-	SHMOB_DRM_IFACE_SYS8A,		/* 24bpp, 8:8:8 */
-	SHMOB_DRM_IFACE_SYS8B,		/* 18bpp, 8:8:2 */
-	SHMOB_DRM_IFACE_SYS8C,		/* 18bpp, 2:8:8 */
-	SHMOB_DRM_IFACE_SYS8D,		/* 16bpp, 8:8 */
-	SHMOB_DRM_IFACE_SYS9,		/* 18bpp, 9:9 */
-	SHMOB_DRM_IFACE_SYS12,		/* 24bpp, 12:12 */
-	SHMOB_DRM_IFACE_SYS16A,		/* 16bpp */
-	SHMOB_DRM_IFACE_SYS16B,		/* 18bpp, 16:2 */
-	SHMOB_DRM_IFACE_SYS16C,		/* 18bpp, 2:16 */
-	SHMOB_DRM_IFACE_SYS18,		/* 18bpp */
-	SHMOB_DRM_IFACE_SYS24,		/* 24bpp */
 };
 
 struct shmob_drm_panel_data {
@@ -46,19 +35,6 @@ struct shmob_drm_panel_data {
 	struct drm_mode_modeinfo mode;
 };
 
-struct shmob_drm_sys_interface_data {
-	unsigned int read_latch:6;
-	unsigned int read_setup:8;
-	unsigned int read_cycle:8;
-	unsigned int read_strobe:8;
-	unsigned int write_setup:8;
-	unsigned int write_cycle:8;
-	unsigned int write_strobe:8;
-	unsigned int cs_setup:3;
-	unsigned int vsync_active_high:1;
-	unsigned int vsync_dir_input:1;
-};
-
 #define SHMOB_DRM_IFACE_FL_DWPOL (1 << 0) /* Rising edge dot clock data latch */
 #define SHMOB_DRM_IFACE_FL_DIPOL (1 << 1) /* Active low display enable */
 #define SHMOB_DRM_IFACE_FL_DAPOL (1 << 2) /* Active low display data */
@@ -67,7 +43,6 @@ struct shmob_drm_sys_interface_data {
 
 struct shmob_drm_interface_data {
 	enum shmob_drm_interface interface;
-	struct shmob_drm_sys_interface_data sys;
 	unsigned int clk_div;
 	unsigned int flags;
 };
-- 
cgit v1.2.3


From 6a6ab0c7162b4b10ce74347e282e1bc81103a48f Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Fri, 15 Sep 2023 10:53:38 +0200
Subject: drm: renesas: shmobile: Use struct videomode in platform data

Replace the drm_mode_modeinfo field with videomode that includes more
signal polarity flags.  This simplifies driver handling of panel modes
and prepares for DT support.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
[geert: Simplify]
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/4312e56de424d94399c6105e7159317eae86c9d5.1694767209.git.geert+renesas@glider.be
---
 include/linux/platform_data/shmob_drm.h | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/shmob_drm.h b/include/linux/platform_data/shmob_drm.h
index b728e24222d9..f3cb19ff9f81 100644
--- a/include/linux/platform_data/shmob_drm.h
+++ b/include/linux/platform_data/shmob_drm.h
@@ -10,7 +10,7 @@
 #ifndef __SHMOB_DRM_H__
 #define __SHMOB_DRM_H__
 
-#include <drm/drm_mode.h>
+#include <video/videomode.h>
 
 enum shmob_drm_clk_source {
 	SHMOB_DRM_CLK_BUS,
@@ -32,19 +32,12 @@ enum shmob_drm_interface {
 struct shmob_drm_panel_data {
 	unsigned int width_mm;		/* Panel width in mm */
 	unsigned int height_mm;		/* Panel height in mm */
-	struct drm_mode_modeinfo mode;
+	struct videomode mode;
 };
 
-#define SHMOB_DRM_IFACE_FL_DWPOL (1 << 0) /* Rising edge dot clock data latch */
-#define SHMOB_DRM_IFACE_FL_DIPOL (1 << 1) /* Active low display enable */
-#define SHMOB_DRM_IFACE_FL_DAPOL (1 << 2) /* Active low display data */
-#define SHMOB_DRM_IFACE_FL_HSCNT (1 << 3) /* Disable HSYNC during VBLANK */
-#define SHMOB_DRM_IFACE_FL_DWCNT (1 << 4) /* Disable dotclock during blanking */
-
 struct shmob_drm_interface_data {
 	enum shmob_drm_interface interface;
 	unsigned int clk_div;
-	unsigned int flags;
 };
 
 struct shmob_drm_platform_data {
-- 
cgit v1.2.3


From fa32c6bc92813a76419d1c306e35394cae5028eb Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 15 Sep 2023 10:53:39 +0200
Subject: drm: renesas: shmobile: Use media bus formats in platform data

Replace the custom shmob_drm_interface enumeration values with standard
media bus formats.  This simplifies driver handling of bus formats and
prepares for DT support.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/0a15e5100ca30d14953c93550eb1d4c2e18de939.1694767209.git.geert+renesas@glider.be
---
 include/linux/platform_data/shmob_drm.h | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/shmob_drm.h b/include/linux/platform_data/shmob_drm.h
index f3cb19ff9f81..6c19d4fbbe39 100644
--- a/include/linux/platform_data/shmob_drm.h
+++ b/include/linux/platform_data/shmob_drm.h
@@ -18,17 +18,6 @@ enum shmob_drm_clk_source {
 	SHMOB_DRM_CLK_EXTERNAL,
 };
 
-enum shmob_drm_interface {
-	SHMOB_DRM_IFACE_RGB8,		/* 24bpp, 8:8:8 */
-	SHMOB_DRM_IFACE_RGB9,		/* 18bpp, 9:9 */
-	SHMOB_DRM_IFACE_RGB12A,		/* 24bpp, 12:12 */
-	SHMOB_DRM_IFACE_RGB12B,		/* 12bpp */
-	SHMOB_DRM_IFACE_RGB16,		/* 16bpp */
-	SHMOB_DRM_IFACE_RGB18,		/* 18bpp */
-	SHMOB_DRM_IFACE_RGB24,		/* 24bpp */
-	SHMOB_DRM_IFACE_YUV422,		/* 16bpp */
-};
-
 struct shmob_drm_panel_data {
 	unsigned int width_mm;		/* Panel width in mm */
 	unsigned int height_mm;		/* Panel height in mm */
@@ -36,7 +25,7 @@ struct shmob_drm_panel_data {
 };
 
 struct shmob_drm_interface_data {
-	enum shmob_drm_interface interface;
+	unsigned int bus_fmt;		/* MEDIA_BUS_FMT_* */
 	unsigned int clk_div;
 };
 
-- 
cgit v1.2.3


From 166b76a073be8e1ffdc2c4db60f3abbbc974a3a3 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:24 +0100
Subject: clocksource/drivers/arm_arch_timer: Initialize evtstrm after
 finalizing cpucaps

We attempt to initialize each CPU's arch_timer event stream in
arch_timer_evtstrm_enable(), which we call from the
arch_timer_starting_cpu() cpu hotplug callback which is registered early
in boot. As this is registered before we initialize the system cpucaps,
the test for ARM64_HAS_ECV will always be false for CPUs present at boot
time, and will only be taken into account for CPUs onlined late
(including those which are hotplugged out and in again).

Due to this, CPUs present and boot time may not use the intended divider
and scale factor to generate the event stream, and may differ from other
CPUs.

Correct this by only initializing the event stream after cpucaps have been
finalized, registering a separate CPU hotplug callback for the event stream
configuration. Since the caps must be finalized by this point, use
cpus_have_final_cap() to verify this.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/cpuhotplug.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 068f7738be22..bb4d0bcac81b 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -172,6 +172,7 @@ enum cpuhp_state {
 	CPUHP_AP_ARM_L2X0_STARTING,
 	CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING,
 	CPUHP_AP_ARM_ARCH_TIMER_STARTING,
+	CPUHP_AP_ARM_ARCH_TIMER_EVTSTRM_STARTING,
 	CPUHP_AP_ARM_GLOBAL_TIMER_STARTING,
 	CPUHP_AP_JCORE_TIMER_STARTING,
 	CPUHP_AP_ARM_TWD_STARTING,
-- 
cgit v1.2.3


From 20f3b8eafe0ba5d3c69d5011a9b07739e9645132 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:25 +0100
Subject: arm64/arm: xen: enlighten: Fix KPTI checks

When KPTI is in use, we cannot register a runstate region as XEN
requires that this is always a valid VA, which we cannot guarantee. Due
to this, xen_starting_cpu() must avoid registering each CPU's runstate
region, and xen_guest_init() must avoid setting up features that depend
upon it.

We tried to ensure that in commit:

  f88af7229f6f22ce (" xen/arm: do not setup the runstate info page if kpti is enabled")

... where we added checks for xen_kernel_unmapped_at_usr(), which wraps
arm64_kernel_unmapped_at_el0() on arm64 and is always false on 32-bit
arm.

Unfortunately, as xen_guest_init() is an early_initcall, this happens
before secondary CPUs are booted and arm64 has finalized the
ARM64_UNMAP_KERNEL_AT_EL0 cpucap which backs
arm64_kernel_unmapped_at_el0(), and so this can subsequently be set as
secondary CPUs are onlined. On a big.LITTLE system where the boot CPU
does not require KPTI but some secondary CPUs do, this will result in
xen_guest_init() intializing features that depend on the runstate
region, and xen_starting_cpu() registering the runstate region on some
CPUs before KPTI is subsequent enabled, resulting the the problems the
aforementioned commit tried to avoid.

Handle this more robsutly by deferring the initialization of the
runstate region until secondary CPUs have been initialized and the
ARM64_UNMAP_KERNEL_AT_EL0 cpucap has been finalized. The per-cpu work is
moved into a new hotplug starting function which is registered later
when we're certain that KPTI will not be used.

Fixes: f88af7229f6f ("xen/arm: do not setup the runstate info page if kpti is enabled")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Bertrand Marquis <bertrand.marquis@arm.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/cpuhotplug.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index bb4d0bcac81b..d06e31e03a5e 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -190,6 +190,7 @@ enum cpuhp_state {
 	/* Must be the last timer callback */
 	CPUHP_AP_DUMMY_TIMER_STARTING,
 	CPUHP_AP_ARM_XEN_STARTING,
+	CPUHP_AP_ARM_XEN_RUNSTATE_STARTING,
 	CPUHP_AP_ARM_CORESIGHT_STARTING,
 	CPUHP_AP_ARM_CORESIGHT_CTI_STARTING,
 	CPUHP_AP_ARM64_ISNDEP_STARTING,
-- 
cgit v1.2.3


From ada1682d60ac6017037305166d02eb0cd5ee50fa Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 4 Oct 2023 20:57:32 +0200
Subject: firmware: qcom: qseecom: add missing include guards

The qseecom header does not contain ifdef guards against multiple
inclusion. Add them.

Fixes: 00b1248606ba ("firmware: qcom_scm: Add support for Qualcomm Secure Execution Environment SCM interface")
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20231004185732.98621-1-brgl@bgdev.pl
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/firmware/qcom/qcom_qseecom.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/qcom/qcom_qseecom.h b/include/linux/firmware/qcom/qcom_qseecom.h
index b531547e1dc9..5c28298a98be 100644
--- a/include/linux/firmware/qcom/qcom_qseecom.h
+++ b/include/linux/firmware/qcom/qcom_qseecom.h
@@ -5,6 +5,10 @@
  *
  * Copyright (C) 2023 Maximilian Luz <luzmaximilian@gmail.com>
  */
+
+#ifndef __QCOM_QSEECOM_H
+#define __QCOM_QSEECOM_H
+
 #include <linux/auxiliary_bus.h>
 #include <linux/types.h>
 
@@ -44,3 +48,5 @@ static inline int qcom_qseecom_app_send(struct qseecom_client *client, void *req
 {
 	return qcom_scm_qseecom_app_send(client->app_id, req, req_size, rsp, rsp_size);
 }
+
+#endif /* __QCOM_QSEECOM_H */
-- 
cgit v1.2.3


From a8ecbc54165fca767e75a82372a7be3810c667cf Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sat, 14 Oct 2023 22:53:11 +0200
Subject: spi: Export acpi_spi_find_controller_by_adev()

Export acpi_spi_find_controller_by_adev() so that ACPI glue code which
wants to dynamically create a spi_device using acpi_spi_device_alloc() or
spi_new_device() on a controller, to which the code does not already have
a reference, can find the controller.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20231014205314.59333-2-hdegoede@redhat.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 487da1f6e4b7..86825c88b576 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -867,6 +867,7 @@ extern int devm_spi_register_controller(struct device *dev,
 extern void spi_unregister_controller(struct spi_controller *ctlr);
 
 #if IS_ENABLED(CONFIG_ACPI)
+extern struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev);
 extern struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr,
 						struct acpi_device *adev,
 						int index);
-- 
cgit v1.2.3


From 2dd10de8e6bcbacf85ad758b904543c294820c63 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Tue, 12 Sep 2023 17:53:18 -0400
Subject: lockd: introduce safe async lock op

This patch reverts mostly commit 40595cdc93ed ("nfs: block notification
on fs with its own ->lock") and introduces an EXPORT_OP_ASYNC_LOCK
export flag to signal that the "own ->lock" implementation supports
async lock requests. The only main user is DLM that is used by GFS2 and
OCFS2 filesystem. Those implement their own lock() implementation and
return FILE_LOCK_DEFERRED as return value. Since commit 40595cdc93ed
("nfs: block notification on fs with its own ->lock") the DLM
implementation were never updated. This patch should prepare for DLM
to set the EXPORT_OP_ASYNC_LOCK export flag and update the DLM
plock implementation regarding to it.

Acked-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/exportfs.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 11fbd0ee1370..6dd993240fcc 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -224,9 +224,23 @@ struct export_operations {
 						  atomic attribute updates
 						*/
 #define EXPORT_OP_FLUSH_ON_CLOSE	(0x20) /* fs flushes file data on close */
+#define EXPORT_OP_ASYNC_LOCK		(0x40) /* fs can do async lock request */
 	unsigned long	flags;
 };
 
+/**
+ * exportfs_lock_op_is_async() - export op supports async lock operation
+ * @export_ops:	the nfs export operations to check
+ *
+ * Returns true if the nfs export_operations structure has
+ * EXPORT_OP_ASYNC_LOCK in their flags set
+ */
+static inline bool
+exportfs_lock_op_is_async(const struct export_operations *export_ops)
+{
+	return export_ops->flags & EXPORT_OP_ASYNC_LOCK;
+}
+
 extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
 				    int *max_len, struct inode *parent,
 				    int flags);
-- 
cgit v1.2.3


From 6ed8cdf967f7e9fc96cd1c129719ef99db2f9afc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Sep 2023 10:38:51 -0400
Subject: SUNRPC: Clean up bc_svc_process()

The test robot complained that, in some build configurations, the
@error variable in bc_svc_process's only caller is set but never
used. This happens because dprintk() is the only consumer of that
value.

 - Remove the dprintk() call sites in favor of the svc_process
   tracepoint
 - The @error variable and the return value of bc_svc_process() are
   now unused, so get rid of them.
 - The @serv parameter is set to rqstp->rq_serv by the only caller,
   and bc_svc_process() then uses it only to set rqstp->rq_serv. It
   can be removed.
 - Rename bc_svc_process() according to the convention that
   globally-visible RPC server functions have names that begin with
   "svc_"; and because it is globally-visible, give it a proper
   kdoc comment.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202308121314.HA8Rq2XG-lkp@intel.com/
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index dbf5b21feafe..0cdca5960171 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -413,8 +413,7 @@ struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_process(struct svc_rqst *rqstp);
-int		   bc_svc_process(struct svc_serv *, struct rpc_rqst *,
-			struct svc_rqst *);
+void		   svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp);
 int		   svc_register(const struct svc_serv *, struct net *, const int,
 				const unsigned short, const unsigned short);
 
-- 
cgit v1.2.3


From 063ab935a48b3a2854f433957adbb2bde396ed22 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:38:58 -0400
Subject: SUNRPC: integrate back-channel processing with svc_recv()

Using svc_recv() for (NFSv4.1) back-channel handling means we have just
one mechanism for waking threads.

Also change kthread_freezable_should_stop() in nfs4_callback_svc() to
kthread_should_stop() as used elsewhere.
kthread_freezable_should_stop() effectively adds a try_to_freeze() call,
and svc_recv() already contains that at an appropriate place.

Signed-off-by: NeilBrown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 0cdca5960171..acbe1314febd 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -92,8 +92,6 @@ struct svc_serv {
 						 * that arrive over the same
 						 * connection */
 	spinlock_t		sv_cb_lock;	/* protects the svc_cb_list */
-	wait_queue_head_t	sv_cb_waitq;	/* sleep here if there are no
-						 * entries in the svc_cb_list */
 	bool			sv_bc_enabled;	/* service uses backchannel */
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 };
-- 
cgit v1.2.3


From fa341560ca7458f4396d5a0771cb5f2358d8535d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:04 -0400
Subject: SUNRPC: change how svc threads are asked to exit.

svc threads are currently stopped using kthread_stop().  This requires
identifying a specific thread.  However we don't care which thread
stops, just as long as one does.

So instead, set a flag in the svc_pool to say that a thread needs to
die, and have each thread check this flag instead of calling
kthread_should_stop().  The first thread to find and clear this flag
then moves towards exiting.

This removes an explicit dependency on sp_all_threads which will make a
future patch simpler.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/lockd/lockd.h |  2 +-
 include/linux/sunrpc/svc.h  | 26 +++++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 0f016d69c996..9f565416d186 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -282,7 +282,7 @@ __be32		  nlmsvc_testlock(struct svc_rqst *, struct nlm_file *,
 			struct nlm_host *, struct nlm_lock *,
 			struct nlm_lock *, struct nlm_cookie *);
 __be32		  nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *);
-void		  nlmsvc_retry_blocked(void);
+void		  nlmsvc_retry_blocked(struct svc_rqst *rqstp);
 void		  nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *,
 					nlm_host_match_fn_t match);
 void		  nlmsvc_grant_reply(struct nlm_cookie *, __be32);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index acbe1314febd..0ec691070e27 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -50,6 +50,8 @@ struct svc_pool {
 enum {
 	SP_TASK_PENDING,	/* still work to do even if no xprt is queued */
 	SP_CONGESTED,		/* all threads are busy, none idle */
+	SP_NEED_VICTIM,		/* One thread needs to agree to exit */
+	SP_VICTIM_REMAINS,	/* One thread needs to actually exit */
 };
 
 
@@ -259,7 +261,7 @@ enum {
 	RQ_DROPME,		/* drop current reply */
 	RQ_SPLICE_OK,		/* turned off in gss privacy to prevent
 				 * encrypting page cache pages */
-	RQ_VICTIM,		/* about to be shut down */
+	RQ_VICTIM,		/* Have agreed to shut down */
 	RQ_BUSY,		/* request is busy */
 	RQ_DATA,		/* request has data */
 };
@@ -299,6 +301,28 @@ static inline struct sockaddr *svc_daddr(const struct svc_rqst *rqst)
 	return (struct sockaddr *) &rqst->rq_daddr;
 }
 
+/**
+ * svc_thread_should_stop - check if this thread should stop
+ * @rqstp: the thread that might need to stop
+ *
+ * To stop an svc thread, the pool flags SP_NEED_VICTIM and SP_VICTIM_REMAINS
+ * are set.  The first thread which sees SP_NEED_VICTIM clears it, becoming
+ * the victim using this function.  It should then promptly call
+ * svc_exit_thread() to complete the process, clearing SP_VICTIM_REMAINS
+ * so the task waiting for a thread to exit can wake and continue.
+ *
+ * Return values:
+ *   %true: caller should invoke svc_exit_thread()
+ *   %false: caller should do nothing
+ */
+static inline bool svc_thread_should_stop(struct svc_rqst *rqstp)
+{
+	if (test_and_clear_bit(SP_NEED_VICTIM, &rqstp->rq_pool->sp_flags))
+		set_bit(RQ_VICTIM, &rqstp->rq_flags);
+
+	return test_bit(RQ_VICTIM, &rqstp->rq_flags);
+}
+
 struct svc_deferred_req {
 	u32			prot;	/* protocol (UDP or TCP) */
 	struct svc_xprt		*xprt;
-- 
cgit v1.2.3


From 5ff817b23534dd3942f881ab01dd5050505517aa Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:11 -0400
Subject: SUNRPC: add list of idle threads

Rather than searching a list of threads to find an idle one, having a
list of idle threads allows an idle thread to be found immediately.

This adds some spin_lock calls which is not ideal, but as the hold-time
is tiny it is still faster than searching a list.  A future patch will
remove them using llist.h.  This involves some subtlety and so is left
to a separate patch.

This removes the need for the RQ_BUSY flag.  The rqst is "busy"
precisely when it is not on the "idle" list.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 0ec691070e27..e9c34e99bc88 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -37,6 +37,7 @@ struct svc_pool {
 	struct list_head	sp_sockets;	/* pending sockets */
 	unsigned int		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
+	struct list_head	sp_idle_threads; /* idle server threads */
 
 	/* statistics on pool operation */
 	struct percpu_counter	sp_messages_arrived;
@@ -186,6 +187,7 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
  */
 struct svc_rqst {
 	struct list_head	rq_all;		/* all threads list */
+	struct list_head	rq_idle;	/* On the idle list */
 	struct rcu_head		rq_rcu_head;	/* for RCU deferred kfree */
 	struct svc_xprt *	rq_xprt;	/* transport ptr */
 
@@ -262,10 +264,31 @@ enum {
 	RQ_SPLICE_OK,		/* turned off in gss privacy to prevent
 				 * encrypting page cache pages */
 	RQ_VICTIM,		/* Have agreed to shut down */
-	RQ_BUSY,		/* request is busy */
 	RQ_DATA,		/* request has data */
 };
 
+/**
+ * svc_thread_set_busy - mark a thread as busy
+ * @rqstp: the thread which is now busy
+ *
+ * If rq_idle is "empty", the thread must be busy.
+ */
+static inline void svc_thread_set_busy(struct svc_rqst *rqstp)
+{
+	INIT_LIST_HEAD(&rqstp->rq_idle);
+}
+
+/**
+ * svc_thread_busy - check if a thread as busy
+ * @rqstp: the thread which might be busy
+ *
+ * If rq_idle is "empty", the thread must be busy.
+ */
+static inline bool svc_thread_busy(struct svc_rqst *rqstp)
+{
+	return list_empty(&rqstp->rq_idle);
+}
+
 #define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net)
 
 /*
-- 
cgit v1.2.3


From 2b65a226840c0e86db0e7926856a0a017b3390f2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:17 -0400
Subject: SUNRPC: discard SP_CONGESTED

We can tell if a pool is congested by checking if the idle list is
empty.  We don't need a separate flag.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e9c34e99bc88..22b3018ebf62 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -50,7 +50,6 @@ struct svc_pool {
 /* bits for sp_flags */
 enum {
 	SP_TASK_PENDING,	/* still work to do even if no xprt is queued */
-	SP_CONGESTED,		/* all threads are busy, none idle */
 	SP_NEED_VICTIM,		/* One thread needs to agree to exit */
 	SP_VICTIM_REMAINS,	/* One thread needs to actually exit */
 };
-- 
cgit v1.2.3


From d6b3358a2813bb14791259a2227d9af1e7019ca0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:24 -0400
Subject: llist: add interface to check if a node is on a list.

With list.h lists, it is easy to test if a node is on a list, providing
it was initialised and that it is removed with list_del_init().

This patch provides similar functionality for llist.h lists.

 init_llist_node()
marks a node as being not-on-any-list be setting the ->next pointer to
the node itself.
 llist_on_list()
tests if the node is on any list.
 llist_del_first_init()
remove the first element from a llist, and marks it as being off-list.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/llist.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/llist.h b/include/linux/llist.h
index 85bda2d02d65..dcb91e3bac1c 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -73,6 +73,33 @@ static inline void init_llist_head(struct llist_head *list)
 	list->first = NULL;
 }
 
+/**
+ * init_llist_node - initialize lock-less list node
+ * @node:	the node to be initialised
+ *
+ * In cases where there is a need to test if a node is on
+ * a list or not, this initialises the node to clearly
+ * not be on any list.
+ */
+static inline void init_llist_node(struct llist_node *node)
+{
+	node->next = node;
+}
+
+/**
+ * llist_on_list - test if a lock-list list node is on a list
+ * @node:	the node to test
+ *
+ * When a node is on a list the ->next pointer will be NULL or
+ * some other node.  It can never point to itself.  We use that
+ * in init_llist_node() to record that a node is not on any list,
+ * and here to test whether it is on any list.
+ */
+static inline bool llist_on_list(const struct llist_node *node)
+{
+	return node->next != node;
+}
+
 /**
  * llist_entry - get the struct of this entry
  * @ptr:	the &struct llist_node pointer.
@@ -249,6 +276,21 @@ static inline struct llist_node *__llist_del_all(struct llist_head *head)
 
 extern struct llist_node *llist_del_first(struct llist_head *head);
 
+/**
+ * llist_del_first_init - delete first entry from lock-list and mark is as being off-list
+ * @head:	the head of lock-less list to delete from.
+ *
+ * This behave the same as llist_del_first() except that llist_init_node() is called
+ * on the returned node so that llist_on_list() will report false for the node.
+ */
+static inline struct llist_node *llist_del_first_init(struct llist_head *head)
+{
+	struct llist_node *n = llist_del_first(head);
+
+	if (n)
+		init_llist_node(n);
+	return n;
+}
 struct llist_node *llist_reverse_order(struct llist_node *head);
 
 #endif /* LLIST_H */
-- 
cgit v1.2.3


From 9bd4161c591710f152a8cd3ed85ea928c61e26ca Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:30 -0400
Subject: SUNRPC: change service idle list to be an llist

With an llist we don't need to take a lock to add a thread to the list,
though we still need a lock to remove it.  That will go in the next
patch.

Unlike double-linked lists, a thread cannot reliably remove itself from
the list.  Only the first thread can be removed, and that can change
asynchronously.  So some care is needed.

We already check if there is pending work to do, so we are unlikely to
add ourselves to the idle list and then want to remove ourselves again.

If we DO find something needs to be done after adding ourselves to the
list, we simply wake up the first thread on the list.  If that was us,
we successfully removed ourselves and can continue.  If it was some
other thread, they will do the work that needs to be done.  We can
safely sleep until woken.

We also remove the test on freezing() from rqst_should_sleep().  Instead
we set TASK_FREEZABLE before scheduling.  This makes is safe to
schedule() when a freeze is pending.  As we now loop waiting to be
removed from the idle queue, this is a cleaner way to handle freezing.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 22b3018ebf62..ad4572630335 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -37,7 +37,7 @@ struct svc_pool {
 	struct list_head	sp_sockets;	/* pending sockets */
 	unsigned int		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
-	struct list_head	sp_idle_threads; /* idle server threads */
+	struct llist_head	sp_idle_threads; /* idle server threads */
 
 	/* statistics on pool operation */
 	struct percpu_counter	sp_messages_arrived;
@@ -186,7 +186,7 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
  */
 struct svc_rqst {
 	struct list_head	rq_all;		/* all threads list */
-	struct list_head	rq_idle;	/* On the idle list */
+	struct llist_node	rq_idle;	/* On the idle list */
 	struct rcu_head		rq_rcu_head;	/* for RCU deferred kfree */
 	struct svc_xprt *	rq_xprt;	/* transport ptr */
 
@@ -266,26 +266,15 @@ enum {
 	RQ_DATA,		/* request has data */
 };
 
-/**
- * svc_thread_set_busy - mark a thread as busy
- * @rqstp: the thread which is now busy
- *
- * If rq_idle is "empty", the thread must be busy.
- */
-static inline void svc_thread_set_busy(struct svc_rqst *rqstp)
-{
-	INIT_LIST_HEAD(&rqstp->rq_idle);
-}
-
 /**
  * svc_thread_busy - check if a thread as busy
  * @rqstp: the thread which might be busy
  *
- * If rq_idle is "empty", the thread must be busy.
+ * A thread is only busy when it is not an the idle list.
  */
-static inline bool svc_thread_busy(struct svc_rqst *rqstp)
+static inline bool svc_thread_busy(const struct svc_rqst *rqstp)
 {
-	return list_empty(&rqstp->rq_idle);
+	return !llist_on_list(&rqstp->rq_idle);
 }
 
 #define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net)
-- 
cgit v1.2.3


From 8a3e5975ed11dd16d81dc501b514ab01986db94e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:37 -0400
Subject: llist: add llist_del_first_this()

llist_del_first_this() deletes a specific entry from an llist, providing
it is at the head of the list.  Multiple threads can call this
concurrently providing they each offer a different entry.

This can be uses for a set of worker threads which are on the llist when
they are idle.  The head can always be woken, and when it is woken it
can remove itself, and possibly wake the next if there is an excess of
work to do.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/llist.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/llist.h b/include/linux/llist.h
index dcb91e3bac1c..2c982ff7475a 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -291,6 +291,10 @@ static inline struct llist_node *llist_del_first_init(struct llist_head *head)
 		init_llist_node(n);
 	return n;
 }
+
+extern bool llist_del_first_this(struct llist_head *head,
+				 struct llist_node *this);
+
 struct llist_node *llist_reverse_order(struct llist_node *head);
 
 #endif /* LLIST_H */
-- 
cgit v1.2.3


From de9e82c355f2ae73c04aec84f73fc2657cf7dfdd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:43 -0400
Subject: lib: add light-weight queuing mechanism.

lwq is a FIFO single-linked queue that only requires a spinlock
for dequeueing, which happens in process context.  Enqueueing is atomic
with no spinlock and can happen in any context.

This is particularly useful when work items are queued from BH or IRQ
context, and when they are handled one at a time by dedicated threads.

Avoiding any locking when enqueueing means there is no need to disable
BH or interrupts, which is generally best avoided (particularly when
there are any RT tasks on the machine).

This solution is superior to using "list_head" links because we need
half as many pointers in the data structures, and because list_head
lists would need locking to add items to the queue.

This solution is superior to a bespoke solution as all locking and
container_of casting is integrated, so the interface is simple.

Despite the similar name, this solution meets a distinctly different
need to kfifo.  kfifo provides a fixed sized circular buffer to which
data can be added at one end and removed at the other, and does not
provide any locking.  lwq does not have any size limit and works with
data structures (objects?) rather than data (bytes).

A unit test for basic functionality, which runs at boot time, is included.

Signed-off-by: NeilBrown <neilb@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: David Gow <davidgow@google.com>
Cc: linux-kernel@vger.kernel.org
Message-Id: <20230911111333.4d1a872330e924a00acb905b@linux-foundation.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/lwq.h | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 include/linux/lwq.h

(limited to 'include/linux')

diff --git a/include/linux/lwq.h b/include/linux/lwq.h
new file mode 100644
index 000000000000..d081d5cf8e33
--- /dev/null
+++ b/include/linux/lwq.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef LWQ_H
+#define LWQ_H
+/*
+ * Light-weight single-linked queue built from llist
+ *
+ * Entries can be enqueued from any context with no locking.
+ * Entries can be dequeued from process context with integrated locking.
+ *
+ * This is particularly suitable when work items are queued in
+ * BH or IRQ context, and where work items are handled one at a time
+ * by dedicated threads.
+ */
+#include <linux/container_of.h>
+#include <linux/spinlock.h>
+#include <linux/llist.h>
+
+struct lwq_node {
+	struct llist_node node;
+};
+
+struct lwq {
+	spinlock_t		lock;
+	struct llist_node	*ready;		/* entries to be dequeued */
+	struct llist_head	new;		/* entries being enqueued */
+};
+
+/**
+ * lwq_init - initialise a lwq
+ * @q:	the lwq object
+ */
+static inline void lwq_init(struct lwq *q)
+{
+	spin_lock_init(&q->lock);
+	q->ready = NULL;
+	init_llist_head(&q->new);
+}
+
+/**
+ * lwq_empty - test if lwq contains any entry
+ * @q:	the lwq object
+ *
+ * This empty test contains an acquire barrier so that if a wakeup
+ * is sent when lwq_dequeue returns true, it is safe to go to sleep after
+ * a test on lwq_empty().
+ */
+static inline bool lwq_empty(struct lwq *q)
+{
+	/* acquire ensures ordering wrt lwq_enqueue() */
+	return smp_load_acquire(&q->ready) == NULL && llist_empty(&q->new);
+}
+
+struct llist_node *__lwq_dequeue(struct lwq *q);
+/**
+ * lwq_dequeue - dequeue first (oldest) entry from lwq
+ * @q:		the queue to dequeue from
+ * @type:	the type of object to return
+ * @member:	them member in returned object which is an lwq_node.
+ *
+ * Remove a single object from the lwq and return it.  This will take
+ * a spinlock and so must always be called in the same context, typcially
+ * process contet.
+ */
+#define lwq_dequeue(q, type, member)					\
+	({ struct llist_node *_n = __lwq_dequeue(q);			\
+	  _n ? container_of(_n, type, member.node) : NULL; })
+
+struct llist_node *lwq_dequeue_all(struct lwq *q);
+
+/**
+ * lwq_for_each_safe - iterate over detached queue allowing deletion
+ * @_n:		iterator variable
+ * @_t1:	temporary struct llist_node **
+ * @_t2:	temporary struct llist_node *
+ * @_l:		address of llist_node pointer from lwq_dequeue_all()
+ * @_member:	member in _n where lwq_node is found.
+ *
+ * Iterate over members in a dequeued list.  If the iterator variable
+ * is set to NULL, the iterator removes that entry from the queue.
+ */
+#define lwq_for_each_safe(_n, _t1, _t2, _l, _member)			\
+	for (_t1 = (_l);						\
+	     *(_t1) ? (_n = container_of(*(_t1), typeof(*(_n)), _member.node),\
+		       _t2 = ((*_t1)->next),				\
+		       true)						\
+	     : false;							\
+	     (_n) ? (_t1 = &(_n)->_member.node.next, 0)			\
+	     : ((*(_t1) = (_t2)),  0))
+
+/**
+ * lwq_enqueue - add a new item to the end of the queue
+ * @n	- the lwq_node embedded in the item to be added
+ * @q	- the lwq to append to.
+ *
+ * No locking is needed to append to the queue so this can
+ * be called from any context.
+ * Return %true is the list may have previously been empty.
+ */
+static inline bool lwq_enqueue(struct lwq_node *n, struct lwq *q)
+{
+	/* acquire enqures ordering wrt lwq_dequeue */
+	return llist_add(&n->node, &q->new) &&
+		smp_load_acquire(&q->ready) == NULL;
+}
+
+/**
+ * lwq_enqueue_batch - add a list of new items to the end of the queue
+ * @n	- the lwq_node embedded in the first item to be added
+ * @q	- the lwq to append to.
+ *
+ * No locking is needed to append to the queue so this can
+ * be called from any context.
+ * Return %true is the list may have previously been empty.
+ */
+static inline bool lwq_enqueue_batch(struct llist_node *n, struct lwq *q)
+{
+	struct llist_node *e = n;
+
+	/* acquire enqures ordering wrt lwq_dequeue */
+	return llist_add_batch(llist_reverse_order(n), e, &q->new) &&
+		smp_load_acquire(&q->ready) == NULL;
+}
+#endif /* LWQ_H */
-- 
cgit v1.2.3


From 5b80147e0c70181654e8e54eae99f69b2bf891b1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:56 -0400
Subject: SUNRPC: only have one thread waking up at a time

Currently if several items of work become available in quick succession,
that number of threads (if available) will be woken.  By the time some
of them wake up another thread that was already cache-warm might have
come along and completed the work.  Anecdotal evidence suggests as many
as 15% of wakes find nothing to do once they get to the point of
looking.

This patch changes svc_pool_wake_idle_thread() to wake the first thread
on the queue but NOT remove it.  Subsequent calls will wake the same
thread.  Once that thread starts it will dequeue itself and after
dequeueing some work to do, it will wake the next thread if there is more
work ready.  This results in a more orderly increase in the number of
busy threads.

As a bonus, this allows us to reduce locking around the idle queue.
svc_pool_wake_idle_thread() no longer needs to take a lock (beyond
rcu_read_lock()) as it doesn't manipulate the queue, it just looks at
the first item.

The thread itself can avoid locking by using the new
llist_del_first_this() interface.  This will safely remove the thread
itself if it is the head.  If it isn't the head, it will do nothing.
If multiple threads call this concurrently only one will succeed.  The
others will do nothing, so no corruption can result.

If a thread wakes up and finds that it cannot dequeue itself that means
either
- that it wasn't woken because it was the head of the queue.  Maybe the
  freezer woke it.  In that case it can go back to sleep (after trying
  to freeze of course).
- some other thread found there was nothing to do very recently, and
  placed itself on the head of the queue in front of this thread.
  It must check again after placing itself there, so it can be deemed to
  be responsible for any pending work, and this thread can go back to
  sleep until woken.

No code ever tests for busy threads any more.  Only each thread itself
cares if it is busy.  So svc_thread_busy() is no longer needed.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index ad4572630335..dafa362b4fdd 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -266,17 +266,6 @@ enum {
 	RQ_DATA,		/* request has data */
 };
 
-/**
- * svc_thread_busy - check if a thread as busy
- * @rqstp: the thread which might be busy
- *
- * A thread is only busy when it is not an the idle list.
- */
-static inline bool svc_thread_busy(const struct svc_rqst *rqstp)
-{
-	return !llist_on_list(&rqstp->rq_idle);
-}
-
 #define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net)
 
 /*
-- 
cgit v1.2.3


From 9a0e6accc0a8c3adf72f1b43be8019961b68663a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:40:02 -0400
Subject: SUNRPC: use lwq for sp_sockets - renamed to sp_xprts

lwq avoids using back pointers in lists, and uses less locking.
This introduces a new spinlock, but the other one will be removed in a
future patch.

For svc_clean_up_xprts(), we now dequeue the entire queue, walk it to
remove and process the xprts that need cleaning up, then re-enqueue the
remaining queue.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h      | 3 ++-
 include/linux/sunrpc/svc_xprt.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index dafa362b4fdd..7ff9fe785e49 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -17,6 +17,7 @@
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/auth.h>
 #include <linux/sunrpc/svcauth.h>
+#include <linux/lwq.h>
 #include <linux/wait.h>
 #include <linux/mm.h>
 #include <linux/pagevec.h>
@@ -34,7 +35,7 @@
 struct svc_pool {
 	unsigned int		sp_id;	    	/* pool id; also node id on NUMA */
 	spinlock_t		sp_lock;	/* protects all fields */
-	struct list_head	sp_sockets;	/* pending sockets */
+	struct lwq		sp_xprts;	/* pending transports */
 	unsigned int		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
 	struct llist_head	sp_idle_threads; /* idle server threads */
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index fa55d12dc765..8e20cd60e2e7 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -54,7 +54,7 @@ struct svc_xprt {
 	const struct svc_xprt_ops *xpt_ops;
 	struct kref		xpt_ref;
 	struct list_head	xpt_list;
-	struct list_head	xpt_ready;
+	struct lwq_node		xpt_ready;
 	unsigned long		xpt_flags;
 
 	struct svc_serv		*xpt_server;	/* service for transport */
-- 
cgit v1.2.3


From 2e8fc923fe476db8cab9b6458027eccb22f3b6e6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:40:09 -0400
Subject: SUNRPC: change sp_nrthreads to atomic_t

Using an atomic_t avoids the need to take a spinlock (which can soon be
removed).

Choosing a thread to kill needs to be careful as we cannot set the "die
now" bit atomically with the test on the count.  Instead we temporarily
increase the count.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 7ff9fe785e49..9d0fcd6148ae 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -36,7 +36,7 @@ struct svc_pool {
 	unsigned int		sp_id;	    	/* pool id; also node id on NUMA */
 	spinlock_t		sp_lock;	/* protects all fields */
 	struct lwq		sp_xprts;	/* pending transports */
-	unsigned int		sp_nrthreads;	/* # of threads in pool */
+	atomic_t		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
 	struct llist_head	sp_idle_threads; /* idle server threads */
 
-- 
cgit v1.2.3


From 580a25756a9f639180b29a508f3bdd24c50a936a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:40:15 -0400
Subject: SUNRPC: discard sp_lock

sp_lock is now only used to protect sp_all_threads.  This isn't needed
as sp_all_threads is only manipulated through svc_set_num_threads(),
which is already serialized.  Read-acccess only requires rcu_read_lock().
So no more locking is needed.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 9d0fcd6148ae..8ce1392c1a35 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -34,7 +34,6 @@
  */
 struct svc_pool {
 	unsigned int		sp_id;	    	/* pool id; also node id on NUMA */
-	spinlock_t		sp_lock;	/* protects all fields */
 	struct lwq		sp_xprts;	/* pending transports */
 	atomic_t		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
-- 
cgit v1.2.3


From 15d39883ee7dfc023d8a24f5d4b58100e1d04ad9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:40:22 -0400
Subject: SUNRPC: change the back-channel queue to lwq

This removes the need to store and update back-links in the list.
It also remove the need for the _bh version of spin_lock().

Signed-off-by: NeilBrown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h  | 3 +--
 include/linux/sunrpc/xprt.h | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 8ce1392c1a35..c1feaf0d1542 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -90,10 +90,9 @@ struct svc_serv {
 	int			(*sv_threadfn)(void *data);
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
-	struct list_head	sv_cb_list;	/* queue for callback requests
+	struct lwq		sv_cb_list;	/* queue for callback requests
 						 * that arrive over the same
 						 * connection */
-	spinlock_t		sv_cb_lock;	/* protects the svc_cb_list */
 	bool			sv_bc_enabled;	/* service uses backchannel */
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 };
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 4ecc89301eb7..f85d3a0daca2 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -57,6 +57,7 @@ struct xprt_class;
 struct seq_file;
 struct svc_serv;
 struct net;
+#include <linux/lwq.h>
 
 /*
  * This describes a complete RPC request
@@ -121,7 +122,7 @@ struct rpc_rqst {
 	int			rq_ntrans;
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
-	struct list_head	rq_bc_list;	/* Callback service list */
+	struct lwq_node		rq_bc_list;	/* Callback service list */
 	unsigned long		rq_bc_pa_state;	/* Backchannel prealloc state */
 	struct list_head	rq_bc_pa_list;	/* Backchannel prealloc list */
 #endif /* CONFIG_SUNRPC_BACKCHANEL */
-- 
cgit v1.2.3


From bd9d6a3efa9709e653aafbeb859289feccb8e70c Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 11 Sep 2023 14:49:46 +0200
Subject: NFSD: add rpc_status netlink support

Introduce rpc_status netlink support for NFSD in order to dump pending
RPC requests debugging information from userspace.

Closes: https://bugzilla.linux-nfs.org/show_bug.cgi?id=366
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index c1feaf0d1542..b10f987509cc 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -251,6 +251,7 @@ struct svc_rqst {
 						 * net namespace
 						 */
 	void **			rq_lease_breaker; /* The v4 client breaking a lease */
+	unsigned int		rq_status_counter; /* RPC processing counter */
 };
 
 /* bits for rq_flags */
-- 
cgit v1.2.3


From 2929ba9b460c13625cb47ed4e2de134f966bdf17 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 18 Sep 2023 14:59:06 +0800
Subject: nfs: fix the typo of rfc number about xattr in NFSv4

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/nfs4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 730003c4f4af..b6fa923a3111 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -150,7 +150,7 @@ enum nfs_opnum4 {
 	OP_WRITE_SAME = 70,
 	OP_CLONE = 71,
 
-	/* xattr support (RFC8726) */
+	/* xattr support (RFC8276) */
 	OP_GETXATTR                = 72,
 	OP_SETXATTR                = 73,
 	OP_LISTXATTRS              = 74,
-- 
cgit v1.2.3


From 263453d9bb46ad42f03a0f86aafe580f1b0e9291 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Sep 2023 09:57:51 -0400
Subject: NFSD: Add nfsd4_encode_fattr4_change()

Refactor the encoder for FATTR4_CHANGE into a helper. In a
subsequent patch, this helper will be called from a bitmask loop.

The code is restructured a bit to use the modern xdr_stream flow,
and the encoded cinfo value is made const so that callers of the
encoders can be passed a const cinfo.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/iversion.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index f174ff1b59ee..8f972eaca2ed 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -256,7 +256,7 @@ inode_peek_iversion(const struct inode *inode)
  * For filesystems without any sort of change attribute, the best we can
  * do is fake one up from the ctime:
  */
-static inline u64 time_to_chattr(struct timespec64 *t)
+static inline u64 time_to_chattr(const struct timespec64 *t)
 {
 	u64 chattr = t->tv_sec;
 
-- 
cgit v1.2.3


From be46e695fb95c8acfb342f3182595aaad45301d5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Sep 2023 10:02:06 -0400
Subject: NFSD: Copy FATTR4 bit number definitions from RFCs

I'd like to convert nfsd4_encode_fattr() to rotate through the
attrmask using for_each_bit() instead of explicitly testing the
bitmask for each bit value. This means I need the bit numbers, as
defined in the specs, instead of our internal bitmask constants.

As a clean up, use the new spec-derived values to define the WORD#
bitmask constants.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/nfs4.h | 260 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 192 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index b6fa923a3111..c11c4db34639 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -389,79 +389,203 @@ enum lock_type4 {
 	NFS4_WRITEW_LT = 4
 };
 
+/*
+ * Symbol names and values are from RFC 7531 Section 2.
+ * "XDR Description of NFSv4.0"
+ */
+enum {
+	FATTR4_SUPPORTED_ATTRS		= 0,
+	FATTR4_TYPE			= 1,
+	FATTR4_FH_EXPIRE_TYPE		= 2,
+	FATTR4_CHANGE			= 3,
+	FATTR4_SIZE			= 4,
+	FATTR4_LINK_SUPPORT		= 5,
+	FATTR4_SYMLINK_SUPPORT		= 6,
+	FATTR4_NAMED_ATTR		= 7,
+	FATTR4_FSID			= 8,
+	FATTR4_UNIQUE_HANDLES		= 9,
+	FATTR4_LEASE_TIME		= 10,
+	FATTR4_RDATTR_ERROR		= 11,
+	FATTR4_ACL			= 12,
+	FATTR4_ACLSUPPORT		= 13,
+	FATTR4_ARCHIVE			= 14,
+	FATTR4_CANSETTIME		= 15,
+	FATTR4_CASE_INSENSITIVE		= 16,
+	FATTR4_CASE_PRESERVING		= 17,
+	FATTR4_CHOWN_RESTRICTED		= 18,
+	FATTR4_FILEHANDLE		= 19,
+	FATTR4_FILEID			= 20,
+	FATTR4_FILES_AVAIL		= 21,
+	FATTR4_FILES_FREE		= 22,
+	FATTR4_FILES_TOTAL		= 23,
+	FATTR4_FS_LOCATIONS		= 24,
+	FATTR4_HIDDEN			= 25,
+	FATTR4_HOMOGENEOUS		= 26,
+	FATTR4_MAXFILESIZE		= 27,
+	FATTR4_MAXLINK			= 28,
+	FATTR4_MAXNAME			= 29,
+	FATTR4_MAXREAD			= 30,
+	FATTR4_MAXWRITE			= 31,
+	FATTR4_MIMETYPE			= 32,
+	FATTR4_MODE			= 33,
+	FATTR4_NO_TRUNC			= 34,
+	FATTR4_NUMLINKS			= 35,
+	FATTR4_OWNER			= 36,
+	FATTR4_OWNER_GROUP		= 37,
+	FATTR4_QUOTA_AVAIL_HARD		= 38,
+	FATTR4_QUOTA_AVAIL_SOFT		= 39,
+	FATTR4_QUOTA_USED		= 40,
+	FATTR4_RAWDEV			= 41,
+	FATTR4_SPACE_AVAIL		= 42,
+	FATTR4_SPACE_FREE		= 43,
+	FATTR4_SPACE_TOTAL		= 44,
+	FATTR4_SPACE_USED		= 45,
+	FATTR4_SYSTEM			= 46,
+	FATTR4_TIME_ACCESS		= 47,
+	FATTR4_TIME_ACCESS_SET		= 48,
+	FATTR4_TIME_BACKUP		= 49,
+	FATTR4_TIME_CREATE		= 50,
+	FATTR4_TIME_DELTA		= 51,
+	FATTR4_TIME_METADATA		= 52,
+	FATTR4_TIME_MODIFY		= 53,
+	FATTR4_TIME_MODIFY_SET		= 54,
+	FATTR4_MOUNTED_ON_FILEID	= 55,
+};
+
+/*
+ * Symbol names and values are from RFC 5662 Section 2.
+ * "XDR Description of NFSv4.1"
+ */
+enum {
+	FATTR4_DIR_NOTIF_DELAY		= 56,
+	FATTR4_DIRENT_NOTIF_DELAY	= 57,
+	FATTR4_DACL			= 58,
+	FATTR4_SACL			= 59,
+	FATTR4_CHANGE_POLICY		= 60,
+	FATTR4_FS_STATUS		= 61,
+	FATTR4_FS_LAYOUT_TYPES		= 62,
+	FATTR4_LAYOUT_HINT		= 63,
+	FATTR4_LAYOUT_TYPES		= 64,
+	FATTR4_LAYOUT_BLKSIZE		= 65,
+	FATTR4_LAYOUT_ALIGNMENT		= 66,
+	FATTR4_FS_LOCATIONS_INFO	= 67,
+	FATTR4_MDSTHRESHOLD		= 68,
+	FATTR4_RETENTION_GET		= 69,
+	FATTR4_RETENTION_SET		= 70,
+	FATTR4_RETENTEVT_GET		= 71,
+	FATTR4_RETENTEVT_SET		= 72,
+	FATTR4_RETENTION_HOLD		= 73,
+	FATTR4_MODE_SET_MASKED		= 74,
+	FATTR4_SUPPATTR_EXCLCREAT	= 75,
+	FATTR4_FS_CHARSET_CAP		= 76,
+};
+
+/*
+ * Symbol names and values are from RFC 7863 Section 2.
+ * "XDR Description of NFSv4.2"
+ */
+enum {
+	FATTR4_CLONE_BLKSIZE		= 77,
+	FATTR4_SPACE_FREED		= 78,
+	FATTR4_CHANGE_ATTR_TYPE		= 79,
+	FATTR4_SEC_LABEL		= 80,
+};
+
+/*
+ * Symbol names and values are from RFC 8275 Section 5.
+ * "The mode_umask Attribute"
+ */
+enum {
+	FATTR4_MODE_UMASK		= 81,
+};
+
+/*
+ * Symbol names and values are from RFC 8276 Section 8.6.
+ * "Numeric Values Assigned to Protocol Extensions"
+ */
+enum {
+	FATTR4_XATTR_SUPPORT		= 82,
+};
+
+/*
+ * The following internal definitions enable processing the above
+ * attribute bits within 32-bit word boundaries.
+ */
 
 /* Mandatory Attributes */
-#define FATTR4_WORD0_SUPPORTED_ATTRS    (1UL << 0)
-#define FATTR4_WORD0_TYPE               (1UL << 1)
-#define FATTR4_WORD0_FH_EXPIRE_TYPE     (1UL << 2)
-#define FATTR4_WORD0_CHANGE             (1UL << 3)
-#define FATTR4_WORD0_SIZE               (1UL << 4)
-#define FATTR4_WORD0_LINK_SUPPORT       (1UL << 5)
-#define FATTR4_WORD0_SYMLINK_SUPPORT    (1UL << 6)
-#define FATTR4_WORD0_NAMED_ATTR         (1UL << 7)
-#define FATTR4_WORD0_FSID               (1UL << 8)
-#define FATTR4_WORD0_UNIQUE_HANDLES     (1UL << 9)
-#define FATTR4_WORD0_LEASE_TIME         (1UL << 10)
-#define FATTR4_WORD0_RDATTR_ERROR       (1UL << 11)
+#define FATTR4_WORD0_SUPPORTED_ATTRS    BIT(FATTR4_SUPPORTED_ATTRS)
+#define FATTR4_WORD0_TYPE               BIT(FATTR4_TYPE)
+#define FATTR4_WORD0_FH_EXPIRE_TYPE     BIT(FATTR4_FH_EXPIRE_TYPE)
+#define FATTR4_WORD0_CHANGE             BIT(FATTR4_CHANGE)
+#define FATTR4_WORD0_SIZE               BIT(FATTR4_SIZE)
+#define FATTR4_WORD0_LINK_SUPPORT       BIT(FATTR4_LINK_SUPPORT)
+#define FATTR4_WORD0_SYMLINK_SUPPORT    BIT(FATTR4_SYMLINK_SUPPORT)
+#define FATTR4_WORD0_NAMED_ATTR         BIT(FATTR4_NAMED_ATTR)
+#define FATTR4_WORD0_FSID               BIT(FATTR4_FSID)
+#define FATTR4_WORD0_UNIQUE_HANDLES     BIT(FATTR4_UNIQUE_HANDLES)
+#define FATTR4_WORD0_LEASE_TIME         BIT(FATTR4_LEASE_TIME)
+#define FATTR4_WORD0_RDATTR_ERROR       BIT(FATTR4_RDATTR_ERROR)
 /* Mandatory in NFSv4.1 */
-#define FATTR4_WORD2_SUPPATTR_EXCLCREAT (1UL << 11)
+#define FATTR4_WORD2_SUPPATTR_EXCLCREAT BIT(FATTR4_SUPPATTR_EXCLCREAT - 64)
 
 /* Recommended Attributes */
-#define FATTR4_WORD0_ACL                (1UL << 12)
-#define FATTR4_WORD0_ACLSUPPORT         (1UL << 13)
-#define FATTR4_WORD0_ARCHIVE            (1UL << 14)
-#define FATTR4_WORD0_CANSETTIME         (1UL << 15)
-#define FATTR4_WORD0_CASE_INSENSITIVE   (1UL << 16)
-#define FATTR4_WORD0_CASE_PRESERVING    (1UL << 17)
-#define FATTR4_WORD0_CHOWN_RESTRICTED   (1UL << 18)
-#define FATTR4_WORD0_FILEHANDLE         (1UL << 19)
-#define FATTR4_WORD0_FILEID             (1UL << 20)
-#define FATTR4_WORD0_FILES_AVAIL        (1UL << 21)
-#define FATTR4_WORD0_FILES_FREE         (1UL << 22)
-#define FATTR4_WORD0_FILES_TOTAL        (1UL << 23)
-#define FATTR4_WORD0_FS_LOCATIONS       (1UL << 24)
-#define FATTR4_WORD0_HIDDEN             (1UL << 25)
-#define FATTR4_WORD0_HOMOGENEOUS        (1UL << 26)
-#define FATTR4_WORD0_MAXFILESIZE        (1UL << 27)
-#define FATTR4_WORD0_MAXLINK            (1UL << 28)
-#define FATTR4_WORD0_MAXNAME            (1UL << 29)
-#define FATTR4_WORD0_MAXREAD            (1UL << 30)
-#define FATTR4_WORD0_MAXWRITE           (1UL << 31)
-#define FATTR4_WORD1_MIMETYPE           (1UL << 0)
-#define FATTR4_WORD1_MODE               (1UL << 1)
-#define FATTR4_WORD1_NO_TRUNC           (1UL << 2)
-#define FATTR4_WORD1_NUMLINKS           (1UL << 3)
-#define FATTR4_WORD1_OWNER              (1UL << 4)
-#define FATTR4_WORD1_OWNER_GROUP        (1UL << 5)
-#define FATTR4_WORD1_QUOTA_HARD         (1UL << 6)
-#define FATTR4_WORD1_QUOTA_SOFT         (1UL << 7)
-#define FATTR4_WORD1_QUOTA_USED         (1UL << 8)
-#define FATTR4_WORD1_RAWDEV             (1UL << 9)
-#define FATTR4_WORD1_SPACE_AVAIL        (1UL << 10)
-#define FATTR4_WORD1_SPACE_FREE         (1UL << 11)
-#define FATTR4_WORD1_SPACE_TOTAL        (1UL << 12)
-#define FATTR4_WORD1_SPACE_USED         (1UL << 13)
-#define FATTR4_WORD1_SYSTEM             (1UL << 14)
-#define FATTR4_WORD1_TIME_ACCESS        (1UL << 15)
-#define FATTR4_WORD1_TIME_ACCESS_SET    (1UL << 16)
-#define FATTR4_WORD1_TIME_BACKUP        (1UL << 17)
-#define FATTR4_WORD1_TIME_CREATE        (1UL << 18)
-#define FATTR4_WORD1_TIME_DELTA         (1UL << 19)
-#define FATTR4_WORD1_TIME_METADATA      (1UL << 20)
-#define FATTR4_WORD1_TIME_MODIFY        (1UL << 21)
-#define FATTR4_WORD1_TIME_MODIFY_SET    (1UL << 22)
-#define FATTR4_WORD1_MOUNTED_ON_FILEID  (1UL << 23)
-#define FATTR4_WORD1_DACL               (1UL << 26)
-#define FATTR4_WORD1_SACL               (1UL << 27)
-#define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
-#define FATTR4_WORD2_LAYOUT_TYPES       (1UL << 0)
-#define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
-#define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
-#define FATTR4_WORD2_CLONE_BLKSIZE	(1UL << 13)
-#define FATTR4_WORD2_CHANGE_ATTR_TYPE	(1UL << 15)
-#define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
-#define FATTR4_WORD2_MODE_UMASK		(1UL << 17)
-#define FATTR4_WORD2_XATTR_SUPPORT	(1UL << 18)
+#define FATTR4_WORD0_ACL                BIT(FATTR4_ACL)
+#define FATTR4_WORD0_ACLSUPPORT         BIT(FATTR4_ACLSUPPORT)
+#define FATTR4_WORD0_ARCHIVE            BIT(FATTR4_ARCHIVE)
+#define FATTR4_WORD0_CANSETTIME         BIT(FATTR4_CANSETTIME)
+#define FATTR4_WORD0_CASE_INSENSITIVE   BIT(FATTR4_CASE_INSENSITIVE)
+#define FATTR4_WORD0_CASE_PRESERVING    BIT(FATTR4_CASE_PRESERVING)
+#define FATTR4_WORD0_CHOWN_RESTRICTED   BIT(FATTR4_CHOWN_RESTRICTED)
+#define FATTR4_WORD0_FILEHANDLE         BIT(FATTR4_FILEHANDLE)
+#define FATTR4_WORD0_FILEID             BIT(FATTR4_FILEID)
+#define FATTR4_WORD0_FILES_AVAIL        BIT(FATTR4_FILES_AVAIL)
+#define FATTR4_WORD0_FILES_FREE         BIT(FATTR4_FILES_FREE)
+#define FATTR4_WORD0_FILES_TOTAL        BIT(FATTR4_FILES_TOTAL)
+#define FATTR4_WORD0_FS_LOCATIONS       BIT(FATTR4_FS_LOCATIONS)
+#define FATTR4_WORD0_HIDDEN             BIT(FATTR4_HIDDEN)
+#define FATTR4_WORD0_HOMOGENEOUS        BIT(FATTR4_HOMOGENEOUS)
+#define FATTR4_WORD0_MAXFILESIZE        BIT(FATTR4_MAXFILESIZE)
+#define FATTR4_WORD0_MAXLINK            BIT(FATTR4_MAXLINK)
+#define FATTR4_WORD0_MAXNAME            BIT(FATTR4_MAXNAME)
+#define FATTR4_WORD0_MAXREAD            BIT(FATTR4_MAXREAD)
+#define FATTR4_WORD0_MAXWRITE           BIT(FATTR4_MAXWRITE)
+
+#define FATTR4_WORD1_MIMETYPE           BIT(FATTR4_MIMETYPE - 32)
+#define FATTR4_WORD1_MODE               BIT(FATTR4_MODE	- 32)
+#define FATTR4_WORD1_NO_TRUNC           BIT(FATTR4_NO_TRUNC - 32)
+#define FATTR4_WORD1_NUMLINKS           BIT(FATTR4_NUMLINKS - 32)
+#define FATTR4_WORD1_OWNER              BIT(FATTR4_OWNER - 32)
+#define FATTR4_WORD1_OWNER_GROUP        BIT(FATTR4_OWNER_GROUP - 32)
+#define FATTR4_WORD1_QUOTA_HARD         BIT(FATTR4_QUOTA_AVAIL_HARD - 32)
+#define FATTR4_WORD1_QUOTA_SOFT         BIT(FATTR4_QUOTA_AVAIL_SOFT - 32)
+#define FATTR4_WORD1_QUOTA_USED         BIT(FATTR4_QUOTA_USED - 32)
+#define FATTR4_WORD1_RAWDEV             BIT(FATTR4_RAWDEV - 32)
+#define FATTR4_WORD1_SPACE_AVAIL        BIT(FATTR4_SPACE_AVAIL - 32)
+#define FATTR4_WORD1_SPACE_FREE         BIT(FATTR4_SPACE_FREE - 32)
+#define FATTR4_WORD1_SPACE_TOTAL        BIT(FATTR4_SPACE_TOTAL - 32)
+#define FATTR4_WORD1_SPACE_USED         BIT(FATTR4_SPACE_USED - 32)
+#define FATTR4_WORD1_SYSTEM             BIT(FATTR4_SYSTEM - 32)
+#define FATTR4_WORD1_TIME_ACCESS        BIT(FATTR4_TIME_ACCESS - 32)
+#define FATTR4_WORD1_TIME_ACCESS_SET    BIT(FATTR4_TIME_ACCESS_SET - 32)
+#define FATTR4_WORD1_TIME_BACKUP        BIT(FATTR4_TIME_BACKUP - 32)
+#define FATTR4_WORD1_TIME_CREATE        BIT(FATTR4_TIME_CREATE - 32)
+#define FATTR4_WORD1_TIME_DELTA         BIT(FATTR4_TIME_DELTA - 32)
+#define FATTR4_WORD1_TIME_METADATA      BIT(FATTR4_TIME_METADATA - 32)
+#define FATTR4_WORD1_TIME_MODIFY        BIT(FATTR4_TIME_MODIFY - 32)
+#define FATTR4_WORD1_TIME_MODIFY_SET    BIT(FATTR4_TIME_MODIFY_SET - 32)
+#define FATTR4_WORD1_MOUNTED_ON_FILEID  BIT(FATTR4_MOUNTED_ON_FILEID - 32)
+#define FATTR4_WORD1_DACL               BIT(FATTR4_DACL	- 32)
+#define FATTR4_WORD1_SACL               BIT(FATTR4_SACL	- 32)
+#define FATTR4_WORD1_FS_LAYOUT_TYPES    BIT(FATTR4_FS_LAYOUT_TYPES - 32)
+
+#define FATTR4_WORD2_LAYOUT_TYPES       BIT(FATTR4_LAYOUT_TYPES - 64)
+#define FATTR4_WORD2_LAYOUT_BLKSIZE     BIT(FATTR4_LAYOUT_BLKSIZE - 64)
+#define FATTR4_WORD2_MDSTHRESHOLD       BIT(FATTR4_MDSTHRESHOLD	- 64)
+#define FATTR4_WORD2_CLONE_BLKSIZE	BIT(FATTR4_CLONE_BLKSIZE - 64)
+#define FATTR4_WORD2_CHANGE_ATTR_TYPE	BIT(FATTR4_CHANGE_ATTR_TYPE - 64)
+#define FATTR4_WORD2_SECURITY_LABEL     BIT(FATTR4_SEC_LABEL - 64)
+#define FATTR4_WORD2_MODE_UMASK		BIT(FATTR4_MODE_UMASK - 64)
+#define FATTR4_WORD2_XATTR_SUPPORT	BIT(FATTR4_XATTR_SUPPORT - 64)
 
 /* MDS threshold bitmap bits */
 #define THRESHOLD_RD                    (1UL << 0)
-- 
cgit v1.2.3


From 59de2a56d127890cc610f3896d5fc31887c54ac2 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Wed, 11 Oct 2023 13:58:24 +0300
Subject: usb: typec: Link enumerated USB devices with Type-C partner

Adding functions that USB hub code can use to inform the
Type-C class about connected USB devices.

Once taken into use, it will allow the Type-C port drivers
to power off components that are not needed, for example if
USB2 device is enumerated, everything that is only relevant
for USB3 (retimers, etc.), can be powered off.

This will also create a symlink "typec" for the USB devices
pointing to the USB Type-C partner device.

Suggested-by: Benson Leung <bleung@chromium.org>
Tested-by: Benson Leung <bleung@chromium.org>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20231011105825.320062-2-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/typec.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index 8fa781207970..a05d6f6f2536 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -202,6 +202,8 @@ struct typec_cable_desc {
  * @accessory: Audio, Debug or none.
  * @identity: Discover Identity command data
  * @pd_revision: USB Power Delivery Specification Revision if supported
+ * @attach: Notification about attached USB device
+ * @deattach: Notification about removed USB device
  *
  * Details about a partner that is attached to USB Type-C port. If @identity
  * member exists when partner is registered, a directory named "identity" is
@@ -217,6 +219,9 @@ struct typec_partner_desc {
 	enum typec_accessory	accessory;
 	struct usb_pd_identity	*identity;
 	u16			pd_revision; /* 0300H = "3.0" */
+
+	void (*attach)(struct typec_partner *partner, struct device *dev);
+	void (*deattach)(struct typec_partner *partner, struct device *dev);
 };
 
 /**
@@ -335,4 +340,36 @@ int typec_port_set_usb_power_delivery(struct typec_port *port, struct usb_power_
 int typec_partner_set_usb_power_delivery(struct typec_partner *partner,
 					 struct usb_power_delivery *pd);
 
+/**
+ * struct typec_connector - Representation of Type-C port for external drivers
+ * @attach: notification about device removal
+ * @deattach: notification about device removal
+ *
+ * Drivers that control the USB and other ports (DisplayPorts, etc.), that are
+ * connected to the Type-C connectors, can use these callbacks to inform the
+ * Type-C connector class about connections and disconnections. That information
+ * can then be used by the typec-port drivers to power on or off parts that are
+ * needed or not needed - as an example, in USB mode if USB2 device is
+ * enumerated, USB3 components (retimers, phys, and what have you) do not need
+ * to be powered on.
+ *
+ * The attached (enumerated) devices will be liked with the typec-partner device.
+ */
+struct typec_connector {
+	void (*attach)(struct typec_connector *con, struct device *dev);
+	void (*deattach)(struct typec_connector *con, struct device *dev);
+};
+
+static inline void typec_attach(struct typec_connector *con, struct device *dev)
+{
+	if (con && con->attach)
+		con->attach(con, dev);
+}
+
+static inline void typec_deattach(struct typec_connector *con, struct device *dev)
+{
+	if (con && con->deattach)
+		con->deattach(con, dev);
+}
+
 #endif /* __LINUX_USB_TYPEC_H */
-- 
cgit v1.2.3


From a5e80e18f268ea7c7a36bc4159de0deb3b5a2171 Mon Sep 17 00:00:00 2001
From: Zhiguo Niu <zhiguo.niu@unisoc.com>
Date: Mon, 16 Oct 2023 19:27:31 +0800
Subject: f2fs: fix error path of __f2fs_build_free_nids

If NAT is corrupted, let scan_nat_page() return EFSCORRUPTED, so that,
caller can set SBI_NEED_FSCK flag into checkpoint for later repair by
fsck.

Also, this patch introduces a new fscorrupted error flag, and in above
scenario, it will persist the error flag into superblock synchronously
to avoid it has no luck to trigger a checkpoint to record SBI_NEED_FSCK

Signed-off-by: Zhiguo Niu <zhiguo.niu@unisoc.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 07ed69c2840d..039fe0ce8d83 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -104,6 +104,7 @@ enum f2fs_error {
 	ERROR_CORRUPTED_VERITY_XATTR,
 	ERROR_CORRUPTED_XATTR,
 	ERROR_INVALID_NODE_REFERENCE,
+	ERROR_INCONSISTENT_NAT,
 	ERROR_MAX,
 };
 
-- 
cgit v1.2.3


From 65610453459f9048678a0daef89d592e412ec00a Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 21 Sep 2023 15:44:12 +0800
Subject: mm: memory: add vm_normal_folio_pmd()

Patch series "mm: convert numa balancing functions to use a folio", v2.

do_numa_pages() only handles non-compound pages, and only PMD-mapped THPs
are handled in do_huge_pmd_numa_page().  But a large, PTE-mapped folio
will be supported so let's convert more numa balancing functions to
use/take a folio in preparation for that, no functional change intended
for now.


This patch (of 6):

The new vm_normal_folio_pmd() wrapper is similar to vm_normal_folio(),
which allow them to completely replace the struct page variables with
struct folio variables.

Link: https://lkml.kernel.org/r/20230921074417.24004-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20230921074417.24004-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 126b54b45442..52c40b3d0813 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2327,6 +2327,8 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
+struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
+				  unsigned long addr, pmd_t pmd);
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 				pmd_t pmd);
 
-- 
cgit v1.2.3


From 75c70128a67311070115b90d826a229d4bbbb2b5 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 21 Sep 2023 15:44:16 +0800
Subject: mm: mempolicy: make mpol_misplaced() to take a folio

In preparation for large folio numa balancing, make mpol_misplaced() to
take a folio, no functional change intended.

Link: https://lkml.kernel.org/r/20230921074417.24004-6-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..6c2754d7bfed 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -174,7 +174,7 @@ extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
 /* Check if a vma is migratable */
 extern bool vma_migratable(struct vm_area_struct *vma);
 
-extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+int mpol_misplaced(struct folio *, struct vm_area_struct *, unsigned long);
 extern void mpol_put_task_policy(struct task_struct *);
 
 static inline bool mpol_is_preferred_many(struct mempolicy *pol)
@@ -278,7 +278,8 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
 }
 #endif
 
-static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+static inline int mpol_misplaced(struct folio *folio,
+				 struct vm_area_struct *vma,
 				 unsigned long address)
 {
 	return -1; /* no node preference */
-- 
cgit v1.2.3


From 8c9ae56dc73b5ae48a14000b96292bd4f2aeb710 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 21 Sep 2023 15:44:17 +0800
Subject: sched/numa, mm: make numa migrate functions to take a folio

The cpupid (or access time) is stored in the head page for THP, so it is
safely to make should_numa_migrate_memory() and numa_hint_fault_latency()
to take a folio.  This is in preparation for large folio numa balancing.

Link: https://lkml.kernel.org/r/20230921074417.24004-7-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/numa_balancing.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..06a9d35650f0 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -20,8 +20,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
 extern void task_numa_free(struct task_struct *p, bool final);
-extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
-					int src_nid, int dst_cpu);
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
+				int src_nid, int dst_cpu);
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
 				   int flags)
@@ -38,7 +38,7 @@ static inline void task_numa_free(struct task_struct *p, bool final)
 {
 }
 static inline bool should_numa_migrate_memory(struct task_struct *p,
-				struct page *page, int src_nid, int dst_cpu)
+				struct folio *folio, int src_nid, int dst_cpu)
 {
 	return true;
 }
-- 
cgit v1.2.3


From 3c6f33b7273a7e2f2b2497b62c8400bd957b2fbe Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@devkernel.io>
Date: Fri, 22 Sep 2023 14:11:40 -0700
Subject: mm/ksm: support fork/exec for prctl

Patch series "mm/ksm: add fork-exec support for prctl", v4.

A process can enable KSM with the prctl system call.  When the process is
forked the KSM flag is inherited by the child process.  However if the
process is executing an exec system call directly after the fork, the KSM
setting is cleared.  This patch series addresses this problem.

1) Change the mask in coredump.h for execing a new process
2) Add a new test case in ksm_functional_tests


This patch (of 2):

Today we have two ways to enable KSM:

1) madvise system call
   This allows to enable KSM for a memory region for a long time.

2) prctl system call
   This is a recent addition to enable KSM for the complete process.
   In addition when a process is forked, the KSM setting is inherited.

This change only affects the second case.

One of the use cases for (2) was to support the ability to enable
KSM for cgroups. This allows systemd to enable KSM for the seed
process. By enabling it in the seed process all child processes inherit
the setting.

This works correctly when the process is forked. However it doesn't
support fork/exec workflow.

From the previous cover letter:

....
Use case 3:
With the madvise call sharing opportunities are only enabled for the
current process: it is a workload-local decision. A considerable number
of sharing opportunities may exist across multiple workloads or jobs
(if they are part of the same security domain). Only a higler level
entity like a job scheduler or container can know for certain if its
running one or more instances of a job. That job scheduler however
doesn't have the necessary internal workload knowledge to make targeted
madvise calls.
....

In addition it can also be a bit surprising that fork keeps the KSM
setting and fork/exec does not.

Link: https://lkml.kernel.org/r/20230922211141.320789-1-shr@devkernel.io
Link: https://lkml.kernel.org/r/20230922211141.320789-2-shr@devkernel.io
Signed-off-by: Stefan Roesch <shr@devkernel.io>
Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process")
Reviewed-by: David Hildenbrand <david@redhat.com>
Reported-by: Carl Klemm <carl@uvos.xyz>
Tested-by: Carl Klemm <carl@uvos.xyz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/coredump.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 1b37fa8fc723..02f5090ffea2 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
 #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 #define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
+#define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 #define MMF_OOM_REAP_QUEUED	25	/* mm was queued for oom_reaper */
 #define MMF_MULTIPROCESS	26	/* mm is shared between processes */
 /*
@@ -85,13 +86,15 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_HAS_MDWE		28
 #define MMF_HAS_MDWE_MASK	(1 << MMF_HAS_MDWE)
 
-#define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 
-#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
-				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
+#define MMF_HAS_MDWE_NO_INHERIT	29
 
-#define MMF_VM_MERGE_ANY	29
-#define MMF_HAS_MDWE_NO_INHERIT	30
+#define MMF_VM_MERGE_ANY	30
+#define MMF_VM_MERGE_ANY_MASK	(1 << MMF_VM_MERGE_ANY)
+
+#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
+				 MMF_VM_MERGE_ANY_MASK)
 
 static inline unsigned long mmf_init_flags(unsigned long flags)
 {
-- 
cgit v1.2.3


From a08c7193e4f18dc8508f2d07d0de2c5b94cb39a3 Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Tue, 26 Sep 2023 12:20:17 -0700
Subject: mm/filemap: remove hugetlb special casing in filemap.c

Remove special cased hugetlb handling code within the page cache by
changing the granularity of ->index to the base page size rather than the
huge page size.  The motivation of this patch is to reduce complexity
within the filemap code while also increasing performance by removing
branches that are evaluated on every page cache lookup.

To support the change in index, new wrappers for hugetlb page cache
interactions are added.  These wrappers perform the conversion to a linear
index which is now expected by the page cache for huge pages.

========================= PERFORMANCE ======================================

Perf was used to check the performance differences after the patch.
Overall the performance is similar to mainline with a very small larger
overhead that occurs in __filemap_add_folio() and
hugetlb_add_to_page_cache().  This is because of the larger overhead that
occurs in xa_load() and xa_store() as the xarray is now using more entries
to store hugetlb folios in the page cache.

Timing

aarch64
    2MB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-1 hugepages]# time fallocate -l 700GB test.txt
            real    1m49.568s
            user    0m0.000s
            sys     1m49.461s

        6.5-rc3:
            [root]# time fallocate -l 700GB test.txt
            real    1m47.495s
            user    0m0.000s
            sys     1m47.370s
    1GB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt
            real    1m47.024s
            user    0m0.000s
            sys     1m46.921s

        6.5-rc3:
            [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt
            real    1m44.551s
            user    0m0.000s
            sys     1m44.438s

x86
    2MB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-2 hugepages]# time fallocate -l 100GB test.txt
            real    0m22.383s
            user    0m0.000s
            sys     0m22.255s

        6.5-rc3:
            [opc@sidhakum-ol9-2 hugepages]$ time sudo fallocate -l 100GB /dev/hugepages/test.txt
            real    0m22.735s
            user    0m0.038s
            sys     0m22.567s

    1GB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-2 hugepages1GB]# time fallocate -l 100GB test.txt
            real    0m25.786s
            user    0m0.001s
            sys     0m25.589s

        6.5-rc3:
            [root@sidhakum-ol9-2 hugepages1G]# time fallocate -l 100GB test.txt
            real    0m33.454s
            user    0m0.001s
            sys     0m33.193s

aarch64:
    workload - fallocate a 700GB file backed by huge pages

    6.5-rc3 + this patch:
        2MB Page Size:
            --100.00%--__arm64_sys_fallocate
                          ksys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |
                          |--95.04%--__pi_clear_page
                          |
                          |--3.57%--clear_huge_page
                          |          |
                          |          |--2.63%--rcu_all_qs
                          |          |
                          |           --0.91%--__cond_resched
                          |
                           --0.67%--__cond_resched
            0.17%     0.00%             0  fallocate  [kernel.vmlinux]       [k] hugetlb_add_to_page_cache
            0.14%     0.10%            11  fallocate  [kernel.vmlinux]       [k] __filemap_add_folio

    6.5-rc3
        2MB Page Size:
                --100.00%--__arm64_sys_fallocate
                          ksys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |
                          |--94.91%--__pi_clear_page
                          |
                          |--4.11%--clear_huge_page
                          |          |
                          |          |--3.00%--rcu_all_qs
                          |          |
                          |           --1.10%--__cond_resched
                          |
                           --0.59%--__cond_resched
            0.08%     0.01%             1  fallocate  [kernel.kallsyms]  [k] hugetlb_add_to_page_cache
            0.05%     0.03%             3  fallocate  [kernel.kallsyms]  [k] __filemap_add_folio

x86
    workload - fallocate a 100GB file backed by huge pages

    6.5-rc3 + this patch:
        2MB Page Size:
            hugetlbfs_fallocate
            |
            --99.57%--clear_huge_page
                |
                --98.47%--clear_page_erms
                    |
                    --0.53%--asm_sysvec_apic_timer_interrupt

            0.04%     0.04%             1  fallocate  [kernel.kallsyms]     [k] xa_load
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] hugetlb_add_to_page_cache
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] __filemap_add_folio
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] xas_store

    6.5-rc3
        2MB Page Size:
                --99.93%--__x64_sys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |
                           --99.38%--clear_huge_page
                                     |
                                     |--98.40%--clear_page_erms
                                     |
                                      --0.59%--__cond_resched
            0.03%     0.03%             1  fallocate  [kernel.kallsyms]  [k] __filemap_add_folio

========================= TESTING ======================================

This patch passes libhugetlbfs tests and LTP hugetlb tests

********** TEST SUMMARY
*                      2M
*                      32-bit 64-bit
*     Total testcases:   110    113
*             Skipped:     0      0
*                PASS:   107    113
*                FAIL:     0      0
*    Killed by signal:     3      0
*   Bad configuration:     0      0
*       Expected FAIL:     0      0
*     Unexpected PASS:     0      0
*    Test not present:     0      0
* Strange test result:     0      0
**********

    Done executing testcases.
    LTP Version:  20220527-178-g2761a81c4

page migration was also tested using Mike Kravetz's test program.[8]

[dan.carpenter@linaro.org: fix an NULL vs IS_ERR() bug]
  Link: https://lkml.kernel.org/r/1772c296-1417-486f-8eef-171af2192681@moroto.mountain
Link: https://lkml.kernel.org/r/20230926192017.98183-1-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reported-and-tested-by: syzbot+c225dea486da4d5592bd@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c225dea486da4d5592bd
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 12 ++++++++++++
 include/linux/pagemap.h | 32 ++------------------------------
 2 files changed, 14 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a30686e649f7..d935b0584f1d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -812,6 +812,12 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h)
 	return huge_page_size(h) / 512;
 }
 
+static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
+				struct address_space *mapping, pgoff_t idx)
+{
+	return filemap_lock_folio(mapping, idx << huge_page_order(h));
+}
+
 #include <asm/hugetlb.h>
 
 #ifndef is_hugepage_only_range
@@ -1008,6 +1014,12 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio
 	return NULL;
 }
 
+static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
+				struct address_space *mapping, pgoff_t idx)
+{
+	return NULL;
+}
+
 static inline int isolate_or_dissolve_huge_page(struct page *page,
 						struct list_head *list)
 {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 351c3b7f93a1..759b29d9a69a 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -789,9 +789,6 @@ static inline pgoff_t folio_next_index(struct folio *folio)
  */
 static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
 {
-	/* HugeTLBfs indexes the page cache in units of hpage_size */
-	if (folio_test_hugetlb(folio))
-		return &folio->page;
 	return folio_page(folio, index & (folio_nr_pages(folio) - 1));
 }
 
@@ -807,9 +804,6 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
  */
 static inline bool folio_contains(struct folio *folio, pgoff_t index)
 {
-	/* HugeTLBfs indexes the page cache in units of hpage_size */
-	if (folio_test_hugetlb(folio))
-		return folio->index == index;
 	return index - folio_index(folio) < folio_nr_pages(folio);
 }
 
@@ -867,10 +861,9 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping,
 }
 
 /*
- * Get index of the page within radix-tree (but not for hugetlb pages).
- * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
+ * Get the offset in PAGE_SIZE (even for hugetlb pages).
  */
-static inline pgoff_t page_to_index(struct page *page)
+static inline pgoff_t page_to_pgoff(struct page *page)
 {
 	struct page *head;
 
@@ -885,19 +878,6 @@ static inline pgoff_t page_to_index(struct page *page)
 	return head->index + page - head;
 }
 
-extern pgoff_t hugetlb_basepage_index(struct page *page);
-
-/*
- * Get the offset in PAGE_SIZE (even for hugetlb pages).
- * (TODO: hugetlb pages should have ->index in PAGE_SIZE)
- */
-static inline pgoff_t page_to_pgoff(struct page *page)
-{
-	if (unlikely(PageHuge(page)))
-		return hugetlb_basepage_index(page);
-	return page_to_index(page);
-}
-
 /*
  * Return byte-offset into filesystem object for page.
  */
@@ -934,24 +914,16 @@ static inline loff_t folio_file_pos(struct folio *folio)
 
 /*
  * Get the offset in PAGE_SIZE (even for hugetlb folios).
- * (TODO: hugetlb folios should have ->index in PAGE_SIZE)
  */
 static inline pgoff_t folio_pgoff(struct folio *folio)
 {
-	if (unlikely(folio_test_hugetlb(folio)))
-		return hugetlb_basepage_index(&folio->page);
 	return folio->index;
 }
 
-extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
-				     unsigned long address);
-
 static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 					unsigned long address)
 {
 	pgoff_t pgoff;
-	if (unlikely(is_vm_hugetlb_page(vma)))
-		return linear_hugepage_index(vma, address);
 	pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
 	pgoff += vma->vm_pgoff;
 	return pgoff;
-- 
cgit v1.2.3


From 07a8bdd4120ced3490ef9adf51b8086af0aaa8e7 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 26 Sep 2023 14:06:25 +0800
Subject: memory tiering: add abstract distance calculation algorithms
 management

Patch series "memory tiering: calculate abstract distance based on ACPI
HMAT", v4.

We have the explicit memory tiers framework to manage systems with
multiple types of memory, e.g., DRAM in DIMM slots and CXL memory devices.
Where, same kind of memory devices will be grouped into memory types,
then put into memory tiers.  To describe the performance of a memory type,
abstract distance is defined.  Which is in direct proportion to the memory
latency and inversely proportional to the memory bandwidth.  To keep the
code as simple as possible, fixed abstract distance is used in dax/kmem to
describe slow memory such as Optane DCPMM.

To support more memory types, in this series, we added the abstract
distance calculation algorithm management mechanism, provided a algorithm
implementation based on ACPI HMAT, and used the general abstract distance
calculation interface in dax/kmem driver.  So, dax/kmem can support HBM
(high bandwidth memory) in addition to the original Optane DCPMM.


This patch (of 4):

The abstract distance may be calculated by various drivers, such as ACPI
HMAT, CXL CDAT, etc.  While it may be used by various code which hot-add
memory node, such as dax/kmem etc.  To decouple the algorithm users and
the providers, the abstract distance calculation algorithms management
mechanism is implemented in this patch.  It provides interface for the
providers to register the implementation, and interface for the users.

Multiple algorithm implementations can cooperate via calculating abstract
distance for different memory nodes.  The preference of algorithm
implementations can be specified via priority (notifier_block.priority).

Link: https://lkml.kernel.org/r/20230926060628.265989-1-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20230926060628.265989-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Tested-by: Bharata B Rao <bharata@amd.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Rafael J Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 4fa178b50784..fddc59990644 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -6,6 +6,7 @@
 #include <linux/nodemask.h>
 #include <linux/kref.h>
 #include <linux/mmzone.h>
+#include <linux/notifier.h>
 /*
  * Each tier cover a abstrace distance chunk size of 128
  */
@@ -36,6 +37,9 @@ struct memory_dev_type *alloc_memory_type(int adistance);
 void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+int register_mt_adistance_algorithm(struct notifier_block *nb);
+int unregister_mt_adistance_algorithm(struct notifier_block *nb);
+int mt_calc_adistance(int node, int *adist);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -97,5 +101,20 @@ static inline bool node_is_toptier(int node)
 {
 	return true;
 }
+
+static inline int register_mt_adistance_algorithm(struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline int unregister_mt_adistance_algorithm(struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline int mt_calc_adistance(int node, int *adist)
+{
+	return NOTIFY_DONE;
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
-- 
cgit v1.2.3


From 3718c02dbd4c88d47b5af003acdb3d1112604ea3 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 26 Sep 2023 14:06:27 +0800
Subject: acpi, hmat: calculate abstract distance with HMAT

A memory tiering abstract distance calculation algorithm based on ACPI
HMAT is implemented.  The basic idea is as follows.

The performance attributes of system default DRAM nodes are recorded as
the base line.  Whose abstract distance is MEMTIER_ADISTANCE_DRAM.  Then,
the ratio of the abstract distance of a memory node (target) to
MEMTIER_ADISTANCE_DRAM is scaled based on the ratio of the performance
attributes of the node to that of the default DRAM nodes.

The functions to record the read/write latency/bandwidth of the default
DRAM nodes and calculate abstract distance according to read/write
latency/bandwidth ratio will be used by CXL CDAT (Coherent Device
Attribute Table) and other memory device drivers.  So, they are put in
memory-tiers.c.

Link: https://lkml.kernel.org/r/20230926060628.265989-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Tested-by: Bharata B Rao <bharata@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Rafael J Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index fddc59990644..59d15c6d3c0d 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -31,8 +31,11 @@ struct memory_dev_type {
 	struct kref kref;
 };
 
+struct node_hmem_attrs;
+
 #ifdef CONFIG_NUMA
 extern bool numa_demotion_enabled;
+extern struct memory_dev_type *default_dram_type;
 struct memory_dev_type *alloc_memory_type(int adistance);
 void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
@@ -40,6 +43,9 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype);
 int register_mt_adistance_algorithm(struct notifier_block *nb);
 int unregister_mt_adistance_algorithm(struct notifier_block *nb);
 int mt_calc_adistance(int node, int *adist);
+int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+			     const char *source);
+int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -64,6 +70,7 @@ static inline bool node_is_toptier(int node)
 #else
 
 #define numa_demotion_enabled	false
+#define default_dram_type	NULL
 /*
  * CONFIG_NUMA implementation returns non NULL error.
  */
@@ -116,5 +123,16 @@ static inline int mt_calc_adistance(int node, int *adist)
 {
 	return NOTIFY_DONE;
 }
+
+static inline int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+					   const char *source)
+{
+	return -EIO;
+}
+
+static inline int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
+{
+	return -EIO;
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
-- 
cgit v1.2.3


From 6bc2cfdf82d56863b7cf5e86e37a662b2ae5d47e Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 26 Sep 2023 14:06:28 +0800
Subject: dax, kmem: calculate abstract distance with general interface

Previously, a fixed abstract distance MEMTIER_DEFAULT_DAX_ADISTANCE is
used for slow memory type in kmem driver.  This limits the usage of kmem
driver, for example, it cannot be used for HBM (high bandwidth memory).

So, we use the general abstract distance calculation mechanism in kmem
drivers to get more accurate abstract distance on systems with proper
support.  The original MEMTIER_DEFAULT_DAX_ADISTANCE is used as fallback
only.

Now, multiple memory types may be managed by kmem.  These memory types are
put into the "kmem_memory_types" list and protected by
kmem_memory_type_lock.

Link: https://lkml.kernel.org/r/20230926060628.265989-5-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Tested-by: Bharata B Rao <bharata@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Rafael J Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 59d15c6d3c0d..1e39d27bee41 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -24,6 +24,8 @@ struct memory_tier;
 struct memory_dev_type {
 	/* list of memory types that are part of same tier as this type */
 	struct list_head tier_sibling;
+	/* list of memory types that are managed by one driver */
+	struct list_head list;
 	/* abstract distance for this specific memory type */
 	int adistance;
 	/* Nodes of same abstract distance */
-- 
cgit v1.2.3


From 6cb42f91aa6dfd10fd847c469caebe63b35141ff Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 24 Sep 2023 19:38:17 -0700
Subject: bitmap: move bitmap_*_region() functions to bitmap.h

Now that bitmap_*_region() functions are implemented as thin wrappers
around others, it's worth to move them to the header, as it opens room
for compile-time optimizations.

CC: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
CC: Rasmus Villemoes <linux@rasmusvillemoes.dk>
CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h | 64 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 61 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 1cca950a54ae..99451431e4d6 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -6,6 +6,7 @@
 
 #include <linux/align.h>
 #include <linux/bitops.h>
+#include <linux/errno.h>
 #include <linux/find.h>
 #include <linux/limits.h>
 #include <linux/string.h>
@@ -209,9 +210,6 @@ void bitmap_onto(unsigned long *dst, const unsigned long *orig,
 		const unsigned long *relmap, unsigned int bits);
 void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 		unsigned int sz, unsigned int nbits);
-int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
-void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
-int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
 
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
 #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
@@ -497,6 +495,66 @@ static inline void bitmap_next_set_region(unsigned long *bitmap,
 	*re = find_next_zero_bit(bitmap, end, *rs + 1);
 }
 
+/**
+ * bitmap_release_region - release allocated bitmap region
+ *	@bitmap: array of unsigned longs corresponding to the bitmap
+ *	@pos: beginning of bit region to release
+ *	@order: region size (log base 2 of number of bits) to release
+ *
+ * This is the complement to __bitmap_find_free_region() and releases
+ * the found region (by clearing it in the bitmap).
+ */
+static inline void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order)
+{
+	bitmap_clear(bitmap, pos, BIT(order));
+}
+
+/**
+ * bitmap_allocate_region - allocate bitmap region
+ *	@bitmap: array of unsigned longs corresponding to the bitmap
+ *	@pos: beginning of bit region to allocate
+ *	@order: region size (log base 2 of number of bits) to allocate
+ *
+ * Allocate (set bits in) a specified region of a bitmap.
+ *
+ * Returns: 0 on success, or %-EBUSY if specified region wasn't
+ * free (not all bits were zero).
+ */
+static inline int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
+{
+	unsigned int len = BIT(order);
+
+	if (find_next_bit(bitmap, pos + len, pos) < pos + len)
+		return -EBUSY;
+	bitmap_set(bitmap, pos, len);
+	return 0;
+}
+
+/**
+ * bitmap_find_free_region - find a contiguous aligned mem region
+ *	@bitmap: array of unsigned longs corresponding to the bitmap
+ *	@bits: number of bits in the bitmap
+ *	@order: region size (log base 2 of number of bits) to find
+ *
+ * Find a region of free (zero) bits in a @bitmap of @bits bits and
+ * allocate them (set them to one).  Only consider regions of length
+ * a power (@order) of two, aligned to that power of two, which
+ * makes the search algorithm much faster.
+ *
+ * Returns: the bit offset in bitmap of the allocated region,
+ * or -errno on failure.
+ */
+static inline int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
+{
+	unsigned int pos, end;		/* scans bitmap by regions of size order */
+
+	for (pos = 0; (end = pos + BIT(order)) <= bits; pos = end) {
+		if (!bitmap_allocate_region(bitmap, pos, order))
+			return pos;
+	}
+	return -ENOMEM;
+}
+
 /**
  * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
  * @n: u64 value
-- 
cgit v1.2.3


From 90de47f020db086f7929e09f64efd0cf627d6869 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 13 Oct 2023 14:48:21 +0800
Subject: page_pool: fragment API support for 32-bit arch with 64-bit DMA

Currently page_pool_alloc_frag() is not supported in 32-bit
arch with 64-bit DMA because of the overlap issue between
pp_frag_count and dma_addr_upper in 'struct page' for those
arches, which seems to be quite common, see [1], which means
driver may need to handle it when using fragment API.

It is assumed that the combination of the above arch with an
address space >16TB does not exist, as all those arches have
64b equivalent, it seems logical to use the 64b version for a
system with a large address space. It is also assumed that dma
address is page aligned when we are dma mapping a page aligned
buffer, see [2].

That means we're storing 12 bits of 0 at the lower end for a
dma address, we can reuse those bits for the above arches to
support 32b+12b, which is 16TB of memory.

If we make a wrong assumption, a warning is emitted so that
user can report to us.

1. https://lore.kernel.org/all/20211117075652.58299-1-linyunsheng@huawei.com/
2. https://lore.kernel.org/all/20230818145145.4b357c89@kernel.org/

Tested-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Guillaume Tucker <guillaume.tucker@collabora.com>
CC: Matthew Wilcox <willy@infradead.org>
CC: Linux-MM <linux-mm@kvack.org>
Link: https://lore.kernel.org/r/20231013064827.61135-2-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mm_types.h | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 36c5b43999e6..74b49c4c7a52 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -125,18 +125,7 @@ struct page {
 			struct page_pool *pp;
 			unsigned long _pp_mapping_pad;
 			unsigned long dma_addr;
-			union {
-				/**
-				 * dma_addr_upper: might require a 64-bit
-				 * value on 32-bit architectures.
-				 */
-				unsigned long dma_addr_upper;
-				/**
-				 * For frag page support, not supported in
-				 * 32-bit architectures with 64-bit DMA.
-				 */
-				atomic_long_t pp_frag_count;
-			};
+			atomic_long_t pp_frag_count;
 		};
 		struct {	/* Tail pages of compound page */
 			unsigned long compound_head;	/* Bit zero is set */
-- 
cgit v1.2.3


From acfc788233263fee9413434d39d4201a8de592ba Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 9 Oct 2023 23:18:41 +0200
Subject: vgacon: remove screen_info dependency

The vga console driver is fairly self-contained, and only used by
architectures that explicitly initialize the screen_info settings.

Chance every instance that picks the vga console by setting conswitchp
to call a function instead, and pass a reference to the screen_info
there.

Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: Khalid Azzi <khalid@gonehiking.org>
Acked-by: Helge Deller <deller@gmx.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20231009211845.3136536-6-arnd@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/console.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 7de11c763eb3..5ff6f11c47b1 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -101,6 +101,13 @@ extern const struct consw dummy_con;	/* dummy console buffer */
 extern const struct consw vga_con;	/* VGA text console */
 extern const struct consw newport_con;	/* SGI Newport console  */
 
+struct screen_info;
+#ifdef CONFIG_VGA_CONSOLE
+void vgacon_register_screen(struct screen_info *si);
+#else
+static inline void vgacon_register_screen(struct screen_info *si) { }
+#endif
+
 int con_is_bound(const struct consw *csw);
 int do_unregister_con_driver(const struct consw *csw);
 int do_take_over_console(const struct consw *sw, int first, int last, int deflt);
-- 
cgit v1.2.3


From 545a4f89cad5bd349522d17558b3a4208648e20e Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 12 Oct 2023 09:42:56 +0300
Subject: printk: Check valid console index for preferred console

Let's check for valid console index values for preferred console to avoid
bogus console index numbers from kernel command line.

Let's also return an error for negative index numbers for the preferred
console. Unlike for device drivers, a negative index is not valid for the
preferred console.

Let's also constify idx while at it.

Signed-off-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20231012064300.50221-1-tony@atomide.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/console.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 5ff6f11c47b1..1be13c9695e0 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -347,7 +347,7 @@ enum con_flush_mode {
 	CONSOLE_REPLAY_ALL,
 };
 
-extern int add_preferred_console(char *name, int idx, char *options);
+extern int add_preferred_console(char *name, const short idx, char *options);
 extern void console_force_preferred_locked(struct console *con);
 extern void register_console(struct console *);
 extern int unregister_console(struct console *);
-- 
cgit v1.2.3


From 1e3c8526918403a4cebbd67bcd18443bf68df939 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 12 Oct 2023 09:42:57 +0300
Subject: printk: Constify name for add_preferred_console()

While adding a preferred console handling for serial_core for serial port
hardware based device addressing, Jiri suggested we constify name for
add_preferred_console(). The name gets copied anyways. This allows serial
core to add a preferred console using serial drv->dev_name without copying
it.

Note that constifying options causes changes all over the place because of
struct console for match().

Suggested-by: Jiri Slaby <jirislaby@kernel.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20231012064300.50221-2-tony@atomide.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/console.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 1be13c9695e0..eddf4c5279b5 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -347,7 +347,7 @@ enum con_flush_mode {
 	CONSOLE_REPLAY_ALL,
 };
 
-extern int add_preferred_console(char *name, const short idx, char *options);
+extern int add_preferred_console(const char *name, const short idx, char *options);
 extern void console_force_preferred_locked(struct console *con);
 extern void register_console(struct console *);
 extern int unregister_console(struct console *);
-- 
cgit v1.2.3


From c5b5831f3c168a1bf1172159c4956e721a0064cf Mon Sep 17 00:00:00 2001
From: Fabien Parent <fparent@baylibre.com>
Date: Mon, 18 Sep 2023 11:37:52 +0200
Subject: pmdomain: mediatek: Add support for MT8365

Add the needed board data to support MT8365 SoC.

Signed-off-by: Fabien Parent <fparent@baylibre.com>
Signed-off-by: Markus Schneider-Pargmann <msp@baylibre.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Alexandre Mergnat <amergnat@baylibre.com>
Tested-by: Alexandre Mergnat <amergnat@baylibre.com>
Link: https://lore.kernel.org/r/20230918093751.1188668-9-msp@baylibre.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/soc/mediatek/infracfg.h | 41 +++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h
index 07f67b3d8e97..6c6cccc848f4 100644
--- a/include/linux/soc/mediatek/infracfg.h
+++ b/include/linux/soc/mediatek/infracfg.h
@@ -2,6 +2,47 @@
 #ifndef __SOC_MEDIATEK_INFRACFG_H
 #define __SOC_MEDIATEK_INFRACFG_H
 
+#define MT8365_INFRA_TOPAXI_PROTECTEN_STA1				0x228
+#define MT8365_INFRA_TOPAXI_PROTECTEN_SET				0x2a0
+#define MT8365_INFRA_TOPAXI_PROTECTEN_CLR				0x2a4
+#define MT8365_INFRA_TOPAXI_PROTECTEN_MM_M0				BIT(1)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_MDMCU_M1				BIT(2)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_MMAPB_S				BIT(6)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_MM2INFRA_AXI_GALS_SLV_0		BIT(10)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_MM2INFRA_AXI_GALS_SLV_1		BIT(11)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_AP2CONN_AHB			BIT(13)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_CONN2INFRA_AHB			BIT(14)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_MFG_M0				BIT(21)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_INFRA2MFG			BIT(22)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_STA1				0x258
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_SET				0x2a8
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_CLR				0x2ac
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_APU2AP				BIT(2)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_MM2INFRA_AXI_GALS_MST_0	BIT(16)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_MM2INFRA_AXI_GALS_MST_1	BIT(17)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_CONN2INFRA_AXI_GALS_MST	BIT(18)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_CAM2MM_AXI_GALS_MST		BIT(19)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_APU_CBIP_GALS_MST		BIT(20)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_INFRA2CONN_AHB_GALS_SLV	BIT(21)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_PWRDNREQ_INFRA_GALS_ADB	BIT(24)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_PWRDNREQ_MP1_L2C_AFIFO		BIT(27)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_AUDIO_BUS_AUDIO_M		BIT(28)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_AUDIO_BUS_DSP_M		BIT(30)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_AUDIO_BUS_DSP_S		BIT(31)
+
+#define MT8365_INFRA_NAO_TOPAXI_SI0_STA					0x0
+#define MT8365_INFRA_NAO_TOPAXI_SI0_CTRL_UPDATED			BIT(24)
+#define MT8365_INFRA_NAO_TOPAXI_SI2_STA					0x28
+#define MT8365_INFRA_NAO_TOPAXI_SI2_CTRL_UPDATED			BIT(14)
+#define MT8365_INFRA_TOPAXI_SI0_CTL					0x200
+#define MT8365_INFRA_TOPAXI_SI0_WAY_EN_MMAPB_S				BIT(6)
+#define MT8365_INFRA_TOPAXI_SI2_CTL					0x234
+#define MT8365_INFRA_TOPAXI_SI2_WAY_EN_PERI_M1				BIT(5)
+
+#define MT8365_SMI_COMMON_CLAMP_EN			0x3c0
+#define MT8365_SMI_COMMON_CLAMP_EN_SET			0x3c4
+#define MT8365_SMI_COMMON_CLAMP_EN_CLR			0x3c8
+
 #define MT8195_TOP_AXI_PROT_EN_STA1                     0x228
 #define MT8195_TOP_AXI_PROT_EN_1_STA1                   0x258
 #define MT8195_TOP_AXI_PROT_EN_SET			0x2a0
-- 
cgit v1.2.3


From 0ac6486e5cbd1af682d3b52fcd3615247e13cb57 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Fri, 13 Oct 2023 12:06:24 +0800
Subject: i3c: Add support for bus enumeration & notification

This allows other drivers to be notified when new i3c busses are
attached, referring to a whole i3c bus as opposed to individual
devices.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/i3c/master.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 0b52da4f2346..db909ef79be4 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -24,6 +24,12 @@
 
 struct i2c_client;
 
+/* notifier actions. notifier call data is the struct i3c_bus */
+enum {
+	I3C_NOTIFY_BUS_ADD,
+	I3C_NOTIFY_BUS_REMOVE,
+};
+
 struct i3c_master_controller;
 struct i3c_bus;
 struct i3c_device;
@@ -652,4 +658,9 @@ void i3c_master_queue_ibi(struct i3c_dev_desc *dev, struct i3c_ibi_slot *slot);
 
 struct i3c_ibi_slot *i3c_master_get_free_ibi_slot(struct i3c_dev_desc *dev);
 
+void i3c_for_each_bus_locked(int (*fn)(struct i3c_bus *bus, void *data),
+			     void *data);
+int i3c_register_notifier(struct notifier_block *nb);
+int i3c_unregister_notifier(struct notifier_block *nb);
+
 #endif /* I3C_MASTER_H */
-- 
cgit v1.2.3


From 184fdf9fc7ae6ae7155768faa48fc609d1a24b7e Mon Sep 17 00:00:00 2001
From: Cuda-Chen <clh960524@gmail.com>
Date: Tue, 17 Oct 2023 13:37:03 +0800
Subject: locking/seqlock: Fix grammar in comment

The "neither writes before and after ..." for the description
of do_write_seqcount_end() should be "neither writes before nor after".

Signed-off-by: Cuda-Chen <clh960524@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231017053703.11312-1-clh960524@gmail.com
---
 include/linux/seqlock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 80f21d2ca2aa..e92f9d5577ba 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -584,7 +584,7 @@ static inline void do_write_seqcount_end(seqcount_t *s)
  * via WRITE_ONCE): a) to ensure the writes become visible to other threads
  * atomically, avoiding compiler optimizations; b) to document which writes are
  * meant to propagate to the reader critical section. This is necessary because
- * neither writes before and after the barrier are enclosed in a seq-writer
+ * neither writes before nor after the barrier are enclosed in a seq-writer
  * critical section that would ensure readers are aware of ongoing writes::
  *
  *	seqcount_t seq;
-- 
cgit v1.2.3


From 9a675ba55a96a45a9fb69e6a5c43f80c6682e541 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 16 Oct 2023 14:57:38 +0200
Subject: net, bpf: Add a warning if NAPI cb missed xdp_do_flush().
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A few drivers were missing a xdp_do_flush() invocation after
XDP_REDIRECT.

Add three helper functions each for one of the per-CPU lists. Return
true if the per-CPU list is non-empty and flush the list.

Add xdp_do_check_flushed() which invokes each helper functions and
creates a warning if one of the functions had a non-empty list.

Hide everything behind CONFIG_DEBUG_NET.

Suggested-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20231016125738.Yt79p1uF@linutronix.de
---
 include/linux/bpf.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d3c51a507508..b4b40b45962b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2478,6 +2478,9 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 		     enum bpf_dynptr_type type, u32 offset, u32 size);
 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
+
+bool dev_check_flush(void);
+bool cpu_map_check_flush(void);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
-- 
cgit v1.2.3


From 96ff37ceb203426b1bcebbae42399686110b0130 Mon Sep 17 00:00:00 2001
From: Greg Joyce <gjoyce@linux.vnet.ibm.com>
Date: Wed, 4 Oct 2023 15:19:55 -0500
Subject: block:sed-opal: SED Opal keystore

Add read and write functions that allow SED Opal keys to stored
in a permanent keystore.

Signed-off-by: Greg Joyce <gjoyce@linux.vnet.ibm.com>
Reviewed-by: Jonathan Derrick <jonathan.derrick@linux.dev>
Link: https://lore.kernel.org/r/20231004201957.1451669-2-gjoyce@linux.vnet.ibm.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sed-opal-key.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 include/linux/sed-opal-key.h

(limited to 'include/linux')

diff --git a/include/linux/sed-opal-key.h b/include/linux/sed-opal-key.h
new file mode 100644
index 000000000000..0ca03054e8f6
--- /dev/null
+++ b/include/linux/sed-opal-key.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SED key operations.
+ *
+ * Copyright (C) 2023 IBM Corporation
+ *
+ * These are the accessor functions (read/write) for SED Opal
+ * keys. Specific keystores can provide overrides.
+ *
+ */
+
+#include <linux/kernel.h>
+
+#ifdef CONFIG_PSERIES_PLPKS_SED
+int sed_read_key(char *keyname, char *key, u_int *keylen);
+int sed_write_key(char *keyname, char *key, u_int keylen);
+#else
+static inline
+int sed_read_key(char *keyname, char *key, u_int *keylen) {
+	return -EOPNOTSUPP;
+}
+static inline
+int sed_write_key(char *keyname, char *key, u_int keylen) {
+	return -EOPNOTSUPP;
+}
+#endif
-- 
cgit v1.2.3


From 41efa431244f6498833ff8ee8dde28c4924c5479 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 17 Oct 2023 10:56:38 -0700
Subject: PCI/MSI: Provide stubs for IMS functions

The IMS related functions (pci_create_ims_domain(), pci_ims_alloc_irq(),
and pci_ims_free_irq()) are not declared when CONFIG_PCI_MSI is disabled.

Provide definitions of these functions for use when callers are compiled
with CONFIG_PCI_MSI disabled.

Fixes: 0194425af0c8 ("PCI/MSI: Provide IMS (Interrupt Message Store) support")
Fixes: c9e5bea27383 ("PCI/MSI: Provide pci_ims_alloc/free_irq()")
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/14ff656899a3757453f8584c1109d7a9b98fa258.1697564731.git.reinette.chatre@intel.com
---
 include/linux/pci.h | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8c7c2c3c6c65..b56417276042 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1624,6 +1624,8 @@ struct msix_entry {
 	u16	entry;	/* Driver uses to specify entry, OS writes */
 };
 
+struct msi_domain_template;
+
 #ifdef CONFIG_PCI_MSI
 int pci_msi_vec_count(struct pci_dev *dev);
 void pci_disable_msi(struct pci_dev *dev);
@@ -1656,6 +1658,11 @@ void pci_msix_free_irq(struct pci_dev *pdev, struct msi_map map);
 void pci_free_irq_vectors(struct pci_dev *dev);
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
 const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec);
+bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template,
+			   unsigned int hwsize, void *data);
+struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev, union msi_instance_cookie *icookie,
+				 const struct irq_affinity_desc *affdesc);
+void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map);
 
 #else
 static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; }
@@ -1719,6 +1726,25 @@ static inline const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev,
 {
 	return cpu_possible_mask;
 }
+
+static inline bool pci_create_ims_domain(struct pci_dev *pdev,
+					 const struct msi_domain_template *template,
+					 unsigned int hwsize, void *data)
+{ return false; }
+
+static inline struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev,
+					       union msi_instance_cookie *icookie,
+					       const struct irq_affinity_desc *affdesc)
+{
+	struct msi_map map = { .index = -ENOSYS, };
+
+	return map;
+}
+
+static inline void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map)
+{
+}
+
 #endif
 
 /**
@@ -2616,14 +2642,6 @@ static inline bool pci_is_thunderbolt_attached(struct pci_dev *pdev)
 void pci_uevent_ers(struct pci_dev *pdev, enum  pci_ers_result err_type);
 #endif
 
-struct msi_domain_template;
-
-bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template,
-			   unsigned int hwsize, void *data);
-struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev, union msi_instance_cookie *icookie,
-				 const struct irq_affinity_desc *affdesc);
-void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map);
-
 #include <linux/dma-mapping.h>
 
 #define pci_printk(level, pdev, fmt, arg...) \
-- 
cgit v1.2.3


From 3ebed3749f1767927229d568eea29daaf9f272d5 Mon Sep 17 00:00:00 2001
From: Mark O'Donovan <shiftee@posteo.net>
Date: Tue, 17 Oct 2023 17:09:17 +0000
Subject: nvme-auth: alloc nvme_dhchap_key as single buffer

Co-developed-by: Akash Appaiah <Akash.Appaiah@dell.com>
Signed-off-by: Akash Appaiah <Akash.Appaiah@dell.com>
Signed-off-by: Mark O'Donovan <shiftee@posteo.net>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme-auth.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h
index dcb8030062dd..a5ae9abe1ef6 100644
--- a/include/linux/nvme-auth.h
+++ b/include/linux/nvme-auth.h
@@ -9,9 +9,9 @@
 #include <crypto/kpp.h>
 
 struct nvme_dhchap_key {
-	u8 *key;
 	size_t len;
 	u8 hash;
+	u8 key[];
 };
 
 u32 nvme_auth_get_seqnum(void);
@@ -24,9 +24,11 @@ const char *nvme_auth_digest_name(u8 hmac_id);
 size_t nvme_auth_hmac_hash_len(u8 hmac_id);
 u8 nvme_auth_hmac_id(const char *hmac_name);
 
+u32 nvme_auth_key_struct_size(u32 key_len);
 struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
 					      u8 key_hash);
 void nvme_auth_free_key(struct nvme_dhchap_key *key);
+struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash);
 u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn);
 int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key);
 int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
-- 
cgit v1.2.3


From f047daed179a451657d1e66b5fe4030a593a000c Mon Sep 17 00:00:00 2001
From: Mark O'Donovan <shiftee@posteo.net>
Date: Tue, 17 Oct 2023 17:09:18 +0000
Subject: nvme-auth: use transformed key size to create resp

This does not change current behaviour as the driver currently
verifies that the secret size is the same size as the length of
the transformation hash.

Co-developed-by: Akash Appaiah <Akash.Appaiah@dell.com>
Signed-off-by: Akash Appaiah <Akash.Appaiah@dell.com>
Signed-off-by: Mark O'Donovan <shiftee@posteo.net>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme-auth.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h
index a5ae9abe1ef6..c1d0bc5d9624 100644
--- a/include/linux/nvme-auth.h
+++ b/include/linux/nvme-auth.h
@@ -29,7 +29,8 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
 					      u8 key_hash);
 void nvme_auth_free_key(struct nvme_dhchap_key *key);
 struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash);
-u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn);
+struct nvme_dhchap_key *nvme_auth_transform_key(
+				struct nvme_dhchap_key *key, char *nqn);
 int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key);
 int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
 				  u8 *challenge, u8 *aug, size_t hlen);
-- 
cgit v1.2.3


From b6f9774719e5601b32f47021b40fee446b356490 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 16 Oct 2023 16:42:53 +0100
Subject: net: phylink: provide mac_get_caps() method

Provide a new method, mac_get_caps() to get the MAC capabilities for
the specified interface mode. This is for MACs which have special
requirements, such as not supporting half-duplex in certain interface
modes, and will replace the validate() method.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/E1qsPk5-009wiX-G5@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phylink.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 2b886ea654bb..0798198a09ef 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -228,6 +228,7 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed);
 /**
  * struct phylink_mac_ops - MAC operations structure.
  * @validate: Validate and update the link configuration.
+ * @mac_get_caps: Get MAC capabilities for interface mode.
  * @mac_select_pcs: Select a PCS for the interface mode.
  * @mac_prepare: prepare for a major reconfiguration of the interface.
  * @mac_config: configure the MAC for the selected mode and state.
@@ -241,6 +242,8 @@ struct phylink_mac_ops {
 	void (*validate)(struct phylink_config *config,
 			 unsigned long *supported,
 			 struct phylink_link_state *state);
+	unsigned long (*mac_get_caps)(struct phylink_config *config,
+				      phy_interface_t interface);
 	struct phylink_pcs *(*mac_select_pcs)(struct phylink_config *config,
 					      phy_interface_t interface);
 	int (*mac_prepare)(struct phylink_config *config, unsigned int mode,
@@ -292,6 +295,18 @@ struct phylink_mac_ops {
  */
 void validate(struct phylink_config *config, unsigned long *supported,
 	      struct phylink_link_state *state);
+/**
+ * mac_get_caps: Get MAC capabilities for interface mode.
+ * @config: a pointer to a &struct phylink_config.
+ * @interface: PHY interface mode.
+ *
+ * Optional method. When not provided, config->mac_capabilities will be used.
+ * When implemented, this returns the MAC capabilities for the specified
+ * interface mode where there is some special handling required by the MAC
+ * driver (e.g. not supporting half-duplex in certain interface modes.)
+ */
+unsigned long mac_get_caps(struct phylink_config *config,
+			   phy_interface_t interface);
 /**
  * mac_select_pcs: Select a PCS for the interface mode.
  * @config: a pointer to a &struct phylink_config.
-- 
cgit v1.2.3


From da5f6b80ad6417f7cf50d273c1c0ecb885e511e8 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 16 Oct 2023 16:43:03 +0100
Subject: net: phylink: remove .validate() method

The MAC .validate() method is no longer used, so remove it from the
phylink_mac_ops structure, and remove the callsite in
phylink_validate_mac_and_pcs().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/E1qsPkF-009wij-QM@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phylink.h | 38 --------------------------------------
 1 file changed, 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 0798198a09ef..0cf559bae1ff 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -227,7 +227,6 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed);
 
 /**
  * struct phylink_mac_ops - MAC operations structure.
- * @validate: Validate and update the link configuration.
  * @mac_get_caps: Get MAC capabilities for interface mode.
  * @mac_select_pcs: Select a PCS for the interface mode.
  * @mac_prepare: prepare for a major reconfiguration of the interface.
@@ -239,9 +238,6 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed);
  * The individual methods are described more fully below.
  */
 struct phylink_mac_ops {
-	void (*validate)(struct phylink_config *config,
-			 unsigned long *supported,
-			 struct phylink_link_state *state);
 	unsigned long (*mac_get_caps)(struct phylink_config *config,
 				      phy_interface_t interface);
 	struct phylink_pcs *(*mac_select_pcs)(struct phylink_config *config,
@@ -261,40 +257,6 @@ struct phylink_mac_ops {
 };
 
 #if 0 /* For kernel-doc purposes only. */
-/**
- * validate - Validate and update the link configuration
- * @config: a pointer to a &struct phylink_config.
- * @supported: ethtool bitmask for supported link modes.
- * @state: a pointer to a &struct phylink_link_state.
- *
- * Clear bits in the @supported and @state->advertising masks that
- * are not supportable by the MAC.
- *
- * Note that the PHY may be able to transform from one connection
- * technology to another, so, eg, don't clear 1000BaseX just
- * because the MAC is unable to BaseX mode. This is more about
- * clearing unsupported speeds and duplex settings. The port modes
- * should not be cleared; phylink_set_port_modes() will help with this.
- *
- * When @config->supported_interfaces has been set, phylink will iterate
- * over the supported interfaces to determine the full capability of the
- * MAC. The validation function must not print errors if @state->interface
- * is set to an unexpected value.
- *
- * When @config->supported_interfaces is empty, phylink will call this
- * function with @state->interface set to %PHY_INTERFACE_MODE_NA, and
- * expects the MAC driver to return all supported link modes.
- *
- * If the @state->interface mode is not supported, then the @supported
- * mask must be cleared.
- *
- * This member is optional; if not set, the generic validator will be
- * used making use of @config->mac_capabilities and
- * @config->supported_interfaces to determine which link modes are
- * supported.
- */
-void validate(struct phylink_config *config, unsigned long *supported,
-	      struct phylink_link_state *state);
 /**
  * mac_get_caps: Get MAC capabilities for interface mode.
  * @config: a pointer to a &struct phylink_config.
-- 
cgit v1.2.3


From 743f6397623edc708eda76ecc20e70032166e573 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 16 Oct 2023 16:43:08 +0100
Subject: net: phylink: remove a bunch of unused validation methods

Remove exports for phylink_caps_to_linkmodes(),
phylink_get_capabilities(), phylink_validate_mask_caps() and
phylink_generic_validate(). Also, as phylink_generic_validate() is no
longer called, we can remove its implementation as well.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/E1qsPkK-009wip-W9@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phylink.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 0cf559bae1ff..875439ab45de 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -613,17 +613,6 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode,
 		 phy_interface_t interface, int speed, int duplex);
 #endif
 
-void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps);
-unsigned long phylink_get_capabilities(phy_interface_t interface,
-				       unsigned long mac_capabilities,
-				       int rate_matching);
-void phylink_validate_mask_caps(unsigned long *supported,
-				struct phylink_link_state *state,
-				unsigned long caps);
-void phylink_generic_validate(struct phylink_config *config,
-			      unsigned long *supported,
-			      struct phylink_link_state *state);
-
 struct phylink *phylink_create(struct phylink_config *,
 			       const struct fwnode_handle *,
 			       phy_interface_t,
-- 
cgit v1.2.3


From cf439721f66719c6aa73b5ea23320c2f8cba100f Mon Sep 17 00:00:00 2001
From: Alexander Usyskin <alexander.usyskin@intel.com>
Date: Wed, 11 Oct 2023 14:01:54 +0300
Subject: mei: bus: add send and recv api with timeout

Add variation of the send and recv functions on bus
that define timeout.
Caller can use such functions in flow that can stuck
to bail out and not to put down the whole system.

Signed-off-by: Alexander Usyskin <alexander.usyskin@intel.com>
Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Link: https://lore.kernel.org/r/20231011110157.247552-2-tomas.winkler@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mei_cl_bus.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index fd6e0620658d..b77cefd4f0c0 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -94,15 +94,23 @@ void mei_cldev_driver_unregister(struct mei_cl_driver *cldrv);
 
 ssize_t mei_cldev_send(struct mei_cl_device *cldev, const u8 *buf,
 		       size_t length);
+ssize_t mei_cldev_send_timeout(struct mei_cl_device *cldev, const u8 *buf,
+			       size_t length, unsigned long timeout);
 ssize_t mei_cldev_recv(struct mei_cl_device *cldev, u8 *buf, size_t length);
 ssize_t mei_cldev_recv_nonblock(struct mei_cl_device *cldev, u8 *buf,
 				size_t length);
+ssize_t mei_cldev_recv_timeout(struct mei_cl_device *cldev, u8 *buf, size_t length,
+			       unsigned long timeout);
 ssize_t mei_cldev_send_vtag(struct mei_cl_device *cldev, const u8 *buf,
 			    size_t length, u8 vtag);
+ssize_t mei_cldev_send_vtag_timeout(struct mei_cl_device *cldev, const u8 *buf,
+				    size_t length, u8 vtag, unsigned long timeout);
 ssize_t mei_cldev_recv_vtag(struct mei_cl_device *cldev, u8 *buf, size_t length,
 			    u8 *vtag);
 ssize_t mei_cldev_recv_nonblock_vtag(struct mei_cl_device *cldev, u8 *buf,
 				     size_t length, u8 *vtag);
+ssize_t mei_cldev_recv_vtag_timeout(struct mei_cl_device *cldev, u8 *buf, size_t length,
+			    u8 *vtag, unsigned long timeout);
 
 int mei_cldev_register_rx_cb(struct mei_cl_device *cldev, mei_cldev_cb_t rx_cb);
 int mei_cldev_register_notif_cb(struct mei_cl_device *cldev,
-- 
cgit v1.2.3


From ae4cb6bd5039a659d71632dbc2e176bf9227e294 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Wed, 11 Oct 2023 10:43:01 +0300
Subject: mei: docs: fix spelling errors

Fix spelling errors in the mei code base.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20231011074301.223879-4-tomas.winkler@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mei_cl_bus.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index b77cefd4f0c0..b38a56a13f39 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -31,11 +31,11 @@ typedef void (*mei_cldev_cb_t)(struct mei_cl_device *cldev);
  * @rx_work: async work to execute Rx event callback
  * @rx_cb: Drivers register this callback to get asynchronous ME
  *	Rx buffer pending notifications.
- * @notif_work: async work to execute FW notif event callback
+ * @notif_work: async work to execute FW notify event callback
  * @notif_cb: Drivers register this callback to get asynchronous ME
  *	FW notification pending notifications.
  *
- * @do_match: wheather device can be matched with a driver
+ * @do_match: whether the device can be matched with a driver
  * @is_added: device is already scanned
  * @priv_data: client private data
  */
-- 
cgit v1.2.3


From 8d8ae17eb0de1fcdff6e7ddee3b641a16eefe8f6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 16 Oct 2023 16:31:33 +0300
Subject: parport: Use kasprintf() instead of fixed buffer formatting

Improve readability and maintainability by replacing a hardcoded string
allocation and formatting by the use of the kasprintf() helper.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231016133135.1203643-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/parport.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/parport.h b/include/linux/parport.h
index 999eddd619b7..fff39bc30629 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -180,8 +180,6 @@ struct ieee1284_info {
 	struct semaphore irq;
 };
 
-#define PARPORT_NAME_MAX_LEN 15
-
 /* A parallel port */
 struct parport {
 	unsigned long base;	/* base address */
-- 
cgit v1.2.3


From e0d4593140b01b8da513a0c88c26da28b4906413 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 11 Oct 2023 09:59:38 +0200
Subject: netfilter: make nftables drops visible in net dropmonitor

net_dropmonitor blames core.c:nf_hook_slow.
Add NF_DROP_REASON() helper and use it in nft_do_chain().

The helper releases the skb, so exact drop location becomes
available. Calling code will observe the NF_STOLEN verdict
instead.

Adjust nf_hook_slow so we can embed an erro value wih
NF_STOLEN verdicts, just like we do for NF_DROP.

After this, drop in nftables can be pinpointed to a drop due
to a rule or the chain policy.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index d68644b7c299..80900d910992 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -22,6 +22,16 @@ static inline int NF_DROP_GETERR(int verdict)
 	return -(verdict >> NF_VERDICT_QBITS);
 }
 
+static __always_inline int
+NF_DROP_REASON(struct sk_buff *skb, enum skb_drop_reason reason, u32 err)
+{
+	BUILD_BUG_ON(err > 0xffff);
+
+	kfree_skb_reason(skb, reason);
+
+	return ((err << 16) | NF_STOLEN);
+}
+
 static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1,
 				   const union nf_inet_addr *a2)
 {
-- 
cgit v1.2.3


From 26c5334d344db7a9bffa1f5ce87d9045b5e90383 Mon Sep 17 00:00:00 2001
From: Paul Greenwalt <paul.greenwalt@intel.com>
Date: Sun, 15 Oct 2023 19:43:02 -0400
Subject: ethtool: Add forced speed to supported link modes maps

The need to map Ethtool forced speeds to Ethtool supported link modes is
common among drivers. To support this, add a common structure for forced
speed maps and a function to init them.  This is solution was originally
introduced in commit 1d4e4ecccb11 ("qede: populate supported link modes
maps on module init") for qede driver.

ethtool_forced_speed_maps_init() should be called during driver init
with an array of struct ethtool_forced_speed_map to populate the mapping.

Definitions for maps themselves are left in the driver code, as the sets
of supported link modes may vary between the devices.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Pawel Chmielewski <pawel.chmielewski@intel.com>
Signed-off-by: Paul Greenwalt <paul.greenwalt@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h  | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/linkmode.h | 29 +++++++++++++++--------------
 2 files changed, 52 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 62b61527bcc4..8e91e8b8a693 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -13,6 +13,7 @@
 #ifndef _LINUX_ETHTOOL_H
 #define _LINUX_ETHTOOL_H
 
+#include <linux/linkmode.h>
 #include <linux/bitmap.h>
 #include <linux/compat.h>
 #include <linux/if_ether.h>
@@ -1052,4 +1053,40 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add,
  * next string.
  */
 extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...);
+
+/* Link mode to forced speed capabilities maps */
+struct ethtool_forced_speed_map {
+	u32		speed;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(caps);
+
+	const u32	*cap_arr;
+	u32		arr_size;
+};
+
+#define ETHTOOL_FORCED_SPEED_MAP(prefix, value)				\
+{									\
+	.speed		= SPEED_##value,				\
+	.cap_arr	= prefix##_##value,				\
+	.arr_size	= ARRAY_SIZE(prefix##_##value),			\
+}
+
+/**
+ * ethtool_forced_speed_maps_init
+ * @maps: Pointer to an array of Ethtool forced speed map
+ * @size: Array size
+ *
+ * Initialize an array of Ethtool forced speed map to Ethtool link modes. This
+ * should be called during driver module init.
+ */
+static inline void
+ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size)
+{
+	for (u32 i = 0; i < size; i++) {
+		struct ethtool_forced_speed_map *map = &maps[i];
+
+		linkmode_set_bit_array(map->cap_arr, map->arr_size, map->caps);
+		map->cap_arr = NULL;
+		map->arr_size = 0;
+	}
+}
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index 15e0e0209da4..cd38f89553e6 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -2,6 +2,21 @@
 #define __LINKMODE_H
 
 #include <linux/bitmap.h>
+
+static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
+{
+	__set_bit(nr, addr);
+}
+
+static inline void linkmode_set_bit_array(const int *array, int array_size,
+					  unsigned long *addr)
+{
+	int i;
+
+	for (i = 0; i < array_size; i++)
+		linkmode_set_bit(array[i], addr);
+}
+
 #include <linux/ethtool.h>
 #include <uapi/linux/ethtool.h>
 
@@ -38,20 +53,6 @@ static inline int linkmode_andnot(unsigned long *dst, const unsigned long *src1,
 	return bitmap_andnot(dst, src1, src2,  __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
-static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
-{
-	__set_bit(nr, addr);
-}
-
-static inline void linkmode_set_bit_array(const int *array, int array_size,
-					  unsigned long *addr)
-{
-	int i;
-
-	for (i = 0; i < array_size; i++)
-		linkmode_set_bit(array[i], addr);
-}
-
 static inline void linkmode_clear_bit(int nr, volatile unsigned long *addr)
 {
 	__clear_bit(nr, addr);
-- 
cgit v1.2.3


From 265f3ed077036f053981f5eea0b5b43e7c5b39ff Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Sun, 24 Sep 2023 17:07:02 +0200
Subject: workqueue: Provide one lock class key per work_on_cpu() callsite

All callers of work_on_cpu() share the same lock class key for all the
functions queued. As a result the workqueue related locking scenario for
a function A may be spuriously accounted as an inversion against the
locking scenario of function B such as in the following model:

	long A(void *arg)
	{
		mutex_lock(&mutex);
		mutex_unlock(&mutex);
	}

	long B(void *arg)
	{
	}

	void launchA(void)
	{
		work_on_cpu(0, A, NULL);
	}

	void launchB(void)
	{
		mutex_lock(&mutex);
		work_on_cpu(1, B, NULL);
		mutex_unlock(&mutex);
	}

launchA and launchB running concurrently have no chance to deadlock.
However the above can be reported by lockdep as a possible locking
inversion because the works containing A() and B() are treated as
belonging to the same locking class.

The following shows an existing example of such a spurious lockdep splat:

	 ======================================================
	 WARNING: possible circular locking dependency detected
	 6.6.0-rc1-00065-g934ebd6e5359 #35409 Not tainted
	 ------------------------------------------------------
	 kworker/0:1/9 is trying to acquire lock:
	 ffffffff9bc72f30 (cpu_hotplug_lock){++++}-{0:0}, at: _cpu_down+0x57/0x2b0

	 but task is already holding lock:
	 ffff9e3bc0057e60 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_scheduled_works+0x216/0x500

	 which lock already depends on the new lock.

	 the existing dependency chain (in reverse order) is:

	 -> #2 ((work_completion)(&wfc.work)){+.+.}-{0:0}:
			__flush_work+0x83/0x4e0
			work_on_cpu+0x97/0xc0
			rcu_nocb_cpu_offload+0x62/0xb0
			rcu_nocb_toggle+0xd0/0x1d0
			kthread+0xe6/0x120
			ret_from_fork+0x2f/0x40
			ret_from_fork_asm+0x1b/0x30

	 -> #1 (rcu_state.barrier_mutex){+.+.}-{3:3}:
			__mutex_lock+0x81/0xc80
			rcu_nocb_cpu_deoffload+0x38/0xb0
			rcu_nocb_toggle+0x144/0x1d0
			kthread+0xe6/0x120
			ret_from_fork+0x2f/0x40
			ret_from_fork_asm+0x1b/0x30

	 -> #0 (cpu_hotplug_lock){++++}-{0:0}:
			__lock_acquire+0x1538/0x2500
			lock_acquire+0xbf/0x2a0
			percpu_down_write+0x31/0x200
			_cpu_down+0x57/0x2b0
			__cpu_down_maps_locked+0x10/0x20
			work_for_cpu_fn+0x15/0x20
			process_scheduled_works+0x2a7/0x500
			worker_thread+0x173/0x330
			kthread+0xe6/0x120
			ret_from_fork+0x2f/0x40
			ret_from_fork_asm+0x1b/0x30

	 other info that might help us debug this:

	 Chain exists of:
	   cpu_hotplug_lock --> rcu_state.barrier_mutex --> (work_completion)(&wfc.work)

	  Possible unsafe locking scenario:

			CPU0                    CPU1
			----                    ----
	   lock((work_completion)(&wfc.work));
									lock(rcu_state.barrier_mutex);
									lock((work_completion)(&wfc.work));
	   lock(cpu_hotplug_lock);

	  *** DEADLOCK ***

	 2 locks held by kworker/0:1/9:
	  #0: ffff900481068b38 ((wq_completion)events){+.+.}-{0:0}, at: process_scheduled_works+0x212/0x500
	  #1: ffff9e3bc0057e60 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_scheduled_works+0x216/0x500

	 stack backtrace:
	 CPU: 0 PID: 9 Comm: kworker/0:1 Not tainted 6.6.0-rc1-00065-g934ebd6e5359 #35409
	 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
	 Workqueue: events work_for_cpu_fn
	 Call Trace:
	 rcu-torture: rcu_torture_read_exit: Start of episode
	  <TASK>
	  dump_stack_lvl+0x4a/0x80
	  check_noncircular+0x132/0x150
	  __lock_acquire+0x1538/0x2500
	  lock_acquire+0xbf/0x2a0
	  ? _cpu_down+0x57/0x2b0
	  percpu_down_write+0x31/0x200
	  ? _cpu_down+0x57/0x2b0
	  _cpu_down+0x57/0x2b0
	  __cpu_down_maps_locked+0x10/0x20
	  work_for_cpu_fn+0x15/0x20
	  process_scheduled_works+0x2a7/0x500
	  worker_thread+0x173/0x330
	  ? __pfx_worker_thread+0x10/0x10
	  kthread+0xe6/0x120
	  ? __pfx_kthread+0x10/0x10
	  ret_from_fork+0x2f/0x40
	  ? __pfx_kthread+0x10/0x10
	  ret_from_fork_asm+0x1b/0x30
	  </TASK

Fix this with providing one lock class key per work_on_cpu() caller.

Reported-and-tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 46 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1c1d06804d45..24b1e5070f4d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -274,18 +274,16 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
  * to generate better code.
  */
 #ifdef CONFIG_LOCKDEP
-#define __INIT_WORK(_work, _func, _onstack)				\
+#define __INIT_WORK_KEY(_work, _func, _onstack, _key)			\
 	do {								\
-		static struct lock_class_key __key;			\
-									\
 		__init_work((_work), _onstack);				\
 		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
-		lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \
+		lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \
 		INIT_LIST_HEAD(&(_work)->entry);			\
 		(_work)->func = (_func);				\
 	} while (0)
 #else
-#define __INIT_WORK(_work, _func, _onstack)				\
+#define __INIT_WORK_KEY(_work, _func, _onstack, _key)			\
 	do {								\
 		__init_work((_work), _onstack);				\
 		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
@@ -294,12 +292,22 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 	} while (0)
 #endif
 
+#define __INIT_WORK(_work, _func, _onstack)				\
+	do {								\
+		static __maybe_unused struct lock_class_key __key;	\
+									\
+		__INIT_WORK_KEY(_work, _func, _onstack, &__key);	\
+	} while (0)
+
 #define INIT_WORK(_work, _func)						\
 	__INIT_WORK((_work), (_func), 0)
 
 #define INIT_WORK_ONSTACK(_work, _func)					\
 	__INIT_WORK((_work), (_func), 1)
 
+#define INIT_WORK_ONSTACK_KEY(_work, _func, _key)			\
+	__INIT_WORK_KEY((_work), (_func), 1, _key)
+
 #define __INIT_DELAYED_WORK(_work, _func, _tflags)			\
 	do {								\
 		INIT_WORK(&(_work)->work, (_func));			\
@@ -693,8 +701,32 @@ static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
 	return fn(arg);
 }
 #else
-long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
-long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
+long work_on_cpu_key(int cpu, long (*fn)(void *),
+		     void *arg, struct lock_class_key *key);
+/*
+ * A new key is defined for each caller to make sure the work
+ * associated with the function doesn't share its locking class.
+ */
+#define work_on_cpu(_cpu, _fn, _arg)			\
+({							\
+	static struct lock_class_key __key;		\
+							\
+	work_on_cpu_key(_cpu, _fn, _arg, &__key);	\
+})
+
+long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
+			  void *arg, struct lock_class_key *key);
+
+/*
+ * A new key is defined for each caller to make sure the work
+ * associated with the function doesn't share its locking class.
+ */
+#define work_on_cpu_safe(_cpu, _fn, _arg)		\
+({							\
+	static struct lock_class_key __key;		\
+							\
+	work_on_cpu_safe_key(_cpu, _fn, _arg, &__key);	\
+})
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
-- 
cgit v1.2.3


From 077c212f0344ae4198b2b51af128a94b614ccdf4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 4 Oct 2023 14:52:37 -0400
Subject: fs: new accessor methods for atime and mtime

Recently, we converted the ctime accesses in the kernel to use new
accessor functions. Linus recently pointed out though that if we add
accessors for the atime and mtime, then that would allow us to
seamlessly change how these timestamps are stored in the inode.

Add new accessor functions for the atime and mtime that mirror the
accessors for the ctime.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20231004185239.80830-1-jlayton@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 85 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 72 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b528f063e8ff..79cc11139115 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1511,24 +1511,81 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
 struct timespec64 current_time(struct inode *inode);
 struct timespec64 inode_set_ctime_current(struct inode *inode);
 
-/**
- * inode_get_ctime - fetch the current ctime from the inode
- * @inode: inode from which to fetch ctime
- *
- * Grab the current ctime from the inode and return it.
- */
+static inline time64_t inode_get_atime_sec(const struct inode *inode)
+{
+	return inode->i_atime.tv_sec;
+}
+
+static inline long inode_get_atime_nsec(const struct inode *inode)
+{
+	return inode->i_atime.tv_nsec;
+}
+
+static inline struct timespec64 inode_get_atime(const struct inode *inode)
+{
+	return inode->i_atime;
+}
+
+static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode,
+						      struct timespec64 ts)
+{
+	inode->i_atime = ts;
+	return ts;
+}
+
+static inline struct timespec64 inode_set_atime(struct inode *inode,
+						time64_t sec, long nsec)
+{
+	struct timespec64 ts = { .tv_sec  = sec,
+				 .tv_nsec = nsec };
+	return inode_set_atime_to_ts(inode, ts);
+}
+
+static inline time64_t inode_get_mtime_sec(const struct inode *inode)
+{
+	return inode->i_mtime.tv_sec;
+}
+
+static inline long inode_get_mtime_nsec(const struct inode *inode)
+{
+	return inode->i_mtime.tv_nsec;
+}
+
+static inline struct timespec64 inode_get_mtime(const struct inode *inode)
+{
+	return inode->i_mtime;
+}
+
+static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode,
+						      struct timespec64 ts)
+{
+	inode->i_mtime = ts;
+	return ts;
+}
+
+static inline struct timespec64 inode_set_mtime(struct inode *inode,
+						time64_t sec, long nsec)
+{
+	struct timespec64 ts = { .tv_sec  = sec,
+				 .tv_nsec = nsec };
+	return inode_set_mtime_to_ts(inode, ts);
+}
+
+static inline time64_t inode_get_ctime_sec(const struct inode *inode)
+{
+	return inode->__i_ctime.tv_sec;
+}
+
+static inline long inode_get_ctime_nsec(const struct inode *inode)
+{
+	return inode->__i_ctime.tv_nsec;
+}
+
 static inline struct timespec64 inode_get_ctime(const struct inode *inode)
 {
 	return inode->__i_ctime;
 }
 
-/**
- * inode_set_ctime_to_ts - set the ctime in the inode
- * @inode: inode in which to set the ctime
- * @ts: value to set in the ctime field
- *
- * Set the ctime in @inode to @ts
- */
 static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
 						      struct timespec64 ts)
 {
@@ -1553,6 +1610,8 @@ static inline struct timespec64 inode_set_ctime(struct inode *inode,
 	return inode_set_ctime_to_ts(inode, ts);
 }
 
+struct timespec64 simple_inode_init_ts(struct inode *inode);
+
 /*
  * Snapshotting support.
  */
-- 
cgit v1.2.3


From 1f693269cbb4cd9cd1044299c14b2464590a5f28 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 4 Oct 2023 14:53:04 -0400
Subject: linux: convert to new timestamp accessors

Convert to using the new inode timestamp accessor functions.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20231004185347.80880-77-jlayton@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs_stack.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs_stack.h b/include/linux/fs_stack.h
index 010d39d0dc1c..2b1f74b24070 100644
--- a/include/linux/fs_stack.h
+++ b/include/linux/fs_stack.h
@@ -16,14 +16,14 @@ extern void fsstack_copy_inode_size(struct inode *dst, struct inode *src);
 static inline void fsstack_copy_attr_atime(struct inode *dest,
 					   const struct inode *src)
 {
-	dest->i_atime = src->i_atime;
+	inode_set_atime_to_ts(dest, inode_get_atime(src));
 }
 
 static inline void fsstack_copy_attr_times(struct inode *dest,
 					   const struct inode *src)
 {
-	dest->i_atime = src->i_atime;
-	dest->i_mtime = src->i_mtime;
+	inode_set_atime_to_ts(dest, inode_get_atime(src));
+	inode_set_mtime_to_ts(dest, inode_get_mtime(src));
 	inode_set_ctime_to_ts(dest, inode_get_ctime(src));
 }
 
-- 
cgit v1.2.3


From 12cd44023651666bd44baa36a5c999698890debb Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 29 Sep 2023 09:05:52 -0400
Subject: fs: rename inode i_atime and i_mtime fields

Rename these two fields to discourage direct access (and to help ensure
that we mop up any leftover direct accesses).

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 79cc11139115..cc29518517f7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -671,8 +671,8 @@ struct inode {
 	};
 	dev_t			i_rdev;
 	loff_t			i_size;
-	struct timespec64	i_atime;
-	struct timespec64	i_mtime;
+	struct timespec64	__i_atime;
+	struct timespec64	__i_mtime;
 	struct timespec64	__i_ctime; /* use inode_*_ctime accessors! */
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	unsigned short          i_bytes;
@@ -1513,23 +1513,23 @@ struct timespec64 inode_set_ctime_current(struct inode *inode);
 
 static inline time64_t inode_get_atime_sec(const struct inode *inode)
 {
-	return inode->i_atime.tv_sec;
+	return inode->__i_atime.tv_sec;
 }
 
 static inline long inode_get_atime_nsec(const struct inode *inode)
 {
-	return inode->i_atime.tv_nsec;
+	return inode->__i_atime.tv_nsec;
 }
 
 static inline struct timespec64 inode_get_atime(const struct inode *inode)
 {
-	return inode->i_atime;
+	return inode->__i_atime;
 }
 
 static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode,
 						      struct timespec64 ts)
 {
-	inode->i_atime = ts;
+	inode->__i_atime = ts;
 	return ts;
 }
 
@@ -1543,23 +1543,23 @@ static inline struct timespec64 inode_set_atime(struct inode *inode,
 
 static inline time64_t inode_get_mtime_sec(const struct inode *inode)
 {
-	return inode->i_mtime.tv_sec;
+	return inode->__i_mtime.tv_sec;
 }
 
 static inline long inode_get_mtime_nsec(const struct inode *inode)
 {
-	return inode->i_mtime.tv_nsec;
+	return inode->__i_mtime.tv_nsec;
 }
 
 static inline struct timespec64 inode_get_mtime(const struct inode *inode)
 {
-	return inode->i_mtime;
+	return inode->__i_mtime;
 }
 
 static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode,
 						      struct timespec64 ts)
 {
-	inode->i_mtime = ts;
+	inode->__i_mtime = ts;
 	return ts;
 }
 
-- 
cgit v1.2.3


From b4edb8d2d4647a71a246d91fe34ff25c2c5f3481 Mon Sep 17 00:00:00 2001
From: "wuqiang.matt" <wuqiang.matt@bytedance.com>
Date: Tue, 17 Oct 2023 21:56:50 +0800
Subject: lib: objpool added: ring-array based lockless MPMC

objpool is a scalable implementation of high performance queue for
object allocation and reclamation, such as kretprobe instances.

With leveraging percpu ring-array to mitigate hot spots of memory
contention, it delivers near-linear scalability for high parallel
scenarios. The objpool is best suited for the following cases:
1) Memory allocation or reclamation are prohibited or too expensive
2) Consumers are of different priorities, such as irqs and threads

Limitations:
1) Maximum objects (capacity) is fixed after objpool creation
2) All pre-allocated objects are managed in percpu ring array,
   which consumes more memory than linked lists

Link: https://lore.kernel.org/all/20231017135654.82270-2-wuqiang.matt@bytedance.com/

Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/objpool.h | 181 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)
 create mode 100644 include/linux/objpool.h

(limited to 'include/linux')

diff --git a/include/linux/objpool.h b/include/linux/objpool.h
new file mode 100644
index 000000000000..15aff4a17f0c
--- /dev/null
+++ b/include/linux/objpool.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_OBJPOOL_H
+#define _LINUX_OBJPOOL_H
+
+#include <linux/types.h>
+#include <linux/refcount.h>
+
+/*
+ * objpool: ring-array based lockless MPMC queue
+ *
+ * Copyright: wuqiang.matt@bytedance.com,mhiramat@kernel.org
+ *
+ * objpool is a scalable implementation of high performance queue for
+ * object allocation and reclamation, such as kretprobe instances.
+ *
+ * With leveraging percpu ring-array to mitigate hot spots of memory
+ * contention, it delivers near-linear scalability for high parallel
+ * scenarios. The objpool is best suited for the following cases:
+ * 1) Memory allocation or reclamation are prohibited or too expensive
+ * 2) Consumers are of different priorities, such as irqs and threads
+ *
+ * Limitations:
+ * 1) Maximum objects (capacity) is fixed after objpool creation
+ * 2) All pre-allocated objects are managed in percpu ring array,
+ *    which consumes more memory than linked lists
+ */
+
+/**
+ * struct objpool_slot - percpu ring array of objpool
+ * @head: head sequence of the local ring array (to retrieve at)
+ * @tail: tail sequence of the local ring array (to append at)
+ * @last: the last sequence number marked as ready for retrieve
+ * @mask: bits mask for modulo capacity to compute array indexes
+ * @entries: object entries on this slot
+ *
+ * Represents a cpu-local array-based ring buffer, its size is specialized
+ * during initialization of object pool. The percpu objpool node is to be
+ * allocated from local memory for NUMA system, and to be kept compact in
+ * continuous memory: CPU assigned number of objects are stored just after
+ * the body of objpool_node.
+ *
+ * Real size of the ring array is far too smaller than the value range of
+ * head and tail, typed as uint32_t: [0, 2^32), so only lower bits (mask)
+ * of head and tail are used as the actual position in the ring array. In
+ * general the ring array is acting like a small sliding window, which is
+ * always moving forward in the loop of [0, 2^32).
+ */
+struct objpool_slot {
+	uint32_t            head;
+	uint32_t            tail;
+	uint32_t            last;
+	uint32_t            mask;
+	void               *entries[];
+} __packed;
+
+struct objpool_head;
+
+/*
+ * caller-specified callback for object initial setup, it's only called
+ * once for each object (just after the memory allocation of the object)
+ */
+typedef int (*objpool_init_obj_cb)(void *obj, void *context);
+
+/* caller-specified cleanup callback for objpool destruction */
+typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
+
+/**
+ * struct objpool_head - object pooling metadata
+ * @obj_size:   object size, aligned to sizeof(void *)
+ * @nr_objs:    total objs (to be pre-allocated with objpool)
+ * @nr_cpus:    local copy of nr_cpu_ids
+ * @capacity:   max objs can be managed by one objpool_slot
+ * @gfp:        gfp flags for kmalloc & vmalloc
+ * @ref:        refcount of objpool
+ * @flags:      flags for objpool management
+ * @cpu_slots:  pointer to the array of objpool_slot
+ * @release:    resource cleanup callback
+ * @context:    caller-provided context
+ */
+struct objpool_head {
+	int                     obj_size;
+	int                     nr_objs;
+	int                     nr_cpus;
+	int                     capacity;
+	gfp_t                   gfp;
+	refcount_t              ref;
+	unsigned long           flags;
+	struct objpool_slot   **cpu_slots;
+	objpool_fini_cb         release;
+	void                   *context;
+};
+
+#define OBJPOOL_NR_OBJECT_MAX	(1UL << 24) /* maximum numbers of total objects */
+#define OBJPOOL_OBJECT_SIZE_MAX	(1UL << 16) /* maximum size of an object */
+
+/**
+ * objpool_init() - initialize objpool and pre-allocated objects
+ * @pool:    the object pool to be initialized, declared by caller
+ * @nr_objs: total objects to be pre-allocated by this object pool
+ * @object_size: size of an object (should be > 0)
+ * @gfp:     flags for memory allocation (via kmalloc or vmalloc)
+ * @context: user context for object initialization callback
+ * @objinit: object initialization callback for extra setup
+ * @release: cleanup callback for extra cleanup task
+ *
+ * return value: 0 for success, otherwise error code
+ *
+ * All pre-allocated objects are to be zeroed after memory allocation.
+ * Caller could do extra initialization in objinit callback. objinit()
+ * will be called just after slot allocation and called only once for
+ * each object. After that the objpool won't touch any content of the
+ * objects. It's caller's duty to perform reinitialization after each
+ * pop (object allocation) or do clearance before each push (object
+ * reclamation).
+ */
+int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
+		 gfp_t gfp, void *context, objpool_init_obj_cb objinit,
+		 objpool_fini_cb release);
+
+/**
+ * objpool_pop() - allocate an object from objpool
+ * @pool: object pool
+ *
+ * return value: object ptr or NULL if failed
+ */
+void *objpool_pop(struct objpool_head *pool);
+
+/**
+ * objpool_push() - reclaim the object and return back to objpool
+ * @obj:  object ptr to be pushed to objpool
+ * @pool: object pool
+ *
+ * return: 0 or error code (it fails only when user tries to push
+ * the same object multiple times or wrong "objects" into objpool)
+ */
+int objpool_push(void *obj, struct objpool_head *pool);
+
+/**
+ * objpool_drop() - discard the object and deref objpool
+ * @obj:  object ptr to be discarded
+ * @pool: object pool
+ *
+ * return: 0 if objpool was released; -EAGAIN if there are still
+ *         outstanding objects
+ *
+ * objpool_drop is normally for the release of outstanding objects
+ * after objpool cleanup (objpool_fini). Thinking of this example:
+ * kretprobe is unregistered and objpool_fini() is called to release
+ * all remained objects, but there are still objects being used by
+ * unfinished kretprobes (like blockable function: sys_accept). So
+ * only when the last outstanding object is dropped could the whole
+ * objpool be released along with the call of objpool_drop()
+ */
+int objpool_drop(void *obj, struct objpool_head *pool);
+
+/**
+ * objpool_free() - release objpool forcely (all objects to be freed)
+ * @pool: object pool to be released
+ */
+void objpool_free(struct objpool_head *pool);
+
+/**
+ * objpool_fini() - deref object pool (also releasing unused objects)
+ * @pool: object pool to be dereferenced
+ *
+ * objpool_fini() will try to release all remained free objects and
+ * then drop an extra reference of the objpool. If all objects are
+ * already returned to objpool (so called synchronous use cases),
+ * the objpool itself will be freed together. But if there are still
+ * outstanding objects (so called asynchronous use cases, such like
+ * blockable kretprobe), the objpool won't be released until all
+ * the outstanding objects are dropped, but the caller must assure
+ * there are no concurrent objpool_push() on the fly. Normally RCU
+ * is being required to make sure all ongoing objpool_push() must
+ * be finished before calling objpool_fini(), so does test_objpool,
+ * kretprobe or rethook
+ */
+void objpool_fini(struct objpool_head *pool);
+
+#endif /* _LINUX_OBJPOOL_H */
-- 
cgit v1.2.3


From 4bbd9345565933823f38a419df65661f12adbe5e Mon Sep 17 00:00:00 2001
From: "wuqiang.matt" <wuqiang.matt@bytedance.com>
Date: Tue, 17 Oct 2023 21:56:52 +0800
Subject: kprobes: kretprobe scalability improvement

kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.

Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):

OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s

         1T       2T       4T       8T      16T      24T
   24150045 29317964 15446741 12494489 18287272 17708768
        32T      48T      64T      72T      96T     128T
   16200682 13737658 11645677 11269858 10470118  9931051

This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:

                  1T         2T         4T         8T        16T
native:     41186213   82336866  164250978  328662645  658810299
freelist:   24150045   29317964   15446741   12494489   18287272
objpool:    23926730   48010314   96125218  191782984  385091769
                 32T        48T        64T        96T       128T
native:   1330338351 1969957941 2512291791 2615754135 2671040914
freelist:   16200682   13737658   11645677   10470118    9931051
objpool:   764481096 1147149781 1456220214 1502109662 1579015050

Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:

OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s

                  1T         2T         4T         8T        16T
native: .   30066096   63569843  126194076  257447289  505800181
freelist:   16152090   11064397   11124068    7215768    5663013
objpool:    13997541   28032100   55726624  110099926  221498787
                 24T        32T        48T        64T        96T
native:    763305277 1015925192 1521075123 2033009392 3021013752
freelist:    5015810    4602893    3766792    3382478    2945292
objpool:   328192025  439439564  668534502  887401381 1319972072

Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/

Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/kprobes.h | 11 +++--------
 include/linux/rethook.h | 16 ++++------------
 2 files changed, 7 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 85a64cb95d75..365eb092e9c4 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -26,8 +26,7 @@
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
-#include <linux/refcount.h>
-#include <linux/freelist.h>
+#include <linux/objpool.h>
 #include <linux/rethook.h>
 #include <asm/kprobes.h>
 
@@ -141,7 +140,7 @@ static inline bool kprobe_ftrace(struct kprobe *p)
  */
 struct kretprobe_holder {
 	struct kretprobe	*rp;
-	refcount_t		ref;
+	struct objpool_head	pool;
 };
 
 struct kretprobe {
@@ -154,7 +153,6 @@ struct kretprobe {
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 	struct rethook *rh;
 #else
-	struct freelist_head freelist;
 	struct kretprobe_holder *rph;
 #endif
 };
@@ -165,10 +163,7 @@ struct kretprobe_instance {
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 	struct rethook_node node;
 #else
-	union {
-		struct freelist_node freelist;
-		struct rcu_head rcu;
-	};
+	struct rcu_head rcu;
 	struct llist_node llist;
 	struct kretprobe_holder *rph;
 	kprobe_opcode_t *ret_addr;
diff --git a/include/linux/rethook.h b/include/linux/rethook.h
index 26b6f3c81a76..ce69b2b7bc35 100644
--- a/include/linux/rethook.h
+++ b/include/linux/rethook.h
@@ -6,11 +6,10 @@
 #define _LINUX_RETHOOK_H
 
 #include <linux/compiler.h>
-#include <linux/freelist.h>
+#include <linux/objpool.h>
 #include <linux/kallsyms.h>
 #include <linux/llist.h>
 #include <linux/rcupdate.h>
-#include <linux/refcount.h>
 
 struct rethook_node;
 
@@ -30,14 +29,12 @@ typedef void (*rethook_handler_t) (struct rethook_node *, void *, unsigned long,
 struct rethook {
 	void			*data;
 	rethook_handler_t	handler;
-	struct freelist_head	pool;
-	refcount_t		ref;
+	struct objpool_head	pool;
 	struct rcu_head		rcu;
 };
 
 /**
  * struct rethook_node - The rethook shadow-stack entry node.
- * @freelist: The freelist, linked to struct rethook::pool.
  * @rcu: The rcu_head for deferred freeing.
  * @llist: The llist, linked to a struct task_struct::rethooks.
  * @rethook: The pointer to the struct rethook.
@@ -48,20 +45,16 @@ struct rethook {
  * on each entry of the shadow stack.
  */
 struct rethook_node {
-	union {
-		struct freelist_node freelist;
-		struct rcu_head      rcu;
-	};
+	struct rcu_head		rcu;
 	struct llist_node	llist;
 	struct rethook		*rethook;
 	unsigned long		ret_addr;
 	unsigned long		frame;
 };
 
-struct rethook *rethook_alloc(void *data, rethook_handler_t handler);
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler, int size, int num);
 void rethook_stop(struct rethook *rh);
 void rethook_free(struct rethook *rh);
-void rethook_add_node(struct rethook *rh, struct rethook_node *node);
 struct rethook_node *rethook_try_get(struct rethook *rh);
 void rethook_recycle(struct rethook_node *node);
 void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount);
@@ -98,4 +91,3 @@ void rethook_flush_task(struct task_struct *tk);
 #endif
 
 #endif
-
-- 
cgit v1.2.3


From f46e8939de80bf029787111bf488ee86355b111c Mon Sep 17 00:00:00 2001
From: "wuqiang.matt" <wuqiang.matt@bytedance.com>
Date: Tue, 17 Oct 2023 21:56:53 +0800
Subject: kprobes: freelist.h removed

This patch will remove freelist.h from kernel source tree, since the
only use cases (kretprobe and rethook) are converted to objpool.

Link: https://lore.kernel.org/all/20231017135654.82270-5-wuqiang.matt@bytedance.com/

Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/freelist.h | 129 -----------------------------------------------
 1 file changed, 129 deletions(-)
 delete mode 100644 include/linux/freelist.h

(limited to 'include/linux')

diff --git a/include/linux/freelist.h b/include/linux/freelist.h
deleted file mode 100644
index fc1842b96469..000000000000
--- a/include/linux/freelist.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */
-#ifndef FREELIST_H
-#define FREELIST_H
-
-#include <linux/atomic.h>
-
-/*
- * Copyright: cameron@moodycamel.com
- *
- * A simple CAS-based lock-free free list. Not the fastest thing in the world
- * under heavy contention, but simple and correct (assuming nodes are never
- * freed until after the free list is destroyed), and fairly speedy under low
- * contention.
- *
- * Adapted from: https://moodycamel.com/blog/2014/solving-the-aba-problem-for-lock-free-free-lists
- */
-
-struct freelist_node {
-	atomic_t		refs;
-	struct freelist_node	*next;
-};
-
-struct freelist_head {
-	struct freelist_node	*head;
-};
-
-#define REFS_ON_FREELIST 0x80000000
-#define REFS_MASK	 0x7FFFFFFF
-
-static inline void __freelist_add(struct freelist_node *node, struct freelist_head *list)
-{
-	/*
-	 * Since the refcount is zero, and nobody can increase it once it's
-	 * zero (except us, and we run only one copy of this method per node at
-	 * a time, i.e. the single thread case), then we know we can safely
-	 * change the next pointer of the node; however, once the refcount is
-	 * back above zero, then other threads could increase it (happens under
-	 * heavy contention, when the refcount goes to zero in between a load
-	 * and a refcount increment of a node in try_get, then back up to
-	 * something non-zero, then the refcount increment is done by the other
-	 * thread) -- so if the CAS to add the node to the actual list fails,
-	 * decrese the refcount and leave the add operation to the next thread
-	 * who puts the refcount back to zero (which could be us, hence the
-	 * loop).
-	 */
-	struct freelist_node *head = READ_ONCE(list->head);
-
-	for (;;) {
-		WRITE_ONCE(node->next, head);
-		atomic_set_release(&node->refs, 1);
-
-		if (!try_cmpxchg_release(&list->head, &head, node)) {
-			/*
-			 * Hmm, the add failed, but we can only try again when
-			 * the refcount goes back to zero.
-			 */
-			if (atomic_fetch_add_release(REFS_ON_FREELIST - 1, &node->refs) == 1)
-				continue;
-		}
-		return;
-	}
-}
-
-static inline void freelist_add(struct freelist_node *node, struct freelist_head *list)
-{
-	/*
-	 * We know that the should-be-on-freelist bit is 0 at this point, so
-	 * it's safe to set it using a fetch_add.
-	 */
-	if (!atomic_fetch_add_release(REFS_ON_FREELIST, &node->refs)) {
-		/*
-		 * Oh look! We were the last ones referencing this node, and we
-		 * know we want to add it to the free list, so let's do it!
-		 */
-		__freelist_add(node, list);
-	}
-}
-
-static inline struct freelist_node *freelist_try_get(struct freelist_head *list)
-{
-	struct freelist_node *prev, *next, *head = smp_load_acquire(&list->head);
-	unsigned int refs;
-
-	while (head) {
-		prev = head;
-		refs = atomic_read(&head->refs);
-		if ((refs & REFS_MASK) == 0 ||
-		    !atomic_try_cmpxchg_acquire(&head->refs, &refs, refs+1)) {
-			head = smp_load_acquire(&list->head);
-			continue;
-		}
-
-		/*
-		 * Good, reference count has been incremented (it wasn't at
-		 * zero), which means we can read the next and not worry about
-		 * it changing between now and the time we do the CAS.
-		 */
-		next = READ_ONCE(head->next);
-		if (try_cmpxchg_acquire(&list->head, &head, next)) {
-			/*
-			 * Yay, got the node. This means it was on the list,
-			 * which means should-be-on-freelist must be false no
-			 * matter the refcount (because nobody else knows it's
-			 * been taken off yet, it can't have been put back on).
-			 */
-			WARN_ON_ONCE(atomic_read(&head->refs) & REFS_ON_FREELIST);
-
-			/*
-			 * Decrease refcount twice, once for our ref, and once
-			 * for the list's ref.
-			 */
-			atomic_fetch_add(-2, &head->refs);
-
-			return head;
-		}
-
-		/*
-		 * OK, the head must have changed on us, but we still need to decrement
-		 * the refcount we increased.
-		 */
-		refs = atomic_fetch_add(-1, &prev->refs);
-		if (refs == REFS_ON_FREELIST + 1)
-			__freelist_add(prev, list);
-	}
-
-	return NULL;
-}
-
-#endif /* FREELIST_H */
-- 
cgit v1.2.3


From bf4916922c60f43efaa329744b3eef539aa6a2b2 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 5 Oct 2023 23:59:07 -0400
Subject: hugetlbfs: extend hugetlb_vma_lock to private VMAs

Extend the locking scheme used to protect shared hugetlb mappings from
truncate vs page fault races, in order to protect private hugetlb mappings
(with resv_map) against MADV_DONTNEED.

Add a read-write semaphore to the resv_map data structure, and use that
from the hugetlb_vma_(un)lock_* functions, in preparation for closing the
race between MADV_DONTNEED and page faults.

Link: https://lkml.kernel.org/r/20231006040020.3677377-3-riel@surriel.com
Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing")
Signed-off-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a30686e649f7..065ec022cbb0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -60,6 +60,7 @@ struct resv_map {
 	long adds_in_progress;
 	struct list_head region_cache;
 	long region_cache_count;
+	struct rw_semaphore rw_sema;
 #ifdef CONFIG_CGROUP_HUGETLB
 	/*
 	 * On private mappings, the counter to uncharge reservations is stored
@@ -1233,6 +1234,11 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
 	return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
 }
 
+static inline bool __vma_private_lock(struct vm_area_struct *vma)
+{
+	return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data;
+}
+
 /*
  * Safe version of huge_pte_offset() to check the locks.  See comments
  * above huge_pte_offset().
-- 
cgit v1.2.3


From 2820b0f09be99f6406784b03a22dfc83e858449d Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 5 Oct 2023 23:59:08 -0400
Subject: hugetlbfs: close race between MADV_DONTNEED and page fault

Malloc libraries, like jemalloc and tcalloc, take decisions on when to
call madvise independently from the code in the main application.

This sometimes results in the application page faulting on an address,
right after the malloc library has shot down the backing memory with
MADV_DONTNEED.

Usually this is harmless, because we always have some 4kB pages sitting
around to satisfy a page fault.  However, with hugetlbfs systems often
allocate only the exact number of huge pages that the application wants.

Due to TLB batching, hugetlbfs MADV_DONTNEED will free pages outside of
any lock taken on the page fault path, which can open up the following
race condition:

       CPU 1                            CPU 2

       MADV_DONTNEED
       unmap page
       shoot down TLB entry
                                       page fault
                                       fail to allocate a huge page
                                       killed with SIGBUS
       free page

Fix that race by pulling the locking from __unmap_hugepage_final_range
into helper functions called from zap_page_range_single.  This ensures
page faults stay locked out of the MADV_DONTNEED VMA until the huge pages
have actually been freed.

Link: https://lkml.kernel.org/r/20231006040020.3677377-4-riel@surriel.com
Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing")
Signed-off-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 065ec022cbb0..47d25a5e1933 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -139,7 +139,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
 void unmap_hugepage_range(struct vm_area_struct *,
 			  unsigned long, unsigned long, struct page *,
 			  zap_flags_t);
-void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+void __unmap_hugepage_range(struct mmu_gather *tlb,
 			  struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end,
 			  struct page *ref_page, zap_flags_t zap_flags);
@@ -246,6 +246,25 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 				unsigned long *start, unsigned long *end);
 
+extern void __hugetlb_zap_begin(struct vm_area_struct *vma,
+				unsigned long *begin, unsigned long *end);
+extern void __hugetlb_zap_end(struct vm_area_struct *vma,
+			      struct zap_details *details);
+
+static inline void hugetlb_zap_begin(struct vm_area_struct *vma,
+				     unsigned long *start, unsigned long *end)
+{
+	if (is_vm_hugetlb_page(vma))
+		__hugetlb_zap_begin(vma, start, end);
+}
+
+static inline void hugetlb_zap_end(struct vm_area_struct *vma,
+				   struct zap_details *details)
+{
+	if (is_vm_hugetlb_page(vma))
+		__hugetlb_zap_end(vma, details);
+}
+
 void hugetlb_vma_lock_read(struct vm_area_struct *vma);
 void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
 void hugetlb_vma_lock_write(struct vm_area_struct *vma);
@@ -297,6 +316,18 @@ static inline void adjust_range_if_pmd_sharing_possible(
 {
 }
 
+static inline void hugetlb_zap_begin(
+				struct vm_area_struct *vma,
+				unsigned long *start, unsigned long *end)
+{
+}
+
+static inline void hugetlb_zap_end(
+				struct vm_area_struct *vma,
+				struct zap_details *details)
+{
+}
+
 static inline struct page *hugetlb_follow_page_mask(
     struct vm_area_struct *vma, unsigned long address, unsigned int flags,
     unsigned int *page_mask)
@@ -442,7 +473,7 @@ static inline long hugetlb_change_protection(
 	return 0;
 }
 
-static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
 			struct vm_area_struct *vma, unsigned long start,
 			unsigned long end, struct page *ref_page,
 			zap_flags_t zap_flags)
-- 
cgit v1.2.3


From babddbfb7d7d70ae7f10fedd75a45d8ad75fdddf Mon Sep 17 00:00:00 2001
From: Haibo Li <haibo.li@mediatek.com>
Date: Mon, 9 Oct 2023 15:37:48 +0800
Subject: kasan: print the original fault addr when access invalid shadow

when the checked address is illegal,the corresponding shadow address from
kasan_mem_to_shadow may have no mapping in mmu table.  Access such shadow
address causes kernel oops.  Here is a sample about oops on arm64(VA
39bit) with KASAN_SW_TAGS and KASAN_OUTLINE on:

[ffffffb80aaaaaaa] pgd=000000005d3ce003, p4d=000000005d3ce003,
    pud=000000005d3ce003, pmd=0000000000000000
Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP
Modules linked in:
CPU: 3 PID: 100 Comm: sh Not tainted 6.6.0-rc1-dirty #43
Hardware name: linux,dummy-virt (DT)
pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : __hwasan_load8_noabort+0x5c/0x90
lr : do_ib_ob+0xf4/0x110
ffffffb80aaaaaaa is the shadow address for efffff80aaaaaaaa.
The problem is reading invalid shadow in kasan_check_range.

The generic kasan also has similar oops.

It only reports the shadow address which causes oops but not
the original address.

Commit 2f004eea0fc8("x86/kasan: Print original address on #GP")
introduce to kasan_non_canonical_hook but limit it to KASAN_INLINE.

This patch extends it to KASAN_OUTLINE mode.

Link: https://lkml.kernel.org/r/20231009073748.159228-1-haibo.li@mediatek.com
Fixes: 2f004eea0fc8("x86/kasan: Print original address on #GP")
Signed-off-by: Haibo Li <haibo.li@mediatek.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Haibo Li <haibo.li@mediatek.com>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 842623d708c2..6dbb5ded3cf9 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -466,10 +466,10 @@ static inline void kasan_free_module_shadow(const struct vm_struct *vm) {}
 
 #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
 
-#ifdef CONFIG_KASAN_INLINE
+#ifdef CONFIG_KASAN
 void kasan_non_canonical_hook(unsigned long addr);
-#else /* CONFIG_KASAN_INLINE */
+#else /* CONFIG_KASAN */
 static inline void kasan_non_canonical_hook(unsigned long addr) { }
-#endif /* CONFIG_KASAN_INLINE */
+#endif /* CONFIG_KASAN */
 
 #endif /* LINUX_KASAN_H */
-- 
cgit v1.2.3


From 17c17567fe510857b18fe01b7a88027600e76ac6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 16 Oct 2023 22:08:38 +0200
Subject: kasan: disable kasan_non_canonical_hook() for HW tags

On arm64, building with CONFIG_KASAN_HW_TAGS now causes a compile-time
error:

mm/kasan/report.c: In function 'kasan_non_canonical_hook':
mm/kasan/report.c:637:20: error: 'KASAN_SHADOW_OFFSET' undeclared (first use in this function)
  637 |         if (addr < KASAN_SHADOW_OFFSET)
      |                    ^~~~~~~~~~~~~~~~~~~
mm/kasan/report.c:637:20: note: each undeclared identifier is reported only once for each function it appears in
mm/kasan/report.c:640:77: error: expected expression before ';' token
  640 |         orig_addr = (addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT;

This was caused by removing the dependency on CONFIG_KASAN_INLINE that
used to prevent this from happening. Use the more specific dependency
on KASAN_SW_TAGS || KASAN_GENERIC to only ignore the function for hwasan
mode.

Link: https://lkml.kernel.org/r/20231016200925.984439-1-arnd@kernel.org
Fixes: 12ec6a919b0f ("kasan: print the original fault addr when access invalid shadow")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Haibo Li <haibo.li@mediatek.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 6dbb5ded3cf9..71fa9a40fb5a 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -466,10 +466,10 @@ static inline void kasan_free_module_shadow(const struct vm_struct *vm) {}
 
 #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 void kasan_non_canonical_hook(unsigned long addr);
-#else /* CONFIG_KASAN */
+#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 static inline void kasan_non_canonical_hook(unsigned long addr) { }
-#endif /* CONFIG_KASAN */
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
 #endif /* LINUX_KASAN_H */
-- 
cgit v1.2.3


From b7c67206594a56be2407ae4da54a114c90609e53 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:53:28 -0700
Subject: mm/memcg: annotate struct mem_cgroup_threshold_ary with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute.  Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct
mem_cgroup_threshold_ary.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Link: https://lkml.kernel.org/r/20230922175327.work.985-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 031102ac9311..c6029aeaa268 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -143,7 +143,7 @@ struct mem_cgroup_threshold_ary {
 	/* Size of entries[] */
 	unsigned int size;
 	/* Array of thresholds */
-	struct mem_cgroup_threshold entries[];
+	struct mem_cgroup_threshold entries[] __counted_by(size);
 };
 
 struct mem_cgroup_thresholds {
-- 
cgit v1.2.3


From d61ea1cb009532dcbd77a9d44b812704cec60146 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 21 Aug 2023 19:15:13 +0500
Subject: userfaultfd: UFFD_FEATURE_WP_ASYNC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Implement IOCTL to get and optionally clear info about
PTEs", v33.

*Motivation*
The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows
GetWriteWatch() and ResetWriteWatch() syscalls [1].  The GetWriteWatch()
retrieves the addresses of the pages that are written to in a region of
virtual memory.

This syscall is used in Windows applications and games etc.  This syscall
is being emulated in pretty slow manner in userspace.  Our purpose is to
enhance the kernel such that we translate it efficiently in a better way.
Currently some out of tree hack patches are being used to efficiently
emulate it in some kernels.  We intend to replace those with these
patches.  So the whole gaming on Linux can effectively get benefit from
this.  It means there would be tons of users of this code.

CRIU use case [2] was mentioned by Andrei and Danylo:
> Use cases for migrating sparse VMAs are binaries sanitized with ASAN,
> MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of
> shadow memory [4]. Being able to migrate such binaries allows to highly
> reduce the amount of work needed to identify and fix post-migration
> crashes, which happen constantly.

Andrei defines the following uses of this code:
* it is more granular and allows us to track changed pages more
  effectively. The current interface can clear dirty bits for the entire
  process only. In addition, reading info about pages is a separate
  operation. It means we must freeze the process to read information
  about all its pages, reset dirty bits, only then we can start dumping
  pages. The information about pages becomes more and more outdated,
  while we are processing pages. The new interface solves both these
  downsides. First, it allows us to read pte bits and clear the
  soft-dirty bit atomically. It means that CRIU will not need to freeze
  processes to pre-dump their memory. Second, it clears soft-dirty bits
  for a specified region of memory. It means CRIU will have actual info
  about pages to the moment of dumping them.
* The new interface has to be much faster because basic page filtering
  is happening in the kernel. With the old interface, we have to read
  pagemap for each page.

*Implementation Evolution (Short Summary)*
From the definition of GetWriteWatch(), we feel like kernel's soft-dirty
feature can be used under the hood with some additions like:
* reset soft-dirty flag for only a specific region of memory instead of
clearing the flag for the entire process
* get and clear soft-dirty flag for a specific region atomically

So we decided to use ioctl on pagemap file to read or/and reset soft-dirty
flag. But using soft-dirty flag, sometimes we get extra pages which weren't
even written. They had become soft-dirty because of VMA merging and
VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were
able to by-pass this short coming by ignoring VM_SOFTDIRTY until David
reported that mprotect etc messes up the soft-dirty flag while ignoring
VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We
discussed if we can revert these patches. But we could not reach to any
conclusion. So at this point, I made couple of tries to solve this whole
VM_SOFTDIRTY issue by correcting the soft-dirty implementation:
* [7] Correct the bug fixed wrongly back in 2014. It had potential to cause
regression. We left it behind.
* [8] Keep a list of soft-dirty part of a VMA across splits and merges. I
got the reply don't increase the size of the VMA by 8 bytes.

At this point, we left soft-dirty considering it is too much delicate and
userfaultfd [9] seemed like the only way forward. From there onward, we
have been basing soft-dirty emulation on userfaultfd wp feature where
kernel resolves the faults itself when WP_ASYNC feature is used. It was
straight forward to add WP_ASYNC feature in userfautlfd. Now we get only
those pages dirty or written-to which are really written in reality. (PS
There is another WP_UNPOPULATED userfautfd feature is required which is
needed to avoid pre-faulting memory before write-protecting [9].)

All the different masks were added on the request of CRIU devs to create
interface more generic and better.

[1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-getwritewatch
[2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com
[3] https://github.com/google/sanitizers
[4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit
[5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com
[6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/
[7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.com
[8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.com
[9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com
[10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com


This patch (of 6):

Add a new userfaultfd-wp feature UFFD_FEATURE_WP_ASYNC, that allows
userfaultfd wr-protect faults to be resolved by the kernel directly.

It can be used like a high accuracy version of soft-dirty, without vma
modifications during tracking, and also with ranged support by default
rather than for a whole mm when reset the protections due to existence of
ioctl(UFFDIO_WRITEPROTECT).

Several goals of such a dirty tracking interface:

1. All types of memory should be supported and tracable. This is nature
   for soft-dirty but should mention when the context is userfaultfd,
   because it used to only support anon/shmem/hugetlb. The problem is for
   a dirty tracking purpose these three types may not be enough, and it's
   legal to track anything e.g. any page cache writes from mmap.

2. Protections can be applied to partial of a memory range, without vma
   split/merge fuss.  The hope is that the tracking itself should not
   affect any vma layout change.  It also helps when reset happens because
   the reset will not need mmap write lock which can block the tracee.

3. Accuracy needs to be maintained.  This means we need pte markers to work
   on any type of VMA.

One could question that, the whole concept of async dirty tracking is not
really close to fundamentally what userfaultfd used to be: it's not "a
fault to be serviced by userspace" anymore. However, using userfaultfd-wp
here as a framework is convenient for us in at least:

1. VM_UFFD_WP vma flag, which has a very good name to suite something like
   this, so we don't need VM_YET_ANOTHER_SOFT_DIRTY. Just use a new
   feature bit to identify from a sync version of uffd-wp registration.

2. PTE markers logic can be leveraged across the whole kernel to maintain
   the uffd-wp bit as long as an arch supports, this also applies to this
   case where uffd-wp bit will be a hint to dirty information and it will
   not go lost easily (e.g. when some page cache ptes got zapped).

3. Reuse ioctl(UFFDIO_WRITEPROTECT) interface for either starting or
   resetting a range of memory, while there's no counterpart in the old
   soft-dirty world, hence if this is wanted in a new design we'll need a
   new interface otherwise.

We can somehow understand that commonality because uffd-wp was
fundamentally a similar idea of write-protecting pages just like
soft-dirty.

This implementation allows WP_ASYNC to imply WP_UNPOPULATED, because so
far WP_ASYNC seems to not usable if without WP_UNPOPULATE.  This also
gives us chance to modify impl of WP_ASYNC just in case it could be not
depending on WP_UNPOPULATED anymore in the future kernels.  It's also fine
to imply that because both features will rely on PTE_MARKER_UFFD_WP config
option, so they'll show up together (or both missing) in an UFFDIO_API
probe.

vma_can_userfault() now allows any VMA if the userfaultfd registration is
only about async uffd-wp.  So we can track dirty for all kinds of memory
including generic file systems (like XFS, EXT4 or BTRFS).

One trick worth mention in do_wp_page() is that we need to manually update
vmf->orig_pte here because it can be used later with a pte_same() check -
this path always has FAULT_FLAG_ORIG_PTE_VALID set in the flags.

The major defect of this approach of dirty tracking is we need to populate
the pgtables when tracking starts.  Soft-dirty doesn't do it like that.
It's unwanted in the case where the range of memory to track is huge and
unpopulated (e.g., tracking updates on a 10G file with mmap() on top,
without having any page cache installed yet).  One way to improve this is
to allow pte markers exist for larger than PTE level for PMD+.  That will
not change the interface if to implemented, so we can leave that for
later.

Link: https://lkml.kernel.org/r/20230821141518.870589-1-usama.anjum@collabora.com
Link: https://lkml.kernel.org/r/20230821141518.870589-2-usama.anjum@collabora.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Co-developed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Miroslaw <emmir@google.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Paul Gofman <pgofman@codeweavers.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yun Zhou <yun.zhou@windriver.com>
Cc: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/userfaultfd_k.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index ac8c6854097c..c98df391bfd8 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -161,11 +161,22 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 }
 
 static inline bool vma_can_userfault(struct vm_area_struct *vma,
-				     unsigned long vm_flags)
+				     unsigned long vm_flags,
+				     bool wp_async)
 {
+	vm_flags &= __VM_UFFD_FLAGS;
+
 	if ((vm_flags & VM_UFFD_MINOR) &&
 	    (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
 		return false;
+
+	/*
+	 * If wp async enabled, and WP is the only mode enabled, allow any
+	 * memory type.
+	 */
+	if (wp_async && (vm_flags == VM_UFFD_WP))
+		return true;
+
 #ifndef CONFIG_PTE_MARKER_UFFD_WP
 	/*
 	 * If user requested uffd-wp but not enabled pte markers for
@@ -175,6 +186,8 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
 	if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
 		return false;
 #endif
+
+	/* By default, allow any of anon|shmem|hugetlb */
 	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
 	    vma_is_shmem(vma);
 }
@@ -197,6 +210,7 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
 extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 				       struct list_head *uf);
 extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
+extern bool userfaultfd_wp_async(struct vm_area_struct *vma);
 
 #else /* CONFIG_USERFAULTFD */
 
@@ -297,6 +311,11 @@ static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
+{
+	return false;
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
-- 
cgit v1.2.3


From 52526ca7fdb905a768a93f8faa418e9b988fc34b Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Mon, 21 Aug 2023 19:15:14 +0500
Subject: fs/proc/task_mmu: implement IOCTL to get and optionally clear info
 about PTEs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally
clear the info about page table entries. The following operations are
supported in this IOCTL:
- Scan the address range and get the memory ranges matching the provided
  criteria. This is performed when the output buffer is specified.
- Write-protect the pages. The PM_SCAN_WP_MATCHING is used to write-protect
  the pages of interest. The PM_SCAN_CHECK_WPASYNC aborts the operation if
  non-Async Write Protected pages are found. The ``PM_SCAN_WP_MATCHING``
  can be used with or without PM_SCAN_CHECK_WPASYNC.
- Both of those operations can be combined into one atomic operation where
  we can get and write protect the pages as well.

Following flags about pages are currently supported:
- PAGE_IS_WPALLOWED - Page has async-write-protection enabled
- PAGE_IS_WRITTEN - Page has been written to from the time it was write protected
- PAGE_IS_FILE - Page is file backed
- PAGE_IS_PRESENT - Page is present in the memory
- PAGE_IS_SWAPPED - Page is in swapped
- PAGE_IS_PFNZERO - Page has zero PFN
- PAGE_IS_HUGE - Page is THP or Hugetlb backed

This IOCTL can be extended to get information about more PTE bits. The
entire address range passed by user [start, end) is scanned until either
the user provided buffer is full or max_pages have been found.

[akpm@linux-foundation.org: update it for "mm: hugetlb: add huge page size param to set_huge_pte_at()"]
[akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n warning]
[arnd@arndb.de: hide unused pagemap_scan_backout_range() function]
  Link: https://lkml.kernel.org/r/20230927060257.2975412-1-arnd@kernel.org
[sfr@canb.auug.org.au: fix "fs/proc/task_mmu: hide unused pagemap_scan_backout_range() function"]
  Link: https://lkml.kernel.org/r/20230928092223.0625c6bf@canb.auug.org.au
Link: https://lkml.kernel.org/r/20230821141518.870589-3-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Miroslaw <emmir@google.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Paul Gofman <pgofman@codeweavers.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yun Zhou <yun.zhou@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h       | 1 +
 include/linux/userfaultfd_k.h | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e5b9f7e62eeb..205469aa0613 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -280,6 +280,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long cp_flags);
 
 bool is_hugetlb_entry_migration(pte_t pte);
+bool is_hugetlb_entry_hwpoisoned(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index c98df391bfd8..f2dc19f40d05 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -221,6 +221,13 @@ static inline vm_fault_t handle_userfault(struct vm_fault *vmf,
 	return VM_FAULT_SIGBUS;
 }
 
+static inline long uffd_wp_range(struct vm_area_struct *vma,
+				 unsigned long start, unsigned long len,
+				 bool enable_wp)
+{
+	return false;
+}
+
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
 					struct vm_userfaultfd_ctx vm_ctx)
 {
-- 
cgit v1.2.3


From ee615d4585cfc305bf6c218a62123c3051f8b4a3 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 29 Sep 2023 20:25:38 -0700
Subject: shmem: shrink shmem_inode_info: dir_offsets in a union

Patch series "shmem,tmpfs: general maintenance".

Mostly just cosmetic mods in mm/shmem.c, but the last two enforcing the
"size=" limit better.  8/8 goes into percpu counter territory, and could
stand alone.


This patch (of 8):

Shave 32 bytes off (the 64-bit) shmem_inode_info.  There was a 4-byte
pahole after stop_eviction, better filled by fsflags.  And the 24-byte
dir_offsets can only be used by directories, whereas shrinklist and
swaplist only by shmem_mapping() inodes (regular files or long symlinks):
so put those into a union.  No change in mm/shmem.c is required for this.

Link: https://lkml.kernel.org/r/c7441dc6-f3bb-dd60-c670-9f5cbd9f266@google.com
Link: https://lkml.kernel.org/r/86ebb4b-c571-b9e8-27f5-cb82ec50357e@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Carlos Maiolino <cem@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 6b0c626620f5..2caa6b86106a 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -23,18 +23,22 @@ struct shmem_inode_info {
 	unsigned long		flags;
 	unsigned long		alloced;	/* data pages alloced to file */
 	unsigned long		swapped;	/* subtotal assigned to swap */
-	pgoff_t			fallocend;	/* highest fallocate endindex */
-	struct list_head        shrinklist;     /* shrinkable hpage inodes */
-	struct list_head	swaplist;	/* chain of maybes on swap */
+	union {
+	    struct offset_ctx	dir_offsets;	/* stable directory offsets */
+	    struct {
+		struct list_head shrinklist;	/* shrinkable hpage inodes */
+		struct list_head swaplist;	/* chain of maybes on swap */
+	    };
+	};
+	struct timespec64	i_crtime;	/* file creation time */
 	struct shared_policy	policy;		/* NUMA memory alloc policy */
 	struct simple_xattrs	xattrs;		/* list of xattrs */
+	pgoff_t			fallocend;	/* highest fallocate endindex */
+	unsigned int		fsflags;	/* for FS_IOC_[SG]ETFLAGS */
 	atomic_t		stop_eviction;	/* hold when working on inode */
-	struct timespec64	i_crtime;	/* file creation time */
-	unsigned int		fsflags;	/* flags for FS_IOC_[SG]ETFLAGS */
 #ifdef CONFIG_TMPFS_QUOTA
 	struct dquot		*i_dquot[MAXQUOTAS];
 #endif
-	struct offset_ctx	dir_offsets;	/* stable entry offsets */
 	struct inode		vfs_inode;
 };
 
-- 
cgit v1.2.3


From beb9868628445306958fd7b2da1cd369a4a381cc Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 29 Sep 2023 20:42:45 -0700
Subject: shmem,percpu_counter: add _limited_add(fbc, limit, amount)

Percpu counter's compare and add are separate functions: without locking
around them (which would defeat their purpose), it has been possible to
overflow the intended limit.  Imagine all the other CPUs fallocating tmpfs
huge pages to the limit, in between this CPU's compare and its add.

I have not seen reports of that happening; but tmpfs's recent addition of
dquot_alloc_block_nodirty() in between the compare and the add makes it
even more likely, and I'd be uncomfortable to leave it unfixed.

Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it.

I believe this implementation is correct, and slightly more efficient than
the combination of compare and add (taking the lock once rather than twice
when nearing full - the last 128MiB of a tmpfs volume on a machine with
128 CPUs and 4KiB pages); but it does beg for a better design - when
nearing full, there is no new batching, but the costly percpu counter sum
across CPUs still has to be done, while locked.

Follow __percpu_counter_sum()'s example, including cpu_dying_mask as well
as cpu_online_mask: but shouldn't __percpu_counter_compare() and
__percpu_counter_limited_add() then be adding a num_dying_cpus() to
num_online_cpus(), when they calculate the maximum which could be held
across CPUs?  But the times when it matters would be vanishingly rare.

Link: https://lkml.kernel.org/r/bb817848-2d19-bcc8-39ca-ea179af0f0b4@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Carlos Maiolino <cem@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu_counter.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index d01351b1526f..8cb7c071bd5c 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -57,6 +57,8 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
 			      s32 batch);
 s64 __percpu_counter_sum(struct percpu_counter *fbc);
 int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
+bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit,
+				  s64 amount, s32 batch);
 void percpu_counter_sync(struct percpu_counter *fbc);
 
 static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
@@ -69,6 +71,13 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 	percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
 }
 
+static inline bool
+percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
+{
+	return __percpu_counter_limited_add(fbc, limit, amount,
+					    percpu_counter_batch);
+}
+
 /*
  * With percpu_counter_add_local() and percpu_counter_sub_local(), counts
  * are accumulated in local per cpu counter and not in fbc->count until
@@ -185,6 +194,20 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 	local_irq_restore(flags);
 }
 
+static inline bool
+percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
+{
+	unsigned long flags;
+	s64 count;
+
+	local_irq_save(flags);
+	count = fbc->count + amount;
+	if (count <= limit)
+		fbc->count = count;
+	local_irq_restore(flags);
+	return count <= limit;
+}
+
 /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
 static inline void
 percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
-- 
cgit v1.2.3


From 1431996bf9088ee59f8017637ab9a7f89909ae63 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 11 Oct 2023 21:40:09 -0700
Subject: percpu_counter: extend _limited_add() to negative amounts

Though tmpfs does not need it, percpu_counter_limited_add() can be twice
as useful if it works sensibly with negative amounts (subs) - typically
decrements towards a limit of 0 or nearby: as suggested by Dave Chinner.

And in the course of that reworking, skip the percpu counter sum if it is
already obvious that the limit would be passed: as suggested by Tim Chen.

Extend the comment above __percpu_counter_limited_add(), defining the
behaviour with positive and negative amounts, allowing negative limits,
but not bothering about overflow beyond S64_MAX.

Link: https://lkml.kernel.org/r/8f86083b-c452-95d4-365b-f16a2e4ebcd4@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Carlos Maiolino <cem@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu_counter.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 8cb7c071bd5c..3a44dd1e33d2 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -198,14 +198,21 @@ static inline bool
 percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
 {
 	unsigned long flags;
+	bool good = false;
 	s64 count;
 
+	if (amount == 0)
+		return true;
+
 	local_irq_save(flags);
 	count = fbc->count + amount;
-	if (count <= limit)
+	if ((amount > 0 && count <= limit) ||
+	    (amount < 0 && count >= limit)) {
 		fbc->count = count;
+		good = true;
+	}
 	local_irq_restore(flags);
-	return count <= limit;
+	return good;
 }
 
 /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
-- 
cgit v1.2.3


From 069686255c16a75b6a796e42df47f5af27b496a4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 2 Oct 2023 16:29:48 +0200
Subject: mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap()

Let's convert it to consume a folio.

[akpm@linux-foundation.org: fix kerneldoc]
Link: https://lkml.kernel.org/r/20231002142949.235104-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d22f4d21a11c..b26fe858fd44 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -189,7 +189,7 @@ typedef int __bitwise rmap_t;
 /*
  * rmap interfaces called when adding or removing pte of page
  */
-void page_move_anon_rmap(struct page *, struct vm_area_struct *);
+void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
 void page_add_anon_rmap(struct page *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
-- 
cgit v1.2.3


From c43cfa42541c04a3a94312e39ab81c41ba431277 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Tue, 3 Oct 2023 00:14:51 +0100
Subject: mm: make __access_remote_vm() static

Patch series "various improvements to the GUP interface", v2.

A series of fixes to simplify and improve the GUP interface with an eye to
providing groundwork to future improvements:-

* __access_remote_vm() and access_remote_vm() are functionally identical,
  so make the former static such that in future we can potentially change
  the external-facing implementation details of this function.

* Extend is_valid_gup_args() to cover the missing FOLL_TOUCH case, and
  simplify things by defining INTERNAL_GUP_FLAGS to check against.

* Adjust __get_user_pages_locked() to explicitly treat a failure to pin any
  pages as an error in all circumstances other than FOLL_NOWAIT being
  specified, bringing it in line with the nommu implementation of this
  function.

* (With many thanks to Arnd who suggested this in the first instance)
  Update get_user_page_vma_remote() to explicitly only return a page or an
  error, simplifying the interface and avoiding the questionable
  IS_ERR_OR_NULL() pattern.


This patch (of 4):

access_remote_vm() passes through parameters to __access_remote_vm()
directly, so remove the __access_remote_vm() function from mm.h and use
access_remote_vm() in the one caller that needs it (ptrace_access_vm()).

This allows future adjustments to the GUP-internal __access_remote_vm()
function while keeping the access_remote_vm() function stable.

Link: https://lkml.kernel.org/r/cover.1696288092.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/f7877c5039ce1c202a514a8aeeefc5cdd5e32d19.1696288092.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 52c40b3d0813..7b89f7bd420d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2415,8 +2415,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags);
-extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
-			      void *buf, int len, unsigned int gup_flags);
 
 long get_user_pages_remote(struct mm_struct *mm,
 			   unsigned long start, unsigned long nr_pages,
-- 
cgit v1.2.3


From 6a1960b8a8773324d870fa32ba68ff3106523a95 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Tue, 3 Oct 2023 00:14:54 +0100
Subject: mm/gup: adapt get_user_page_vma_remote() to never return NULL

get_user_pages_remote() will never return 0 except in the case of
FOLL_NOWAIT being specified, which we explicitly disallow.

This simplifies error handling for the caller and avoids the awkwardness
of dealing with both errors and failing to pin.  Failing to pin here is an
error.

Link: https://lkml.kernel.org/r/00319ce292d27b3aae76a0eb220ce3f528187508.1696288092.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b89f7bd420d..fa608cba041f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2425,6 +2425,9 @@ long pin_user_pages_remote(struct mm_struct *mm,
 			   unsigned int gup_flags, struct page **pages,
 			   int *locked);
 
+/*
+ * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
+ */
 static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
 						    unsigned long addr,
 						    int gup_flags,
@@ -2432,12 +2435,15 @@ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
 {
 	struct page *page;
 	struct vm_area_struct *vma;
-	int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
+	int got;
+
+	if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
+		return ERR_PTR(-EINVAL);
+
+	got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
 
 	if (got < 0)
 		return ERR_PTR(got);
-	if (got == 0)
-		return NULL;
 
 	vma = vma_lookup(mm, addr);
 	if (WARN_ON_ONCE(!vma)) {
-- 
cgit v1.2.3


From 0b237047d5a72ffe06c0bdf2f4536f669dcd31c9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 4 Oct 2023 17:53:03 +0100
Subject: mm: add folio_end_read()

Provide a function for filesystems to call when they have finished reading
an entire folio.

Link: https://lkml.kernel.org/r/20231004165317.1061855-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 759b29d9a69a..bcc1ea44b4e8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1101,6 +1101,7 @@ static inline void wait_on_page_locked(struct page *page)
 	folio_wait_locked(page_folio(page));
 }
 
+void folio_end_read(struct folio *folio, bool success);
 void wait_on_page_writeback(struct page *page);
 void folio_wait_writeback(struct folio *folio);
 int folio_wait_writeback_killable(struct folio *folio);
-- 
cgit v1.2.3


From 0410cd844ed0af3db3cb510d877d62c66d26e5cc Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 4 Oct 2023 17:53:15 +0100
Subject: mm: add folio_xor_flags_has_waiters()

Optimise folio_end_read() by setting the uptodate bit at the same time we
clear the unlock bit.  This saves at least one memory barrier and one
write-after-write hazard.

Link: https://lkml.kernel.org/r/20231004165317.1061855-16-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5c02720c53a5..a88e64acebfe 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -692,6 +692,25 @@ TESTPAGEFLAG_FALSE(Ksm, ksm)
 
 u64 stable_page_flags(struct page *page);
 
+/**
+ * folio_xor_flags_has_waiters - Change some folio flags.
+ * @folio: The folio.
+ * @mask: Bits set in this word will be changed.
+ *
+ * This must only be used for flags which are changed with the folio
+ * lock held.  For example, it is unsafe to use for PG_dirty as that
+ * can be set without the folio lock held.  It can also only be used
+ * on flags which are in the range 0-6 as some of the implementations
+ * only affect those bits.
+ *
+ * Return: Whether there are tasks waiting on the folio.
+ */
+static inline bool folio_xor_flags_has_waiters(struct folio *folio,
+		unsigned long mask)
+{
+	return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0));
+}
+
 /**
  * folio_test_uptodate - Is this folio up to date?
  * @folio: The folio.
-- 
cgit v1.2.3


From 59838b2566f6d03099743675a2e0f425813078c6 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Wed, 4 Oct 2023 15:32:48 +0000
Subject: mm, hugetlb: remove HUGETLB_CGROUP_MIN_ORDER

Originally, hugetlb_cgroup was the only hugetlb user of tail page
structure fields.  So, the code defined and checked against
HUGETLB_CGROUP_MIN_ORDER to make sure pages weren't too small to use.

However, by now, tail page #2 is used to store hugetlb hwpoison and
subpool information as well.  In other words, without that tail page
hugetlb doesn't work.

Acknowledge this fact by getting rid of HUGETLB_CGROUP_MIN_ORDER and
checks against it.  Instead, just check for the minimum viable page order
at hstate creation time.

Link: https://lkml.kernel.org/r/20231004153248.3842997-1-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb_cgroup.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 3d82d91f49ac..e5d64b8b59c2 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -22,13 +22,6 @@ struct resv_map;
 struct file_region;
 
 #ifdef CONFIG_CGROUP_HUGETLB
-/*
- * Minimum page order trackable by hugetlb cgroup.
- * At least 3 pages are necessary for all the tracking information.
- * The second tail page contains all of the hugetlb-specific fields.
- */
-#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__NR_USED_SUBPAGE)
-
 enum hugetlb_memory_event {
 	HUGETLB_MAX,
 	HUGETLB_NR_MEMORY_EVENTS,
@@ -68,8 +61,6 @@ static inline struct hugetlb_cgroup *
 __hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd)
 {
 	VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
-	if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER)
-		return NULL;
 	if (rsvd)
 		return folio->_hugetlb_cgroup_rsvd;
 	else
@@ -91,8 +82,6 @@ static inline void __set_hugetlb_cgroup(struct folio *folio,
 				       struct hugetlb_cgroup *h_cg, bool rsvd)
 {
 	VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
-	if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER)
-		return;
 	if (rsvd)
 		folio->_hugetlb_cgroup_rsvd = h_cg;
 	else
-- 
cgit v1.2.3


From 4b569387c0d566db288e7c3e1b484b43df797bdb Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Fri, 6 Oct 2023 11:46:26 -0700
Subject: memcontrol: add helpers for hugetlb memcg accounting

Patch series "hugetlb memcg accounting", v4.

Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory.  This has been observed in our production system.

For instance, here is one of our usecases: suppose there are two 32G
containers.  The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it.  The rest is anon, cache, slab, etc.  We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness.  But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etcetera fair.

What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max.  Similar procedure is done to other memory limits
(memory.low for e.g).  However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.

This patch series rectifies this issue by charging the memcg when the
hugetlb folio is allocated, and uncharging when the folio is freed.  In
addition, a new selftest is added to demonstrate and verify this new
behavior.


This patch (of 4):

This patch exposes charge committing and cancelling as parts of the memory
controller interface.  These functionalities are useful when the
try_charge() and commit_charge() stages have to be separated by other
actions in between (which can fail).  One such example is the new hugetlb
accounting behavior in the following patch.

The patch also adds a helper function to obtain a reference to the
current task's memcg.

Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c6029aeaa268..bea3d997b076 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -652,6 +652,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
 		page_counter_read(&memcg->memory);
 }
 
+void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
+
 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
 
 /**
@@ -703,6 +705,8 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 	__mem_cgroup_uncharge_list(page_list);
 }
 
+void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
+
 void mem_cgroup_migrate(struct folio *old, struct folio *new);
 
 /**
@@ -759,6 +763,8 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
+struct mem_cgroup *get_mem_cgroup_from_current(void);
+
 struct lruvec *folio_lruvec_lock(struct folio *folio);
 struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
@@ -1239,6 +1245,11 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
 	return false;
 }
 
+static inline void mem_cgroup_commit_charge(struct folio *folio,
+		struct mem_cgroup *memcg)
+{
+}
+
 static inline int mem_cgroup_charge(struct folio *folio,
 		struct mm_struct *mm, gfp_t gfp)
 {
@@ -1263,6 +1274,11 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 {
 }
 
+static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
+		unsigned int nr_pages)
+{
+}
+
 static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
 {
 }
@@ -1300,6 +1316,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return NULL;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+	return NULL;
+}
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
 {
-- 
cgit v1.2.3


From 85ce2c517ade0d51b7ad95f2e88be9bbe294379a Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Fri, 6 Oct 2023 11:46:27 -0700
Subject: memcontrol: only transfer the memcg data for migration

For most migration use cases, only transfer the memcg data from the old
folio to the new folio, and clear the old folio's memcg data.  No charging
and uncharging will be done.

This shaves off some work on the migration path, and avoids the temporary
double charging of a folio during its migration.

The only exception is replace_page_cache_folio(), which will use the old
mem_cgroup_migrate() (now renamed to mem_cgroup_replace_folio).  In that
context, the isolation of the old page isn't quite as thorough as with
migration, so we cannot use our new implementation directly.

This patch is the result of the following discussion on the new hugetlb
memcg accounting behavior:

https://lore.kernel.org/lkml/20231003171329.GB314430@monkey/

Link: https://lkml.kernel.org/r/20231006184629.155543-3-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bea3d997b076..a3fce570122e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -707,6 +707,8 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 
 void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
 
+void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
+
 void mem_cgroup_migrate(struct folio *old, struct folio *new);
 
 /**
@@ -1279,6 +1281,11 @@ static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
 {
 }
 
+static inline void mem_cgroup_replace_folio(struct folio *old,
+		struct folio *new)
+{
+}
+
 static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
 {
 }
-- 
cgit v1.2.3


From 8cba9576df601c384abd334a503c3f6e1e29eefb Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Fri, 6 Oct 2023 11:46:28 -0700
Subject: hugetlb: memcg: account hugetlb-backed memory in memory controller

Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory.  This has been observed in our production system.

For instance, here is one of our usecases: suppose there are two 32G
containers.  The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it.  The rest is anon, cache, slab, etc.  We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness.  But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etc.  fair.

What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max.  Similar procedure is done to other memory limits
(memory.low for e.g).  However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.

This patch rectifies this issue by charging the memcg when the hugetlb
folio is utilized, and uncharging when the folio is freed (analogous to
the hugetlb controller).  Note that we do not charge when the folio is
allocated to the hugetlb pool, because at this point it is not owned by
any memcg.

Some caveats to consider:
  * This feature is only available on cgroup v2.
  * There is no hugetlb pool management involved in the memory
    controller. As stated above, hugetlb folios are only charged towards
    the memory controller when it is used. Host overcommit management
    has to consider it when configuring hard limits.
  * Failure to charge towards the memcg results in SIGBUS. This could
    happen even if the hugetlb pool still has pages (but the cgroup
    limit is hit and reclaim attempt fails).
  * When this feature is enabled, hugetlb pages contribute to memory
    reclaim protection. low, min limits tuning must take into account
    hugetlb memory.
  * Hugetlb pages utilized while this option is not selected will not
    be tracked by the memory controller (even if cgroup v2 is remounted
    later on).

Link: https://lkml.kernel.org/r/20231006184629.155543-4-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cgroup-defs.h | 5 +++++
 include/linux/memcontrol.h  | 9 +++++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index f1b3151ac30b..8641f4320c98 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -115,6 +115,11 @@ enum {
 	 * Enable recursive subtree protection
 	 */
 	CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18),
+
+	/*
+	 * Enable hugetlb accounting for the memory controller.
+	 */
+	 CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
 };
 
 /* cftype->flags */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a3fce570122e..6674c12725d5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -678,6 +678,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
 	return __mem_cgroup_charge(folio, mm, gfp);
 }
 
+int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
+		long nr_pages);
+
 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
 				  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
@@ -1258,6 +1261,12 @@ static inline int mem_cgroup_charge(struct folio *folio,
 	return 0;
 }
 
+static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
+		gfp_t gfp, long nr_pages)
+{
+	return 0;
+}
+
 static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
-- 
cgit v1.2.3


From f04eba134e598d4a5af2eeecff0562df5042cbfc Mon Sep 17 00:00:00 2001
From: Lucy Mielke <lucymielke@icloud.com>
Date: Fri, 6 Oct 2023 22:30:51 +0200
Subject: mm: add printf attribute to shrinker_debugfs_name_alloc

This fixes a compiler warning when compiling an allyesconfig with W=1:

mm/internal.h:1235:9: error: function might be a candidate for `gnu_printf'
format attribute [-Werror=suggest-attribute=format]

[akpm@linux-foundation.org: fix shrinker_alloc() as welll per Qi Zheng]
  Link: https://lkml.kernel.org/r/822387b7-4895-4e64-5806-0f56b5d6c447@bytedance.com
Link: https://lkml.kernel.org/r/ZSBue-3kM6gI6jCr@mainframe
Fixes: c42d50aefd17 ("mm: shrinker: add infrastructure for dynamically allocating shrinker")
Signed-off-by: Lucy Mielke <lucymielke@icloud.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index e4f93120e0ab..1a00be90d93a 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -131,6 +131,7 @@ struct shrinker {
  */
 #define SHRINKER_NONSLAB	BIT(4)
 
+__printf(2, 3)
 struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
 void shrinker_register(struct shrinker *shrinker);
 void shrinker_free(struct shrinker *shrinker);
-- 
cgit v1.2.3


From 37acade0ce8938f00d6979bd02b8043b5b7089ae Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 10 Oct 2023 04:58:29 +0100
Subject: sched: remove wait bookmarks

There are no users of wait bookmarks left, so simplify the wait
code by removing them.

Link: https://lkml.kernel.org/r/20231010035829.544242-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Benjamin Segall <bsegall@google.com>
Cc: Bin Lai <sclaibin@gmail.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/wait.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5ec7739400f4..3473b663176f 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -19,10 +19,9 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 /* wait_queue_entry::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
-#define WQ_FLAG_BOOKMARK	0x04
-#define WQ_FLAG_CUSTOM		0x08
-#define WQ_FLAG_DONE		0x10
-#define WQ_FLAG_PRIORITY	0x20
+#define WQ_FLAG_CUSTOM		0x04
+#define WQ_FLAG_DONE		0x08
+#define WQ_FLAG_PRIORITY	0x10
 
 /*
  * A single wait-queue entry structure:
@@ -212,8 +211,6 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
 int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
-		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
-- 
cgit v1.2.3


From 3657fdc2451abf135c2d20949acf57d78cc50338 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 11 Oct 2023 18:04:27 +0100
Subject: mm: move vma_policy() and anon_vma_name() decls to mm_types.h

Patch series "Abstract vma_merge() and split_vma()", v4.

The vma_merge() interface is very confusing and its implementation has led
to numerous bugs as a result of that confusion.

In addition there is duplication both in invocation of vma_merge(), but
also in the common mprotect()-style pattern of attempting a merge, then if
this fails, splitting the portion of a VMA about to have its attributes
changed.

This pattern has been copy/pasted around the kernel in each instance where
such an operation has been required, each very slightly modified from the
last to make it even harder to decipher what is going on.

Simplify the whole thing by dividing the actual uses of vma_merge() and
split_vma() into specific and abstracted functions and de-duplicate the
vma_merge()/split_vma() pattern altogether.

Doing so also opens the door to changing how vma_merge() is implemented -
by knowing precisely what cases a caller is invoking rather than having a
central interface where anything might happen we can untangle the brittle
and confusing vma_merge() implementation into something more workable.

For mprotect()-like cases we introduce vma_modify() which performs the
vma_merge()/split_vma() pattern, returning a pointer to either the merged
or split VMA or an ERR_PTR(err) if the splits fail.

We provide a number of inline helper functions to make things even clearer:-

* vma_modify_flags()      - Prepare to modify the VMA's flags.
* vma_modify_flags_name() - Prepare to modify the VMA's flags/anon_vma_name
* vma_modify_policy()     - Prepare to modify the VMA's mempolicy.
* vma_modify_flags_uffd() - Prepare to modify the VMA's flags/uffd context.

For cases where a new VMA is attempted to be merged with adjacent VMAs we
add:-

* vma_merge_new_vma() - Prepare to merge a new VMA.
* vma_merge_extend()  - Prepare to extend the end of a new VMA.


This patch (of 5):

The vma_policy() define is a helper specifically for a VMA field so it
makes sense to host it in the memory management types header.

The anon_vma_name(), anon_vma_name_alloc() and anon_vma_name_free()
functions are a little out of place in mm_inline.h as they define external
functions, and so it makes sense to locate them in mm_types.h.

The purpose of these relocations is to make it possible to abstract static
inline wrappers which invoke both of these helpers.

Link: https://lkml.kernel.org/r/cover.1697043508.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/24bfc6c9e382fffbcb0ea8d424392c27d56cc8ca.1697043508.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h |  4 ----
 include/linux/mm_inline.h | 20 +-------------------
 include/linux/mm_types.h  | 27 +++++++++++++++++++++++++++
 3 files changed, 28 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 6c2754d7bfed..1d7f4ec2614c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -89,8 +89,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
 	return pol;
 }
 
-#define vma_policy(vma) ((vma)->vm_policy)
-
 static inline void mpol_get(struct mempolicy *pol)
 {
 	if (pol)
@@ -222,8 +220,6 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 	return NULL;
 }
 
-#define vma_policy(vma) NULL
-
 static inline int
 vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
 {
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8148b30a9df1..9ae7def16cb2 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -4,6 +4,7 @@
 
 #include <linux/atomic.h>
 #include <linux/huge_mm.h>
+#include <linux/mm_types.h>
 #include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/userfaultfd_k.h>
@@ -352,15 +353,6 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
 }
 
 #ifdef CONFIG_ANON_VMA_NAME
-/*
- * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
- * either keep holding the lock while using the returned pointer or it should
- * raise anon_vma_name refcount before releasing the lock.
- */
-extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
-extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
-extern void anon_vma_name_free(struct kref *kref);
-
 /* mmap_lock should be read-locked */
 static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
 {
@@ -415,16 +407,6 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
 }
 
 #else /* CONFIG_ANON_VMA_NAME */
-static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
-{
-	return NULL;
-}
-
-static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
-{
-	return NULL;
-}
-
 static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
 static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
 static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 36c5b43999e6..21eb56145f57 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -546,6 +546,27 @@ struct anon_vma_name {
 	char name[];
 };
 
+#ifdef CONFIG_ANON_VMA_NAME
+/*
+ * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
+ * either keep holding the lock while using the returned pointer or it should
+ * raise anon_vma_name refcount before releasing the lock.
+ */
+struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
+struct anon_vma_name *anon_vma_name_alloc(const char *name);
+void anon_vma_name_free(struct kref *kref);
+#else /* CONFIG_ANON_VMA_NAME */
+static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
+static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
+{
+	return NULL;
+}
+#endif
+
 struct vma_lock {
 	struct rw_semaphore lock;
 };
@@ -662,6 +683,12 @@ struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 } __randomize_layout;
 
+#ifdef CONFIG_NUMA
+#define vma_policy(vma) ((vma)->vm_policy)
+#else
+#define vma_policy(vma) NULL
+#endif
+
 #ifdef CONFIG_SCHED_MM_CID
 struct mm_cid {
 	u64 time;
-- 
cgit v1.2.3


From 94d7d923395129b9248777e575c877e40007f9dc Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 11 Oct 2023 18:04:28 +0100
Subject: mm: abstract the vma_merge()/split_vma() pattern for mprotect() et
 al.

mprotect() and other functions which change VMA parameters over a range
each employ a pattern of:-

1. Attempt to merge the range with adjacent VMAs.
2. If this fails, and the range spans a subset of the VMA, split it
   accordingly.

This is open-coded and duplicated in each case. Also in each case most of
the parameters passed to vma_merge() remain the same.

Create a new function, vma_modify(), which abstracts this operation,
accepting only those parameters which can be changed.

To avoid the mess of invoking each function call with unnecessary
parameters, create inline wrapper functions for each of the modify
operations, parameterised only by what is required to perform the action.

We can also significantly simplify the logic - by returning the VMA if we
split (or merged VMA if we do not) we no longer need specific handling for
merge/split cases in any of the call sites.

Note that the userfaultfd_release() case works even though it does not
split VMAs - since start is set to vma->vm_start and end is set to
vma->vm_end, the split logic does not trigger.

In addition, since we calculate pgoff to be equal to vma->vm_pgoff + (start
- vma->vm_start) >> PAGE_SHIFT, and start - vma->vm_start will be 0 in this
instance, this invocation will remain unchanged.

We eliminate a VM_WARN_ON() in mprotect_fixup() as this simply asserts that
vma_merge() correctly ensures that flags remain the same, something that is
already checked in is_mergeable_vma() and elsewhere, and in any case is not
specific to mprotect().

Link: https://lkml.kernel.org/r/0dfa9368f37199a423674bf0ee312e8ea0619044.1697043508.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa608cba041f..895feb814620 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3251,6 +3251,66 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff,
 	bool *need_rmap_locks);
 extern void exit_mmap(struct mm_struct *);
+struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
+				  struct vm_area_struct *prev,
+				  struct vm_area_struct *vma,
+				  unsigned long start, unsigned long end,
+				  unsigned long vm_flags,
+				  struct mempolicy *policy,
+				  struct vm_userfaultfd_ctx uffd_ctx,
+				  struct anon_vma_name *anon_name);
+
+/* We are about to modify the VMA's flags. */
+static inline struct vm_area_struct
+*vma_modify_flags(struct vma_iterator *vmi,
+		  struct vm_area_struct *prev,
+		  struct vm_area_struct *vma,
+		  unsigned long start, unsigned long end,
+		  unsigned long new_flags)
+{
+	return vma_modify(vmi, prev, vma, start, end, new_flags,
+			  vma_policy(vma), vma->vm_userfaultfd_ctx,
+			  anon_vma_name(vma));
+}
+
+/* We are about to modify the VMA's flags and/or anon_name. */
+static inline struct vm_area_struct
+*vma_modify_flags_name(struct vma_iterator *vmi,
+		       struct vm_area_struct *prev,
+		       struct vm_area_struct *vma,
+		       unsigned long start,
+		       unsigned long end,
+		       unsigned long new_flags,
+		       struct anon_vma_name *new_name)
+{
+	return vma_modify(vmi, prev, vma, start, end, new_flags,
+			  vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
+}
+
+/* We are about to modify the VMA's memory policy. */
+static inline struct vm_area_struct
+*vma_modify_policy(struct vma_iterator *vmi,
+		   struct vm_area_struct *prev,
+		   struct vm_area_struct *vma,
+		   unsigned long start, unsigned long end,
+		   struct mempolicy *new_pol)
+{
+	return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
+			  new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+}
+
+/* We are about to modify the VMA's flags and/or uffd context. */
+static inline struct vm_area_struct
+*vma_modify_flags_uffd(struct vma_iterator *vmi,
+		       struct vm_area_struct *prev,
+		       struct vm_area_struct *vma,
+		       unsigned long start, unsigned long end,
+		       unsigned long new_flags,
+		       struct vm_userfaultfd_ctx new_ctx)
+{
+	return vma_modify(vmi, prev, vma, start, end, new_flags,
+			  vma_policy(vma), new_ctx, anon_vma_name(vma));
+}
 
 static inline int check_data_rlimit(unsigned long rlim,
 				    unsigned long new,
-- 
cgit v1.2.3


From adb20b0c785e9e8d5e4d4a07b9c3340d7de0e5dc Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 11 Oct 2023 18:04:29 +0100
Subject: mm: make vma_merge() and split_vma() internal

Now the common pattern of - attempting a merge via vma_merge() and should
this fail splitting VMAs via split_vma() - has been abstracted, the former
can be placed into mm/internal.h and the latter made static.

In addition, the split_vma() nommu variant also need not be exported.

Link: https://lkml.kernel.org/r/405f2be10e20c4e9fbcc9fe6b2dfea105f6642e0.1697043508.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 895feb814620..1bddb151cd5c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3235,16 +3235,7 @@ extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		      struct vm_area_struct *next);
 extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		       unsigned long start, unsigned long end, pgoff_t pgoff);
-extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi,
-	struct mm_struct *, struct vm_area_struct *prev, unsigned long addr,
-	unsigned long end, unsigned long vm_flags, struct anon_vma *,
-	struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx,
-	struct anon_vma_name *);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
-extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
-		       unsigned long addr, int new_below);
-extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
-			 unsigned long addr, int new_below);
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
-- 
cgit v1.2.3


From e8e17ee90eaf650c855adb0a3e5e965fd6692ff1 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 12 Oct 2023 18:04:28 +0100
Subject: mm: drop the assumption that VM_SHARED always implies writable

Patch series "permit write-sealed memfd read-only shared mappings", v4.

The man page for fcntl() describing memfd file seals states the following
about F_SEAL_WRITE:-

    Furthermore, trying to create new shared, writable memory-mappings via
    mmap(2) will also fail with EPERM.

With emphasis on 'writable'.  In turns out in fact that currently the
kernel simply disallows all new shared memory mappings for a memfd with
F_SEAL_WRITE applied, rendering this documentation inaccurate.

This matters because users are therefore unable to obtain a shared mapping
to a memfd after write sealing altogether, which limits their usefulness.
This was reported in the discussion thread [1] originating from a bug
report [2].

This is a product of both using the struct address_space->i_mmap_writable
atomic counter to determine whether writing may be permitted, and the
kernel adjusting this counter when any VM_SHARED mapping is performed and
more generally implicitly assuming VM_SHARED implies writable.

It seems sensible that we should only update this mapping if VM_MAYWRITE
is specified, i.e.  whether it is possible that this mapping could at any
point be written to.

If we do so then all we need to do to permit write seals to function as
documented is to clear VM_MAYWRITE when mapping read-only.  It turns out
this functionality already exists for F_SEAL_FUTURE_WRITE - we can
therefore simply adapt this logic to do the same for F_SEAL_WRITE.

We then hit a chicken and egg situation in mmap_region() where the check
for VM_MAYWRITE occurs before we are able to clear this flag.  To work
around this, perform this check after we invoke call_mmap(), with careful
consideration of error paths.

Thanks to Andy Lutomirski for the suggestion!

[1]:https://lore.kernel.org/all/20230324133646.16101dfa666f253c4715d965@linux-foundation.org/
[2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238


This patch (of 3):

There is a general assumption that VMAs with the VM_SHARED flag set are
writable.  If the VM_MAYWRITE flag is not set, then this is simply not the
case.

Update those checks which affect the struct address_space->i_mmap_writable
field to explicitly test for this by introducing
[vma_]is_shared_maywrite() helper functions.

This remains entirely conservative, as the lack of VM_MAYWRITE guarantees
that the VMA cannot be written to.

Link: https://lkml.kernel.org/r/cover.1697116581.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/d978aefefa83ec42d18dfa964ad180dbcde34795.1697116581.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h |  4 ++--
 include/linux/mm.h | 11 +++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd539c9fef8e..5265186da0e2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -454,7 +454,7 @@ extern const struct address_space_operations empty_aops;
  *   It is also used to block modification of page cache contents through
  *   memory mappings.
  * @gfp_mask: Memory allocation flags to use for allocating pages.
- * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
  * @nr_thps: Number of THPs in the pagecache (non-shmem only).
  * @i_mmap: Tree of private and shared mappings.
  * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
@@ -557,7 +557,7 @@ static inline int mapping_mapped(struct address_space *mapping)
 
 /*
  * Might pages of this file have been modified in userspace?
- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
+ * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
  * marks vma as VM_SHARED if it is shared, and the file was opened for
  * writing i.e. vma may be mprotected writable even if now readonly.
  *
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1bddb151cd5c..c3b5749ede9d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -937,6 +937,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_ACCESS_FLAGS;
 }
 
+static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+{
+	return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
+		(VM_SHARED | VM_MAYWRITE);
+}
+
+static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+{
+	return is_shared_maywrite(vma->vm_flags);
+}
+
 static inline
 struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
 {
-- 
cgit v1.2.3


From 28464bbb2ddc199433383994bcb9600c8034afa1 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 12 Oct 2023 18:04:29 +0100
Subject: mm: update memfd seal write check to include F_SEAL_WRITE

The seal_check_future_write() function is called by shmem_mmap() or
hugetlbfs_file_mmap() to disallow any future writable mappings of an memfd
sealed this way.

The F_SEAL_WRITE flag is not checked here, as that is handled via the
mapping->i_mmap_writable mechanism and so any attempt at a mapping would
fail before this could be run.

However we intend to change this, meaning this check can be performed for
F_SEAL_WRITE mappings also.

The logic here is equally applicable to both flags, so update this
function to accommodate both and rename it accordingly.

Link: https://lkml.kernel.org/r/913628168ce6cce77df7d13a63970bae06a526e0.1697116581.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c3b5749ede9d..86e040e886e4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4076,25 +4076,26 @@ static inline void mem_dump_obj(void *object) {}
 #endif
 
 /**
- * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
+ * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
+ *                    handle them.
  * @seals: the seals to check
  * @vma: the vma to operate on
  *
- * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
- * the vma flags.  Return 0 if check pass, or <0 for errors.
+ * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
+ * check/handling on the vma flags.  Return 0 if check pass, or <0 for errors.
  */
-static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
+static inline int seal_check_write(int seals, struct vm_area_struct *vma)
 {
-	if (seals & F_SEAL_FUTURE_WRITE) {
+	if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
 		/*
 		 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
-		 * "future write" seal active.
+		 * write seals are active.
 		 */
 		if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
 			return -EPERM;
 
 		/*
-		 * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
+		 * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
 		 * MAP_SHARED and read-only, take care to not allow mprotect to
 		 * revert protections on such mappings. Do this only for shared
 		 * mappings. For private mappings, don't need to mask
-- 
cgit v1.2.3


From 5097a69d676f7e562240dbe81c21b1c62aaf5950 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 16 Sep 2023 21:21:31 +0300
Subject: extract and use FILE_LINE macro

Extract nifty FILE_LINE useful for printk style debugging:

	printk("%s\n", FILE_LINE);

It should not be used en mass probably because __FILE__ string literals
can be merged while FILE_LINE's won't. But for debugging it is what
the doctor ordered.

Don't add leading and trailing underscores, they're painful to type.
Trust me, I've tried both versions.

Link: https://lkml.kernel.org/r/ebf12ac4-5a61-4b12-b8b0-1253eb371332@p183
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fortify-string.h | 2 +-
 include/linux/stringify.h      | 2 ++
 include/linux/timer.h          | 3 +--
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index da51a83b2829..442ee9170259 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -643,7 +643,7 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
 				     __q_size_field, #op),		\
 		  #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
 		  __fortify_size,					\
-		  "field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \
+		  "field \"" #p "\" at " FILE_LINE,			\
 		  __p_size_field);					\
 	__underlying_##op(p, q, __fortify_size);			\
 })
diff --git a/include/linux/stringify.h b/include/linux/stringify.h
index 841cec8ed525..0e84cbe65270 100644
--- a/include/linux/stringify.h
+++ b/include/linux/stringify.h
@@ -9,4 +9,6 @@
 #define __stringify_1(x...)	#x
 #define __stringify(x...)	__stringify_1(x)
 
+#define FILE_LINE	__FILE__ ":" __stringify(__LINE__)
+
 #endif	/* !__LINUX_STRINGIFY_H */
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 9162f275819a..26a545bb0153 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -77,8 +77,7 @@ struct timer_list {
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
 		.flags = (_flags),				\
-		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
-			__FILE__ ":" __stringify(__LINE__))	\
+		__TIMER_LOCKDEP_MAP_INITIALIZER(FILE_LINE)	\
 	}
 
 #define DEFINE_TIMER(_name, _function)				\
-- 
cgit v1.2.3


From 9bf2850c9170b52a6ab052085feced55effa78ef Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 16 Sep 2023 17:50:11 +0200
Subject: kstrtox: remove strtobool()

The conversion from strtobool() to kstrtobool() is completed.  So
strtobool() can now be removed.

Link: https://lkml.kernel.org/r/87e3cc2547df174cd5af1fadbf866be4ef9e8e45.1694878151.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kstrtox.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kstrtox.h b/include/linux/kstrtox.h
index 529974e22ea7..7fcf29a4e0de 100644
--- a/include/linux/kstrtox.h
+++ b/include/linux/kstrtox.h
@@ -147,9 +147,4 @@ extern long simple_strtol(const char *,char **,unsigned int);
 extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
 extern long long simple_strtoll(const char *,char **,unsigned int);
 
-static inline int strtobool(const char *s, bool *res)
-{
-	return kstrtobool(s, res);
-}
-
 #endif	/* _LINUX_KSTRTOX_H */
-- 
cgit v1.2.3


From 80fcac55385ccb710d33a20dc1caaef29bd5a921 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Mon, 18 Sep 2023 08:16:30 +0000
Subject: minmax: add umin(a, b) and umax(a, b)

Patch series "minmax: Relax type checks in min() and max()", v4.

The min() (etc) functions in minmax.h require that the arguments have
exactly the same types.

However when the type check fails, rather than look at the types and fix
the type of a variable/constant, everyone seems to jump on min_t().  In
reality min_t() ought to be rare - when something unusual is being done,
not normality.

The orginal min() (added in 2.4.9) replaced several inline functions and
included the type - so matched the implicit casting of the function call.
This was renamed min_t() in 2.4.10 and the current min() added.  There is
no actual indication that the conversion of negatve values to large
unsigned values has ever been an actual problem.

A quick grep shows 5734 min() and 4597 min_t().  Having the casts on
almost half of the calls shows that something is clearly wrong.

If the wrong type is picked (and it is far too easy to pick the type of
the result instead of the larger input) then significant bits can get
discarded.

Pretty much the worst example is in the derived clamp_val(), consider:
        unsigned char x = 200u;
        y = clamp_val(x, 10u, 300u);

I also suspect that many of the min_t(u16, ...) are actually wrong.  For
example copy_data() in printk_ringbuffer.c contains:

        data_size = min_t(u16, buf_size, len);

Here buf_size is 'unsigned int' and len 'u16', pass a 64k buffer (can you
prove that doesn't happen?) and no data is returned.  Apparantly it did -
and has since been fixed.

The only reason that most of the min_t() are 'fine' is that pretty much
all the values in the kernel are between 0 and INT_MAX.

Patch 1 adds umin(), this uses integer promotions to convert both
arguments to 'unsigned long long'.  It can be used to compare a signed
type that is known to contain a non-negative value with an unsigned type.
The compiler typically optimises it all away.  Added first so that it can
be referred to in patch 2.

Patch 2 replaces the 'same type' check with a 'same signedness' one.  This
makes min(unsigned_int_var, sizeof()) be ok.  The error message is also
improved and will contain the expanded form of both arguments (useful for
seeing how constants are defined).

Patch 3 just fixes some whitespace.

Patch 4 allows comparisons of 'unsigned char' and 'unsigned short' to
signed types.  The integer promotion rules convert them both to 'signed
int' prior to the comparison so they can never cause a negative value be
converted to a large positive one.

Patch 5 (rewritted for v4) allows comparisons of unsigned values against
non-negative constant integer expressions.  This makes
min(unsigned_int_var, 4) be ok.

The only common case that is still errored is the comparison of signed
values against unsigned constant integer expressions below __INT_MAX__.
Typcally min(int_val, sizeof (foo)), the real fix for this is casting the
constant: min(int_var, (int)sizeof (foo)).

With all the patches applied pretty much all the min_t() could be replaced
by min(), and most of the rest by umin().  However they all need careful
inspection due to code like:

        sz = min_t(unsigned char, sz - 1, LIM - 1) + 1;

which converts 0 to LIM.


This patch (of 6):

umin() and umax() can be used when min()/max() errors a signed v unsigned
compare when the signed value is known to be non-negative.

Unlike min_t(some_unsigned_type, a, b) umin() will never mask off high
bits if an inappropriate type is selected.

The '+ 0u + 0ul + 0ull' may look strange.
The '+ 0u' is needed for 'signed int' on 64bit systems.
The '+ 0ul' is needed for 'signed long' on 32bit systems.
The '+ 0ull' is needed for 'signed long long'.

Link: https://lkml.kernel.org/r/b97faef60ad24922b530241c5d7c933c@AcuMS.aculab.com
Link: https://lkml.kernel.org/r/41d93ca827a248698ec64bf57e0c05a5@AcuMS.aculab.com
Signed-off-by: David Laight <david.laight@aculab.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index ca69abd6151e..2a197f54fe05 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -76,6 +76,23 @@
  */
 #define max(x, y)	__careful_cmp(x, y, >)
 
+/**
+ * umin - return minimum of two non-negative values
+ *   Signed types are zero extended to match a larger unsigned type.
+ * @x: first value
+ * @y: second value
+ */
+#define umin(x, y)	\
+	__careful_cmp((x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull, <)
+
+/**
+ * umax - return maximum of two non-negative values
+ * @x: first value
+ * @y: second value
+ */
+#define umax(x, y)	\
+	__careful_cmp((x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull, >)
+
 /**
  * min3 - return minimum of three values
  * @x: first value
-- 
cgit v1.2.3


From d03eba99f5bf7cbc6e2fdde3b6fa36954ad58e09 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Mon, 18 Sep 2023 08:17:15 +0000
Subject: minmax: allow min()/max()/clamp() if the arguments have the same
 signedness.

The type-check in min()/max() is there to stop unexpected results if a
negative value gets converted to a large unsigned value.  However it also
rejects 'unsigned int' v 'unsigned long' compares which are common and
never problematc.

Replace the 'same type' check with a 'same signedness' check.

The new test isn't itself a compile time error, so use static_assert() to
report the error and give a meaningful error message.

Due to the way builtin_choose_expr() works detecting the error in the
'non-constant' side (where static_assert() can be used) also detects
errors when the arguments are constant.

Link: https://lkml.kernel.org/r/fe7e6c542e094bfca655abcd323c1c98@AcuMS.aculab.com
Signed-off-by: David Laight <david.laight@aculab.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 60 +++++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 2a197f54fe05..8718fd71a793 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -12,9 +12,8 @@
  *
  * - avoid multiple evaluations of the arguments (so side-effects like
  *   "x++" happen only once) when non-constant.
- * - perform strict type-checking (to generate warnings instead of
- *   nasty runtime surprises). See the "unnecessary" pointer comparison
- *   in __typecheck().
+ * - perform signed v unsigned type-checking (to generate compile
+ *   errors instead of nasty runtime surprises).
  * - retain result as a constant expressions when called with only
  *   constant expressions (to avoid tripping VLA warnings in stack
  *   allocation usage).
@@ -22,23 +21,30 @@
 #define __typecheck(x, y) \
 	(!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
 
-#define __no_side_effects(x, y) \
-		(__is_constexpr(x) && __is_constexpr(y))
+/* is_signed_type() isn't a constexpr for pointer types */
+#define __is_signed(x) 								\
+	__builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))),	\
+		is_signed_type(typeof(x)), 0)
 
-#define __safe_cmp(x, y) \
-		(__typecheck(x, y) && __no_side_effects(x, y))
+#define __types_ok(x, y) \
+	(__is_signed(x) == __is_signed(y))
 
-#define __cmp(x, y, op)	((x) op (y) ? (x) : (y))
+#define __cmp_op_min <
+#define __cmp_op_max >
 
-#define __cmp_once(x, y, unique_x, unique_y, op) ({	\
+#define __cmp(op, x, y)	((x) __cmp_op_##op (y) ? (x) : (y))
+
+#define __cmp_once(op, x, y, unique_x, unique_y) ({	\
 		typeof(x) unique_x = (x);		\
 		typeof(y) unique_y = (y);		\
-		__cmp(unique_x, unique_y, op); })
+		static_assert(__types_ok(x, y),		\
+			#op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \
+		__cmp(op, unique_x, unique_y); })
 
-#define __careful_cmp(x, y, op) \
-	__builtin_choose_expr(__safe_cmp(x, y), \
-		__cmp(x, y, op), \
-		__cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op))
+#define __careful_cmp(op, x, y)					\
+	__builtin_choose_expr(__is_constexpr((x) - (y)),	\
+		__cmp(op, x, y),				\
+		__cmp_once(op, x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y)))
 
 #define __clamp(val, lo, hi)	\
 	((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val)))
@@ -47,17 +53,15 @@
 		typeof(val) unique_val = (val);				\
 		typeof(lo) unique_lo = (lo);				\
 		typeof(hi) unique_hi = (hi);				\
+		static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), 	\
+				(lo) <= (hi), true),					\
+			"clamp() low limit " #lo " greater than high limit " #hi);	\
+		static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error");	\
+		static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error");	\
 		__clamp(unique_val, unique_lo, unique_hi); })
 
-#define __clamp_input_check(lo, hi)					\
-        (BUILD_BUG_ON_ZERO(__builtin_choose_expr(			\
-                __is_constexpr((lo) > (hi)), (lo) > (hi), false)))
-
 #define __careful_clamp(val, lo, hi) ({					\
-	__clamp_input_check(lo, hi) +					\
-	__builtin_choose_expr(__typecheck(val, lo) && __typecheck(val, hi) && \
-			      __typecheck(hi, lo) && __is_constexpr(val) && \
-			      __is_constexpr(lo) && __is_constexpr(hi),	\
+	__builtin_choose_expr(__is_constexpr((val) - (lo) + (hi)),	\
 		__clamp(val, lo, hi),					\
 		__clamp_once(val, lo, hi, __UNIQUE_ID(__val),		\
 			     __UNIQUE_ID(__lo), __UNIQUE_ID(__hi))); })
@@ -67,14 +71,14 @@
  * @x: first value
  * @y: second value
  */
-#define min(x, y)	__careful_cmp(x, y, <)
+#define min(x, y)	__careful_cmp(min, x, y)
 
 /**
  * max - return maximum of two values of the same or compatible types
  * @x: first value
  * @y: second value
  */
-#define max(x, y)	__careful_cmp(x, y, >)
+#define max(x, y)	__careful_cmp(max, x, y)
 
 /**
  * umin - return minimum of two non-negative values
@@ -83,7 +87,7 @@
  * @y: second value
  */
 #define umin(x, y)	\
-	__careful_cmp((x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull, <)
+	__careful_cmp(min, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull)
 
 /**
  * umax - return maximum of two non-negative values
@@ -91,7 +95,7 @@
  * @y: second value
  */
 #define umax(x, y)	\
-	__careful_cmp((x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull, >)
+	__careful_cmp(max, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull)
 
 /**
  * min3 - return minimum of three values
@@ -143,7 +147,7 @@
  * @x: first value
  * @y: second value
  */
-#define min_t(type, x, y)	__careful_cmp((type)(x), (type)(y), <)
+#define min_t(type, x, y)	__careful_cmp(min, (type)(x), (type)(y))
 
 /**
  * max_t - return maximum of two values, using the specified type
@@ -151,7 +155,7 @@
  * @x: first value
  * @y: second value
  */
-#define max_t(type, x, y)	__careful_cmp((type)(x), (type)(y), >)
+#define max_t(type, x, y)	__careful_cmp(max, (type)(x), (type)(y))
 
 /*
  * Do not check the array parameter using __must_be_array().
-- 
cgit v1.2.3


From f4b84b2ff851f01d0fac619eadef47eb41648534 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Mon, 18 Sep 2023 08:17:57 +0000
Subject: minmax: fix indentation of __cmp_once() and __clamp_once()

Remove the extra indentation and align continuation markers.

Link: https://lkml.kernel.org/r/bed41317a05c498ea0209eafbcab45a5@AcuMS.aculab.com
Signed-off-by: David Laight <david.laight@aculab.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 8718fd71a793..c0e738eacefa 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -35,11 +35,11 @@
 #define __cmp(op, x, y)	((x) __cmp_op_##op (y) ? (x) : (y))
 
 #define __cmp_once(op, x, y, unique_x, unique_y) ({	\
-		typeof(x) unique_x = (x);		\
-		typeof(y) unique_y = (y);		\
-		static_assert(__types_ok(x, y),		\
-			#op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \
-		__cmp(op, unique_x, unique_y); })
+	typeof(x) unique_x = (x);			\
+	typeof(y) unique_y = (y);			\
+	static_assert(__types_ok(x, y),			\
+		#op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \
+	__cmp(op, unique_x, unique_y); })
 
 #define __careful_cmp(op, x, y)					\
 	__builtin_choose_expr(__is_constexpr((x) - (y)),	\
@@ -49,16 +49,16 @@
 #define __clamp(val, lo, hi)	\
 	((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val)))
 
-#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({	\
-		typeof(val) unique_val = (val);				\
-		typeof(lo) unique_lo = (lo);				\
-		typeof(hi) unique_hi = (hi);				\
-		static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), 	\
-				(lo) <= (hi), true),					\
-			"clamp() low limit " #lo " greater than high limit " #hi);	\
-		static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error");	\
-		static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error");	\
-		__clamp(unique_val, unique_lo, unique_hi); })
+#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({		\
+	typeof(val) unique_val = (val);						\
+	typeof(lo) unique_lo = (lo);						\
+	typeof(hi) unique_hi = (hi);						\
+	static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), 	\
+			(lo) <= (hi), true),					\
+		"clamp() low limit " #lo " greater than high limit " #hi);	\
+	static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error");	\
+	static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error");	\
+	__clamp(unique_val, unique_lo, unique_hi); })
 
 #define __careful_clamp(val, lo, hi) ({					\
 	__builtin_choose_expr(__is_constexpr((val) - (lo) + (hi)),	\
-- 
cgit v1.2.3


From 4ead534fba42fc4fd41163297528d2aa731cd121 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Mon, 18 Sep 2023 08:18:40 +0000
Subject: minmax: allow comparisons of 'int' against 'unsigned char/short'

Since 'unsigned char/short' get promoted to 'signed int' it is safe to
compare them against an 'int' value.

Link: https://lkml.kernel.org/r/8732ef5f809c47c28a7be47c938b28d4@AcuMS.aculab.com
Signed-off-by: David Laight <david.laight@aculab.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index c0e738eacefa..842c1db62ffe 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -26,8 +26,9 @@
 	__builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))),	\
 		is_signed_type(typeof(x)), 0)
 
-#define __types_ok(x, y) \
-	(__is_signed(x) == __is_signed(y))
+#define __types_ok(x, y) 			\
+	(__is_signed(x) == __is_signed(y) ||	\
+		__is_signed((x) + 0) == __is_signed((y) + 0))
 
 #define __cmp_op_min <
 #define __cmp_op_max >
-- 
cgit v1.2.3


From 867046cc7027703f60a46339ffde91a1970f2901 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Mon, 18 Sep 2023 08:19:25 +0000
Subject: minmax: relax check to allow comparison between unsigned arguments
 and signed constants

Allow (for example) min(unsigned_var, 20).

The opposite min(signed_var, 20u) is still errored.

Since a comparison between signed and unsigned never makes the unsigned
value negative it is only necessary to adjust the __types_ok() test.

Link: https://lkml.kernel.org/r/633b64e2f39e46bb8234809c5595b8c7@AcuMS.aculab.com
Signed-off-by: David Laight <david.laight@aculab.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 842c1db62ffe..2ec559284a9f 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -10,13 +10,18 @@
 /*
  * min()/max()/clamp() macros must accomplish three things:
  *
- * - avoid multiple evaluations of the arguments (so side-effects like
+ * - Avoid multiple evaluations of the arguments (so side-effects like
  *   "x++" happen only once) when non-constant.
- * - perform signed v unsigned type-checking (to generate compile
- *   errors instead of nasty runtime surprises).
- * - retain result as a constant expressions when called with only
+ * - Retain result as a constant expressions when called with only
  *   constant expressions (to avoid tripping VLA warnings in stack
  *   allocation usage).
+ * - Perform signed v unsigned type-checking (to generate compile
+ *   errors instead of nasty runtime surprises).
+ * - Unsigned char/short are always promoted to signed int and can be
+ *   compared against signed or unsigned arguments.
+ * - Unsigned arguments can be compared against non-negative signed constants.
+ * - Comparison of a signed argument against an unsigned constant fails
+ *   even if the constant is below __INT_MAX__ and could be cast to int.
  */
 #define __typecheck(x, y) \
 	(!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
@@ -26,9 +31,14 @@
 	__builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))),	\
 		is_signed_type(typeof(x)), 0)
 
-#define __types_ok(x, y) 			\
-	(__is_signed(x) == __is_signed(y) ||	\
-		__is_signed((x) + 0) == __is_signed((y) + 0))
+/* True for a non-negative signed int constant */
+#define __is_noneg_int(x)	\
+	(__builtin_choose_expr(__is_constexpr(x) && __is_signed(x), x, -1) >= 0)
+
+#define __types_ok(x, y) 					\
+	(__is_signed(x) == __is_signed(y) ||			\
+		__is_signed((x) + 0) == __is_signed((y) + 0) ||	\
+		__is_noneg_int(x) || __is_noneg_int(y))
 
 #define __cmp_op_min <
 #define __cmp_op_max >
-- 
cgit v1.2.3


From 598f0ac1500d7145c696de4d38797b1dd2c651de Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Thu, 5 Oct 2023 11:39:54 +0000
Subject: compiler.h: move __is_constexpr() to compiler.h

Prior to f747e6667ebb2 __is_constexpr() was in its only user minmax.h.
That commit moved it to const.h - but that file just defines ULL(x) and
UL(x) so that constants can be defined for .S and .c files.

So apart from the word 'const' it wasn't really a good location.  Instead
move the definition to compiler.h just before the similar

  is_signed_type() and is_unsigned_type().

This may not be a good long-term home, but the three definitions belong
together.

Link: https://lkml.kernel.org/r/2a6680bbe2e84459816a113730426782@AcuMS.aculab.com
Signed-off-by: David Laight <david.laight@aculab.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler.h | 8 ++++++++
 include/linux/const.h    | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 174099fdc485..bb1339c7057b 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -227,6 +227,14 @@ static inline void *offset_to_ptr(const int *off)
 /* &a[0] degrades to a pointer: a different type from an array */
 #define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 
+/*
+ * This returns a constant expression while determining if an argument is
+ * a constant expression, most importantly without evaluating the argument.
+ * Glory to Martin Uecker <Martin.Uecker@med.uni-goettingen.de>
+ */
+#define __is_constexpr(x) \
+	(sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8)))
+
 /*
  * Whether 'type' is a signed type or an unsigned type. Supports scalar types,
  * bool and also pointer types.
diff --git a/include/linux/const.h b/include/linux/const.h
index 435ddd72d2c4..81b8aae5a855 100644
--- a/include/linux/const.h
+++ b/include/linux/const.h
@@ -3,12 +3,4 @@
 
 #include <vdso/const.h>
 
-/*
- * This returns a constant expression while determining if an argument is
- * a constant expression, most importantly without evaluating the argument.
- * Glory to Martin Uecker <Martin.Uecker@med.uni-goettingen.de>
- */
-#define __is_constexpr(x) \
-	(sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8)))
-
 #endif /* _LINUX_CONST_H */
-- 
cgit v1.2.3


From 0e108725f6cc5b3be9e607f89c9fbcbb236367b7 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 18 Oct 2023 10:53:58 -0700
Subject: string: Adjust strtomem() logic to allow for smaller sources

Arnd noticed we have a case where a shorter source string is being copied
into a destination byte array, but this results in a strnlen() call that
exceeds the size of the source. This is seen with -Wstringop-overread:

In file included from ../include/linux/uuid.h:11,
                 from ../include/linux/mod_devicetable.h:14,
                 from ../include/linux/cpufeature.h:12,
                 from ../arch/x86/coco/tdx/tdx.c:7:
../arch/x86/coco/tdx/tdx.c: In function 'tdx_panic.constprop':
../include/linux/string.h:284:9: error: 'strnlen' specified bound 64 exceeds source size 60 [-Werror=stringop-overread]
  284 |         memcpy_and_pad(dest, _dest_len, src, strnlen(src, _dest_len), pad); \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../arch/x86/coco/tdx/tdx.c:124:9: note: in expansion of macro 'strtomem_pad'
  124 |         strtomem_pad(message.str, msg, '\0');
      |         ^~~~~~~~~~~~

Use the smaller of the two buffer sizes when calling strnlen(). When
src length is unknown (SIZE_MAX), it is adjusted to use dest length,
which is what the original code did.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Fixes: dfbafa70bde2 ("string: Introduce strtomem() and strtomem_pad()")
Tested-by: Arnd Bergmann <arnd@arndb.de>
Cc: Andy Shevchenko <andy@kernel.org>
Cc: linux-hardening@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/string.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index dbfc66400050..9e3cb6923b0e 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -277,10 +277,12 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
  */
 #define strtomem_pad(dest, src, pad)	do {				\
 	const size_t _dest_len = __builtin_object_size(dest, 1);	\
+	const size_t _src_len = __builtin_object_size(src, 1);		\
 									\
 	BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||		\
 		     _dest_len == (size_t)-1);				\
-	memcpy_and_pad(dest, _dest_len, src, strnlen(src, _dest_len), pad); \
+	memcpy_and_pad(dest, _dest_len, src,				\
+		       strnlen(src, min(_src_len, _dest_len)), pad);	\
 } while (0)
 
 /**
@@ -298,10 +300,11 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
  */
 #define strtomem(dest, src)	do {					\
 	const size_t _dest_len = __builtin_object_size(dest, 1);	\
+	const size_t _src_len = __builtin_object_size(src, 1);		\
 									\
 	BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||		\
 		     _dest_len == (size_t)-1);				\
-	memcpy(dest, src, min(_dest_len, strnlen(src, _dest_len)));	\
+	memcpy(dest, src, strnlen(src, min(_src_len, _dest_len)));	\
 } while (0)
 
 /**
-- 
cgit v1.2.3


From bdcb37a5d8de3253da48b120e3f10863696fb654 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 17 Oct 2023 16:33:58 +0200
Subject: buildid: reduce header file dependencies for module

The vmlinux decompressor code intentionally has only a limited set of
included header files, but this started running into a build failure
because of the bitmap logic needing linux/errno.h:

In file included from include/linux/cpumask.h:12,
                 from include/linux/mm_types_task.h:14,
                 from include/linux/mm_types.h:5,
                 from include/linux/buildid.h:5,
                 from include/linux/module.h:14,
                 from arch/arm/boot/compressed/../../../../lib/lz4/lz4_decompress.c:39,
                 from arch/arm/boot/compressed/../../../../lib/decompress_unlz4.c:10,
                 from arch/arm/boot/compressed/decompress.c:60:
include/linux/bitmap.h: In function 'bitmap_allocate_region':
include/linux/bitmap.h:527:25: error: 'EBUSY' undeclared (first use in this function)
  527 |                 return -EBUSY;
      |                         ^~~~~
include/linux/bitmap.h:527:25: note: each undeclared identifier is reported only once for each function it appears in
include/linux/bitmap.h: In function 'bitmap_find_free_region':
include/linux/bitmap.h:554:17: error: 'ENOMEM' undeclared (first use in this function)
  554 |         return -ENOMEM;
      |                 ^~~~~~

This is easily avoided by changing linux/buildid.h to no longer depend on
linux/mm_types.h, a header that pulls in a huge number of indirect dependencies.

Fixes: b9c957f554442 ("bitmap: move bitmap_*_region() functions to bitmap.h")
Fixes: bd7525dacd7e2 ("bpf: Move stack_map_get_build_id into lib")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/buildid.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index 3b7a0ff4642f..8a582d242f06 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -2,10 +2,11 @@
 #ifndef _LINUX_BUILDID_H
 #define _LINUX_BUILDID_H
 
-#include <linux/mm_types.h>
+#include <linux/types.h>
 
 #define BUILD_ID_SIZE_MAX 20
 
+struct vm_area_struct;
 int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
 		   __u32 *size);
 int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
-- 
cgit v1.2.3


From 32671e3799ca2e4590773fd0e63aaa4229e50c06 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 18 Oct 2023 13:56:54 +0200
Subject: perf: Disallow mis-matched inherited group reads

Because group consistency is non-atomic between parent (filedesc) and children
(inherited) events, it is possible for PERF_FORMAT_GROUP read() to try and sum
non-matching counter groups -- with non-sensical results.

Add group_generation to distinguish the case where a parent group removes and
adds an event and thus has the same number, but a different configuration of
events as inherited groups.

This became a problem when commit fa8c269353d5 ("perf/core: Invert
perf_read_group() loops") flipped the order of child_list and sibling_list.
Previously it would iterate the group (sibling_list) first, and for each
sibling traverse the child_list. In this order, only the group composition of
the parent is relevant. By flipping the order the group composition of the
child (inherited) events becomes an issue and the mis-match in group
composition becomes evident.

That said; even prior to this commit, while reading of a group that is not
equally inherited was not broken, it still made no sense.

(Ab)use ECHILD as error return to indicate issues with child process group
composition.

Fixes: fa8c269353d5 ("perf/core: Invert perf_read_group() loops")
Reported-by: Budimir Markovic <markovicbudimir@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231018115654.GK33217@noisy.programming.kicks-ass.net
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e85cd1c0eaf3..7b5406e3288d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -704,6 +704,7 @@ struct perf_event {
 	/* The cumulative AND of all event_caps for events in this group. */
 	int				group_caps;
 
+	unsigned int			group_generation;
 	struct perf_event		*group_leader;
 	/*
 	 * event->pmu will always point to pmu in which this event belongs.
-- 
cgit v1.2.3


From 9a6c782158f703fa65c7d52e2156b7d4e7cd41f5 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Fri, 22 Sep 2023 08:15:24 +0200
Subject: vlynq: remove bus driver

There are no users with a vlynq_driver in the Kernel tree. Also, only
the AR7 platform ever initialized a VLYNQ bus, but AR7 is going to be
removed from the Kernel. OpenWRT had some out-of-tree drivers which they
probably intended to upport, but AR7 devices are even there not
supported anymore because they are "stuck with Kernel
3.18" [1]. This code can go.

[1] https://openwrt.org/docs/techref/targets/ar7

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 include/linux/vlynq.h | 149 --------------------------------------------------
 1 file changed, 149 deletions(-)
 delete mode 100644 include/linux/vlynq.h

(limited to 'include/linux')

diff --git a/include/linux/vlynq.h b/include/linux/vlynq.h
deleted file mode 100644
index e9c0cd36c48a..000000000000
--- a/include/linux/vlynq.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2006, 2007 Eugene Konev <ejka@openwrt.org>
- */
-
-#ifndef __VLYNQ_H__
-#define __VLYNQ_H__
-
-#include <linux/device.h>
-#include <linux/types.h>
-
-struct module;
-
-#define VLYNQ_NUM_IRQS 32
-
-struct vlynq_mapping {
-	u32 size;
-	u32 offset;
-};
-
-enum vlynq_divisor {
-	vlynq_div_auto = 0,
-	vlynq_ldiv1,
-	vlynq_ldiv2,
-	vlynq_ldiv3,
-	vlynq_ldiv4,
-	vlynq_ldiv5,
-	vlynq_ldiv6,
-	vlynq_ldiv7,
-	vlynq_ldiv8,
-	vlynq_rdiv1,
-	vlynq_rdiv2,
-	vlynq_rdiv3,
-	vlynq_rdiv4,
-	vlynq_rdiv5,
-	vlynq_rdiv6,
-	vlynq_rdiv7,
-	vlynq_rdiv8,
-	vlynq_div_external
-};
-
-struct vlynq_device_id {
-	u32 id;
-	enum vlynq_divisor divisor;
-	unsigned long driver_data;
-};
-
-struct vlynq_regs;
-struct vlynq_device {
-	u32 id, dev_id;
-	int local_irq;
-	int remote_irq;
-	enum vlynq_divisor divisor;
-	u32 regs_start, regs_end;
-	u32 mem_start, mem_end;
-	u32 irq_start, irq_end;
-	int irq;
-	int enabled;
-	struct vlynq_regs *local;
-	struct vlynq_regs *remote;
-	struct device dev;
-};
-
-struct vlynq_driver {
-	char *name;
-	struct vlynq_device_id *id_table;
-	int (*probe)(struct vlynq_device *dev, struct vlynq_device_id *id);
-	void (*remove)(struct vlynq_device *dev);
-	struct device_driver driver;
-};
-
-struct plat_vlynq_ops {
-	int (*on)(struct vlynq_device *dev);
-	void (*off)(struct vlynq_device *dev);
-};
-
-static inline struct vlynq_driver *to_vlynq_driver(struct device_driver *drv)
-{
-	return container_of(drv, struct vlynq_driver, driver);
-}
-
-static inline struct vlynq_device *to_vlynq_device(struct device *device)
-{
-	return container_of(device, struct vlynq_device, dev);
-}
-
-extern struct bus_type vlynq_bus_type;
-
-extern int __vlynq_register_driver(struct vlynq_driver *driver,
-				   struct module *owner);
-
-static inline int vlynq_register_driver(struct vlynq_driver *driver)
-{
-	return __vlynq_register_driver(driver, THIS_MODULE);
-}
-
-static inline void *vlynq_get_drvdata(struct vlynq_device *dev)
-{
-	return dev_get_drvdata(&dev->dev);
-}
-
-static inline void vlynq_set_drvdata(struct vlynq_device *dev, void *data)
-{
-	dev_set_drvdata(&dev->dev, data);
-}
-
-static inline u32 vlynq_mem_start(struct vlynq_device *dev)
-{
-	return dev->mem_start;
-}
-
-static inline u32 vlynq_mem_end(struct vlynq_device *dev)
-{
-	return dev->mem_end;
-}
-
-static inline u32 vlynq_mem_len(struct vlynq_device *dev)
-{
-	return dev->mem_end - dev->mem_start + 1;
-}
-
-static inline int vlynq_virq_to_irq(struct vlynq_device *dev, int virq)
-{
-	int irq = dev->irq_start + virq;
-	if ((irq < dev->irq_start) || (irq > dev->irq_end))
-		return -EINVAL;
-
-	return irq;
-}
-
-static inline int vlynq_irq_to_virq(struct vlynq_device *dev, int irq)
-{
-	if ((irq < dev->irq_start) || (irq > dev->irq_end))
-		return -EINVAL;
-
-	return irq - dev->irq_start;
-}
-
-extern void vlynq_unregister_driver(struct vlynq_driver *driver);
-extern int vlynq_enable_device(struct vlynq_device *dev);
-extern void vlynq_disable_device(struct vlynq_device *dev);
-extern int vlynq_set_local_mapping(struct vlynq_device *dev, u32 tx_offset,
-				   struct vlynq_mapping *mapping);
-extern int vlynq_set_remote_mapping(struct vlynq_device *dev, u32 tx_offset,
-				    struct vlynq_mapping *mapping);
-extern int vlynq_set_local_irq(struct vlynq_device *dev, int virq);
-extern int vlynq_set_remote_irq(struct vlynq_device *dev, int virq);
-
-#endif /* __VLYNQ_H__ */
-- 
cgit v1.2.3


From 5aa8fd9cea2ee0d42c5d92c5eacf0a14bbc4c293 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 11 Sep 2023 20:25:50 -0400
Subject: fs: add a new SB_I_NOUMASK flag

SB_POSIXACL must be set when a filesystem supports POSIX ACLs, but NFSv4
also sets this flag to prevent the VFS from applying the umask on
newly-created files. NFSv4 doesn't support POSIX ACLs however, which
causes confusion when other subsystems try to test for them.

Add a new SB_I_NOUMASK flag that allows filesystems to opt-in to umask
stripping without advertising support for POSIX ACLs. Set the new flag
on NFSv4 instead of SB_POSIXACL.

Also, move mode_strip_umask to namei.h and convert init_mknod and
init_mkdir to use it.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Message-Id: <20230911-acl-fix-v3-1-b25315333f6c@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h    |  3 ++-
 include/linux/namei.h | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4aeb3fa11927..58dea591a341 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1119,7 +1119,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_NOATIME      BIT(10)	/* Do not update access times. */
 #define SB_NODIRATIME   BIT(11)	/* Do not update directory access times */
 #define SB_SILENT       BIT(15)
-#define SB_POSIXACL     BIT(16)	/* VFS does not apply the umask */
+#define SB_POSIXACL     BIT(16)	/* Supports POSIX ACLs */
 #define SB_INLINECRYPT  BIT(17)	/* Use blk-crypto for encrypted files */
 #define SB_KERNMOUNT    BIT(22)	/* this is a kern_mount call */
 #define SB_I_VERSION    BIT(23)	/* Update inode I_version field */
@@ -1166,6 +1166,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
 #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
 #define SB_I_RETIRED	0x00000800	/* superblock shouldn't be reused */
+#define SB_I_NOUMASK	0x00001000	/* VFS does not apply umask */
 
 /* Possible states of 'frozen' field */
 enum {
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 1463cbda4888..e3619920f9f0 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -92,6 +92,30 @@ extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
 
+/**
+ * mode_strip_umask - handle vfs umask stripping
+ * @dir:	parent directory of the new inode
+ * @mode:	mode of the new inode to be created in @dir
+ *
+ * In most filesystems, umask stripping depends on whether or not the
+ * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask
+ * stripping is done directly in here. If the filesystem does support POSIX
+ * ACLs umask stripping is deferred until the filesystem calls
+ * posix_acl_create().
+ *
+ * Some filesystems (like NFSv4) also want to avoid umask stripping by the
+ * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK
+ * to get this effect without declaring that they support POSIX ACLs.
+ *
+ * Returns: mode
+ */
+static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode)
+{
+	if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK))
+		mode &= ~current_umask();
+	return mode;
+}
+
 extern int __must_check nd_jump_link(const struct path *path);
 
 static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
-- 
cgit v1.2.3


From 61105aab4edb59bf8177f005eb2923fe5c4deb3c Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Thu, 21 Sep 2023 09:57:52 +0200
Subject: pipe: reduce padding in struct pipe_inode_info

This has no effect on 64 bit because there are 10 32-bit integers
surrounding the two bools, but on 32 bit architectures, this reduces
the struct size by 4 bytes by merging the two bools into one word.

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Message-Id: <20230921075755.1378787-1-max.kellermann@ionos.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pipe_fs_i.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 608a9eb86bff..598a411d7da2 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -62,9 +62,6 @@ struct pipe_inode_info {
 	unsigned int tail;
 	unsigned int max_usage;
 	unsigned int ring_size;
-#ifdef CONFIG_WATCH_QUEUE
-	bool note_loss;
-#endif
 	unsigned int nr_accounted;
 	unsigned int readers;
 	unsigned int writers;
@@ -72,6 +69,9 @@ struct pipe_inode_info {
 	unsigned int r_counter;
 	unsigned int w_counter;
 	bool poll_usage;
+#ifdef CONFIG_WATCH_QUEUE
+	bool note_loss;
+#endif
 	struct page *tmp_page;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
-- 
cgit v1.2.3


From b4bd6b4bac8edd61eb8f7b836969d12c0c6af165 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Thu, 21 Sep 2023 09:57:53 +0200
Subject: fs/pipe: move check to pipe_has_watch_queue()

This declutters the code by reducing the number of #ifdefs and makes
the watch_queue checks simpler.  This has no runtime effect; the
machine code is identical.

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Message-Id: <20230921075755.1378787-2-max.kellermann@ionos.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pipe_fs_i.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 598a411d7da2..8ff23bf5a819 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -124,6 +124,22 @@ struct pipe_buf_operations {
 	bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
 };
 
+/**
+ * pipe_has_watch_queue - Check whether the pipe is a watch_queue,
+ * i.e. it was created with O_NOTIFICATION_PIPE
+ * @pipe: The pipe to check
+ *
+ * Return: true if pipe is a watch queue, false otherwise.
+ */
+static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe)
+{
+#ifdef CONFIG_WATCH_QUEUE
+	return pipe->watch_queue != NULL;
+#else
+	return false;
+#endif
+}
+
 /**
  * pipe_empty - Return true if the pipe is empty
  * @head: The pipe ring head pointer
-- 
cgit v1.2.3


From 85fadf89e5708d74c4923e9ceeca4b9df0e000bb Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:54:08 -0700
Subject: watch_queue: Annotate struct watch_filter with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct watch_filter.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: David Howells <dhowells@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Siddh Raman Pant <code@siddh.me>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Qian Cai <cai@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Siddh Raman Pant <code@siddh.me>
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Message-Id: <20230922175407.work.754-kees@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/watch_queue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h
index 45cd42f55d49..429c7b6afead 100644
--- a/include/linux/watch_queue.h
+++ b/include/linux/watch_queue.h
@@ -32,7 +32,7 @@ struct watch_filter {
 		DECLARE_BITMAP(type_filter, WATCH_TYPE__NR);
 	};
 	u32			nr_filters;	/* Number of filters */
-	struct watch_type_filter filters[];
+	struct watch_type_filter filters[] __counted_by(nr_filters);
 };
 
 struct watch_queue {
-- 
cgit v1.2.3


From 0ede61d8589cc2d93aa78230d74ac58b5b8d0244 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Sep 2023 08:45:59 +0200
Subject: file: convert to SLAB_TYPESAFE_BY_RCU

In recent discussions around some performance improvements in the file
handling area we discussed switching the file cache to rely on
SLAB_TYPESAFE_BY_RCU which allows us to get rid of call_rcu() based
freeing for files completely. This is a pretty sensitive change overall
but it might actually be worth doing.

The main downside is the subtlety. The other one is that we should
really wait for Jann's patch to land that enables KASAN to handle
SLAB_TYPESAFE_BY_RCU UAFs. Currently it doesn't but a patch for this
exists.

With SLAB_TYPESAFE_BY_RCU objects may be freed and reused multiple times
which requires a few changes. So it isn't sufficient anymore to just
acquire a reference to the file in question under rcu using
atomic_long_inc_not_zero() since the file might have already been
recycled and someone else might have bumped the reference.

In other words, callers might see reference count bumps from newer
users. For this reason it is necessary to verify that the pointer is the
same before and after the reference count increment. This pattern can be
seen in get_file_rcu() and __files_get_rcu().

In addition, it isn't possible to access or check fields in struct file
without first aqcuiring a reference on it. Not doing that was always
very dodgy and it was only usable for non-pointer data in struct file.
With SLAB_TYPESAFE_BY_RCU it is necessary that callers first acquire a
reference under rcu or they must hold the files_lock of the fdtable.
Failing to do either one of this is a bug.

Thanks to Jann for pointing out that we need to ensure memory ordering
between reallocations and pointer check by ensuring that all subsequent
loads have a dependency on the second load in get_file_rcu() and
providing a fixup that was folded into this patch.

Cc: Jann Horn <jannh@google.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fdtable.h | 17 +++--------------
 include/linux/fs.h      |  4 +++-
 2 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index e066816f3519..bc4c3287a65e 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -98,20 +98,9 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un
 	return files_lookup_fd_raw(files, fd);
 }
 
-static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd)
-{
-	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
-			   "suspicious rcu_dereference_check() usage");
-	return files_lookup_fd_raw(files, fd);
-}
-
-static inline struct file *lookup_fd_rcu(unsigned int fd)
-{
-	return files_lookup_fd_rcu(current->files, fd);
-}
-
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd);
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd);
+struct file *lookup_fdget_rcu(unsigned int fd);
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd);
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd);
 
 struct task_struct;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 58dea591a341..ceafc40cc25f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1042,7 +1042,9 @@ static inline struct file *get_file(struct file *f)
 	atomic_long_inc(&f->f_count);
 	return f;
 }
-#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count)
+
+struct file *get_file_rcu(struct file __rcu **f);
+
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 
 #define	MAX_NON_LFS	((1UL<<31) - 1)
-- 
cgit v1.2.3


From 95e93d17cb113a3b097774874572b69c058acab5 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 4 Oct 2023 13:19:15 +0200
Subject: vfs: predict the error in retry_estale as unlikely

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20231004111916.728135-2-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/namei.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/namei.h b/include/linux/namei.h
index e3619920f9f0..3100371b5e32 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -136,7 +136,7 @@ static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
 static inline bool
 retry_estale(const long error, const unsigned int flags)
 {
-	return error == -ESTALE && !(flags & LOOKUP_REVAL);
+	return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL));
 }
 
 #endif /* _LINUX_NAMEI_H */
-- 
cgit v1.2.3


From 08582d678fcf11fc86188f0b92239d3d49667d8e Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 9 Oct 2023 18:37:11 +0300
Subject: fs: create helper file_user_path() for user displayed mapped file
 path

Overlayfs uses backing files with "fake" overlayfs f_path and "real"
underlying f_inode, in order to use underlying inode aops for mapped
files and to display the overlayfs path in /proc/<pid>/maps.

In preparation for storing the overlayfs "fake" path instead of the
underlying "real" path in struct backing_file, define a noop helper
file_user_path() that returns f_path for now.

Use the new helper in procfs and kernel logs whenever a path of a
mapped file is displayed to users.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231009153712.1566422-3-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ceafc40cc25f..057a3bbd4d27 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2513,6 +2513,20 @@ static inline const struct path *file_real_path(struct file *f)
 	return &f->f_path;
 }
 
+/*
+ * file_user_path - get the path to display for memory mapped file
+ *
+ * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
+ * stored in ->vm_file is a backing file whose f_inode is on the underlying
+ * filesystem.  When the mapped file path is displayed to user (e.g. via
+ * /proc/<pid>/maps), this helper should be used to get the path to display
+ * to the user, which is the path of the fd that user has requested to map.
+ */
+static inline const struct path *file_user_path(struct file *f)
+{
+	return &f->f_path;
+}
+
 static inline struct file *file_clone_open(struct file *file)
 {
 	return dentry_open(&file->f_path, file->f_flags, file->f_cred);
-- 
cgit v1.2.3


From def3ae83da02f87005210fa3d448c5dd37ba4105 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 9 Oct 2023 18:37:12 +0300
Subject: fs: store real path instead of fake path in backing file f_path

A backing file struct stores two path's, one "real" path that is referring
to f_inode and one "fake" path, which should be displayed to users in
/proc/<pid>/maps.

There is a lot more potential code that needs to know the "real" path, then
code that needs to know the "fake" path.

Instead of code having to request the "real" path with file_real_path(),
store the "real" path in f_path and require code that needs to know the
"fake" path request it with file_user_path().
Replace the file_real_path() helper with a simple const accessor f_path().

After this change, file_dentry() is not expected to observe any files
with overlayfs f_path and real f_inode, so the call to ->d_real() should
not be needed.  Leave the ->d_real() call for now and add an assertion
in ovl_d_real() to catch if we made wrong assumptions.

Suggested-by: Miklos Szeredi <miklos@szeredi.hu>
Link: https://lore.kernel.org/r/CAJfpegtt48eXhhjDFA1ojcHPNKj3Go6joryCPtEFAKpocyBsnw@mail.gmail.com/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231009153712.1566422-4-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h       | 22 ++++------------------
 include/linux/fsnotify.h |  3 +--
 2 files changed, 5 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 057a3bbd4d27..b37dccfb4534 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2492,26 +2492,10 @@ struct file *dentry_open(const struct path *path, int flags,
 			 const struct cred *creds);
 struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 			   const struct cred *cred);
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
 			       const struct path *real_path,
 			       const struct cred *cred);
-struct path *backing_file_real_path(struct file *f);
-
-/*
- * file_real_path - get the path corresponding to f_inode
- *
- * When opening a backing file for a stackable filesystem (e.g.,
- * overlayfs) f_path may be on the stackable filesystem and f_inode on
- * the underlying filesystem.  When the path associated with f_inode is
- * needed, this helper should be used instead of accessing f_path
- * directly.
-*/
-static inline const struct path *file_real_path(struct file *f)
-{
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		return backing_file_real_path(f);
-	return &f->f_path;
-}
+struct path *backing_file_user_path(struct file *f);
 
 /*
  * file_user_path - get the path to display for memory mapped file
@@ -2524,6 +2508,8 @@ static inline const struct path *file_real_path(struct file *f)
  */
 static inline const struct path *file_user_path(struct file *f)
 {
+	if (unlikely(f->f_mode & FMODE_BACKING))
+		return backing_file_user_path(f);
 	return &f->f_path;
 }
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index ed48e4f1e755..bcb6609b54b3 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -96,8 +96,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
 	if (file->f_mode & FMODE_NONOTIFY)
 		return 0;
 
-	/* Overlayfs internal files have fake f_path */
-	path = file_real_path(file);
+	path = &file->f_path;
 	return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
 }
 
-- 
cgit v1.2.3


From e4e8b47a34a432c3f65534d12d5c132b6639da71 Mon Sep 17 00:00:00 2001
From: Max Kellermann <mk@cm4all.com>
Date: Mon, 15 Jan 2018 18:30:46 +0100
Subject: fs: fix umask on NFS with CONFIG_FS_POSIX_ACL=n

Make IS_POSIXACL() return false if POSIX ACL support is disabled.

Never skip applying the umask in namei.c and never bother to do any
ACL specific checks if the filesystem falsely indicates it has ACLs
enabled when the feature is completely disabled in the kernel.

This fixes a problem where the umask is always ignored in the NFS
client when compiled without CONFIG_FS_POSIX_ACL.  This is a 4 year
old regression caused by commit 013cdf1088d723 which itself was not
completely wrong, but failed to consider all the side effects by
misdesigned VFS code.

Prior to that commit, there were two places where the umask could be
applied, for example when creating a directory:

 1. in the VFS layer in SYSCALL_DEFINE3(mkdirat), but only if
    !IS_POSIXACL()

 2. again (unconditionally) in nfs3_proc_mkdir()

The first one does not apply, because even without
CONFIG_FS_POSIX_ACL, the NFS client sets SB_POSIXACL in
nfs_fill_super().

After that commit, (2.) was replaced by:

 2b. in posix_acl_create(), called by nfs3_proc_mkdir()

There's one branch in posix_acl_create() which applies the umask;
however, without CONFIG_FS_POSIX_ACL, posix_acl_create() is an empty
dummy function which does not apply the umask.

The approach chosen by this patch is to make IS_POSIXACL() always
return false when POSIX ACL support is disabled, so the umask always
gets applied by the VFS layer.  This is consistent with the (regular)
behavior of posix_acl_create(): that function returns early if
IS_POSIXACL() is false, before applying the umask.

Therefore, posix_acl_create() is responsible for applying the umask if
there is ACL support enabled in the file system (SB_POSIXACL), and the
VFS layer is responsible for all other cases (no SB_POSIXACL or no
CONFIG_FS_POSIX_ACL).

Signed-off-by: Max Kellermann <mk@cm4all.com>
Link: https://lore.kernel.org/r/151603744662.29035.4910161264124875658.stgit@rabbit.intern.cm-ag
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b37dccfb4534..cb8bfa1f8ecb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2113,7 +2113,12 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
 #define IS_NOQUOTA(inode)	((inode)->i_flags & S_NOQUOTA)
 #define IS_APPEND(inode)	((inode)->i_flags & S_APPEND)
 #define IS_IMMUTABLE(inode)	((inode)->i_flags & S_IMMUTABLE)
+
+#ifdef CONFIG_FS_POSIX_ACL
 #define IS_POSIXACL(inode)	__IS_FLG(inode, SB_POSIXACL)
+#else
+#define IS_POSIXACL(inode)	0
+#endif
 
 #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
 #define IS_NOCMTIME(inode)	((inode)->i_flags & S_NOCMTIME)
-- 
cgit v1.2.3


From 1dbfe73bd648bdc3a2d8c9e5b23ae70e3333b592 Mon Sep 17 00:00:00 2001
From: Johannes Zink <j.zink@pengutronix.de>
Date: Wed, 18 Oct 2023 09:09:55 +0200
Subject: net: stmmac: intel: remove unnecessary field struct
 plat_stmmacenet_data::ext_snapshot_num

Do not store bitmask for enabling AUX_SNAPSHOT0. The previous commit
("net: stmmac: fix PPS capture input index") takes care of calculating
the proper bit mask from the request data's extts.index field, which is
0 if not explicitly specified otherwise.

Signed-off-by: Johannes Zink <j.zink@pengutronix.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/stmmac.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index c0079a7574ae..0b4658a7eceb 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -303,7 +303,6 @@ struct plat_stmmacenet_data {
 	unsigned int eee_usecs_rate;
 	struct pci_dev *pdev;
 	int int_snapshot_num;
-	int ext_snapshot_num;
 	int msi_mac_vec;
 	int msi_wol_vec;
 	int msi_lpi_vec;
-- 
cgit v1.2.3


From 7f3eb2174512fe6c9c0f062e96eccb0d3cc6d5cd Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Wed, 18 Oct 2023 14:35:47 +0200
Subject: net: introduce napi_is_scheduled helper

We currently have napi_if_scheduled_mark_missed that can be used to
check if napi is scheduled but that does more thing than simply checking
it and return a bool. Some driver already implement custom function to
check if napi is scheduled.

Drop these custom function and introduce napi_is_scheduled that simply
check if napi is scheduled atomically.

Update any driver and code that implement a similar check and instead
use this new helper.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1c7681263d30..b8bf669212cc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -482,6 +482,29 @@ static inline bool napi_prefer_busy_poll(struct napi_struct *n)
 	return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
 }
 
+/**
+ * napi_is_scheduled - test if NAPI is scheduled
+ * @n: NAPI context
+ *
+ * This check is "best-effort". With no locking implemented,
+ * a NAPI can be scheduled or terminate right after this check
+ * and produce not precise results.
+ *
+ * NAPI_STATE_SCHED is an internal state, napi_is_scheduled
+ * should not be used normally and napi_schedule should be
+ * used instead.
+ *
+ * Use only if the driver really needs to check if a NAPI
+ * is scheduled for example in the context of delayed timer
+ * that can be skipped if a NAPI is already scheduled.
+ *
+ * Return True if NAPI is scheduled, False otherwise.
+ */
+static inline bool napi_is_scheduled(struct napi_struct *n)
+{
+	return test_bit(NAPI_STATE_SCHED, &n->state);
+}
+
 bool napi_schedule_prep(struct napi_struct *n);
 
 /**
-- 
cgit v1.2.3


From 8c8d2d9670e813d623d8a2cbc881cb57344f4d37 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 17 Mar 2017 16:35:23 -0800
Subject: bcache: move closures to lib/

Prep work for bcachefs - being a fork of bcache it also uses closures

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Acked-by: Coly Li <colyli@suse.de>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
---
 include/linux/closure.h | 377 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 377 insertions(+)
 create mode 100644 include/linux/closure.h

(limited to 'include/linux')

diff --git a/include/linux/closure.h b/include/linux/closure.h
new file mode 100644
index 000000000000..0ec9e7bc8d97
--- /dev/null
+++ b/include/linux/closure.h
@@ -0,0 +1,377 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ *   continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, requires a 'return' immediately following the
+ * location where this macro is referenced, to return to the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio)
+ * {
+ *	closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ */
+
+struct closure;
+struct closure_syncer;
+typedef void (closure_fn) (struct closure *);
+extern struct dentry *bcache_debug;
+
+struct closure_waitlist {
+	struct llist_head	list;
+};
+
+enum closure_state {
+	/*
+	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+	 * the thread that owns the closure, and cleared by the thread that's
+	 * waking up the closure.
+	 *
+	 * The rest are for debugging and don't affect behaviour:
+	 *
+	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+	 * closure_init() and when closure_put() runs then next function), and
+	 * must be cleared before remaining hits 0. Primarily to help guard
+	 * against incorrect usage and accidentally transferring references.
+	 * continue_at() and closure_return() clear it for you, if you're doing
+	 * something unusual you can use closure_set_dead() which also helps
+	 * annotate where references are being transferred.
+	 */
+
+	CLOSURE_BITS_START	= (1U << 26),
+	CLOSURE_DESTRUCTOR	= (1U << 26),
+	CLOSURE_WAITING		= (1U << 28),
+	CLOSURE_RUNNING		= (1U << 30),
+};
+
+#define CLOSURE_GUARD_MASK					\
+	((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
+
+#define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
+#define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
+
+struct closure {
+	union {
+		struct {
+			struct workqueue_struct *wq;
+			struct closure_syncer	*s;
+			struct llist_node	list;
+			closure_fn		*fn;
+		};
+		struct work_struct	work;
+	};
+
+	struct closure		*parent;
+
+	atomic_t		remaining;
+
+#ifdef CONFIG_DEBUG_CLOSURES
+#define CLOSURE_MAGIC_DEAD	0xc054dead
+#define CLOSURE_MAGIC_ALIVE	0xc054a11e
+
+	unsigned int		magic;
+	struct list_head	all;
+	unsigned long		ip;
+	unsigned long		waiting_on;
+#endif
+};
+
+void closure_sub(struct closure *cl, int v);
+void closure_put(struct closure *cl);
+void __closure_wake_up(struct closure_waitlist *list);
+bool closure_wait(struct closure_waitlist *list, struct closure *cl);
+void __closure_sync(struct closure *cl);
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+static inline void closure_sync(struct closure *cl)
+{
+	if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+		__closure_sync(cl);
+}
+
+#ifdef CONFIG_DEBUG_CLOSURES
+
+void closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->waiting_on = f;
+#endif
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+				  struct workqueue_struct *wq)
+{
+	closure_set_ip(cl);
+	cl->fn = fn;
+	cl->wq = wq;
+	/* between atomic_dec() in closure_put() */
+	smp_mb__before_atomic();
+}
+
+static inline void closure_queue(struct closure *cl)
+{
+	struct workqueue_struct *wq = cl->wq;
+	/**
+	 * Changes made to closure, work_struct, or a couple of other structs
+	 * may cause work.func not pointing to the right location.
+	 */
+	BUILD_BUG_ON(offsetof(struct closure, fn)
+		     != offsetof(struct work_struct, func));
+
+	if (wq) {
+		INIT_WORK(&cl->work, cl->work.func);
+		BUG_ON(!queue_work(wq, &cl->work));
+	} else
+		cl->fn(cl);
+}
+
+/**
+ * closure_get - increment a closure's refcount
+ */
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	BUG_ON((atomic_inc_return(&cl->remaining) &
+		CLOSURE_REMAINING_MASK) <= 1);
+#else
+	atomic_inc(&cl->remaining);
+#endif
+}
+
+/**
+ * closure_init - Initialize a closure, setting the refcount to 1
+ * @cl:		closure to initialize
+ * @parent:	parent of the new closure. cl will take a refcount on it for its
+ *		lifetime; may be NULL.
+ */
+static inline void closure_init(struct closure *cl, struct closure *parent)
+{
+	cl->fn = NULL;
+	cl->parent = parent;
+	if (parent)
+		closure_get(parent);
+
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+
+	closure_debug_create(cl);
+	closure_set_ip(cl);
+}
+
+static inline void closure_init_stack(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+}
+
+/**
+ * closure_wake_up - wake up all closures on a wait list,
+ *		     with memory barrier
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+	/* Memory barrier for the wait list */
+	smp_mb();
+	__closure_wake_up(list);
+}
+
+/**
+ * continue_at - jump to another function with barrier
+ *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
+ * been dropped with closure_put()), it will resume execution at @fn running out
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
+ *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
+ * and whatever @cl owns may be freed out from under you - a running closure fn
+ * has a ref on its own closure which continue_at() drops.
+ *
+ * Note you are expected to immediately return after using this macro.
+ */
+#define continue_at(_cl, _fn, _wq)					\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
+#define closure_return(_cl)	continue_at((_cl), NULL, NULL)
+
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
+#define continue_at_nobarrier(_cl, _fn, _wq)				\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_queue(_cl);						\
+} while (0)
+
+/**
+ * closure_return_with_destructor - finish execution of a closure,
+ *				    with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
+#define closure_return_with_destructor(_cl, _destructor)		\
+do {									\
+	set_closure_fn(_cl, _destructor, NULL);				\
+	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
+} while (0)
+
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
+static inline void closure_call(struct closure *cl, closure_fn fn,
+				struct workqueue_struct *wq,
+				struct closure *parent)
+{
+	closure_init(cl, parent);
+	continue_at_nobarrier(cl, fn, wq);
+}
+
+#endif /* _LINUX_CLOSURE_H */
-- 
cgit v1.2.3


From ced58fc7ab9f2d4b3d8aaeb859321a61aa884f66 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Dec 2017 12:42:44 -0500
Subject: closures: closure_wait_event()

Like wait_event() - except, because it uses closures and closure
waitlists it doesn't have the restriction on modifying task state inside
the condition check, like wait_event() does.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Acked-by: Coly Li <colyli@suse.de>
---
 include/linux/closure.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/closure.h b/include/linux/closure.h
index 0ec9e7bc8d97..36b4a83f9b77 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -374,4 +374,26 @@ static inline void closure_call(struct closure *cl, closure_fn fn,
 	continue_at_nobarrier(cl, fn, wq);
 }
 
+#define __closure_wait_event(waitlist, _cond)				\
+do {									\
+	struct closure cl;						\
+									\
+	closure_init_stack(&cl);					\
+									\
+	while (1) {							\
+		closure_wait(waitlist, &cl);				\
+		if (_cond)						\
+			break;						\
+		closure_sync(&cl);					\
+	}								\
+	closure_wake_up(waitlist);					\
+	closure_sync(&cl);						\
+} while (0)
+
+#define closure_wait_event(waitlist, _cond)				\
+do {									\
+	if (!(_cond))							\
+		__closure_wait_event(waitlist, _cond);			\
+} while (0)
+
 #endif /* _LINUX_CLOSURE_H */
-- 
cgit v1.2.3


From 48b7935722b8403fec0469ba3dafc5a0af08225d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 Mar 2023 02:39:39 -0500
Subject: closures: closure_nr_remaining()

Factor out a new helper, which returns the number of events outstanding.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/closure.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/closure.h b/include/linux/closure.h
index 36b4a83f9b77..722a586bb224 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -172,6 +172,11 @@ void __closure_wake_up(struct closure_waitlist *list);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl);
 void __closure_sync(struct closure *cl);
 
+static inline unsigned closure_nr_remaining(struct closure *cl)
+{
+	return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK;
+}
+
 /**
  * closure_sync - sleep until a closure a closure has nothing left to wait on
  *
@@ -180,7 +185,7 @@ void __closure_sync(struct closure *cl);
  */
 static inline void closure_sync(struct closure *cl)
 {
-	if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+	if (closure_nr_remaining(cl) != 1)
 		__closure_sync(cl);
 }
 
-- 
cgit v1.2.3


From 9492261ff2460252cf2d8de89cdf854c7e2b28a0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Feb 2021 20:11:25 -0500
Subject: lib/generic-radix-tree.c: Don't overflow in peek()

When we started spreading new inode numbers throughout most of the 64
bit inode space, that triggered some corner case bugs, in particular
some integer overflows related to the radix tree code. Oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 include/linux/generic-radix-tree.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 107613f7d792..f6cd0f909d9f 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -38,6 +38,7 @@
 
 #include <asm/page.h>
 #include <linux/bug.h>
+#include <linux/limits.h>
 #include <linux/log2.h>
 #include <linux/math.h>
 #include <linux/types.h>
@@ -184,6 +185,12 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
 static inline void __genradix_iter_advance(struct genradix_iter *iter,
 					   size_t obj_size)
 {
+	if (iter->offset + obj_size < iter->offset) {
+		iter->offset	= SIZE_MAX;
+		iter->pos	= SIZE_MAX;
+		return;
+	}
+
 	iter->offset += obj_size;
 
 	if (!is_power_of_2(obj_size) &&
-- 
cgit v1.2.3


From 73badee4280ce30a927b82eedc919fb23ee549ae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 12 Sep 2023 01:17:22 -0400
Subject: lib/generic-radix-tree.c: Add peek_prev()

This patch adds genradix_peek_prev(), genradix_iter_rewind(), and
genradix_for_each_reverse(), for iterating backwards over a generic
radix tree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/generic-radix-tree.h | 61 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index f6cd0f909d9f..847413164738 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -117,6 +117,11 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
 
 #define __genradix_cast(_radix)		(typeof((_radix)->type[0]) *)
 #define __genradix_obj_size(_radix)	sizeof((_radix)->type[0])
+#define __genradix_objs_per_page(_radix)			\
+	(PAGE_SIZE / sizeof((_radix)->type[0]))
+#define __genradix_page_remainder(_radix)			\
+	(PAGE_SIZE % sizeof((_radix)->type[0]))
+
 #define __genradix_idx_to_offset(_radix, _idx)			\
 	__idx_to_offset(_idx, __genradix_obj_size(_radix))
 
@@ -180,7 +185,25 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
 #define genradix_iter_peek(_iter, _radix)			\
 	(__genradix_cast(_radix)				\
 	 __genradix_iter_peek(_iter, &(_radix)->tree,		\
-			      PAGE_SIZE / __genradix_obj_size(_radix)))
+			__genradix_objs_per_page(_radix)))
+
+void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *,
+				size_t, size_t);
+
+/**
+ * genradix_iter_peek_prev - get first entry at or below iterator's current
+ *			     position
+ * @_iter:	a genradix_iter
+ * @_radix:	genradix being iterated over
+ *
+ * If no more entries exist at or below @_iter's current position, returns NULL
+ */
+#define genradix_iter_peek_prev(_iter, _radix)			\
+	(__genradix_cast(_radix)				\
+	 __genradix_iter_peek_prev(_iter, &(_radix)->tree,	\
+			__genradix_objs_per_page(_radix),	\
+			__genradix_obj_size(_radix) +		\
+			__genradix_page_remainder(_radix)))
 
 static inline void __genradix_iter_advance(struct genradix_iter *iter,
 					   size_t obj_size)
@@ -203,6 +226,25 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
 #define genradix_iter_advance(_iter, _radix)			\
 	__genradix_iter_advance(_iter, __genradix_obj_size(_radix))
 
+static inline void __genradix_iter_rewind(struct genradix_iter *iter,
+					  size_t obj_size)
+{
+	if (iter->offset == 0 ||
+	    iter->offset == SIZE_MAX) {
+		iter->offset = SIZE_MAX;
+		return;
+	}
+
+	if ((iter->offset & (PAGE_SIZE - 1)) == 0)
+		iter->offset -= PAGE_SIZE % obj_size;
+
+	iter->offset -= obj_size;
+	iter->pos--;
+}
+
+#define genradix_iter_rewind(_iter, _radix)			\
+	__genradix_iter_rewind(_iter, __genradix_obj_size(_radix))
+
 #define genradix_for_each_from(_radix, _iter, _p, _start)	\
 	for (_iter = genradix_iter_init(_radix, _start);	\
 	     (_p = genradix_iter_peek(&_iter, _radix)) != NULL;	\
@@ -220,6 +262,23 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
 #define genradix_for_each(_radix, _iter, _p)			\
 	genradix_for_each_from(_radix, _iter, _p, 0)
 
+#define genradix_last_pos(_radix)				\
+	(SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1)
+
+/**
+ * genradix_for_each_reverse - iterate over entry in a genradix, reverse order
+ * @_radix:	genradix to iterate over
+ * @_iter:	a genradix_iter to track current position
+ * @_p:		pointer to genradix entry type
+ *
+ * On every iteration, @_p will point to the current entry, and @_iter.pos
+ * will be the current entry's index.
+ */
+#define genradix_for_each_reverse(_radix, _iter, _p)		\
+	for (_iter = genradix_iter_init(_radix,	genradix_last_pos(_radix));\
+	     (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\
+	     genradix_iter_rewind(&_iter, _radix))
+
 int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
 
 /**
-- 
cgit v1.2.3


From a615f67e1a426f35366b8398c11f31c148e7df48 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 16 Oct 2023 06:47:39 -0700
Subject: bpf: Add sockptr support for getsockopt

The whole network stack uses sockptr, and while it doesn't move to
something more modern, let's use sockptr in getsockptr BPF hooks, so, it
could be used by other callers.

The main motivation for this change is to use it in the io_uring
{g,s}etsockopt(), which will use a userspace pointer for *optval, but, a
kernel value for optlen.

Link: https://lore.kernel.org/all/ZSArfLaaGcfd8LH8@gmail.com/

Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20231016134750.1381153-2-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bpf-cgroup.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8506690dbb9c..f5b4fb6ed8c6 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -139,9 +139,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
 				       int *optname, char __user *optval,
 				       int *optlen, char **kernel_optval);
+
 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
-				       int optname, char __user *optval,
-				       int __user *optlen, int max_optlen,
+				       int optname, sockptr_t optval,
+				       sockptr_t optlen, int max_optlen,
 				       int retval);
 
 int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
-- 
cgit v1.2.3


From 3f31e0d14d44ad491a81b7c1f83f32fbc300a867 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 16 Oct 2023 06:47:40 -0700
Subject: bpf: Add sockptr support for setsockopt

The whole network stack uses sockptr, and while it doesn't move to
something more modern, let's use sockptr in setsockptr BPF hooks, so, it
could be used by other callers.

The main motivation for this change is to use it in the io_uring
{g,s}etsockopt(), which will use a userspace pointer for *optval, but, a
kernel value for optlen.

Link: https://lore.kernel.org/all/ZSArfLaaGcfd8LH8@gmail.com/

Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20231016134750.1381153-3-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bpf-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index f5b4fb6ed8c6..cecfe8c99f28 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -137,7 +137,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
-				       int *optname, char __user *optval,
+				       int *optname, sockptr_t optval,
 				       int *optlen, char **kernel_optval);
 
 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
-- 
cgit v1.2.3


From 0b05b0cd78c92371fdde6333d006f39eaf9e0860 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 16 Oct 2023 06:47:42 -0700
Subject: net/socket: Break down __sys_getsockopt

Split __sys_getsockopt() into two functions by removing the core
logic into a sub-function (do_sock_getsockopt()). This will avoid
code duplication when doing the same operation in other callers, for
instance.

do_sock_getsockopt() will be called by io_uring getsockopt() command
operation in the following patch.

The same was done for the setsockopt pair.

Suggested-by: Martin KaFai Lau <martin.lau@linux.dev>
Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20231016134750.1381153-5-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bpf-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index cecfe8c99f28..ffaca1ab5e8d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -378,7 +378,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT))			       \
-		get_user(__ret, optlen);				       \
+		copy_from_sockptr(&__ret, optlen, sizeof(int));		       \
 	__ret;								       \
 })
 
-- 
cgit v1.2.3


From 5fea44a6e05b86bf49019fbbf2ab30098d03e0dc Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 16 Oct 2023 06:47:43 -0700
Subject: io_uring/cmd: Pass compat mode in issue_flags

Create a new flag to track if the operation is running compat mode.
This basically check the context->compat and pass it to the issue_flags,
so, it could be queried later in the callbacks.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20231016134750.1381153-6-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index b4391e0a9bc8..aefb73eeeebf 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -23,6 +23,7 @@ enum io_uring_cmd_flags {
 
 	/* set when uring wants to cancel a previously issued command */
 	IO_URING_F_CANCEL		= (1 << 11),
+	IO_URING_F_COMPAT		= (1 << 12),
 };
 
 /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
-- 
cgit v1.2.3


From 6da88306811b40a207c94c9da9faf07bdb20776e Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Wed, 18 Oct 2023 14:17:39 +0800
Subject: cgroup: Prepare for using css_task_iter_*() in BPF

This patch makes some preparations for using css_task_iter_*() in BPF
Program.

1. Flags CSS_TASK_ITER_* are #define-s and it's not easy for bpf prog to
use them. Convert them to enum so bpf prog can take them from vmlinux.h.

2. In the next patch we will add css_task_iter_*() in common kfuncs which
is not safe. Since css_task_iter_*() does spin_unlock_irq() which might
screw up irq flags depending on the context where bpf prog is running.
So we should use irqsave/irqrestore here and the switching is harmless.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20231018061746.111364-2-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/cgroup.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b307013b9c6c..0ef0af66080e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -40,13 +40,11 @@ struct kernel_clone_args;
 #define CGROUP_WEIGHT_DFL		100
 #define CGROUP_WEIGHT_MAX		10000
 
-/* walk only threadgroup leaders */
-#define CSS_TASK_ITER_PROCS		(1U << 0)
-/* walk all threaded css_sets in the domain */
-#define CSS_TASK_ITER_THREADED		(1U << 1)
-
-/* internal flags */
-#define CSS_TASK_ITER_SKIPPED		(1U << 16)
+enum {
+	CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
+	CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
+	CSS_TASK_ITER_SKIPPED  = (1U << 16), /* internal flags */
+};
 
 /* a css_task_iter should be treated as an opaque object */
 struct css_task_iter {
-- 
cgit v1.2.3


From dfab99df147b0d364f0c199f832ff2aedfb2265a Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Wed, 18 Oct 2023 14:17:43 +0800
Subject: bpf: teach the verifier to enforce css_iter and task_iter in RCU CS

css_iter and task_iter should be used in rcu section. Specifically, in
sleepable progs explicit bpf_rcu_read_lock() is needed before use these
iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
use them directly.

This patch adds a new a KF flag KF_RCU_PROTECTED for bpf_iter_task_new and
bpf_iter_css_new. It means the kfunc should be used in RCU CS. We check
whether we are in rcu cs before we want to invoke this kfunc. If the rcu
protection is guaranteed, we would let st->type = PTR_TO_STACK | MEM_RCU.
Once user do rcu_unlock during the iteration, state MEM_RCU of regs would
be cleared. is_iter_reg_valid_init() will reject if reg->type is UNTRUSTED.

It is worth noting that currently, bpf_rcu_read_unlock does not
clear the state of the STACK_ITER reg, since bpf_for_each_spilled_reg
only considers STACK_SPILL. This patch also let bpf_for_each_spilled_reg
search STACK_ITER.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231018061746.111364-6-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 19 +++++++++++--------
 include/linux/btf.h          |  1 +
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 94ec766432f5..e67cd45a85be 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -386,19 +386,18 @@ struct bpf_verifier_state {
 	u32 jmp_history_cnt;
 };
 
-#define bpf_get_spilled_reg(slot, frame)				\
+#define bpf_get_spilled_reg(slot, frame, mask)				\
 	(((slot < frame->allocated_stack / BPF_REG_SIZE) &&		\
-	  (frame->stack[slot].slot_type[0] == STACK_SPILL))		\
+	  ((1 << frame->stack[slot].slot_type[0]) & (mask))) \
 	 ? &frame->stack[slot].spilled_ptr : NULL)
 
 /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */
-#define bpf_for_each_spilled_reg(iter, frame, reg)			\
-	for (iter = 0, reg = bpf_get_spilled_reg(iter, frame);		\
+#define bpf_for_each_spilled_reg(iter, frame, reg, mask)			\
+	for (iter = 0, reg = bpf_get_spilled_reg(iter, frame, mask);		\
 	     iter < frame->allocated_stack / BPF_REG_SIZE;		\
-	     iter++, reg = bpf_get_spilled_reg(iter, frame))
+	     iter++, reg = bpf_get_spilled_reg(iter, frame, mask))
 
-/* Invoke __expr over regsiters in __vst, setting __state and __reg */
-#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr)   \
+#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr)   \
 	({                                                               \
 		struct bpf_verifier_state *___vstate = __vst;            \
 		int ___i, ___j;                                          \
@@ -410,7 +409,7 @@ struct bpf_verifier_state {
 				__reg = &___regs[___j];                  \
 				(void)(__expr);                          \
 			}                                                \
-			bpf_for_each_spilled_reg(___j, __state, __reg) { \
+			bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \
 				if (!__reg)                              \
 					continue;                        \
 				(void)(__expr);                          \
@@ -418,6 +417,10 @@ struct bpf_verifier_state {
 		}                                                        \
 	})
 
+/* Invoke __expr over regsiters in __vst, setting __state and __reg */
+#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \
+	bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr)
+
 /* linked list of verifier states used to prune search */
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 928113a80a95..c2231c64d60b 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -74,6 +74,7 @@
 #define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
 #define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
 #define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
+#define KF_RCU_PROTECTED (1 << 11) /* kfunc should be protected by rcu cs when they are invoked */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
-- 
cgit v1.2.3


From 70e6f7e2b98575621019aa40ac616be58ff984e0 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 25 Sep 2023 20:13:29 -0700
Subject: configfs-tsm: Introduce a shared ABI for attestation reports

One of the common operations of a TSM (Trusted Security Module) is to
provide a way for a TVM (confidential computing guest execution
environment) to take a measurement of its launch state, sign it and
submit it to a verifying party. Upon successful attestation that
verifies the integrity of the TVM additional secrets may be deployed.
The concept is common across TSMs, but the implementations are
unfortunately vendor specific. While the industry grapples with a common
definition of this attestation format [1], Linux need not make this
problem worse by defining a new ABI per TSM that wants to perform a
similar operation. The current momentum has been to invent new ioctl-ABI
per TSM per function which at best is an abdication of the kernel's
responsibility to make common infrastructure concepts share common ABI.

The proposal, targeted to conceptually work with TDX, SEV-SNP, COVE if
not more, is to define a configfs interface to retrieve the TSM-specific
blob.

    report=/sys/kernel/config/tsm/report/report0
    mkdir $report
    dd if=binary_userdata_plus_nonce > $report/inblob
    hexdump $report/outblob

This approach later allows for the standardization of the attestation
blob format without needing to invent a new ABI. Once standardization
happens the standard format can be emitted by $report/outblob and
indicated by $report/provider, or a new attribute like
"$report/tcg_coco_report" can emit the standard format alongside the
vendor format.

Review of previous iterations of this interface identified that there is
a need to scale report generation for multiple container environments
[2]. Configfs enables a model where each container can bind mount one or
more report generation item instances. Still, within a container only a
single thread can be manipulating a given configuration instance at a
time. A 'generation' count is provided to detect conflicts between
multiple threads racing to configure a report instance.

The SEV-SNP concepts of "extended reports" and "privilege levels" are
optionally enabled by selecting 'tsm_report_ext_type' at register_tsm()
time. The expectation is that those concepts are generic enough that
they may be adopted by other TSM implementations. In other words,
configfs-tsm aims to address a superset of TSM specific functionality
with a common ABI where attributes may appear, or not appear, based on
the set of concepts the implementation supports.

Link: http://lore.kernel.org/r/64961c3baf8ce_142af829436@dwillia2-xfh.jf.intel.com.notmuch [1]
Link: http://lore.kernel.org/r/57f3a05e-8fcd-4656-beea-56bb8365ae64@linux.microsoft.com [2]
Cc: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Cc: Dionna Amalie Glaze <dionnaglaze@google.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Peter Gonda <pgonda@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Samuel Ortiz <sameo@rivosinc.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Tested-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/tsm.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 include/linux/tsm.h

(limited to 'include/linux')

diff --git a/include/linux/tsm.h b/include/linux/tsm.h
new file mode 100644
index 000000000000..de8324a2223c
--- /dev/null
+++ b/include/linux/tsm.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TSM_H
+#define __TSM_H
+
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#define TSM_INBLOB_MAX 64
+#define TSM_OUTBLOB_MAX SZ_32K
+
+/*
+ * Privilege level is a nested permission concept to allow confidential
+ * guests to partition address space, 4-levels are supported.
+ */
+#define TSM_PRIVLEVEL_MAX 3
+
+/**
+ * struct tsm_desc - option descriptor for generating tsm report blobs
+ * @privlevel: optional privilege level to associate with @outblob
+ * @inblob_len: sizeof @inblob
+ * @inblob: arbitrary input data
+ */
+struct tsm_desc {
+	unsigned int privlevel;
+	size_t inblob_len;
+	u8 inblob[TSM_INBLOB_MAX];
+};
+
+/**
+ * struct tsm_report - track state of report generation relative to options
+ * @desc: input parameters to @report_new()
+ * @outblob_len: sizeof(@outblob)
+ * @outblob: generated evidence to provider to the attestation agent
+ * @auxblob_len: sizeof(@auxblob)
+ * @auxblob: (optional) auxiliary data to the report (e.g. certificate data)
+ */
+struct tsm_report {
+	struct tsm_desc desc;
+	size_t outblob_len;
+	u8 *outblob;
+	size_t auxblob_len;
+	u8 *auxblob;
+};
+
+/**
+ * struct tsm_ops - attributes and operations for tsm instances
+ * @name: tsm id reflected in /sys/kernel/config/tsm/report/$report/provider
+ * @privlevel_floor: convey base privlevel for nested scenarios
+ * @report_new: Populate @report with the report blob and auxblob
+ * (optional), return 0 on successful population, or -errno otherwise
+ *
+ * Implementation specific ops, only one is expected to be registered at
+ * a time i.e. only one of "sev-guest", "tdx-guest", etc.
+ */
+struct tsm_ops {
+	const char *name;
+	const unsigned int privlevel_floor;
+	int (*report_new)(struct tsm_report *report, void *data);
+};
+
+extern const struct config_item_type tsm_report_default_type;
+
+/* publish @privlevel, @privlevel_floor, and @auxblob attributes */
+extern const struct config_item_type tsm_report_extra_type;
+
+int tsm_register(const struct tsm_ops *ops, void *priv,
+		 const struct config_item_type *type);
+int tsm_unregister(const struct tsm_ops *ops);
+#endif /* __TSM_H */
-- 
cgit v1.2.3


From a67d74a4b163878a3c0537033ed1b20db92ebfc5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 19 Jul 2023 18:14:13 -0700
Subject: mm/slab: Add __free() support for kvfree

Allow for the declaration of variables that trigger kvfree() when they
go out of scope. The check for NULL and call to kvfree() can be elided
by the compiler in most cases, otherwise without the NULL check an
unnecessary call to kvfree() may be emitted. Peter proposed a comment
for this detail [1].

Link: http://lore.kernel.org/r/20230816103102.GF980931@hirez.programming.kicks-ass.net [1]
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Pankaj Gupta <pankaj.gupta@amd.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Tested-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/slab.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 8228d1276a2f..df4c2d45bb86 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -763,6 +763,8 @@ static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t fla
 extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
 		      __realloc_size(3);
 extern void kvfree(const void *addr);
+DEFINE_FREE(kvfree, void *, if (_T) kvfree(_T))
+
 extern void kvfree_sensitive(const void *addr, size_t len);
 
 unsigned int kmem_cache_size(struct kmem_cache *s);
-- 
cgit v1.2.3


From 16ab7cb5825fc3425c16ad2c6e53d827f382d7c6 Mon Sep 17 00:00:00 2001
From: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Date: Tue, 10 Oct 2023 22:22:38 +0100
Subject: crypto: pkcs7 - remove sha1 support

Removes support for sha1 signed kernel modules, importing sha1 signed
x.509 certificates.

rsa-pkcs1pad keeps sha1 padding support, which seems to be used by
virtio driver.

sha1 remains available as there are many drivers and subsystems using
it. Note only hmac(sha1) with secret keys remains cryptographically
secure.

In the kernel there are filesystems, IMA, tpm/pcr that appear to be
using sha1. Maybe they can all start to be slowly upgraded to
something else i.e. blake3, ParallelHash, SHAKE256 as needed.

Signed-off-by: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/oid_registry.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 4d04fa5d1eec..8b79e55cfcec 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -17,12 +17,10 @@
  *	  build_OID_registry.pl to generate the data for look_up_OID().
  */
 enum OID {
-	OID_id_dsa_with_sha1,		/* 1.2.840.10030.4.3 */
 	OID_id_dsa,			/* 1.2.840.10040.4.1 */
 	OID_id_ecPublicKey,		/* 1.2.840.10045.2.1 */
 	OID_id_prime192v1,		/* 1.2.840.10045.3.1.1 */
 	OID_id_prime256v1,		/* 1.2.840.10045.3.1.7 */
-	OID_id_ecdsa_with_sha1,		/* 1.2.840.10045.4.1 */
 	OID_id_ecdsa_with_sha224,	/* 1.2.840.10045.4.3.1 */
 	OID_id_ecdsa_with_sha256,	/* 1.2.840.10045.4.3.2 */
 	OID_id_ecdsa_with_sha384,	/* 1.2.840.10045.4.3.3 */
@@ -30,7 +28,6 @@ enum OID {
 
 	/* PKCS#1 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-1(1)} */
 	OID_rsaEncryption,		/* 1.2.840.113549.1.1.1 */
-	OID_sha1WithRSAEncryption,	/* 1.2.840.113549.1.1.5 */
 	OID_sha256WithRSAEncryption,	/* 1.2.840.113549.1.1.11 */
 	OID_sha384WithRSAEncryption,	/* 1.2.840.113549.1.1.12 */
 	OID_sha512WithRSAEncryption,	/* 1.2.840.113549.1.1.13 */
@@ -67,7 +64,6 @@ enum OID {
 	OID_PKU2U,			/* 1.3.5.1.5.2.7 */
 	OID_Scram,			/* 1.3.6.1.5.5.14 */
 	OID_certAuthInfoAccess,		/* 1.3.6.1.5.5.7.1.1 */
-	OID_sha1,			/* 1.3.14.3.2.26 */
 	OID_id_ansip384r1,		/* 1.3.132.0.34 */
 	OID_sha256,			/* 2.16.840.1.101.3.4.2.1 */
 	OID_sha384,			/* 2.16.840.1.101.3.4.2.2 */
-- 
cgit v1.2.3


From 5acab6eb592387191c1bb745ba9b815e1e076db5 Mon Sep 17 00:00:00 2001
From: Longfang Liu <liulongfang@huawei.com>
Date: Fri, 13 Oct 2023 11:49:57 +0800
Subject: crypto: hisilicon/qm - fix EQ/AEQ interrupt issue

During hisilicon accelerator live migration operation. In order to
prevent the problem of EQ/AEQ interrupt loss. Migration driver will
trigger an EQ/AEQ doorbell at the end of the migration.

This operation may cause double interruption of EQ/AEQ events.
To ensure that the EQ/AEQ interrupt processing function is normal.
The interrupt handling functionality of EQ/AEQ needs to be updated.
Used to handle repeated interrupts event.

Fixes: b0eed085903e ("hisi_acc_vfio_pci: Add support for VFIO live migration")
Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/hisi_acc_qm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 44e0c44a2e20..ddc7ebb70523 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -276,6 +276,7 @@ struct hisi_qm_poll_data {
 	struct hisi_qm *qm;
 	struct work_struct work;
 	u16 *qp_finish_id;
+	u16 eqe_num;
 };
 
 /**
-- 
cgit v1.2.3


From eeb9f34df065f42f0c9195b322ba6df420c9fc92 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Mon, 4 Sep 2023 22:04:53 -0700
Subject: x86/srso: Fix unret validation dependencies

CONFIG_CPU_SRSO isn't dependent on CONFIG_CPU_UNRET_ENTRY (AMD
Retbleed), so the two features are independently configurable.  Fix
several issues for the (presumably rare) case where CONFIG_CPU_SRSO is
enabled but CONFIG_CPU_UNRET_ENTRY isn't.

Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation")
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/299fb7740174d0f2335e91c58af0e9c242b4bac1.1693889988.git.jpoimboe@kernel.org
---
 include/linux/objtool.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 03f82c2c2ebf..b5440e7da55b 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -130,7 +130,8 @@
  * it will be ignored.
  */
 .macro VALIDATE_UNRET_BEGIN
-#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY)
+#if defined(CONFIG_NOINSTR_VALIDATION) && \
+	(defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO))
 .Lhere_\@:
 	.pushsection .discard.validate_unret
 	.long	.Lhere_\@ - .
-- 
cgit v1.2.3


From f600bb612b06adf70ccdefbd3294c71275b650c2 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Thu, 19 Oct 2023 13:14:58 +0200
Subject: net: dsa: microchip: ksz8: Enable MIIM PHY Control reg access

Provide access to MIIM PHY Control register (Reg. 31) through
ksz8_r_phy_ctrl() and ksz8_w_phy_ctrl() functions. Necessary for
upcoming micrel.c patch to address forced link mode configuration.

Closes: https://lore.kernel.org/oe-kbuild-all/202310112224.iYgvjBUy-lkp@intel.com/
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/micrel_phy.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 4e27ca7c49de..591bf5b5e8dc 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -64,6 +64,10 @@
 #define KSZ886X_BMCR_DISABLE_TRANSMIT		BIT(1)
 #define KSZ886X_BMCR_DISABLE_LED		BIT(0)
 
+/* PHY Special Control/Status Register (Reg 31) */
 #define KSZ886X_CTRL_MDIX_STAT			BIT(4)
+#define KSZ886X_CTRL_FORCE_LINK			BIT(3)
+#define KSZ886X_CTRL_PWRSAVE			BIT(2)
+#define KSZ886X_CTRL_REMOTE_LOOPBACK		BIT(1)
 
 #endif /* _MICREL_PHY_H */
-- 
cgit v1.2.3


From 20c6e05bd33deaa6fa890252d7ffc5ad54a0942c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 19 Oct 2023 08:28:15 -0700
Subject: ethtool: untangle the linkmode and ethtool headers

Commit 26c5334d344d ("ethtool: Add forced speed to supported link
modes maps") added a dependency between ethtool.h and linkmode.h.
The dependency in the opposite direction already exists so the
new code was inserted in an awkward place.

The reason for ethtool.h to include linkmode.h, is that
ethtool_forced_speed_maps_init() is a static inline helper.
That's not really necessary.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Paul Greenwalt <paul.greenwalt@intel.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h  | 22 ++--------------------
 include/linux/linkmode.h | 29 ++++++++++++++---------------
 2 files changed, 16 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 8e91e8b8a693..226a36ed5aa1 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -13,7 +13,6 @@
 #ifndef _LINUX_ETHTOOL_H
 #define _LINUX_ETHTOOL_H
 
-#include <linux/linkmode.h>
 #include <linux/bitmap.h>
 #include <linux/compat.h>
 #include <linux/if_ether.h>
@@ -1070,23 +1069,6 @@ struct ethtool_forced_speed_map {
 	.arr_size	= ARRAY_SIZE(prefix##_##value),			\
 }
 
-/**
- * ethtool_forced_speed_maps_init
- * @maps: Pointer to an array of Ethtool forced speed map
- * @size: Array size
- *
- * Initialize an array of Ethtool forced speed map to Ethtool link modes. This
- * should be called during driver module init.
- */
-static inline void
-ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size)
-{
-	for (u32 i = 0; i < size; i++) {
-		struct ethtool_forced_speed_map *map = &maps[i];
-
-		linkmode_set_bit_array(map->cap_arr, map->arr_size, map->caps);
-		map->cap_arr = NULL;
-		map->arr_size = 0;
-	}
-}
+void
+ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size);
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index cd38f89553e6..7303b4bc2ce0 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -2,21 +2,6 @@
 #define __LINKMODE_H
 
 #include <linux/bitmap.h>
-
-static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
-{
-	__set_bit(nr, addr);
-}
-
-static inline void linkmode_set_bit_array(const int *array, int array_size,
-					  unsigned long *addr)
-{
-	int i;
-
-	for (i = 0; i < array_size; i++)
-		linkmode_set_bit(array[i], addr);
-}
-
 #include <linux/ethtool.h>
 #include <uapi/linux/ethtool.h>
 
@@ -53,6 +38,11 @@ static inline int linkmode_andnot(unsigned long *dst, const unsigned long *src1,
 	return bitmap_andnot(dst, src1, src2,  __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
+static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
+{
+	__set_bit(nr, addr);
+}
+
 static inline void linkmode_clear_bit(int nr, volatile unsigned long *addr)
 {
 	__clear_bit(nr, addr);
@@ -72,6 +62,15 @@ static inline int linkmode_test_bit(int nr, const volatile unsigned long *addr)
 	return test_bit(nr, addr);
 }
 
+static inline void linkmode_set_bit_array(const int *array, int array_size,
+					  unsigned long *addr)
+{
+	int i;
+
+	for (i = 0; i < array_size; i++)
+		linkmode_set_bit(array[i], addr);
+}
+
 static inline int linkmode_equal(const unsigned long *src1,
 				 const unsigned long *src2)
 {
-- 
cgit v1.2.3


From d0ed46b60396cfa7e0056f55e1ce0b43c7db57b6 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 20 Oct 2023 04:35:45 +0100
Subject: tracing: Move readpos from seq_buf to trace_seq

To make seq_buf more lightweight as a string buf, move the readpos member
from seq_buf to its container, trace_seq.  That puts the responsibility
of maintaining the readpos entirely in the tracing code.  If some future
users want to package up the readpos with a seq_buf, we can define a
new struct then.

Link: https://lore.kernel.org/linux-trace-kernel/20231020033545.2587554-2-willy@infradead.org

Cc: Kees Cook <keescook@chromium.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h   | 5 +----
 include/linux/trace_seq.h | 2 ++
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 515d7fcb9634..a0fb013cebdf 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -14,19 +14,16 @@
  * @buffer:	pointer to the buffer
  * @size:	size of the buffer
  * @len:	the amount of data inside the buffer
- * @readpos:	The next position to read in the buffer.
  */
 struct seq_buf {
 	char			*buffer;
 	size_t			size;
 	size_t			len;
-	loff_t			readpos;
 };
 
 static inline void seq_buf_clear(struct seq_buf *s)
 {
 	s->len = 0;
-	s->readpos = 0;
 }
 
 static inline void
@@ -143,7 +140,7 @@ extern __printf(2, 0)
 int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args);
 extern int seq_buf_print_seq(struct seq_file *m, struct seq_buf *s);
 extern int seq_buf_to_user(struct seq_buf *s, char __user *ubuf,
-			   int cnt);
+			   size_t start, int cnt);
 extern int seq_buf_puts(struct seq_buf *s, const char *str);
 extern int seq_buf_putc(struct seq_buf *s, unsigned char c);
 extern int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len);
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 6be92bf559fe..3691e0e76a1a 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -14,6 +14,7 @@
 struct trace_seq {
 	char			buffer[PAGE_SIZE];
 	struct seq_buf		seq;
+	size_t			readpos;
 	int			full;
 };
 
@@ -22,6 +23,7 @@ trace_seq_init(struct trace_seq *s)
 {
 	seq_buf_init(&s->seq, s->buffer, PAGE_SIZE);
 	s->full = 0;
+	s->readpos = 0;
 }
 
 /**
-- 
cgit v1.2.3


From 8c35b1f472533b0df1b8f1f1afcaf4395cdb2256 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 12 Oct 2023 20:34:50 +0200
Subject: thermal: core: Pass trip pointer to governor throttle callback

Modify the governor .throttle() callback definition so that it takes a
trip pointer instead of a trip index as its second argument, adjust the
governors accordingly and update the core code invoking .throttle().

This causes the governors to become independent of the representation
of the list of trips in the thermal zone structure.

This change is not expected to alter the general functionality.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/thermal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index c8600e313909..cee814d5d1ac 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -199,7 +199,8 @@ struct thermal_governor {
 	char name[THERMAL_NAME_LENGTH];
 	int (*bind_to_tz)(struct thermal_zone_device *tz);
 	void (*unbind_from_tz)(struct thermal_zone_device *tz);
-	int (*throttle)(struct thermal_zone_device *tz, int trip);
+	int (*throttle)(struct thermal_zone_device *tz,
+			const struct thermal_trip *trip);
 	struct list_head	governor_list;
 };
 
-- 
cgit v1.2.3


From 845e31e1101fc8533be52aff42d8f1ff48636024 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 20 Oct 2023 14:38:49 -0600
Subject: seq_buf: fix a misleading comment

The comment for seq_buf_has_overflowed() says that an overflow condition is
marked by len == size, but that's not what the code is testing.  Make the
comment match reality.

Link: https://lkml.kernel.org/r/87pm19kp0m.fsf@meer.lwn.net

Fixes: 8cd709ae7658a ("tracing: Have seq_buf use full buffer")
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index a0fb013cebdf..8483e4b2d0d2 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -36,7 +36,7 @@ seq_buf_init(struct seq_buf *s, char *buf, unsigned int size)
 
 /*
  * seq_buf have a buffer that might overflow. When this happens
- * the len and size are set to be equal.
+ * len is set to be greater than size.
  */
 static inline bool
 seq_buf_has_overflowed(struct seq_buf *s)
-- 
cgit v1.2.3


From b460bc8302f222d346f0c15bba980eb8c36d6278 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:31:57 +0800
Subject: mm/percpu.c: introduce pcpu_alloc_size()

Introduce pcpu_alloc_size() to get the size of the dynamic per-cpu
area. It will be used by bpf memory allocator in the following patches.
BPF memory allocator maintains per-cpu area caches for multiple area
sizes and its free API only has the to-be-freed per-cpu pointer, so it
needs the size of dynamic per-cpu area to select the corresponding cache
when bpf program frees the dynamic per-cpu pointer.

Acked-by: Dennis Zhou <dennis@kernel.org>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/percpu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 68fac2e7cbe6..8c677f185901 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -132,6 +132,7 @@ extern void __init setup_per_cpu_areas(void);
 extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1);
 extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1);
 extern void free_percpu(void __percpu *__pdata);
+extern size_t pcpu_alloc_size(void __percpu *__pdata);
 
 DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T))
 
-- 
cgit v1.2.3


From 3f2189e4f77b7a3e979d143dc4ff586488c7e8a5 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:31:59 +0800
Subject: bpf: Use pcpu_alloc_size() in bpf_mem_free{_rcu}()

For bpf_global_percpu_ma, the pointer passed to bpf_mem_free_rcu() is
allocated by kmalloc() and its size is fixed (16-bytes on x86-64). So
no matter which cache allocates the dynamic per-cpu area, on x86-64
cache[2] will always be used to free the per-cpu area.

Fix the unbalance by checking whether the bpf memory allocator is
per-cpu or not and use pcpu_alloc_size() instead of ksize() to
find the correct cache for per-cpu free.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-5-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index d644bbb298af..bb1223b21308 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -11,6 +11,7 @@ struct bpf_mem_caches;
 struct bpf_mem_alloc {
 	struct bpf_mem_caches __percpu *caches;
 	struct bpf_mem_cache __percpu *cache;
+	bool percpu;
 	struct work_struct work;
 };
 
-- 
cgit v1.2.3


From e581a3461de3f129cfe888a67d9f31086328271f Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:32:00 +0800
Subject: bpf: Move the declaration of __bpf_obj_drop_impl() to bpf.h

both syscall.c and helpers.c have the declaration of
__bpf_obj_drop_impl(), so just move it to a common header file.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-6-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4b40b45962b..ebd412179771 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2058,6 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
 void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
 
 struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
-- 
cgit v1.2.3


From e383a45902337356d9ccad797094a27c6b2150f9 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:32:01 +0800
Subject: bpf: Use bpf_global_percpu_ma for per-cpu kptr in
 __bpf_obj_drop_impl()

The following warning was reported when running "./test_progs -t
test_bpf_ma/percpu_free_through_map_free":

  ------------[ cut here ]------------
  WARNING: CPU: 1 PID: 68 at kernel/bpf/memalloc.c:342
  CPU: 1 PID: 68 Comm: kworker/u16:2 Not tainted 6.6.0-rc2+ #222
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
  Workqueue: events_unbound bpf_map_free_deferred
  RIP: 0010:bpf_mem_refill+0x21c/0x2a0
  ......
  Call Trace:
   <IRQ>
   ? bpf_mem_refill+0x21c/0x2a0
   irq_work_single+0x27/0x70
   irq_work_run_list+0x2a/0x40
   irq_work_run+0x18/0x40
   __sysvec_irq_work+0x1c/0xc0
   sysvec_irq_work+0x73/0x90
   </IRQ>
   <TASK>
   asm_sysvec_irq_work+0x1b/0x20
  RIP: 0010:unit_free+0x50/0x80
   ......
   bpf_mem_free+0x46/0x60
   __bpf_obj_drop_impl+0x40/0x90
   bpf_obj_free_fields+0x17d/0x1a0
   array_map_free+0x6b/0x170
   bpf_map_free_deferred+0x54/0xa0
   process_scheduled_works+0xba/0x370
   worker_thread+0x16d/0x2e0
   kthread+0x105/0x140
   ret_from_fork+0x39/0x60
   ret_from_fork_asm+0x1b/0x30
   </TASK>
  ---[ end trace 0000000000000000 ]---

The reason is simple: __bpf_obj_drop_impl() does not know the freeing
field is a per-cpu pointer and it uses bpf_global_ma to free the
pointer. Because bpf_global_ma is not a per-cpu allocator, so ksize() is
used to select the corresponding cache. The bpf_mem_cache with 16-bytes
unit_size will always be selected to do the unmatched free and it will
trigger the warning in free_bulk() eventually.

Because per-cpu kptr doesn't support list or rb-tree now, so fix the
problem by only checking whether or not the type of kptr is per-cpu in
bpf_obj_free_fields(), and using bpf_global_percpu_ma to these kptrs.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-7-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ebd412179771..b4825d3cdb29 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2058,7 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
 void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
 
 struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
-- 
cgit v1.2.3


From 1fa05877588a02fdad995385eaafea4798022cd4 Mon Sep 17 00:00:00 2001
From: Umang Jain <umang.jain@ideasonboard.com>
Date: Thu, 19 Oct 2023 14:31:26 +0530
Subject: staging: vc04_services: Support module autoloading using
 MODULE_DEVICE_TABLE

VC04 has now a independent bus vchiq_bus to register its devices.
However, the module auto-loading for bcm2835-audio and bcm2835-camera
currently happens through MODULE_ALIAS() macro specified explicitly.

The correct way to auto-load a module, is when the alias is picked
out from MODULE_DEVICE_TABLE(). In order to get there, we need to
introduce  vchiq_device_id and add relevant entries in file2alias.c
infrastructure so that aliases can be generated. This patch targets
adding vchiq_device_id and do_vchiq_entry, in order to
generate those alias using the /script/mod/file2alias.c.

Going forward the MODULE_ALIAS() from bcm2835-camera and bcm2835-audio
will be dropped, in favour of MODULE_DEVICE_TABLE being used there.

The alias format for vchiq_bus devices will be "vchiq:<dev_name>".
Adjust the vchiq_bus_uevent() to reflect that.

Signed-off-by: Umang Jain <umang.jain@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Kieran Bingham <kieran.bingham@ideasonboard.com>
Link: https://lore.kernel.org/r/20231019090128.430297-2-umang.jain@ideasonboard.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mod_devicetable.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index b0678b093cb2..700cbf34c3b8 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -946,4 +946,8 @@ struct cdx_device_id {
 	__u32 override_only;
 };
 
+struct vchiq_device_id {
+	char name[32];
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
-- 
cgit v1.2.3


From 2978cc1f285390c1bd4d9bfc665747adc6e4b19c Mon Sep 17 00:00:00 2001
From: Tomer Maimon <tmaimon77@gmail.com>
Date: Tue, 17 Oct 2023 22:59:01 +0300
Subject: usb: chipidea: add CI_HDRC_FORCE_VBUS_ACTIVE_ALWAYS flag

Adding CI_HDRC_FORCE_VBUS_ACTIVE_ALWAYS flag to modify the vbus_active
parameter to active in case the ChipIdea USB IP role is device-only and
there is no otgsc register.

Signed-off-by: Tomer Maimon <tmaimon77@gmail.com>
Acked-by: Peter Chen <peter.chen@kernel.org>
Link: https://lore.kernel.org/r/20231017195903.1665260-2-tmaimon77@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/chipidea.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index 0b4f2d5faa08..5a7f96684ea2 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -64,6 +64,7 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_PMQOS			BIT(15)
 #define CI_HDRC_PHY_VBUS_CONTROL	BIT(16)
 #define CI_HDRC_HAS_PORTSC_PEC_MISSED	BIT(17)
+#define CI_HDRC_FORCE_VBUS_ACTIVE_ALWAYS	BIT(18)
 	enum usb_dr_mode	dr_mode;
 #define CI_HDRC_CONTROLLER_RESET_EVENT		0
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
-- 
cgit v1.2.3


From 2cc3b37f5b6df8189d55d0e812d9658ce256dfec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Fri, 20 Oct 2023 11:55:41 +0100
Subject: nvmem: add explicit config option to read old syntax fixed OF cells
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Binding for fixed NVMEM cells defined directly as NVMEM device subnodes
has been deprecated. It has been replaced by the "fixed-layout" NVMEM
layout binding.

New syntax is meant to be clearer and should help avoiding imprecise
bindings.

NVMEM subsystem already supports the new binding. It should be a good
idea to limit support for old syntax to existing drivers that actually
support & use it (we can't break backward compatibility!). That way we
additionally encourage new bindings & drivers to ignore deprecated
binding.

It wasn't clear (to me) if rtc and w1 code actually uses old syntax
fixed cells. I enabled them to don't risk any breakage.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
[for meson-{efuse,mx-efuse}.c]
Acked-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
[for mtk-efuse.c, nvmem/core.c, nvmem-provider.h]
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
[MT8192, MT8195 Chromebooks]
Tested-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
[for microchip-otpc.c]
Reviewed-by: Claudiu Beznea <claudiu.beznea@microchip.com>
[SAMA7G5-EK]
Tested-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231020105545.216052-3-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-provider.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index dae26295e6be..1b81adebdb8b 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -82,6 +82,7 @@ struct nvmem_cell_info {
  * @owner:	Pointer to exporter module. Used for refcounting.
  * @cells:	Optional array of pre-defined NVMEM cells.
  * @ncells:	Number of elements in cells.
+ * @add_legacy_fixed_of_cells:	Read fixed NVMEM cells from old OF syntax.
  * @keepout:	Optional array of keepout ranges (sorted ascending by start).
  * @nkeepout:	Number of elements in the keepout array.
  * @type:	Type of the nvmem storage
@@ -112,6 +113,7 @@ struct nvmem_config {
 	struct module		*owner;
 	const struct nvmem_cell_info	*cells;
 	int			ncells;
+	bool			add_legacy_fixed_of_cells;
 	const struct nvmem_keepout *keepout;
 	unsigned int		nkeepout;
 	enum nvmem_type		type;
-- 
cgit v1.2.3


From 85e95ca7cc48c23f772387b069d794f69116192b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 13 Nov 2021 19:49:14 -0500
Subject: bcachefs: Update export_operations for snapshots

When support for snapshots was merged, export operations weren't
updated yet. This patch adds new filehandle types for bcachefs that
include the subvolume ID and updates export operations for subvolumes -
and also .get_parent, support for which was added just prior to
snapshots.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 include/linux/exportfs.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 11fbd0ee1370..f49a7d31167e 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -98,6 +98,12 @@ enum fid_type {
 	 */
 	FILEID_FAT_WITH_PARENT = 0x72,
 
+	/*
+	 * 64 bit inode number, 32 bit subvolume, 32 bit generation number:
+	 */
+	FILEID_BCACHEFS_WITHOUT_PARENT = 0x80,
+	FILEID_BCACHEFS_WITH_PARENT = 0x81,
+
 	/*
 	 * 128 bit child FID (struct lu_fid)
 	 * 128 bit parent FID (struct lu_fid)
-- 
cgit v1.2.3


From 3d44de9a10ea2b1658dfaed8ea6d3d7b6e0defbb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:45 +0000
Subject: tcp: add RTAX_FEATURE_TCP_USEC_TS

This new dst feature flag will be used to allow TCP to use usec
based timestamps instead of msec ones.

ip route .... feature tcp_usec_ts

Also document that RTAX_FEATURE_SACK and RTAX_FEATURE_TIMESTAMP
are unused.

RTAX_FEATURE_ALLFRAG is also going away soon.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e15452df9804..04a0e647ef74 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -576,4 +576,9 @@ void tcp_sock_set_quickack(struct sock *sk, int val);
 int tcp_sock_set_syncnt(struct sock *sk, int val);
 int tcp_sock_set_user_timeout(struct sock *sk, int val);
 
+static inline bool dst_tcp_usec_ts(const struct dst_entry *dst)
+{
+	return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS);
+}
+
 #endif	/* _LINUX_TCP_H */
-- 
cgit v1.2.3


From 614e8316aa4cafba3e204cb8ee48bd12b92f3d93 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:47 +0000
Subject: tcp: add support for usec resolution in TCP TS values

Back in 2015, Van Jacobson suggested to use usec resolution in TCP TS values.
This has been implemented in our private kernels.

Goals were :

1) better observability of delays in networking stacks.
2) better disambiguation of events based on TSval/ecr values.
3) building block for congestion control modules needing usec resolution.

Back then we implemented a schem based on private SYN options
to negotiate the feature.

For upstream submission, we chose to use a route attribute,
because this feature is probably going to be used in private
networks [1] [2].

ip route add 10/8 ... features tcp_usec_ts

Note that RFC 7323 recommends a
  "timestamp clock frequency in the range 1 ms to 1 sec per tick.",
but also mentions
  "the maximum acceptable clock frequency is one tick every 59 ns."

[1] Unfortunately RFC 7323 5.5 (Outdated Timestamps) suggests
to invalidate TS.Recent values after a flow was idle for more
than 24 days. This is the part making usec_ts a problem
for peers following this recommendation for long living
idle flows.

[2] Attempts to standardize usec ts went nowhere:

https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf
https://datatracker.ietf.org/doc/draft-wang-tcpm-low-latency-opt/

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 04a0e647ef74..6df715b6e51d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -152,6 +152,7 @@ struct tcp_request_sock {
 	u64				snt_synack; /* first SYNACK sent time */
 	bool				tfo_listener;
 	bool				is_mptcp;
+	s8				req_usec_ts;
 #if IS_ENABLED(CONFIG_MPTCP)
 	bool				drop_req;
 #endif
@@ -257,7 +258,8 @@ struct tcp_sock {
 	u8	compressed_ack;
 	u8	dup_ack_counter:2,
 		tlp_retrans:1,	/* TLP is a retransmission */
-		unused:5;
+		tcp_usec_ts:1, /* TSval values in usec */
+		unused:4;
 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
-- 
cgit v1.2.3


From 89141f965325de9aac3805cd506df788179b2809 Mon Sep 17 00:00:00 2001
From: Dmitry Antipov <dmantipov@yandex.ru>
Date: Wed, 4 Oct 2023 18:30:29 +0300
Subject: wifi: remove unused argument of ieee80211_get_tdls_action()

Remove unused 'hdr_size' argument of 'ieee80211_get_tdls_action()'
and adjust 'ieee80211_report_used_skb()' accordingly.

Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Link: https://lore.kernel.org/r/20231004153032.206134-1-dmantipov@yandex.ru
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 62b4469c6866..045a776ee547 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4490,12 +4490,11 @@ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim,
 /**
  * ieee80211_get_tdls_action - get tdls packet action (or -1, if not tdls packet)
  * @skb: the skb containing the frame, length will not be checked
- * @hdr_size: the size of the ieee80211_hdr that starts at skb->data
  *
  * This function assumes the frame is a data frame, and that the network header
  * is in the correct place.
  */
-static inline int ieee80211_get_tdls_action(struct sk_buff *skb, u32 hdr_size)
+static inline int ieee80211_get_tdls_action(struct sk_buff *skb)
 {
 	if (!skb_is_nonlinear(skb) &&
 	    skb->len > (skb_network_offset(skb) + 2)) {
-- 
cgit v1.2.3


From 91535613b6090fc968c601d11d4e2f16b333713c Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Mon, 16 Oct 2023 14:52:48 +0300
Subject: wifi: mac80211: don't drop all unprotected public action frames

Not all public action frames have a protected variant. When MFP is
enabled drop only public action frames that have a dual protected
variant.

Fixes: 76a3059cf124 ("wifi: mac80211: drop some unprotected action frames")
Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20231016145213.2973e3c8d3bb.I6198b8d3b04cf4a97b06660d346caec3032f232a@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index bd2f6e19c357..b24fb80782c5 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4355,6 +4355,35 @@ static inline bool ieee80211_is_public_action(struct ieee80211_hdr *hdr,
 	return mgmt->u.action.category == WLAN_CATEGORY_PUBLIC;
 }
 
+/**
+ * ieee80211_is_protected_dual_of_public_action - check if skb contains a
+ * protected dual of public action management frame
+ * @skb: the skb containing the frame, length will be checked
+ *
+ * Return: true if the skb contains a protected dual of public action
+ * management frame, false otherwise.
+ */
+static inline bool
+ieee80211_is_protected_dual_of_public_action(struct sk_buff *skb)
+{
+	u8 action;
+
+	if (!ieee80211_is_public_action((void *)skb->data, skb->len) ||
+	    skb->len < IEEE80211_MIN_ACTION_SIZE + 1)
+		return false;
+
+	action = *(u8 *)(skb->data + IEEE80211_MIN_ACTION_SIZE);
+
+	return action != WLAN_PUB_ACTION_20_40_BSS_COEX &&
+		action != WLAN_PUB_ACTION_DSE_REG_LOC_ANN &&
+		action != WLAN_PUB_ACTION_MSMT_PILOT &&
+		action != WLAN_PUB_ACTION_TDLS_DISCOVER_RES &&
+		action != WLAN_PUB_ACTION_LOC_TRACK_NOTI &&
+		action != WLAN_PUB_ACTION_FTM_REQUEST &&
+		action != WLAN_PUB_ACTION_FTM_RESPONSE &&
+		action != WLAN_PUB_ACTION_FILS_DISCOVERY;
+}
+
 /**
  * _ieee80211_is_group_privacy_action - check if frame is a group addressed
  * privacy action frame
-- 
cgit v1.2.3


From 6f15b178cd6315c997981f76c6ebed7ad39144c5 Mon Sep 17 00:00:00 2001
From: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
Date: Thu, 5 Oct 2023 15:42:42 +0530
Subject: EDAC/versal: Add a Xilinx Versal memory controller driver

Add a EDAC driver for the RAS capabilities on the Xilinx integrated DDR
Memory Controllers (DDRMCs) which support both DDR4 and LPDDR4/4X memory
interfaces. It has four programmable Network-on-Chip (NoC) interface
ports and is designed to handle multiple streams of traffic. The driver
reports correctable and uncorrectable errors, and also creates debugfs
entries for testing through error injection.

  [ bp:
   - Add a pointer to the documentation about the register unlock code.
   - Squash in a fix for a Smatch static checker issue as reported by
     Dan Carpenter:
     https://lore.kernel.org/r/a4db6f93-8e5f-4d55-a7b8-b5a987d48a58@moroto.mountain
  ]

Co-developed-by: Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
Signed-off-by: Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20231005101242.14621-3-shubhrajyoti.datta@amd.com
---
 include/linux/firmware/xlnx-zynqmp.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index e8b12ec8b060..d1ea3898564c 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -100,6 +100,18 @@
 #define SD_ITAPDLY	0xFF180314
 #define SD_OTAPDLYSEL	0xFF180318
 
+/**
+ * XPM_EVENT_ERROR_MASK_DDRMC_CR: Error event mask for DDRMC MC Correctable ECC Error.
+ */
+#define XPM_EVENT_ERROR_MASK_DDRMC_CR		BIT(18)
+
+/**
+ * XPM_EVENT_ERROR_MASK_DDRMC_NCR: Error event mask for DDRMC MC Non-Correctable ECC Error.
+ */
+#define XPM_EVENT_ERROR_MASK_DDRMC_NCR		BIT(19)
+#define XPM_EVENT_ERROR_MASK_NOC_NCR		BIT(13)
+#define XPM_EVENT_ERROR_MASK_NOC_CR		BIT(12)
+
 enum pm_api_cb_id {
 	PM_INIT_SUSPEND_CB = 30,
 	PM_ACKNOWLEDGE_CB = 31,
-- 
cgit v1.2.3


From 2793a8b015f7f1caadb9bce9c63dc659f7522676 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 24 Oct 2023 03:09:13 +0300
Subject: bpf: exact states comparison for iterator convergence checks

Convergence for open coded iterators is computed in is_state_visited()
by examining states with branches count > 1 and using states_equal().
states_equal() computes sub-state relation using read and precision marks.
Read and precision marks are propagated from children states,
thus are not guaranteed to be complete inside a loop when branches
count > 1. This could be demonstrated using the following unsafe program:

     1. r7 = -16
     2. r6 = bpf_get_prandom_u32()
     3. while (bpf_iter_num_next(&fp[-8])) {
     4.   if (r6 != 42) {
     5.     r7 = -32
     6.     r6 = bpf_get_prandom_u32()
     7.     continue
     8.   }
     9.   r0 = r10
    10.   r0 += r7
    11.   r8 = *(u64 *)(r0 + 0)
    12.   r6 = bpf_get_prandom_u32()
    13. }

Here verifier would first visit path 1-3, create a checkpoint at 3
with r7=-16, continue to 4-7,3 with r7=-32.

Because instructions at 9-12 had not been visitied yet existing
checkpoint at 3 does not have read or precision mark for r7.
Thus states_equal() would return true and verifier would discard
current state, thus unsafe memory access at 11 would not be caught.

This commit fixes this loophole by introducing exact state comparisons
for iterator convergence logic:
- registers are compared using regs_exact() regardless of read or
  precision marks;
- stack slots have to have identical type.

Unfortunately, this is too strict even for simple programs like below:

    i = 0;
    while(iter_next(&it))
      i++;

At each iteration step i++ would produce a new distinct state and
eventually instruction processing limit would be reached.

To avoid such behavior speculatively forget (widen) range for
imprecise scalar registers, if those registers were not precise at the
end of the previous iteration and do not match exactly.

This a conservative heuristic that allows to verify wide range of
programs, however it precludes verification of programs that conjure
an imprecise value on the first loop iteration and use it as precise
on the second.

Test case iter_task_vma_for_each() presents one of such cases:

        unsigned int seen = 0;
        ...
        bpf_for_each(task_vma, vma, task, 0) {
                if (seen >= 1000)
                        break;
                ...
                seen++;
        }

Here clang generates the following code:

<LBB0_4>:
      24:       r8 = r6                          ; stash current value of
                ... body ...                       'seen'
      29:       r1 = r10
      30:       r1 += -0x8
      31:       call bpf_iter_task_vma_next
      32:       r6 += 0x1                        ; seen++;
      33:       if r0 == 0x0 goto +0x2 <LBB0_6>  ; exit on next() == NULL
      34:       r7 += 0x10
      35:       if r8 < 0x3e7 goto -0xc <LBB0_4> ; loop on seen < 1000

<LBB0_6>:
      ... exit ...

Note that counter in r6 is copied to r8 and then incremented,
conditional jump is done using r8. Because of this precision mark for
r6 lags one state behind of precision mark on r8 and widening logic
kicks in.

Adding barrier_var(seen) after conditional is sufficient to force
clang use the same register for both counting and conditional jump.

This issue was discussed in the thread [1] which was started by
Andrew Werner <awerner32@gmail.com> demonstrating a similar bug
in callback functions handling. The callbacks would be addressed
in a followup patch.

[1] https://lore.kernel.org/bpf/97a90da09404c65c8e810cf83c94ac703705dc0e.camel@gmail.com/

Co-developed-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Co-developed-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231024000917.12153-4-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e67cd45a85be..38b788228594 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -384,6 +384,7 @@ struct bpf_verifier_state {
 	 */
 	struct bpf_idx_pair *jmp_history;
 	u32 jmp_history_cnt;
+	u32 dfs_depth;
 };
 
 #define bpf_get_spilled_reg(slot, frame, mask)				\
-- 
cgit v1.2.3


From 2a0992829ea3864939d917a5c7b48be6629c6217 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 24 Oct 2023 03:09:15 +0300
Subject: bpf: correct loop detection for iterators convergence

It turns out that .branches > 0 in is_state_visited() is not a
sufficient condition to identify if two verifier states form a loop
when iterators convergence is computed. This commit adds logic to
distinguish situations like below:

 (I)            initial       (II)            initial
                  |                             |
                  V                             V
     .---------> hdr                           ..
     |            |                             |
     |            V                             V
     |    .------...                    .------..
     |    |       |                     |       |
     |    V       V                     V       V
     |   ...     ...               .-> hdr     ..
     |    |       |                |    |       |
     |    V       V                |    V       V
     |   succ <- cur               |   succ <- cur
     |    |                        |    |
     |    V                        |    V
     |   ...                       |   ...
     |    |                        |    |
     '----'                        '----'

For both (I) and (II) successor 'succ' of the current state 'cur' was
previously explored and has branches count at 0. However, loop entry
'hdr' corresponding to 'succ' might be a part of current DFS path.
If that is the case 'succ' and 'cur' are members of the same loop
and have to be compared exactly.

Co-developed-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Co-developed-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Reviewed-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231024000917.12153-6-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 38b788228594..24213a99cc79 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -373,10 +373,25 @@ struct bpf_verifier_state {
 	struct bpf_active_lock active_lock;
 	bool speculative;
 	bool active_rcu_lock;
+	/* If this state was ever pointed-to by other state's loop_entry field
+	 * this flag would be set to true. Used to avoid freeing such states
+	 * while they are still in use.
+	 */
+	bool used_as_loop_entry;
 
 	/* first and last insn idx of this verifier state */
 	u32 first_insn_idx;
 	u32 last_insn_idx;
+	/* If this state is a part of states loop this field points to some
+	 * parent of this state such that:
+	 * - it is also a member of the same states loop;
+	 * - DFS states traversal starting from initial state visits loop_entry
+	 *   state before this state.
+	 * Used to compute topmost loop entry for state loops.
+	 * State loops might appear because of open coded iterators logic.
+	 * See get_loop_entry() for more information.
+	 */
+	struct bpf_verifier_state *loop_entry;
 	/* jmp history recorded from first to last.
 	 * backtracking is using it to go from last to first.
 	 * For most states jmp_history_cnt is [0-3].
-- 
cgit v1.2.3


From b95303e0aeaf446b65169dd4142cacdaeb7d4c8b Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Thu, 19 Oct 2023 11:33:21 +0800
Subject: sched: Add cpus_share_resources API

Add cpus_share_resources() API. This is the preparation for the
optimization of select_idle_cpu() on platforms with cluster scheduler
level.

On a machine with clusters cpus_share_resources() will test whether
two cpus are within the same cluster. On a non-cluster machine it
will behaves the same as cpus_share_cache(). So we use "resources"
here for cache resources.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-and-reviewed-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20231019033323.54147-2-yangyicong@huawei.com
---
 include/linux/sched/sd_flags.h | 7 +++++++
 include/linux/sched/topology.h | 8 +++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index fad77b5172e2..a8b28647aafc 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -109,6 +109,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
  */
 SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
 
+/*
+ * Domain members share CPU cluster (LLC tags or L2 cache)
+ *
+ * NEEDS_GROUPS: Clusters are shared between groups.
+ */
+SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
+
 /*
  * Domain members share CPU package resources (i.e. caches)
  *
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 67b573d5bf28..4c14fe127223 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void)
 #ifdef CONFIG_SCHED_CLUSTER
 static inline int cpu_cluster_flags(void)
 {
-	return SD_SHARE_PKG_RESOURCES;
+	return SD_CLUSTER | SD_SHARE_PKG_RESOURCES;
 }
 #endif
 
@@ -179,6 +179,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
 bool cpus_share_cache(int this_cpu, int that_cpu);
+bool cpus_share_resources(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
@@ -232,6 +233,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 	return true;
 }
 
+static inline bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+	return true;
+}
+
 #endif	/* !CONFIG_SMP */
 
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
-- 
cgit v1.2.3


From 984ffb6a4366752c949f7b39640aecdce222607f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 Oct 2023 12:35:33 +0200
Subject: sched/fair: Remove SIS_PROP

SIS_UTIL seems to work well, lets remove the old thing.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20231020134337.GD33965@noisy.programming.kicks-ass.net
---
 include/linux/sched/topology.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 4c14fe127223..de545ba85218 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -109,8 +109,6 @@ struct sched_domain {
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
-	u64 avg_scan_cost;		/* select_idle_sibling */
-
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
-- 
cgit v1.2.3


From c54e52f84d7aa590e90e1f73f462517ac40051e1 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 23 Oct 2023 14:35:03 +0100
Subject: arm64, irqchip/gic-v3, ACPI: Move MADT GICC enabled check into a
 helper

ACPI, irqchip and the architecture code all inspect the MADT
enabled bit for a GICC entry in the MADT.

The addition of an 'online capable' bit means all these sites need
updating.

Move the current checks behind a helper to make future updates easier.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Acked-by: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/E1quv5D-00AeNJ-U8@rmk-PC.armlinux.org.uk
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/acpi.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a73246c3c35e..fae2aa0b5e8b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -256,6 +256,11 @@ acpi_table_parse_cedt(enum acpi_cedt_type id,
 int acpi_parse_mcfg (struct acpi_table_header *header);
 void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
 
+static inline bool acpi_gicc_is_usable(struct acpi_madt_generic_interrupt *gicc)
+{
+	return gicc->flags & ACPI_MADT_ENABLED;
+}
+
 /* the following numa functions are architecture-dependent */
 void acpi_numa_slit_init (struct acpi_table_slit *slit);
 
-- 
cgit v1.2.3


From 8c9c727b6142325ed5697240fceb99cbeb4ac2ec Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Tue, 24 Oct 2023 14:50:53 +0100
Subject: vfio: Move iova_bitmap into iommufd

Both VFIO and IOMMUFD will need iova bitmap for storing dirties and walking
the user bitmaps, so move to the common dependency into IOMMUFD.  In doing
so, create the symbol IOMMUFD_DRIVER which designates the builtin code that
will be used by drivers when selected. Today this means MLX5_VFIO_PCI and
PDS_VFIO_PCI. IOMMU drivers will do the same (in future patches) when
supporting dirty tracking and select IOMMUFD_DRIVER accordingly.

Given that the symbol maybe be disabled, add header definitions in
iova_bitmap.h for when IOMMUFD_DRIVER=n

Link: https://lore.kernel.org/r/20231024135109.73787-3-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Brett Creeley <brett.creeley@amd.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iova_bitmap.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iova_bitmap.h b/include/linux/iova_bitmap.h
index c006cf0a25f3..1c338f5e5b7a 100644
--- a/include/linux/iova_bitmap.h
+++ b/include/linux/iova_bitmap.h
@@ -7,6 +7,7 @@
 #define _IOVA_BITMAP_H_
 
 #include <linux/types.h>
+#include <linux/errno.h>
 
 struct iova_bitmap;
 
@@ -14,6 +15,7 @@ typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap,
 				unsigned long iova, size_t length,
 				void *opaque);
 
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER)
 struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
 				      unsigned long page_size,
 				      u64 __user *data);
@@ -22,5 +24,29 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
 			 iova_bitmap_fn_t fn);
 void iova_bitmap_set(struct iova_bitmap *bitmap,
 		     unsigned long iova, size_t length);
+#else
+static inline struct iova_bitmap *iova_bitmap_alloc(unsigned long iova,
+						    size_t length,
+						    unsigned long page_size,
+						    u64 __user *data)
+{
+	return NULL;
+}
+
+static inline void iova_bitmap_free(struct iova_bitmap *bitmap)
+{
+}
+
+static inline int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
+				       iova_bitmap_fn_t fn)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void iova_bitmap_set(struct iova_bitmap *bitmap,
+				   unsigned long iova, size_t length)
+{
+}
+#endif
 
 #endif
-- 
cgit v1.2.3


From 750e2e902b7180cb82d2f9b1e372e32087bb8b1b Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Tue, 24 Oct 2023 14:50:55 +0100
Subject: iommu: Add iommu_domain ops for dirty tracking

Add to iommu domain operations a set of callbacks to perform dirty
tracking, particulary to start and stop tracking and to read and clear the
dirty data.

Drivers are generally expected to dynamically change its translation
structures to toggle the tracking and flush some form of control state
structure that stands in the IOVA translation path. Though it's not
mandatory, as drivers can also enable dirty tracking at boot, and just
clear the dirty bits before setting dirty tracking. For each of the newly
added IOMMU core APIs:

iommu_cap::IOMMU_CAP_DIRTY_TRACKING: new device iommu_capable value when
probing for capabilities of the device.

.set_dirty_tracking(): an iommu driver is expected to change its
translation structures and enable dirty tracking for the devices in the
iommu_domain. For drivers making dirty tracking always-enabled, it should
just return 0.

.read_and_clear_dirty(): an iommu driver is expected to walk the pagetables
for the iova range passed in and use iommu_dirty_bitmap_record() to record
dirty info per IOVA. When detecting that a given IOVA is dirty it should
also clear its dirty state from the PTE, *unless* the flag
IOMMU_DIRTY_NO_CLEAR is passed in -- flushing is steered from the caller of
the domain_op via iotlb_gather. The iommu core APIs use the same data
structure in use for dirty tracking for VFIO device dirty (struct
iova_bitmap) abstracted by iommu_dirty_bitmap_record() helper function.

domain::dirty_ops: IOMMU domains will store the dirty ops depending on
whether the iommu device supports dirty tracking or not. iommu drivers can
then use this field to figure if the dirty tracking is supported+enforced
on attach. The enforcement is enable via domain_alloc_user() which is done
via IOMMUFD hwpt flag introduced later.

Link: https://lore.kernel.org/r/20231024135109.73787-5-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/io-pgtable.h |  4 +++
 include/linux/iommu.h      | 70 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 1b7a44b35616..25142a0e2fc2 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -166,6 +166,10 @@ struct io_pgtable_ops {
 			      struct iommu_iotlb_gather *gather);
 	phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
 				    unsigned long iova);
+	int (*read_and_clear_dirty)(struct io_pgtable_ops *ops,
+				    unsigned long iova, size_t size,
+				    unsigned long flags,
+				    struct iommu_dirty_bitmap *dirty);
 };
 
 /**
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3861d66b65c1..1d42bdb37cbc 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/of.h>
+#include <linux/iova_bitmap.h>
 #include <uapi/linux/iommu.h>
 
 #define IOMMU_READ	(1 << 0)
@@ -37,6 +38,7 @@ struct bus_type;
 struct device;
 struct iommu_domain;
 struct iommu_domain_ops;
+struct iommu_dirty_ops;
 struct notifier_block;
 struct iommu_sva;
 struct iommu_fault_event;
@@ -95,6 +97,8 @@ struct iommu_domain_geometry {
 struct iommu_domain {
 	unsigned type;
 	const struct iommu_domain_ops *ops;
+	const struct iommu_dirty_ops *dirty_ops;
+
 	unsigned long pgsize_bitmap;	/* Bitmap of page sizes in use */
 	struct iommu_domain_geometry geometry;
 	struct iommu_dma_cookie *iova_cookie;
@@ -133,6 +137,7 @@ enum iommu_cap {
 	 * usefully support the non-strict DMA flush queue.
 	 */
 	IOMMU_CAP_DEFERRED_FLUSH,
+	IOMMU_CAP_DIRTY_TRACKING,	/* IOMMU supports dirty tracking */
 };
 
 /* These are the possible reserved region types */
@@ -227,6 +232,35 @@ struct iommu_iotlb_gather {
 	bool			queued;
 };
 
+/**
+ * struct iommu_dirty_bitmap - Dirty IOVA bitmap state
+ * @bitmap: IOVA bitmap
+ * @gather: Range information for a pending IOTLB flush
+ */
+struct iommu_dirty_bitmap {
+	struct iova_bitmap *bitmap;
+	struct iommu_iotlb_gather *gather;
+};
+
+/* Read but do not clear any dirty bits */
+#define IOMMU_DIRTY_NO_CLEAR (1 << 0)
+
+/**
+ * struct iommu_dirty_ops - domain specific dirty tracking operations
+ * @set_dirty_tracking: Enable or Disable dirty tracking on the iommu domain
+ * @read_and_clear_dirty: Walk IOMMU page tables for dirtied PTEs marshalled
+ *                        into a bitmap, with a bit represented as a page.
+ *                        Reads the dirty PTE bits and clears it from IO
+ *                        pagetables.
+ */
+struct iommu_dirty_ops {
+	int (*set_dirty_tracking)(struct iommu_domain *domain, bool enabled);
+	int (*read_and_clear_dirty)(struct iommu_domain *domain,
+				    unsigned long iova, size_t size,
+				    unsigned long flags,
+				    struct iommu_dirty_bitmap *dirty);
+};
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability
@@ -641,6 +675,28 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
 	return gather && gather->queued;
 }
 
+static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty,
+					   struct iova_bitmap *bitmap,
+					   struct iommu_iotlb_gather *gather)
+{
+	if (gather)
+		iommu_iotlb_gather_init(gather);
+
+	dirty->bitmap = bitmap;
+	dirty->gather = gather;
+}
+
+static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty,
+					     unsigned long iova,
+					     unsigned long length)
+{
+	if (dirty->bitmap)
+		iova_bitmap_set(dirty->bitmap, iova, length);
+
+	if (dirty->gather)
+		iommu_iotlb_gather_add_range(dirty->gather, iova, length);
+}
+
 /* PCI device grouping function */
 extern struct iommu_group *pci_device_group(struct device *dev);
 /* Generic device grouping function */
@@ -746,6 +802,8 @@ struct iommu_fwspec {};
 struct iommu_device {};
 struct iommu_fault_param {};
 struct iommu_iotlb_gather {};
+struct iommu_dirty_bitmap {};
+struct iommu_dirty_ops {};
 
 static inline bool iommu_present(const struct bus_type *bus)
 {
@@ -978,6 +1036,18 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
 	return false;
 }
 
+static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty,
+					   struct iova_bitmap *bitmap,
+					   struct iommu_iotlb_gather *gather)
+{
+}
+
+static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty,
+					     unsigned long iova,
+					     unsigned long length)
+{
+}
+
 static inline void iommu_device_unregister(struct iommu_device *iommu)
 {
 }
-- 
cgit v1.2.3


From 66c62769bcf6aa142e2309278980a2e52f4b08db Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 23 Oct 2023 21:07:58 +0300
Subject: exportfs: add helpers to check if filesystem can encode/decode file
 handles

The logic of whether filesystem can encode/decode file handles is open
coded in many places.

In preparation to changing the logic, move the open coded logic into
inline helpers.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231023180801.2953446-2-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/exportfs.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 11fbd0ee1370..5b3c9f30b422 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -233,6 +233,33 @@ extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
 extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid,
 			      int *max_len, int flags);
 
+static inline bool exportfs_can_encode_fid(const struct export_operations *nop)
+{
+	return nop;
+}
+
+static inline bool exportfs_can_decode_fh(const struct export_operations *nop)
+{
+	return nop && nop->fh_to_dentry;
+}
+
+static inline bool exportfs_can_encode_fh(const struct export_operations *nop,
+					  int fh_flags)
+{
+	/*
+	 * If a non-decodeable file handle was requested, we only need to make
+	 * sure that filesystem can encode file handles.
+	 */
+	if (fh_flags & EXPORT_FH_FID)
+		return exportfs_can_encode_fid(nop);
+
+	/*
+	 * If a decodeable file handle was requested, we need to make sure that
+	 * filesystem can also decode file handles.
+	 */
+	return exportfs_can_decode_fh(nop);
+}
+
 static inline int exportfs_encode_fid(struct inode *inode, struct fid *fid,
 				      int *max_len)
 {
-- 
cgit v1.2.3


From ae8d3522e5b7208d238c893a00957864536983dc Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Thu, 19 Oct 2023 18:56:18 +0000
Subject: KVM: arm64: Add PMU event filter bits required if EL3 is implemented

Suzuki noticed that KVM's PMU emulation is oblivious to the NSU and NSK
event filter bits. On systems that have EL3 these bits modify the
filter behavior in non-secure EL0 and EL1, respectively. Even though the
kernel doesn't use these bits, it is entirely possible some other guest
OS does. Additionally, it would appear that these and the M bit are
required by the architecture if EL3 is implemented.

Allow the EL3 event filter bits to be set if EL3 is advertised in the
guest's ID register. Implement the behavior of NSU and NSK according to
the pseudocode, and entirely ignore the M bit for perf event creation.

Reported-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20231019185618.3442949-3-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 include/linux/perf/arm_pmuv3.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index e3899bd77f5c..9c226adf938a 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -234,9 +234,12 @@
 /*
  * Event filters for PMUv3
  */
-#define ARMV8_PMU_EXCLUDE_EL1	(1U << 31)
-#define ARMV8_PMU_EXCLUDE_EL0	(1U << 30)
-#define ARMV8_PMU_INCLUDE_EL2	(1U << 27)
+#define ARMV8_PMU_EXCLUDE_EL1		(1U << 31)
+#define ARMV8_PMU_EXCLUDE_EL0		(1U << 30)
+#define ARMV8_PMU_EXCLUDE_NS_EL1	(1U << 29)
+#define ARMV8_PMU_EXCLUDE_NS_EL0	(1U << 28)
+#define ARMV8_PMU_INCLUDE_EL2		(1U << 27)
+#define ARMV8_PMU_EXCLUDE_EL3		(1U << 26)
 
 /*
  * PMUSERENR: user enable reg
-- 
cgit v1.2.3


From a3a62ca256ab1d1ba771ee243ec4a8e8b7856f08 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Tue, 24 Oct 2023 11:50:13 +0530
Subject: ACPI: utils: Introduce acpi_dev_uid_match() for matching _UID

Introduce acpi_dev_uid_match() helper that matches the device with
supplied _UID string.

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index afd94c9b8b8a..db3a33e19c97 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -787,6 +787,11 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
 
 struct acpi_device;
 
+static inline bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2)
+{
+	return false;
+}
+
 static inline bool
 acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2)
 {
-- 
cgit v1.2.3


From 6ca80638b90cec66547011ee1ef79e534589989a Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian.fainelli@broadcom.com>
Date: Mon, 23 Oct 2023 11:17:28 -0700
Subject: net: dsa: Use conduit and user terms

Use more inclusive terms throughout the DSA subsystem by moving away
from "master" which is replaced by "conduit" and "slave" which is
replaced by "user". No functional changes.

Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/20231023181729.1191071-2-florian.fainelli@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dsa/sja1105.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index c177322f793d..b9dd35d4b8f5 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -28,7 +28,7 @@
 /* Source and Destination MAC of follow-up meta frames.
  * Whereas the choice of SMAC only affects the unique identification of the
  * switch as sender of meta frames, the DMAC must be an address that is present
- * in the DSA master port's multicast MAC filter.
+ * in the DSA conduit port's multicast MAC filter.
  * 01-80-C2-00-00-0E is a good choice for this, as all profiles of IEEE 1588
  * over L2 use this address for some purpose already.
  */
-- 
cgit v1.2.3


From 15fcedd43a0810a5482ef759a18ab852d012cead Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:52:24 -0700
Subject: kexec: Annotate struct crash_mem with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct crash_mem.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Eric Biederman <ebiederm@xmission.com>
Cc: kexec@lists.infradead.org
Acked-by: Baoquan He <bhe@redhat.com>
Link: https://lore.kernel.org/r/20230922175224.work.712-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/crash_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 0c06561bf5ff..08704c29fdb4 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -92,7 +92,7 @@ int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
 struct crash_mem {
 	unsigned int max_nr_ranges;
 	unsigned int nr_ranges;
-	struct range ranges[];
+	struct range ranges[] __counted_by(max_nr_ranges);
 };
 
 extern int crash_exclude_mem_range(struct crash_mem *mem,
-- 
cgit v1.2.3


From 8daaed76383aaae1d094d3d65d34890b2c6da24b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 14 Sep 2023 17:55:22 +0200
Subject: sh: Remove superhyway bus support

The superhyway bus driver was only referenced on SH4-202, which is now gone,
so remove it all as well.

I could find no trace of anything ever calling superhyway_register_driver(),
not in the git history but also not on the web, so I assume this has never
served any purpose on mainline kernels.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Link: https://lore.kernel.org/r/20230914155523.3839811-3-arnd@kernel.org
Signed-off-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
---
 include/linux/superhyway.h | 107 ---------------------------------------------
 1 file changed, 107 deletions(-)
 delete mode 100644 include/linux/superhyway.h

(limited to 'include/linux')

diff --git a/include/linux/superhyway.h b/include/linux/superhyway.h
deleted file mode 100644
index 8d3376775813..000000000000
--- a/include/linux/superhyway.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * include/linux/superhyway.h
- *
- * SuperHyway Bus definitions
- *
- * Copyright (C) 2004, 2005  Paul Mundt <lethal@linux-sh.org>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#ifndef __LINUX_SUPERHYWAY_H
-#define __LINUX_SUPERHYWAY_H
-
-#include <linux/device.h>
-
-/*
- * SuperHyway IDs
- */
-#define SUPERHYWAY_DEVICE_ID_SH5_DMAC	0x0183
-
-struct superhyway_vcr_info {
-	u8	perr_flags;	/* P-port Error flags */
-	u8	merr_flags;	/* Module Error flags */
-	u16	mod_vers;	/* Module Version */
-	u16	mod_id;		/* Module ID */
-	u8	bot_mb;		/* Bottom Memory block */
-	u8	top_mb;		/* Top Memory block */
-};
-
-struct superhyway_ops {
-	int (*read_vcr)(unsigned long base, struct superhyway_vcr_info *vcr);
-	int (*write_vcr)(unsigned long base, struct superhyway_vcr_info vcr);
-};
-
-struct superhyway_bus {
-	struct superhyway_ops *ops;
-};
-
-extern struct superhyway_bus superhyway_channels[];
-
-struct superhyway_device_id {
-	unsigned int id;
-	unsigned long driver_data;
-};
-
-struct superhyway_device;
-extern struct bus_type superhyway_bus_type;
-
-struct superhyway_driver {
-	char *name;
-
-	const struct superhyway_device_id *id_table;
-	struct device_driver drv;
-
-	int (*probe)(struct superhyway_device *dev, const struct superhyway_device_id *id);
-	void (*remove)(struct superhyway_device *dev);
-};
-
-#define to_superhyway_driver(d)	container_of((d), struct superhyway_driver, drv)
-
-struct superhyway_device {
-	char name[32];
-
-	struct device dev;
-
-	struct superhyway_device_id id;
-	struct superhyway_driver *drv;
-	struct superhyway_bus *bus;
-
-	int num_resources;
-	struct resource *resource;
-	struct superhyway_vcr_info vcr;
-};
-
-#define to_superhyway_device(d)	container_of((d), struct superhyway_device, dev)
-
-#define superhyway_get_drvdata(d)	dev_get_drvdata(&(d)->dev)
-#define superhyway_set_drvdata(d,p)	dev_set_drvdata(&(d)->dev, (p))
-
-static inline int
-superhyway_read_vcr(struct superhyway_device *dev, unsigned long base,
-		    struct superhyway_vcr_info *vcr)
-{
-	return dev->bus->ops->read_vcr(base, vcr);
-}
-
-static inline int
-superhyway_write_vcr(struct superhyway_device *dev, unsigned long base,
-		     struct superhyway_vcr_info vcr)
-{
-	return dev->bus->ops->write_vcr(base, vcr);
-}
-
-extern int superhyway_scan_bus(struct superhyway_bus *);
-
-/* drivers/sh/superhyway/superhyway.c */
-int superhyway_register_driver(struct superhyway_driver *);
-void superhyway_unregister_driver(struct superhyway_driver *);
-int superhyway_add_device(unsigned long base, struct superhyway_device *, struct superhyway_bus *);
-int superhyway_add_devices(struct superhyway_bus *bus, struct superhyway_device **devices, int nr_devices);
-
-/* drivers/sh/superhyway/superhyway-sysfs.c */
-extern const struct attribute_group *superhyway_dev_groups[];
-
-#endif /* __LINUX_SUPERHYWAY_H */
-
-- 
cgit v1.2.3


From df8b030d82dd1224881acb4e778fa06c4824f72b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Thu, 12 Oct 2023 12:23:38 +0200
Subject: HID: core: remove #ifdef CONFIG_PM from hid_driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow HID drivers to pass ->suspend, ->resume and ->reset_resume via
pm_ptr().
Through the usage of pm_ptr() the CONFIG_PM-dependent code will always be
compiled, protecting against bitrot.
The linker will then garbage-collect the unused function avoiding any overhead.

The only overhead in the final kernel image and at runtime are a few
extra bytes in 'struct hid_driver'.

The same approach is chosen by 'struct usb_driver' and other subsystems.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/r/20231012-hid-pm_ptr-v1-1-0a71531ca93b@weissschuh.net
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hid.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 964ca1f15e3f..5a8387a4a712 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -833,11 +833,11 @@ struct hid_driver {
 	void (*feature_mapping)(struct hid_device *hdev,
 			struct hid_field *field,
 			struct hid_usage *usage);
-#ifdef CONFIG_PM
+
 	int (*suspend)(struct hid_device *hdev, pm_message_t message);
 	int (*resume)(struct hid_device *hdev);
 	int (*reset_resume)(struct hid_device *hdev);
-#endif
+
 /* private: */
 	struct device_driver driver;
 };
-- 
cgit v1.2.3


From 3de6047f1832010d24963f58206298fc75085816 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 21 Sep 2023 21:07:38 +0100
Subject: highmem: Add folio_release_kmap()

This is the folio equivalent of unmap_and_put_page(), which remains as
a wrapper for it.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20230921200746.3303942-1-willy@infradead.org>
---
 include/linux/highmem.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 99c474de800d..4cacc0e43b51 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -551,10 +551,24 @@ static inline void folio_zero_range(struct folio *folio,
 	zero_user_segments(&folio->page, start, start + length, 0, 0);
 }
 
-static inline void unmap_and_put_page(struct page *page, void *addr)
+/**
+ * folio_release_kmap - Unmap a folio and drop a refcount.
+ * @folio: The folio to release.
+ * @addr: The address previously returned by a call to kmap_local_folio().
+ *
+ * It is common, eg in directory handling to kmap a folio.  This function
+ * unmaps the folio and drops the refcount that was being held to keep the
+ * folio alive while we accessed it.
+ */
+static inline void folio_release_kmap(struct folio *folio, void *addr)
 {
 	kunmap_local(addr);
-	put_page(page);
+	folio_put(folio);
+}
+
+static inline void unmap_and_put_page(struct page *page, void *addr)
+{
+	folio_release_kmap(page_folio(page), addr);
 }
 
 #endif /* _LINUX_HIGHMEM_H */
-- 
cgit v1.2.3


From 61d4fb0b349ec1b33119913c3b0bd109de30142c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 25 Oct 2023 12:14:37 +0200
Subject: file, i915: fix file reference for mmap_singleton()

Today we got a report at [1] for rcu stalls on the i915 testsuite in [2]
due to the conversion of files to SLAB_TYPSSAFE_BY_RCU. Afaict,
get_file_rcu() goes into an infinite loop trying to carefully verify
that i915->gem.mmap_singleton hasn't changed - see the splat below.

So I stared at this code to figure out what it actually does. It seems
that the i915->gem.mmap_singleton pointer itself never had rcu semantics.

The i915->gem.mmap_singleton is replaced in
file->f_op->release::singleton_release():

        static int singleton_release(struct inode *inode, struct file *file)
        {
                struct drm_i915_private *i915 = file->private_data;

                cmpxchg(&i915->gem.mmap_singleton, file, NULL);
                drm_dev_put(&i915->drm);

                return 0;
        }

The cmpxchg() is ordered against a concurrent update of
i915->gem.mmap_singleton from mmap_singleton(). IOW, when
mmap_singleton() fails to get a reference on i915->gem.mmap_singleton:

While mmap_singleton() does

        rcu_read_lock();
        file = get_file_rcu(&i915->gem.mmap_singleton);
        rcu_read_unlock();

it allocates a new file via anon_inode_getfile() and does

        smp_store_mb(i915->gem.mmap_singleton, file);

So, then what happens in the case of this bug is that at some point
fput() is called and drops the file->f_count to zero leaving the pointer
in i915->gem.mmap_singleton in tact.

Now, there might be delays until
file->f_op->release::singleton_release() is called and
i915->gem.mmap_singleton is set to NULL.

Say concurrently another task hits mmap_singleton() and does:

        rcu_read_lock();
        file = get_file_rcu(&i915->gem.mmap_singleton);
        rcu_read_unlock();

When get_file_rcu() fails to get a reference via atomic_inc_not_zero()
it will try the reload from i915->gem.mmap_singleton expecting it to be
NULL, assuming it has comparable semantics as we expect in
__fget_files_rcu().

But it hasn't so it reloads the same pointer again, trying the same
atomic_inc_not_zero() again and doing so until
file->f_op->release::singleton_release() of the old file has been
called.

So, in contrast to __fget_files_rcu() here we want to not retry when
atomic_inc_not_zero() has failed. We only want to retry in case we
managed to get a reference but the pointer did change on reload.

<3> [511.395679] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
<3> [511.395716] rcu:   Tasks blocked on level-1 rcu_node (CPUs 0-9): P6238
<3> [511.395934] rcu:   (detected by 16, t=65002 jiffies, g=123977, q=439 ncpus=20)
<6> [511.395944] task:i915_selftest   state:R  running task     stack:10568 pid:6238  tgid:6238  ppid:1001   flags:0x00004002
<6> [511.395962] Call Trace:
<6> [511.395966]  <TASK>
<6> [511.395974]  ? __schedule+0x3a8/0xd70
<6> [511.395995]  ? asm_sysvec_apic_timer_interrupt+0x1a/0x20
<6> [511.396003]  ? lockdep_hardirqs_on+0xc3/0x140
<6> [511.396013]  ? asm_sysvec_apic_timer_interrupt+0x1a/0x20
<6> [511.396029]  ? get_file_rcu+0x10/0x30
<6> [511.396039]  ? get_file_rcu+0x10/0x30
<6> [511.396046]  ? i915_gem_object_mmap+0xbc/0x450 [i915]
<6> [511.396509]  ? i915_gem_mmap+0x272/0x480 [i915]
<6> [511.396903]  ? mmap_region+0x253/0xb60
<6> [511.396925]  ? do_mmap+0x334/0x5c0
<6> [511.396939]  ? vm_mmap_pgoff+0x9f/0x1c0
<6> [511.396949]  ? rcu_is_watching+0x11/0x50
<6> [511.396962]  ? igt_mmap_offset+0xfc/0x110 [i915]
<6> [511.397376]  ? __igt_mmap+0xb3/0x570 [i915]
<6> [511.397762]  ? igt_mmap+0x11e/0x150 [i915]
<6> [511.398139]  ? __trace_bprintk+0x76/0x90
<6> [511.398156]  ? __i915_subtests+0xbf/0x240 [i915]
<6> [511.398586]  ? __pfx___i915_live_setup+0x10/0x10 [i915]
<6> [511.399001]  ? __pfx___i915_live_teardown+0x10/0x10 [i915]
<6> [511.399433]  ? __run_selftests+0xbc/0x1a0 [i915]
<6> [511.399875]  ? i915_live_selftests+0x4b/0x90 [i915]
<6> [511.400308]  ? i915_pci_probe+0x106/0x200 [i915]
<6> [511.400692]  ? pci_device_probe+0x95/0x120
<6> [511.400704]  ? really_probe+0x164/0x3c0
<6> [511.400715]  ? __pfx___driver_attach+0x10/0x10
<6> [511.400722]  ? __driver_probe_device+0x73/0x160
<6> [511.400731]  ? driver_probe_device+0x19/0xa0
<6> [511.400741]  ? __driver_attach+0xb6/0x180
<6> [511.400749]  ? __pfx___driver_attach+0x10/0x10
<6> [511.400756]  ? bus_for_each_dev+0x77/0xd0
<6> [511.400770]  ? bus_add_driver+0x114/0x210
<6> [511.400781]  ? driver_register+0x5b/0x110
<6> [511.400791]  ? i915_init+0x23/0xc0 [i915]
<6> [511.401153]  ? __pfx_i915_init+0x10/0x10 [i915]
<6> [511.401503]  ? do_one_initcall+0x57/0x270
<6> [511.401515]  ? rcu_is_watching+0x11/0x50
<6> [511.401521]  ? kmalloc_trace+0xa3/0xb0
<6> [511.401532]  ? do_init_module+0x5f/0x210
<6> [511.401544]  ? load_module+0x1d00/0x1f60
<6> [511.401581]  ? init_module_from_file+0x86/0xd0
<6> [511.401590]  ? init_module_from_file+0x86/0xd0
<6> [511.401613]  ? idempotent_init_module+0x17c/0x230
<6> [511.401639]  ? __x64_sys_finit_module+0x56/0xb0
<6> [511.401650]  ? do_syscall_64+0x3c/0x90
<6> [511.401659]  ? entry_SYSCALL_64_after_hwframe+0x6e/0xd8
<6> [511.401684]  </TASK>

Link: [1]: https://lore.kernel.org/intel-gfx/SJ1PR11MB6129CB39EED831784C331BAFB9DEA@SJ1PR11MB6129.namprd11.prod.outlook.com
Link: [2]: https://intel-gfx-ci.01.org/tree/linux-next/next-20231013/bat-dg2-11/igt@i915_selftest@live@mman.html#dmesg-warnings10963
Cc: Jann Horn <jannh@google.com>,
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20231025-formfrage-watscheln-84526cd3bd7d@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index cb8bfa1f8ecb..b35815277cc6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1044,6 +1044,7 @@ static inline struct file *get_file(struct file *f)
 }
 
 struct file *get_file_rcu(struct file __rcu **f);
+struct file *get_file_active(struct file **f);
 
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 
-- 
cgit v1.2.3


From 3decb8564eff88a2533f83b01cec2cf9259c3eaf Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 16 Oct 2023 21:10:49 +0100
Subject: buffer: make folio_create_empty_buffers() return a buffer_head

Patch series "Finish the create_empty_buffers() transition", v2.

Pankaj recently added folio_create_empty_buffers() as the folio equivalent
to create_empty_buffers().  This patch set finishes the conversion by
first converting all remaining filesystems to call
folio_create_empty_buffers(), then renaming it back to
create_empty_buffers().  I took the opportunity to make a few
simplifications like making folio_create_empty_buffers() return the head
buffer and extracting get_nth_bh() from nilfs2.

A few of the patches in this series aren't directly related to
create_empty_buffers(), but I saw them while I was working on this and
thought they'd be easy enough to add to this series.  Compile-tested only,
other than ext4.


This patch (of 26):

Almost all callers want to know the first BH that was allocated for this
folio.  We already have that handy, so return it.

Link: https://lkml.kernel.org/r/20231016201114.1928083-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231016201114.1928083-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 3dc4720e4773..1001244a8941 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -203,8 +203,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
 void create_empty_buffers(struct page *, unsigned long,
 			unsigned long b_state);
-void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
-				unsigned long b_state);
+struct buffer_head *folio_create_empty_buffers(struct folio *folio,
+		unsigned long blocksize, unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_async_write(struct buffer_head *bh, int uptodate);
-- 
cgit v1.2.3


From 0217fbb0271a7c78e16629cf45375916ec2eb35f Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 16 Oct 2023 21:10:52 +0100
Subject: buffer: add get_nth_bh()

Extract this useful helper from nilfs_page_get_nth_block()

Link: https://lkml.kernel.org/r/20231016201114.1928083-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 1001244a8941..3d85a0cf0ca5 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -457,6 +457,28 @@ __bread(struct block_device *bdev, sector_t block, unsigned size)
 	return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
 }
 
+/**
+ * get_nth_bh - Get a reference on the n'th buffer after this one.
+ * @bh: The buffer to start counting from.
+ * @count: How many buffers to skip.
+ *
+ * This is primarily useful for finding the nth buffer in a folio; in
+ * that case you pass the head buffer and the byte offset in the folio
+ * divided by the block size.  It can be used for other purposes, but
+ * it will wrap at the end of the folio rather than returning NULL or
+ * proceeding to the next folio for you.
+ *
+ * Return: The requested buffer with an elevated refcount.
+ */
+static inline __must_check
+struct buffer_head *get_nth_bh(struct buffer_head *bh, unsigned int count)
+{
+	while (count--)
+		bh = bh->b_this_page;
+	get_bh(bh);
+	return bh;
+}
+
 bool block_dirty_folio(struct address_space *mapping, struct folio *folio);
 
 #ifdef CONFIG_BUFFER_HEAD
-- 
cgit v1.2.3


From 0a88810d9b76e6ecfd234f5728e27344a39e3ff8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 16 Oct 2023 21:11:14 +0100
Subject: buffer: remove folio_create_empty_buffers()

With all users converted, remove the old create_empty_buffers() and rename
folio_create_empty_buffers() to create_empty_buffers().

Link: https://lkml.kernel.org/r/20231016201114.1928083-28-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 3d85a0cf0ca5..5f23ee599889 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -201,9 +201,7 @@ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
 					gfp_t gfp);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
-void create_empty_buffers(struct page *, unsigned long,
-			unsigned long b_state);
-struct buffer_head *folio_create_empty_buffers(struct folio *folio,
+struct buffer_head *create_empty_buffers(struct folio *folio,
 		unsigned long blocksize, unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
-- 
cgit v1.2.3


From ca71fe1ad9221a89c6a25f49159c600d9e598ae1 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:54 +0800
Subject: mm, pcp: avoid to drain PCP when process exit

Patch series "mm: PCP high auto-tuning", v3.

The page allocation performance requirements of different workloads are
often different.  So, we need to tune the PCP (Per-CPU Pageset) high on
each CPU automatically to optimize the page allocation performance.

The list of patches in series is as follows,

[1/9] mm, pcp: avoid to drain PCP when process exit
[2/9] cacheinfo: calculate per-CPU data cache size
[3/9] mm, pcp: reduce lock contention for draining high-order pages
[4/9] mm: restrict the pcp batch scale factor to avoid too long latency
[5/9] mm, page_alloc: scale the number of pages that are batch allocated
[6/9] mm: add framework for PCP high auto-tuning
[7/9] mm: tune PCP high automatically
[8/9] mm, pcp: decrease PCP high if free pages < high watermark
[9/9] mm, pcp: reduce detecting time of consecutive high order page freeing

Patch [1/9], [2/9], [3/9] optimize the PCP draining for consecutive
high-order pages freeing.

Patch [4/9], [5/9] optimize batch freeing and allocating.

Patch [6/9], [7/9], [8/9] implement and optimize a PCP high
auto-tuning method.

Patch [9/9] optimize the PCP draining for consecutive high order page
freeing based on PCP high auto-tuning.

The test results for patches with performance impact are as follows,

kbuild
======

On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
kbuild server that is used by 0-Day kbuild service.

	build time   lock contend%	free_high	alloc_zone
	----------	----------	---------	----------
base	     100.0	      14.0          100.0            100.0
patch1	      99.5	      12.8	     19.5	      95.6
patch3	      99.4	      12.6	      7.1	      95.6
patch5	      98.6	      11.0	      8.1	      97.1
patch7	      95.1	       0.5	      2.8	      15.6
patch9	      95.0	       1.0	      8.8	      20.0

The PCP draining optimization (patch [1/9], [3/9]) and PCP batch
allocation optimization (patch [5/9]) reduces zone lock contention a
little.  The PCP high auto-tuning (patch [7/9], [9/9]) reduces build time
visibly.  Where the tuning target: the number of pages allocated from zone
reduces greatly.  So, the zone contention cycles% reduces greatly.

With PCP tuning patches (patch [7/9], [9/9]), the average used memory
during test increases up to 18.4% because more pages are cached in PCP.
But at the end of the test, the number of the used memory decreases to the
same level as that of the base patch.  That is, the pages cached in PCP
will be released to zone after not being used actively.

netperf SCTP_STREAM_MANY
========================

On a 2-socket Intel server with 128 logical CPU, we tested
SCTP_STREAM_MANY test case of netperf test suite with 64-pair processes.

	     score   lock contend%	free_high	alloc_zone  cache miss rate%
	     -----	----------	---------	----------  ----------------
base	     100.0	       2.1          100.0            100.0	         1.3
patch1	      99.4	       2.1	     99.4	      99.4		 1.3
patch3	     106.4	       1.3	     13.3	     106.3		 1.3
patch5	     106.0	       1.2	     13.2	     105.9		 1.3
patch7	     103.4	       1.9	      6.7	      90.3		 7.6
patch9	     108.6	       1.3	     13.7	     108.6		 1.3

The PCP draining optimization (patch [1/9]+[3/9]) improves performance.
The PCP high auto-tuning (patch [7/9]) reduces performance a little
because PCP draining cannot be triggered in time sometimes.  So, the cache
miss rate% increases.  The further PCP draining optimization (patch [9/9])
based on PCP tuning restore the performance.

lmbench3 UNIX (AF_UNIX)
=======================

On a 2-socket Intel server with 128 logical CPU, we tested UNIX
(AF_UNIX socket) test case of lmbench3 test suite with 16-pair
processes.

	     score   lock contend%	free_high	alloc_zone  cache miss rate%
	     -----	----------	---------	----------  ----------------
base	     100.0	      51.4          100.0            100.0	         0.2
patch1	     116.8	      46.1           69.5	     104.3	         0.2
patch3	     199.1	      21.3            7.0	     104.9	         0.2
patch5	     200.0	      20.8            7.1	     106.9	         0.3
patch7	     191.6	      19.9            6.8	     103.8	         2.8
patch9	     193.4	      21.7            7.0	     104.7	         2.1

The PCP draining optimization (patch [1/9], [3/9]) improves performance
much.  The PCP tuning (patch [7/9]) reduces performance a little because
PCP draining cannot be triggered in time sometimes.  The further PCP
draining optimization (patch [9/9]) based on PCP tuning restores the
performance partly.

The patchset adds several fields in struct per_cpu_pages.  The struct
layout before/after the patchset is as follows,

base
====

struct per_cpu_pages {
	spinlock_t                 lock;                 /*     0     4 */
	int                        count;                /*     4     4 */
	int                        high;                 /*     8     4 */
	int                        batch;                /*    12     4 */
	short int                  free_factor;          /*    16     2 */
	short int                  expire;               /*    18     2 */

	/* XXX 4 bytes hole, try to pack */

	struct list_head           lists[13];            /*    24   208 */

	/* size: 256, cachelines: 4, members: 7 */
	/* sum members: 228, holes: 1, sum holes: 4 */
	/* padding: 24 */
} __attribute__((__aligned__(64)));

patched
=======

struct per_cpu_pages {
	spinlock_t                 lock;                 /*     0     4 */
	int                        count;                /*     4     4 */
	int                        high;                 /*     8     4 */
	int                        high_min;             /*    12     4 */
	int                        high_max;             /*    16     4 */
	int                        batch;                /*    20     4 */
	u8                         flags;                /*    24     1 */
	u8                         alloc_factor;         /*    25     1 */
	u8                         expire;               /*    26     1 */

	/* XXX 1 byte hole, try to pack */

	short int                  free_count;           /*    28     2 */

	/* XXX 2 bytes hole, try to pack */

	struct list_head           lists[13];            /*    32   208 */

	/* size: 256, cachelines: 4, members: 11 */
	/* sum members: 237, holes: 2, sum holes: 3 */
	/* padding: 16 */
} __attribute__((__aligned__(64)));

The size of the struct doesn't changed with the patchset.


This patch (of 9):

In commit f26b3fa04611 ("mm/page_alloc: limit number of high-order pages
on PCP during bulk free"), the PCP (Per-CPU Pageset) will be drained when
PCP is mostly used for high-order pages freeing to improve the cache-hot
pages reusing between page allocation and freeing CPUs.

But, the PCP draining mechanism may be triggered unexpectedly when process
exits.  With some customized trace point, it was found that PCP draining
(free_high == true) was triggered with the order-1 page freeing with the
following call stack,

 => free_unref_page_commit
 => free_unref_page
 => __mmdrop
 => exit_mm
 => do_exit
 => do_group_exit
 => __x64_sys_exit_group
 => do_syscall_64

Checking the source code, this is the page table PGD freeing
(mm_free_pgd()).  It's a order-1 page freeing if
CONFIG_PAGE_TABLE_ISOLATION=y.  Which is a common configuration for
security.

Just before that, page freeing with the following call stack was found,

 => free_unref_page_commit
 => free_unref_page_list
 => release_pages
 => tlb_batch_pages_flush
 => tlb_finish_mmu
 => exit_mmap
 => __mmput
 => exit_mm
 => do_exit
 => do_group_exit
 => __x64_sys_exit_group
 => do_syscall_64

So, when a process exits,

- a large number of user pages of the process will be freed without
  page allocation, it's highly possible that pcp->free_factor becomes >
  0.  In fact, this is expected behavior to improve process exit
  performance.

- after freeing all user pages, the PGD will be freed, which is a
  order-1 page freeing, PCP will be drained.

All in all, when a process exits, it's high possible that the PCP will be
drained.  This is an unexpected behavior.

To avoid this, in the patch, the PCP draining will only be triggered for 2
consecutive high-order page freeing.

On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
kbuild server that is used by 0-Day kbuild service.  With the patch, the
cycles% of the spinlock contention (mostly for zone lock) decreases from
14.0% to 12.8% (with PCP size == 367).  The number of PCP draining for
high order pages freeing (free_high) decreases 80.5%.

This helps network workload too for reduced zone lock contention.  On a
2-socket Intel server with 128 logical CPU, with the patch, the network
bandwidth of the UNIX (AF_UNIX) test case of lmbench test suite with
16-pair processes increase 16.8%.  The cycles% of the spinlock contention
(mostly for zone lock) decreases from 51.4% to 46.1%.  The number of PCP
draining for high order pages freeing (free_high) decreases 30.5%.  The
cache miss rate keeps 0.2%.

Link: https://lkml.kernel.org/r/20231016053002.756205-1-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20231016053002.756205-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 486587fcd27f..de313f1c15f9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -674,12 +674,22 @@ enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
+/*
+ * Flags used in pcp->flags field.
+ *
+ * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
+ * previous page freeing.  To avoid to drain PCP for an accident
+ * high-order page freeing.
+ */
+#define	PCPF_PREV_FREE_HIGH_ORDER	BIT(0)
+
 struct per_cpu_pages {
 	spinlock_t lock;	/* Protects lists field */
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
-	short free_factor;	/* batch scaling factor during free */
+	u8 flags;		/* protected by pcp->lock */
+	u8 free_factor;		/* batch scaling factor during free */
 #ifdef CONFIG_NUMA
 	short expire;		/* When 0, remote pagesets are drained */
 #endif
-- 
cgit v1.2.3


From 94a3bfe4073cd88b05f7fb201ea7bf9dfa2cf5d5 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:55 +0800
Subject: cacheinfo: calculate size of per-CPU data cache slice

This can be used to estimate the size of the data cache slice that can be
used by one CPU under ideal circumstances.  Both DATA caches and UNIFIED
caches are used in calculation.  So, the users need to consider the impact
of the code cache usage.

Because the cache inclusive/non-inclusive information isn't available now,
we just use the size of the per-CPU slice of LLC to make the result more
predictable across architectures.  This may be improved when more cache
information is available in the future.

A brute-force algorithm to iterate all online CPUs is used to avoid to
allocate an extra cpumask, especially in offline callback.

Link: https://lkml.kernel.org/r/20231016053002.756205-3-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cacheinfo.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index a5cfd44fab45..d504eb4b49ab 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -73,6 +73,7 @@ struct cacheinfo {
 
 struct cpu_cacheinfo {
 	struct cacheinfo *info_list;
+	unsigned int per_cpu_data_slice_size;
 	unsigned int num_levels;
 	unsigned int num_leaves;
 	bool cpu_map_populated;
-- 
cgit v1.2.3


From 362d37a106dd3f6431b2fdd91d9208b0d023b50d Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:56 +0800
Subject: mm, pcp: reduce lock contention for draining high-order pages

In commit f26b3fa04611 ("mm/page_alloc: limit number of high-order pages
on PCP during bulk free"), the PCP (Per-CPU Pageset) will be drained when
PCP is mostly used for high-order pages freeing to improve the cache-hot
pages reusing between page allocating and freeing CPUs.

On system with small per-CPU data cache slice, pages shouldn't be cached
before draining to guarantee cache-hot.  But on a system with large
per-CPU data cache slice, some pages can be cached before draining to
reduce zone lock contention.

So, in this patch, instead of draining without any caching, "pcp->batch"
pages will be cached in PCP before draining if the size of the per-CPU
data cache slice is more than "3 * batch".

In theory, if the size of per-CPU data cache slice is more than "2 *
batch", we can reuse cache-hot pages between CPUs.  But considering the
other usage of cache (code, other data accessing, etc.), "3 * batch" is
used.

Note: "3 * batch" is chosen to make sure the optimization works on recent
x86_64 server CPUs.  If you want to increase it, please check whether it
breaks the optimization.

On a 2-socket Intel server with 128 logical CPU, with the patch, the
network bandwidth of the UNIX (AF_UNIX) test case of lmbench test suite
with 16-pair processes increase 70.5%.  The cycles% of the spinlock
contention (mostly for zone lock) decreases from 46.1% to 21.3%.  The
number of PCP draining for high order pages freeing (free_high) decreases
89.9%.  The cache miss rate keeps 0.2%.

Link: https://lkml.kernel.org/r/20231016053002.756205-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h    | 1 +
 include/linux/mmzone.h | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 665f06675c83..665edc11fb9f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -325,6 +325,7 @@ void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
 
 void page_alloc_init_late(void);
+void setup_pcp_cacheinfo(void);
 
 /*
  * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index de313f1c15f9..efe72b3f7872 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -680,8 +680,14 @@ enum zone_watermarks {
  * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
  * previous page freeing.  To avoid to drain PCP for an accident
  * high-order page freeing.
+ *
+ * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before
+ * draining PCP for consecutive high-order pages freeing without
+ * allocation if data cache slice of CPU is large enough.  To reduce
+ * zone lock contention and keep cache-hot pages reusing.
  */
 #define	PCPF_PREV_FREE_HIGH_ORDER	BIT(0)
+#define	PCPF_FREE_HIGH_BATCH		BIT(1)
 
 struct per_cpu_pages {
 	spinlock_t lock;	/* Protects lists field */
-- 
cgit v1.2.3


From c0a242394cb980bd00e1f61dc8aacb453d2bbe6a Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:58 +0800
Subject: mm, page_alloc: scale the number of pages that are batch allocated

When a task is allocating a large number of order-0 pages, it may acquire
the zone->lock multiple times allocating pages in batches.  This may
unnecessarily contend on the zone lock when allocating very large number
of pages.  This patch adapts the size of the batch based on the recent
pattern to scale the batch size for subsequent allocations.

On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
kbuild server that is used by 0-Day kbuild service.  With the patch, the
cycles% of the spinlock contention (mostly for zone lock) decreases from
12.6% to 11.0% (with PCP size == 367).

Link: https://lkml.kernel.org/r/20231016053002.756205-6-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index efe72b3f7872..e6cc2238c65f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -695,9 +695,10 @@ struct per_cpu_pages {
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
 	u8 flags;		/* protected by pcp->lock */
+	u8 alloc_factor;	/* batch scaling factor during allocate */
 	u8 free_factor;		/* batch scaling factor during free */
 #ifdef CONFIG_NUMA
-	short expire;		/* When 0, remote pagesets are drained */
+	u8 expire;		/* When 0, remote pagesets are drained */
 #endif
 
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
-- 
cgit v1.2.3


From 90b41691b9881376fe784e13b5766ec3676fdb55 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:59 +0800
Subject: mm: add framework for PCP high auto-tuning

The page allocation performance requirements of different workloads are
usually different.  So, we need to tune PCP (per-CPU pageset) high to
optimize the workload page allocation performance.  Now, we have a system
wide sysctl knob (percpu_pagelist_high_fraction) to tune PCP high by hand.
But, it's hard to find out the best value by hand.  And one global
configuration may not work best for the different workloads that run on
the same system.  One solution to these issues is to tune PCP high of each
CPU automatically.

This patch adds the framework for PCP high auto-tuning.  With it,
pcp->high of each CPU will be changed automatically by tuning algorithm at
runtime.  The minimal high (pcp->high_min) is the original PCP high value
calculated based on the low watermark pages.  While the maximal high
(pcp->high_max) is the PCP high value when percpu_pagelist_high_fraction
sysctl knob is set to MIN_PERCPU_PAGELIST_HIGH_FRACTION.  That is, the
maximal pcp->high that can be set via sysctl knob by hand.

It's possible that PCP high auto-tuning doesn't work well for some
workloads.  So, when PCP high is tuned by hand via the sysctl knob, the
auto-tuning will be disabled.  The PCP high set by hand will be used
instead.

This patch only adds the framework, so pcp->high will be set to
pcp->high_min (original default) always.  We will add actual auto-tuning
algorithm in the following patches in the series.

Link: https://lkml.kernel.org/r/20231016053002.756205-7-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e6cc2238c65f..775abc899e80 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -693,6 +693,8 @@ struct per_cpu_pages {
 	spinlock_t lock;	/* Protects lists field */
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
+	int high_min;		/* min high watermark */
+	int high_max;		/* max high watermark */
 	int batch;		/* chunk size for buddy add/remove */
 	u8 flags;		/* protected by pcp->lock */
 	u8 alloc_factor;	/* batch scaling factor during allocate */
@@ -852,7 +854,8 @@ struct zone {
 	 * the high and batch values are copied to individual pagesets for
 	 * faster access
 	 */
-	int pageset_high;
+	int pageset_high_min;
+	int pageset_high_max;
 	int pageset_batch;
 
 #ifndef CONFIG_SPARSEMEM
-- 
cgit v1.2.3


From 51a755c56dc05a8b31ed28d24f28354946dc7529 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:30:00 +0800
Subject: mm: tune PCP high automatically

The target to tune PCP high automatically is as follows,

- Minimize allocation/freeing from/to shared zone

- Minimize idle pages in PCP

- Minimize pages in PCP if the system free pages is too few

To reach these target, a tuning algorithm as follows is designed,

- When we refill PCP via allocating from the zone, increase PCP high.
  Because if we had larger PCP, we could avoid to allocate from the
  zone.

- In periodic vmstat updating kworker (via refresh_cpu_vm_stats()),
  decrease PCP high to try to free possible idle PCP pages.

- When page reclaiming is active for the zone, stop increasing PCP
  high in allocating path, decrease PCP high and free some pages in
  freeing path.

So, the PCP high can be tuned to the page allocating/freeing depth of
workloads eventually.

One issue of the algorithm is that if the number of pages allocated is
much more than that of pages freed on a CPU, the PCP high may become the
maximal value even if the allocating/freeing depth is small.  But this
isn't a severe issue, because there are no idle pages in this case.

One alternative choice is to increase PCP high when we drain PCP via
trying to free pages to the zone, but don't increase PCP high during PCP
refilling.  This can avoid the issue above.  But if the number of pages
allocated is much less than that of pages freed on a CPU, there will be
many idle pages in PCP and it is hard to free these idle pages.

1/8 (>> 3) of PCP high will be decreased periodically.  The value 1/8 is
kind of arbitrary.  Just to make sure that the idle PCP pages will be
freed eventually.

On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
kbuild server that is used by 0-Day kbuild service.  With the patch, the
build time decreases 3.5%.  The cycles% of the spinlock contention (mostly
for zone lock) decreases from 11.0% to 0.5%.  The number of PCP draining
for high order pages freeing (free_high) decreases 65.6%.  The number of
pages allocated from zone (instead of from PCP) decreases 83.9%.

Link: https://lkml.kernel.org/r/20231016053002.756205-8-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Suggested-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 665edc11fb9f..5b917e5b9350 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -320,6 +320,7 @@ extern void page_frag_free(void *addr);
 #define free_page(addr) free_pages((addr), 0)
 
 void page_alloc_init_cpuhp(void);
+int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
-- 
cgit v1.2.3


From 57c0419c5f0ea2ccab8700895c8fac20ba1eb21f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:30:01 +0800
Subject: mm, pcp: decrease PCP high if free pages < high watermark

One target of PCP is to minimize pages in PCP if the system free pages is
too few.  To reach that target, when page reclaiming is active for the
zone (ZONE_RECLAIM_ACTIVE), we will stop increasing PCP high in allocating
path, decrease PCP high and free some pages in freeing path.  But this may
be too late because the background page reclaiming may introduce latency
for some workloads.  So, in this patch, during page allocation we will
detect whether the number of free pages of the zone is below high
watermark.  If so, we will stop increasing PCP high in allocating path,
decrease PCP high and free some pages in freeing path.  With this, we can
reduce the possibility of the premature background page reclaiming caused
by too large PCP.

The high watermark checking is done in allocating path to reduce the
overhead in hotter freeing path.

Link: https://lkml.kernel.org/r/20231016053002.756205-9-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 775abc899e80..b92ab001e146 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1016,6 +1016,7 @@ enum zone_flags {
 					 * Cleared when kswapd is woken.
 					 */
 	ZONE_RECLAIM_ACTIVE,		/* kswapd may be scanning the zone. */
+	ZONE_BELOW_HIGH,		/* zone is below high watermark. */
 };
 
 static inline unsigned long zone_managed_pages(struct zone *zone)
-- 
cgit v1.2.3


From 6ccdcb6d3a741c4e005ca6ffd4a62ddf8b5bead3 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:30:02 +0800
Subject: mm, pcp: reduce detecting time of consecutive high order page freeing

In current PCP auto-tuning design, if the number of pages allocated is
much more than that of pages freed on a CPU, the PCP high may become the
maximal value even if the allocating/freeing depth is small, for example,
in the sender of network workloads.  If a CPU was used as sender
originally, then it is used as receiver after context switching, we need
to fill the whole PCP with maximal high before triggering PCP draining for
consecutive high order freeing.  This will hurt the performance of some
network workloads.

To solve the issue, in this patch, we will track the consecutive page
freeing with a counter in stead of relying on PCP draining.  So, we can
detect consecutive page freeing much earlier.

On a 2-socket Intel server with 128 logical CPU, we tested
SCTP_STREAM_MANY test case of netperf test suite with 64-pair processes.
With the patch, the network bandwidth improves 5.0%.  This restores the
performance drop caused by PCP auto-tuning.

Link: https://lkml.kernel.org/r/20231016053002.756205-10-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b92ab001e146..3c25226beeed 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -698,10 +698,10 @@ struct per_cpu_pages {
 	int batch;		/* chunk size for buddy add/remove */
 	u8 flags;		/* protected by pcp->lock */
 	u8 alloc_factor;	/* batch scaling factor during allocate */
-	u8 free_factor;		/* batch scaling factor during free */
 #ifdef CONFIG_NUMA
 	u8 expire;		/* When 0, remote pagesets are drained */
 #endif
+	short free_count;	/* consecutive free count */
 
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
 	struct list_head lists[NR_PCP_LISTS];
-- 
cgit v1.2.3


From 1aacbd354313f25c855e662e41c04e2abf71444a Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Thu, 19 Oct 2023 15:53:42 -0700
Subject: mm: kmem: add direct objcg pointer to task_struct

To charge a freshly allocated kernel object to a memory cgroup, the kernel
needs to obtain an objcg pointer.  Currently it does it indirectly by
obtaining the memcg pointer first and then calling to
__get_obj_cgroup_from_memcg().

Usually tasks spend their entire life belonging to the same object cgroup.
So it makes sense to save the objcg pointer on task_struct directly, so
it can be obtained faster.  It requires some work on fork, exit and cgroup
migrate paths, but these paths are way colder.

To avoid any costly synchronization the following rules are applied:
1) A task sets it's objcg pointer itself.

2) If a task is being migrated to another cgroup, the least
   significant bit of the objcg pointer is set atomically.

3) On the allocation path the objcg pointer is obtained locklessly
   using the READ_ONCE() macro and the least significant bit is
   checked. If it's set, the following procedure is used to update
   it locklessly:
       - task->objcg is zeroed using cmpxcg
       - new objcg pointer is obtained
       - task->objcg is updated using try_cmpxchg
       - operation is repeated if try_cmpxcg fails
   It guarantees that no updates will be lost if task migration
   is racing against objcg pointer update. It also allows to keep
   both read and write paths fully lockless.

Because the task is keeping a reference to the objcg, it can't go away
while the task is alive.

This commit doesn't change the way the remote memcg charging works.

Link: https://lkml.kernel.org/r/20231019225346.1822282-3-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..60de42715b56 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1443,6 +1443,10 @@ struct task_struct {
 	struct mem_cgroup		*active_memcg;
 #endif
 
+#ifdef CONFIG_MEMCG_KMEM
+	struct obj_cgroup		*objcg;
+#endif
+
 #ifdef CONFIG_BLK_CGROUP
 	struct gendisk			*throttle_disk;
 #endif
-- 
cgit v1.2.3


From 675d6c9b59e313ca2573c93e8fd87011a99bb8ce Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Thu, 19 Oct 2023 15:53:43 -0700
Subject: mm: kmem: make memcg keep a reference to the original objcg

Keep a reference to the original objcg object for the entire life of a
memcg structure.

This allows to simplify the synchronization on the kernel memory
allocation paths: pinning a (live) memcg will also pin the corresponding
objcg.

The memory overhead of this change is minimal because object cgroups
usually outlive their corresponding memory cgroups even without this
change, so it's only an additional pointer per memcg.

Link: https://lkml.kernel.org/r/20231019225346.1822282-4-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6674c12725d5..cc110cc8fdfc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -289,7 +289,13 @@ struct mem_cgroup {
 
 #ifdef CONFIG_MEMCG_KMEM
 	int kmemcg_id;
-	struct obj_cgroup __rcu *objcg;
+	/*
+	 * memcg->objcg is wiped out as a part of the objcg repaprenting
+	 * process. memcg->orig_objcg preserves a pointer (and a reference)
+	 * to the original objcg until the end of live of memcg.
+	 */
+	struct obj_cgroup __rcu	*objcg;
+	struct obj_cgroup	*orig_objcg;
 	/* list of inherited objcgs, protected by objcg_lock */
 	struct list_head objcg_list;
 #endif
-- 
cgit v1.2.3


From e86828e5446d95676835679837d995dec188d2be Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Thu, 19 Oct 2023 15:53:44 -0700
Subject: mm: kmem: scoped objcg protection

Switch to a scope-based protection of the objcg pointer on slab/kmem
allocation paths.  Instead of using the get_() semantics in the
pre-allocation hook and put the reference afterwards, let's rely on the
fact that objcg is pinned by the scope.

It's possible because:
1) if the objcg is received from the current task struct, the task is
   keeping a reference to the objcg.
2) if the objcg is received from an active memcg (remote charging),
   the memcg is pinned by the scope and has a reference to the
   corresponding objcg.

Link: https://lkml.kernel.org/r/20231019225346.1822282-5-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 9 +++++++++
 include/linux/sched/mm.h   | 4 ++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index cc110cc8fdfc..8006bc3bd7bf 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1796,6 +1796,15 @@ bool mem_cgroup_kmem_disabled(void);
 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
 void __memcg_kmem_uncharge_page(struct page *page, int order);
 
+/*
+ * The returned objcg pointer is safe to use without additional
+ * protection within a scope. The scope is defined either by
+ * the current task (similar to the "current" global variable)
+ * or by set_active_memcg() pair.
+ * Please, use obj_cgroup_get() to get a reference if the pointer
+ * needs to be used outside of the local scope.
+ */
+struct obj_cgroup *current_obj_cgroup(void);
 struct obj_cgroup *get_obj_cgroup_from_current(void);
 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 8d89c8c4fac1..9a19f1b42f64 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -403,6 +403,10 @@ DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
  * __GFP_ACCOUNT allocations till the end of the scope will be charged to the
  * given memcg.
  *
+ * Please, make sure that caller has a reference to the passed memcg structure,
+ * so its lifetime is guaranteed to exceed the scope between two
+ * set_active_memcg() calls.
+ *
  * NOTE: This function can nest. Users must save the return value and
  * reset the previous value after their own charging scope is over.
  */
-- 
cgit v1.2.3


From e56808fef8f71a192b2740c0b6ea8be7ab865d54 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Thu, 19 Oct 2023 15:53:46 -0700
Subject: mm: kmem: reimplement get_obj_cgroup_from_current()

Reimplement get_obj_cgroup_from_current() using current_obj_cgroup().
get_obj_cgroup_from_current() and current_obj_cgroup() share 80% of the
code, so the new implementation is almost trivial.

get_obj_cgroup_from_current() is a convenient function used by the
bpf subsystem, so there is no reason to get rid of it completely.

Link: https://lkml.kernel.org/r/20231019225346.1822282-7-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8006bc3bd7bf..b3d2e3e60eed 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1805,9 +1805,18 @@ void __memcg_kmem_uncharge_page(struct page *page, int order);
  * needs to be used outside of the local scope.
  */
 struct obj_cgroup *current_obj_cgroup(void);
-struct obj_cgroup *get_obj_cgroup_from_current(void);
 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);
 
+static inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+{
+	struct obj_cgroup *objcg = current_obj_cgroup();
+
+	if (objcg)
+		obj_cgroup_get(objcg);
+
+	return objcg;
+}
+
 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
 
-- 
cgit v1.2.3


From 1d44f2e6d178163a94980fd5f9a4b04b6b36535b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:48 +0800
Subject: mm_types: add virtual and _last_cpupid into struct folio

Patch series "mm: convert page cpupid functions to folios", v3.

The cpupid(or access time) used by numa balancing is stored in flags or
_last_cpupid(if LAST_CPUPID_NOT_IN_PAGE_FLAGS) of page, this is to convert
page cpupid to folio cpupid, a new _last_cpupid is added into folio, which
make us to use folio->_last_cpupid directly, and the page cpupid functions
are converted to folio ones.

  page_cpupid_last()		-> folio_last_cpupid()
  xchg_page_access_time()	-> folio_xchg_access_time()
  page_cpupid_xchg_last()	-> folio_xchg_last_cpupid()


This patch (of 19):

If WANT_PAGE_VIRTUAL and LAST_CPUPID_NOT_IN_PAGE_FLAGS defined, the
'virtual' and '_last_cpupid' are in struct page, and since _last_cpupid is
used by numa balancing feature, it is better to move it before KMSAN
metadata from struct page, also add them into struct folio to make us to
access them from folio directly.

Link: https://lkml.kernel.org/r/20231018140806.2783514-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20231018140806.2783514-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 21eb56145f57..692e41213cd3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -199,6 +199,10 @@ struct page {
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+	int _last_cpupid;
+#endif
+
 #ifdef CONFIG_KMSAN
 	/*
 	 * KMSAN metadata for this page:
@@ -210,10 +214,6 @@ struct page {
 	struct page *kmsan_shadow;
 	struct page *kmsan_origin;
 #endif
-
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
-	int _last_cpupid;
-#endif
 } _struct_page_alignment;
 
 /*
@@ -272,6 +272,8 @@ typedef struct {
  * @_refcount: Do not access this member directly.  Use folio_ref_count()
  *    to find how many references there are to this folio.
  * @memcg_data: Memory Control Group data.
+ * @virtual: Virtual address in the kernel direct map.
+ * @_last_cpupid: IDs of last CPU and last process that accessed the folio.
  * @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
  * @_nr_pages_mapped: Do not use directly, call folio_mapcount().
  * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
@@ -317,6 +319,12 @@ struct folio {
 			atomic_t _refcount;
 #ifdef CONFIG_MEMCG
 			unsigned long memcg_data;
+#endif
+#if defined(WANT_PAGE_VIRTUAL)
+			void *virtual;
+#endif
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+			int _last_cpupid;
 #endif
 	/* private: the union with struct page is transitional */
 		};
@@ -373,6 +381,12 @@ FOLIO_MATCH(_refcount, _refcount);
 #ifdef CONFIG_MEMCG
 FOLIO_MATCH(memcg_data, memcg_data);
 #endif
+#if defined(WANT_PAGE_VIRTUAL)
+FOLIO_MATCH(virtual, virtual);
+#endif
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+FOLIO_MATCH(_last_cpupid, _last_cpupid);
+#endif
 #undef FOLIO_MATCH
 #define FOLIO_MATCH(pg, fl)						\
 	static_assert(offsetof(struct folio, fl) ==			\
-- 
cgit v1.2.3


From 155c98cfcf961327adedabd629edfc2301cf354b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:49 +0800
Subject: mm: add folio_last_cpupid()

Add folio_last_cpupid() wrapper, which is required to convert
page_cpupid_last() to folio vertion later in the series.

Link: https://lkml.kernel.org/r/20231018140806.2783514-3-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 86e040e886e4..0c4ff852930b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1796,6 +1796,11 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+static inline int folio_last_cpupid(struct folio *folio)
+{
+	return page_cpupid_last(&folio->page);
+}
+
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From f39eac30a8f334f0765ef78fe4d13b3fd5bfa3fd Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:53 +0800
Subject: mm: remove page_cpupid_last()

Since all calls use folio_last_cpupid(), remove page_cpupid_last().

Link: https://lkml.kernel.org/r/20231018140806.2783514-7-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0c4ff852930b..f8b7a1825720 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1702,18 +1702,18 @@ static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 	return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
 }
 
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
 {
-	return page->_last_cpupid;
+	return folio->_last_cpupid;
 }
 static inline void page_cpupid_reset_last(struct page *page)
 {
 	page->_last_cpupid = -1 & LAST_CPUPID_MASK;
 }
 #else
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
 {
-	return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
+	return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
 }
 
 extern int page_cpupid_xchg_last(struct page *page, int cpupid);
@@ -1752,9 +1752,9 @@ static inline int xchg_page_access_time(struct page *page, int time)
 	return 0;
 }
 
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
 {
-	return page_to_nid(page); /* XXX */
+	return folio_nid(folio); /* XXX */
 }
 
 static inline int cpupid_to_nid(int cpupid)
@@ -1796,11 +1796,6 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline int folio_last_cpupid(struct folio *folio)
-{
-	return page_cpupid_last(&folio->page);
-}
-
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From 55c199385c4465e9abe1a3d6d1aba348d0356e03 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:54 +0800
Subject: mm: add folio_xchg_access_time()

Add folio_xchg_access_time() wrapper, which is required to convert
xchg_page_access_time() to folio vertion later in the series.

Link: https://lkml.kernel.org/r/20231018140806.2783514-8-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f8b7a1825720..dee63b820f0c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1796,6 +1796,11 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+static inline int folio_xchg_access_time(struct folio *folio, int time)
+{
+	return xchg_page_access_time(&folio->page, time);
+}
+
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From f393084382fa3bbd5840b428d538dbcb33be0186 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:58 +0800
Subject: mm: remove xchg_page_access_time()

Since all calls use folio_xchg_access_time(), remove
xchg_page_access_time().

Link: https://lkml.kernel.org/r/20231018140806.2783514-12-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dee63b820f0c..e003465238c4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1724,11 +1724,12 @@ static inline void page_cpupid_reset_last(struct page *page)
 }
 #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
 
-static inline int xchg_page_access_time(struct page *page, int time)
+static inline int folio_xchg_access_time(struct folio *folio, int time)
 {
 	int last_time;
 
-	last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
+	last_time = page_cpupid_xchg_last(&folio->page,
+					  time >> PAGE_ACCESS_TIME_BUCKETS);
 	return last_time << PAGE_ACCESS_TIME_BUCKETS;
 }
 
@@ -1747,7 +1748,7 @@ static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 	return page_to_nid(page); /* XXX */
 }
 
-static inline int xchg_page_access_time(struct page *page, int time)
+static inline int folio_xchg_access_time(struct folio *folio, int time)
 {
 	return 0;
 }
@@ -1796,11 +1797,6 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline int folio_xchg_access_time(struct folio *folio, int time)
-{
-	return xchg_page_access_time(&folio->page, time);
-}
-
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From 136d0b47576f8701d68c2d504e7237d9fdc4ebbd Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:59 +0800
Subject: mm: add folio_xchg_last_cpupid()

Add folio_xchg_last_cpupid() wrapper, which is required to convert
page_cpupid_xchg_last() to folio vertion later in the series.

Link: https://lkml.kernel.org/r/20231018140806.2783514-13-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e003465238c4..00e0874f7ce2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1797,6 +1797,11 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
+{
+	return page_cpupid_xchg_last(&folio->page, cpupid);
+}
+
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From c08b7e3830dbf24dd2552ddeea84f00393842f1b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:08:03 +0800
Subject: mm: make finish_mkwrite_fault() static

Make finish_mkwrite_fault static since it is not used outside of
memory.c.

Link: https://lkml.kernel.org/r/20231018140806.2783514-17-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00e0874f7ce2..5d6e1d7656c2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1348,7 +1348,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 		struct page *page, unsigned int nr, unsigned long addr);
 
 vm_fault_t finish_fault(struct vm_fault *vmf);
-vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
 #endif
 
 /*
-- 
cgit v1.2.3


From 8f0f4788b1247c2f92ecacd8f86ce0b379b807b9 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:08:06 +0800
Subject: mm: remove page_cpupid_xchg_last()

Since all calls use folio_xchg_last_cpupid(), remove
page_cpupid_xchg_last().

Link: https://lkml.kernel.org/r/20231018140806.2783514-20-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5d6e1d7656c2..0e496a8b8e50 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1696,9 +1696,9 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
 
 #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
-static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
 {
-	return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
+	return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
 }
 
 static inline int folio_last_cpupid(struct folio *folio)
@@ -1715,7 +1715,7 @@ static inline int folio_last_cpupid(struct folio *folio)
 	return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
 }
 
-extern int page_cpupid_xchg_last(struct page *page, int cpupid);
+int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
 
 static inline void page_cpupid_reset_last(struct page *page)
 {
@@ -1727,8 +1727,8 @@ static inline int folio_xchg_access_time(struct folio *folio, int time)
 {
 	int last_time;
 
-	last_time = page_cpupid_xchg_last(&folio->page,
-					  time >> PAGE_ACCESS_TIME_BUCKETS);
+	last_time = folio_xchg_last_cpupid(folio,
+					   time >> PAGE_ACCESS_TIME_BUCKETS);
 	return last_time << PAGE_ACCESS_TIME_BUCKETS;
 }
 
@@ -1742,9 +1742,9 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 	}
 }
 #else /* !CONFIG_NUMA_BALANCING */
-static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
 {
-	return page_to_nid(page); /* XXX */
+	return folio_nid(folio); /* XXX */
 }
 
 static inline int folio_xchg_access_time(struct folio *folio, int time)
@@ -1796,11 +1796,6 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
-{
-	return page_cpupid_xchg_last(&folio->page, cpupid);
-}
-
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From 80203f1ca086835100843f1474bd6dd4a48cc73b Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Wed, 18 Oct 2023 18:29:47 +0800
Subject: bootmem: use kmemleak_free_part_phys in free_bootmem_page

Since kmemleak_alloc_phys() rather than kmemleak_alloc() was called from
memblock_alloc_range_nid(), kmemleak_free_part_phys() should be used to
delete kmemleak object in free_bootmem_page().  In debug mode, there are
following warning:

 kmemleak: Partially freeing unknown object at 0xffff97345aff7000 (size 4096)

Link: https://lkml.kernel.org/r/20231018102952.3339837-3-liushixin2@huawei.com
Fixes: 028725e73375 ("bootmem: remove the vmemmap pages from kmemleak in free_bootmem_page")
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Patrick Wang <patrick.wang.shcn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/bootmem_info.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
index e1a3c9c9754c..cffa38a73618 100644
--- a/include/linux/bootmem_info.h
+++ b/include/linux/bootmem_info.h
@@ -60,7 +60,7 @@ static inline void get_page_bootmem(unsigned long info, struct page *page,
 
 static inline void free_bootmem_page(struct page *page)
 {
-	kmemleak_free_part(page_to_virt(page), PAGE_SIZE);
+	kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);
 	free_reserved_page(page);
 }
 #endif
-- 
cgit v1.2.3


From b455f39d228935f88eebcd1f7c1a6981093c6a3b Mon Sep 17 00:00:00 2001
From: "Vishal Moola (Oracle)" <vishal.moola@gmail.com>
Date: Fri, 20 Oct 2023 11:33:30 -0700
Subject: mm/khugepaged: convert alloc_charge_hpage() to use folios

Also remove count_memcg_page_event now that its last caller no longer uses
it and reword hpage_collapse_alloc_page() to hpage_collapse_alloc_folio().

This removes 1 call to compound_head() and helps convert khugepaged to
use folios throughout.

Link: https://lkml.kernel.org/r/20231020183331.10770-5-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b3d2e3e60eed..7bdcf3020d7a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1087,15 +1087,6 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
 	local_irq_restore(flags);
 }
 
-static inline void count_memcg_page_event(struct page *page,
-					  enum vm_event_item idx)
-{
-	struct mem_cgroup *memcg = page_memcg(page);
-
-	if (memcg)
-		count_memcg_events(memcg, idx, 1);
-}
-
 static inline void count_memcg_folio_events(struct folio *folio,
 		enum vm_event_item idx, unsigned long nr)
 {
@@ -1598,11 +1589,6 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg,
 {
 }
 
-static inline void count_memcg_page_event(struct page *page,
-					  int idx)
-{
-}
-
 static inline void count_memcg_folio_events(struct folio *folio,
 		enum vm_event_item idx, unsigned long nr)
 {
-- 
cgit v1.2.3


From 35f5d94187a6a3a8df2cba54beccca1c2379edb8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 19 Oct 2023 19:49:20 +0000
Subject: mm/damon: implement a function for max nr_accesses safe calculation

Patch series "avoid divide-by-zero due to max_nr_accesses overflow".

The maximum nr_accesses of given DAMON context can be calculated by
dividing the aggregation interval by the sampling interval.  Some logics
in DAMON uses the maximum nr_accesses as a divisor.  Hence, the value
shouldn't be zero.  Such case is avoided since DAMON avoids setting the
agregation interval as samller than the sampling interval.  However, since
nr_accesses is unsigned int while the intervals are unsigned long, the
maximum nr_accesses could be zero while casting.

Avoid the divide-by-zero by implementing a function that handles the
corner case (first patch), and replaces the vulnerable direct max
nr_accesses calculations (remaining patches).

Note that the patches for the replacements are divided for broken commits,
to make backporting on required tres easier.  Especially, the last patch
is for a patch that not yet merged into the mainline but in mm tree.


This patch (of 4):

The maximum nr_accesses of given DAMON context can be calculated by
dividing the aggregation interval by the sampling interval.  Some logics
in DAMON uses the maximum nr_accesses as a divisor.  Hence, the value
shouldn't be zero.  Such case is avoided since DAMON avoids setting the
agregation interval as samller than the sampling interval.  However, since
nr_accesses is unsigned int while the intervals are unsigned long, the
maximum nr_accesses could be zero while casting.  Implement a function
that handles the corner case.

Note that this commit is not fixing the real issue since this is only
introducing the safe function that will replaces the problematic
divisions.  The replacements will be made by followup commits, to make
backporting on stable series easier.

Link: https://lkml.kernel.org/r/20231019194924.100347-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231019194924.100347-2-sj@kernel.org
Fixes: 198f0f4c58b9 ("mm/damon/vaddr,paddr: support pageout prioritization")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Jakub Acs <acsjakub@amazon.de>
Cc: <stable@vger.kernel.org>	[5.16+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 27b995c22497..ab2f17d9926b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -681,6 +681,13 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
 	return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR;
 }
 
+static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs)
+{
+	/* {aggr,sample}_interval are unsigned long, hence could overflow */
+	return min(attrs->aggr_interval / attrs->sample_interval,
+			(unsigned long)UINT_MAX);
+}
+
 
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
-- 
cgit v1.2.3


From 10969b5571387047cd461a9c701b8b9cd007f6c7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2023 02:15:09 -0700
Subject: hugetlbfs: drop shared NUMA mempolicy pretence

Patch series "mempolicy: cleanups leading to NUMA mpol without vma", v2.

Mostly cleanups in mm/mempolicy.c, but finally removing the pseudo-vma
from shmem folio allocation, and removing the mmap_lock around folio
migration for mbind and migrate_pages syscalls.


This patch (of 12):

hugetlbfs_fallocate() goes through the motions of pasting a shared NUMA
mempolicy onto its pseudo-vma, but how could there ever be a shared NUMA
mempolicy for this file?  hugetlb_vm_ops has never offered a set_policy
method, and hugetlbfs_parse_param() has never supported any mpol options
for a mount-wide default policy.

It's just an illusion: clean it away so as not to confuse others, giving
us more freedom to adjust shmem's set_policy/get_policy implementation.
But hugetlbfs_inode_info is still required, just to accommodate seals.

Yes, shared NUMA mempolicy support could be added to hugetlbfs, with a
set_policy method and/or mpol mount option (Andi's first posting did
include an admitted-unsatisfactory hugetlb_set_policy()); but it seems
that nobody has bothered to add that in the nineteen years since v2.6.7
made it possible, and there is at least one company that has invested
enough into hugetlbfs, that I guess they have learnt well enough how to
manage its NUMA, without needing shared mempolicy.

Remove linux/mempolicy.h from linux/hugetlb.h: include linux/pagemap.h in
its place, because hugetlb.h's recently added use of filemap_lock_folio()
requires that (although most .configs and .c's get it in some other way).

Link: https://lkml.kernel.org/r/ebc0987e-beff-8bfb-9283-234c2cbd17c5@google.com
Link: https://lkml.kernel.org/r/cae82d4b-904a-faaf-282a-34fcc188c81f@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 205469aa0613..158ff156710b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -30,7 +30,7 @@ void free_huge_folio(struct folio *folio);
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-#include <linux/mempolicy.h>
+#include <linux/pagemap.h>
 #include <linux/shm.h>
 #include <asm/tlbflush.h>
 
@@ -545,7 +545,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 }
 
 struct hugetlbfs_inode_info {
-	struct shared_policy policy;
 	struct inode vfs_inode;
 	unsigned int seals;
 };
-- 
cgit v1.2.3


From c36f6e6dff4d32ec8b6da8f553933727a57a7a4a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2023 02:20:14 -0700
Subject: mempolicy trivia: slightly more consistent naming

Before getting down to work, do a little cleanup, mainly of inconsistent
variable naming.  I gave up trying to rationalize mpol versus pol versus
policy, and node versus nid, but let's avoid p and nd.  Remove a few
superfluous blank lines, but add one; and here prefer vma->vm_policy to
vma_policy(vma) - the latter being appropriate in other sources, which
have to allow for !CONFIG_NUMA.  That intriguing line about KERNEL_DS?
should have gone in v2.6.15, when numa_policy_init() stopped using
set_mempolicy(2)'s system call handler.

Link: https://lkml.kernel.org/r/68287974-b6ae-7df-4ba-d19ddd69cbf@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 1d7f4ec2614c..a807976fe95d 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -124,10 +124,9 @@ struct shared_policy {
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
-int mpol_set_shared_policy(struct shared_policy *info,
-				struct vm_area_struct *vma,
-				struct mempolicy *new);
-void mpol_free_shared_policy(struct shared_policy *p);
+int mpol_set_shared_policy(struct shared_policy *sp,
+			   struct vm_area_struct *vma, struct mempolicy *mpol);
+void mpol_free_shared_policy(struct shared_policy *sp);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 					    unsigned long idx);
 
@@ -191,7 +190,7 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	return true;
 }
 
-static inline void mpol_put(struct mempolicy *p)
+static inline void mpol_put(struct mempolicy *pol)
 {
 }
 
@@ -210,7 +209,7 @@ static inline void mpol_shared_policy_init(struct shared_policy *sp,
 {
 }
 
-static inline void mpol_free_shared_policy(struct shared_policy *p)
+static inline void mpol_free_shared_policy(struct shared_policy *sp)
 {
 }
 
-- 
cgit v1.2.3


From 93397c3b7684555b7cec726cd13eef6742d191fe Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2023 02:21:34 -0700
Subject: mempolicy trivia: use pgoff_t in shared mempolicy tree

Prefer the more explicit "pgoff_t" to "unsigned long" when dealing with a
shared mempolicy tree.  Delete confusing comment about pseudo mm vmas.

Link: https://lkml.kernel.org/r/5451157-3818-4af5-fd2c-5d26a5d1dc53@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index a807976fe95d..acdb12fcb6cd 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -105,22 +105,16 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 
 /*
  * Tree of shared policies for a shared memory region.
- * Maintain the policies in a pseudo mm that contains vmas. The vmas
- * carry the policy. As a special twist the pseudo mm is indexed in pages, not
- * bytes, so that we can work with shared memory segments bigger than
- * unsigned long.
  */
-
-struct sp_node {
-	struct rb_node nd;
-	unsigned long start, end;
-	struct mempolicy *policy;
-};
-
 struct shared_policy {
 	struct rb_root root;
 	rwlock_t lock;
 };
+struct sp_node {
+	struct rb_node nd;
+	pgoff_t start, end;
+	struct mempolicy *policy;
+};
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
@@ -128,7 +122,7 @@ int mpol_set_shared_policy(struct shared_policy *sp,
 			   struct vm_area_struct *vma, struct mempolicy *mpol);
 void mpol_free_shared_policy(struct shared_policy *sp);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
-					    unsigned long idx);
+					    pgoff_t idx);
 
 struct mempolicy *get_task_policy(struct task_struct *p);
 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
@@ -214,7 +208,7 @@ static inline void mpol_free_shared_policy(struct shared_policy *sp)
 }
 
 static inline struct mempolicy *
-mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From ddc1a5cbc05dc62743a2f409b96faa5cf95ba064 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 Oct 2023 13:39:08 -0700
Subject: mempolicy: alloc_pages_mpol() for NUMA policy without vma

Shrink shmem's stack usage by eliminating the pseudo-vma from its folio
allocation.  alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the
principal actor for passing mempolicy choice down to __alloc_pages(),
rather than vma_alloc_folio(gfp, order, vma, addr, hugepage).

vma_alloc_folio() and alloc_pages() remain, but as wrappers around
alloc_pages_mpol().  alloc_pages_bulk_*() untouched, except to provide the
additional args to policy_nodemask(), which subsumes policy_node().
Cleanup throughout, cutting out some unhelpful "helpers".

It would all be much simpler without MPOL_INTERLEAVE, but that adds a
dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd
("tmpfs: distribute interleave better across nodes"), which added ino bias
to the interleave, hidden from mm/mempolicy.c until this commit.

Hence "ilx" throughout, the "interleave index".  Originally I thought it
could be done just with nid, but that's wrong: the nodemask may come from
the shared policy layer below a shmem vma, or it may come from the task
layer above a shmem vma; and without the final nodemask then nodeid cannot
be decided.  And how ilx is applied depends also on page order.

The interleave index is almost always irrelevant unless MPOL_INTERLEAVE:
with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX
passed down from vma-less alloc_pages() is also used as hint not to use
THP-style hugepage allocation - to avoid the overhead of a hugepage arg
(though I don't understand why we never just added a GFP bit for THP - if
it actually needs a different allocation strategy from other pages of the
same order).  vma_alloc_folio() still carries its hugepage arg here, but
it is not used, and should be removed when agreed.

get_vma_policy() no longer allows a NULL vma: over time I believe we've
eradicated all the places which used to need it e.g.  swapoff and madvise
used to pass NULL vma to read_swap_cache_async(), but now know the vma.

[hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()]
  Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com
Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com
Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h       | 10 +++++++++-
 include/linux/mempolicy.h | 20 +++++++++++++++++---
 include/linux/mm.h        |  2 +-
 3 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 5b917e5b9350..de292a007138 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -8,6 +8,7 @@
 #include <linux/topology.h>
 
 struct vm_area_struct;
+struct mempolicy;
 
 /* Convert GFP flags to their corresponding migrate type */
 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
@@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 
 #ifdef CONFIG_NUMA
 struct page *alloc_pages(gfp_t gfp, unsigned int order);
-struct folio *folio_alloc(gfp_t gfp, unsigned order);
+struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+		struct mempolicy *mpol, pgoff_t ilx, int nid);
+struct folio *folio_alloc(gfp_t gfp, unsigned int order);
 struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
 		unsigned long addr, bool hugepage);
 #else
@@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	return alloc_pages_node(numa_node_id(), gfp_mask, order);
 }
+static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+		struct mempolicy *mpol, pgoff_t ilx, int nid)
+{
+	return alloc_pages(gfp, order);
+}
 static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
 {
 	return __folio_alloc_node(gfp, order, numa_node_id());
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index acdb12fcb6cd..931b118336f4 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -17,6 +17,8 @@
 
 struct mm_struct;
 
+#define NO_INTERLEAVE_INDEX (-1UL)	/* use task il_prev for interleaving */
+
 #ifdef CONFIG_NUMA
 
 /*
@@ -126,7 +128,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 
 struct mempolicy *get_task_policy(struct task_struct *p);
 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
-		unsigned long addr);
+		unsigned long addr, pgoff_t *ilx);
+struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+		unsigned long addr, int order, pgoff_t *ilx);
 bool vma_policy_mof(struct vm_area_struct *vma);
 
 extern void numa_default_policy(void);
@@ -140,8 +144,6 @@ extern int huge_node(struct vm_area_struct *vma,
 extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
 extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
 				const nodemask_t *mask);
-extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
-
 extern unsigned int mempolicy_slab_node(void);
 
 extern enum zone_type policy_zone;
@@ -179,6 +181,11 @@ extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
 
 struct mempolicy {};
 
+static inline struct mempolicy *get_task_policy(struct task_struct *p)
+{
+	return NULL;
+}
+
 static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
 	return true;
@@ -213,6 +220,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
 	return NULL;
 }
 
+static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+				unsigned long addr, int order, pgoff_t *ilx)
+{
+	*ilx = 0;
+	return NULL;
+}
+
 static inline int
 vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e496a8b8e50..14d5aaff96d0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -619,7 +619,7 @@ struct vm_operations_struct {
 	 * policy.
 	 */
 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
-					unsigned long addr);
+					unsigned long addr, pgoff_t *ilx);
 #endif
 	/*
 	 * Called by vm_normal_page() for special PTEs to find the
-- 
cgit v1.2.3


From 72e315f7a750281b4410ac30d8930f735459e72d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2023 02:27:47 -0700
Subject: mempolicy: mmap_lock is not needed while migrating folios

mbind(2) holds down_write of current task's mmap_lock throughout
(exclusive because it needs to set the new mempolicy on the vmas);
migrate_pages(2) holds down_read of pid's mmap_lock throughout.

They both hold mmap_lock across the internal migrate_pages(), under which
all new page allocations (huge or small) are made.  I'm nervous about it;
and migrate_pages() certainly does not need mmap_lock itself.  It's done
this way for mbind(2), because its page allocator is vma_alloc_folio() or
alloc_hugetlb_folio_vma(), both of which depend on vma and address.

Now that we have alloc_pages_mpol(), depending on (refcounted) memory
policy and interleave index, mbind(2) can be modified to use that or
alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the
internal migrate_pages() at all: add alloc_migration_target_by_mpol() to
replace mbind's new_page().

(After that change, alloc_hugetlb_folio_vma() is used by nothing but a
userfaultfd function: move it out of hugetlb.h and into the #ifdef.)

migrate_pages(2) has chosen its target node before migrating, so can
continue to use the standard alloc_migration_target(); but let it take and
drop mmap_lock just around migrate_to_node()'s queue_pages_range():
neither the node-to-node calculations nor the page migrations need it.

It seems unlikely, but it is conceivable that some userspace depends on
the kernel's mmap_lock exclusion here, instead of doing its own locking:
more likely in a testsuite than in real life.  It is also possible, of
course, that some pages on the list will be munmapped by another thread
before they are migrated, or a newer memory policy applied to the range by
that time: but such races could happen before, as soon as mmap_lock was
dropped, so it does not appear to be a concern.

Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 158ff156710b..d3acecc5db4b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -748,8 +748,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 				unsigned long addr, int avoid_reserve);
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
 				nodemask_t *nmask, gfp_t gfp_mask);
-struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
-				unsigned long address);
 int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
 			pgoff_t idx);
 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
@@ -1072,13 +1070,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
 	return NULL;
 }
 
-static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
-					       struct vm_area_struct *vma,
-					       unsigned long address)
-{
-	return NULL;
-}
-
 static inline int __alloc_bootmem_huge_page(struct hstate *h)
 {
 	return 0;
-- 
cgit v1.2.3


From b56ebe7c896dc78b5865ec2c4b1dae3c93537517 Mon Sep 17 00:00:00 2001
From: Koichiro Den <den@valinux.co.jp>
Date: Thu, 26 Oct 2023 12:20:36 +0900
Subject: x86/apic/msi: Fix misconfigured non-maskable MSI quirk

commit ef8dd01538ea ("genirq/msi: Make interrupt allocation less
convoluted"), reworked the code so that the x86 specific quirk for affinity
setting of non-maskable PCI/MSI interrupts is not longer activated if
necessary.

This could be solved by restoring the original logic in the core MSI code,
but after a deeper analysis it turned out that the quirk flag is not
required at all.

The quirk is only required when the PCI/MSI device cannot mask the MSI
interrupts, which in turn also prevents reservation mode from being enabled
for the affected interrupt.

This allows ot remove the NOMASK quirk bit completely as msi_set_affinity()
can instead check whether reservation mode is enabled for the interrupt,
which gives exactly the same answer.

Even in the momentary non-existing case that the reservation mode would be
not set for a maskable MSI interrupt this would not cause any harm as it
just would cause msi_set_affinity() to go needlessly through the
functionaly equivalent slow path, which works perfectly fine with maskable
interrupts as well.

Rework msi_set_affinity() to query the reservation mode and remove all
NOMASK quirk logic from the core code.

[ tglx: Massaged changelog ]

Fixes: ef8dd01538ea ("genirq/msi: Make interrupt allocation less convoluted")
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Koichiro Den <den@valinux.co.jp>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20231026032036.2462428-1-den@valinux.co.jp
---
 include/linux/irq.h | 26 ++++----------------------
 include/linux/msi.h |  6 ------
 2 files changed, 4 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d8a6fdce9373..90081afa10ce 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -215,8 +215,6 @@ struct irq_data {
  * IRQD_SINGLE_TARGET		- IRQ allows only a single affinity target
  * IRQD_DEFAULT_TRIGGER_SET	- Expected trigger already been set
  * IRQD_CAN_RESERVE		- Can use reservation mode
- * IRQD_MSI_NOMASK_QUIRK	- Non-maskable MSI quirk for affinity change
- *				  required
  * IRQD_HANDLE_ENFORCE_IRQCTX	- Enforce that handle_irq_*() is only invoked
  *				  from actual interrupt context.
  * IRQD_AFFINITY_ON_ACTIVATE	- Affinity is set on activation. Don't call
@@ -247,11 +245,10 @@ enum {
 	IRQD_SINGLE_TARGET		= BIT(24),
 	IRQD_DEFAULT_TRIGGER_SET	= BIT(25),
 	IRQD_CAN_RESERVE		= BIT(26),
-	IRQD_MSI_NOMASK_QUIRK		= BIT(27),
-	IRQD_HANDLE_ENFORCE_IRQCTX	= BIT(28),
-	IRQD_AFFINITY_ON_ACTIVATE	= BIT(29),
-	IRQD_IRQ_ENABLED_ON_SUSPEND	= BIT(30),
-	IRQD_RESEND_WHEN_IN_PROGRESS    = BIT(31),
+	IRQD_HANDLE_ENFORCE_IRQCTX	= BIT(27),
+	IRQD_AFFINITY_ON_ACTIVATE	= BIT(28),
+	IRQD_IRQ_ENABLED_ON_SUSPEND	= BIT(29),
+	IRQD_RESEND_WHEN_IN_PROGRESS    = BIT(30),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -426,21 +423,6 @@ static inline bool irqd_can_reserve(struct irq_data *d)
 	return __irqd_to_state(d) & IRQD_CAN_RESERVE;
 }
 
-static inline void irqd_set_msi_nomask_quirk(struct irq_data *d)
-{
-	__irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK;
-}
-
-static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d)
-{
-	__irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK;
-}
-
-static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
-{
-	return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
-}
-
 static inline void irqd_set_affinity_on_activate(struct irq_data *d)
 {
 	__irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index a50ea79522f8..ddace8c34dcf 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -547,12 +547,6 @@ enum {
 	MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS	= (1 << 5),
 	/* Free MSI descriptors */
 	MSI_FLAG_FREE_MSI_DESCS		= (1 << 6),
-	/*
-	 * Quirk to handle MSI implementations which do not provide
-	 * masking. Currently known to affect x86, but has to be partially
-	 * handled in the core MSI code.
-	 */
-	MSI_FLAG_NOMASK_QUIRK		= (1 << 7),
 
 	/* Mask for the generic functionality */
 	MSI_GENERIC_FLAGS_MASK		= GENMASK(15, 0),
-- 
cgit v1.2.3


From 54d606816b32401de5431f6776a78b1de135bfa2 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 25 Oct 2023 21:39:29 -0700
Subject: iommu: Add IOMMU_DOMAIN_NESTED

Introduce a new domain type for a user I/O page table, which is nested on
top of another user space address represented by a PAGING domain. This
new domain can be allocated by the domain_alloc_user op, and attached to
a device through the existing iommu_attach_device/group() interfaces.

The mappings of a nested domain are managed by user space software, so it
is not necessary to have map/unmap callbacks.

Link: https://lore.kernel.org/r/20231026043938.63898-2-yi.l.liu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1d42bdb37cbc..bc303cb2af37 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -67,6 +67,9 @@ struct iommu_domain_geometry {
 
 #define __IOMMU_DOMAIN_SVA	(1U << 4)  /* Shared process address space */
 
+#define __IOMMU_DOMAIN_NESTED	(1U << 6)  /* User-managed address space nested
+					      on a stage-2 translation        */
+
 #define IOMMU_DOMAIN_ALLOC_FLAGS ~__IOMMU_DOMAIN_DMA_FQ
 /*
  * This are the possible domain-types
@@ -93,6 +96,7 @@ struct iommu_domain_geometry {
 				 __IOMMU_DOMAIN_DMA_API |	\
 				 __IOMMU_DOMAIN_DMA_FQ)
 #define IOMMU_DOMAIN_SVA	(__IOMMU_DOMAIN_SVA)
+#define IOMMU_DOMAIN_NESTED	(__IOMMU_DOMAIN_NESTED)
 
 struct iommu_domain {
 	unsigned type;
-- 
cgit v1.2.3


From 2bdabb8e82f564d19eeeb7c83e6b2467af0707cb Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Wed, 25 Oct 2023 21:39:34 -0700
Subject: iommu: Pass in parent domain with user_data to domain_alloc_user op

domain_alloc_user op already accepts user flags for domain allocation, add
a parent domain pointer and a driver specific user data support as well.
The user data would be tagged with a type for iommu drivers to add their
own driver specific user data per hw_pagetable.

Add a struct iommu_user_data as a bundle of data_ptr/data_len/type from an
iommufd core uAPI structure. Make the user data opaque to the core, since
a userspace driver must match the kernel driver. In the future, if drivers
share some common parameter, there would be a generic parameter as well.

Link: https://lore.kernel.org/r/20231026043938.63898-7-yi.l.liu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Co-developed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bc303cb2af37..2ddd99f55471 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -265,6 +265,21 @@ struct iommu_dirty_ops {
 				    struct iommu_dirty_bitmap *dirty);
 };
 
+/**
+ * struct iommu_user_data - iommu driver specific user space data info
+ * @type: The data type of the user buffer
+ * @uptr: Pointer to the user buffer for copy_from_user()
+ * @len: The length of the user buffer in bytes
+ *
+ * A user space data is an uAPI that is defined in include/uapi/linux/iommufd.h
+ * @type, @uptr and @len should be just copied from an iommufd core uAPI struct.
+ */
+struct iommu_user_data {
+	unsigned int type;
+	void __user *uptr;
+	size_t len;
+};
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability
@@ -279,8 +294,12 @@ struct iommu_dirty_ops {
  *                     parameters as defined in include/uapi/linux/iommufd.h.
  *                     Unlike @domain_alloc, it is called only by IOMMUFD and
  *                     must fully initialize the new domain before return.
- *                     Upon success, a domain is returned. Upon failure,
- *                     ERR_PTR must be returned.
+ *                     Upon success, if the @user_data is valid and the @parent
+ *                     points to a kernel-managed domain, the new domain must be
+ *                     IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be
+ *                     NULL while the @user_data can be optionally provided, the
+ *                     new domain must support __IOMMU_DOMAIN_PAGING.
+ *                     Upon failure, ERR_PTR must be returned.
  * @probe_device: Add device to iommu driver handling
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -313,7 +332,9 @@ struct iommu_ops {
 
 	/* Domain allocation and freeing by the iommu driver */
 	struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
-	struct iommu_domain *(*domain_alloc_user)(struct device *dev, u32 flags);
+	struct iommu_domain *(*domain_alloc_user)(
+		struct device *dev, u32 flags, struct iommu_domain *parent,
+		const struct iommu_user_data *user_data);
 
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
-- 
cgit v1.2.3


From e9d36c07bb787840e4813fb09a929a17d522a69f Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 25 Oct 2023 21:39:36 -0700
Subject: iommu: Add iommu_copy_struct_from_user helper

Wrap up the data type/pointer/len sanity and a copy_struct_from_user call
for iommu drivers to copy driver specific data via struct iommu_user_data.
And expect it to be used in the domain_alloc_user op for example.

Link: https://lore.kernel.org/r/20231026043938.63898-9-yi.l.liu@intel.com
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Co-developed-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2ddd99f55471..8fb1b41b4d15 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -280,6 +280,46 @@ struct iommu_user_data {
 	size_t len;
 };
 
+/**
+ * __iommu_copy_struct_from_user - Copy iommu driver specific user space data
+ * @dst_data: Pointer to an iommu driver specific user data that is defined in
+ *            include/uapi/linux/iommufd.h
+ * @src_data: Pointer to a struct iommu_user_data for user space data info
+ * @data_type: The data type of the @dst_data. Must match with @src_data.type
+ * @data_len: Length of current user data structure, i.e. sizeof(struct _dst)
+ * @min_len: Initial length of user data structure for backward compatibility.
+ *           This should be offsetofend using the last member in the user data
+ *           struct that was initially added to include/uapi/linux/iommufd.h
+ */
+static inline int __iommu_copy_struct_from_user(
+	void *dst_data, const struct iommu_user_data *src_data,
+	unsigned int data_type, size_t data_len, size_t min_len)
+{
+	if (src_data->type != data_type)
+		return -EINVAL;
+	if (WARN_ON(!dst_data || !src_data))
+		return -EINVAL;
+	if (src_data->len < min_len || data_len < src_data->len)
+		return -EINVAL;
+	return copy_struct_from_user(dst_data, data_len, src_data->uptr,
+				     src_data->len);
+}
+
+/**
+ * iommu_copy_struct_from_user - Copy iommu driver specific user space data
+ * @kdst: Pointer to an iommu driver specific user data that is defined in
+ *        include/uapi/linux/iommufd.h
+ * @user_data: Pointer to a struct iommu_user_data for user space data info
+ * @data_type: The data type of the @kdst. Must match with @user_data->type
+ * @min_last: The last memember of the data structure @kdst points in the
+ *            initial version.
+ * Return 0 for success, otherwise -error.
+ */
+#define iommu_copy_struct_from_user(kdst, user_data, data_type, min_last) \
+	__iommu_copy_struct_from_user(kdst, user_data, data_type,         \
+				      sizeof(*kdst),                      \
+				      offsetofend(typeof(*kdst), min_last))
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability
-- 
cgit v1.2.3


From e5d8be7406ca7b6b251c09b786f08176337c0999 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 27 Sep 2023 20:47:31 -0300
Subject: iommu: Move IOMMU_DOMAIN_BLOCKED global statics to
 ops->blocked_domain

Following the pattern of identity domains, just assign the BLOCKED domain
global statics to a value in ops. Update the core code to use the global
static directly.

Update powerpc to use the new scheme and remove its empty domain_alloc
callback.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: Sven Peter <sven@svenpeter.dev>
Link: https://lore.kernel.org/r/1-v2-bff223cf6409+282-dart_paging_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c28178f3690a..73daa4fb168b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -265,6 +265,8 @@ struct iommu_iotlb_gather {
  * @owner: Driver module providing these ops
  * @identity_domain: An always available, always attachable identity
  *                   translation.
+ * @blocked_domain: An always available, always attachable blocking
+ *                  translation.
  * @default_domain: If not NULL this will always be set as the default domain.
  *                  This should be an IDENTITY/BLOCKED/PLATFORM domain.
  *                  Do not use in new drivers.
@@ -303,6 +305,7 @@ struct iommu_ops {
 	unsigned long pgsize_bitmap;
 	struct module *owner;
 	struct iommu_domain *identity_domain;
+	struct iommu_domain *blocked_domain;
 	struct iommu_domain *default_domain;
 };
 
-- 
cgit v1.2.3


From b827ac419721a106ae2fccaa40576b0594edad92 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 26 Oct 2023 16:37:58 -0400
Subject: exportfs: Change bcachefs fid_type enum to avoid conflicts

Per Amir Goldstein, the fid types that bcachefs picked conflicted with
xfs and fuse, which previously were in use but not deviced in the master
enum.

Since bcachefs is still out of tree, we can move.

https://lore.kernel.org/linux-next/20231026203733.fx65mjyic4pka3e5@moria.home.lan/T/#ma59f65ba61f605b593e69f4690dbd317526d83ba

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/exportfs.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index f49a7d31167e..f75e0914d40d 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -98,18 +98,18 @@ enum fid_type {
 	 */
 	FILEID_FAT_WITH_PARENT = 0x72,
 
-	/*
-	 * 64 bit inode number, 32 bit subvolume, 32 bit generation number:
-	 */
-	FILEID_BCACHEFS_WITHOUT_PARENT = 0x80,
-	FILEID_BCACHEFS_WITH_PARENT = 0x81,
-
 	/*
 	 * 128 bit child FID (struct lu_fid)
 	 * 128 bit parent FID (struct lu_fid)
 	 */
 	FILEID_LUSTRE = 0x97,
 
+	/*
+	 * 64 bit inode number, 32 bit subvolume, 32 bit generation number:
+	 */
+	FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1,
+	FILEID_BCACHEFS_WITH_PARENT = 0xb2,
+
 	/*
 	 * 64 bit unique kernfs id
 	 */
-- 
cgit v1.2.3


From c845f5f3590ef4669fe5464f8a42be6442cd174b Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:21:54 +0100
Subject: net/tcp: Add TCP-AO config and structures

Introduce new kernel config option and common structures as well as
helpers to be used by TCP-AO code.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6df715b6e51d..64e7b560fa79 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -447,13 +447,18 @@ struct tcp_sock {
 	bool	syn_smc;	/* SYN includes SMC */
 #endif
 
-#ifdef CONFIG_TCP_MD5SIG
-/* TCP AF-Specific parts; only used by MD5 Signature support so far */
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+/* TCP AF-Specific parts; only used by TCP-AO/MD5 Signature support so far */
 	const struct tcp_sock_af_ops	*af_specific;
 
+#ifdef CONFIG_TCP_MD5SIG
 /* TCP MD5 Signature Option information */
 	struct tcp_md5sig_info	__rcu *md5sig_info;
 #endif
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info	__rcu *ao_info;
+#endif
+#endif
 
 /* TCP fastopen related information */
 	struct tcp_fastopen_request *fastopen_req;
-- 
cgit v1.2.3


From 4954f17ddefc51d218625dcdfaf422a253dad3fa Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:21:55 +0100
Subject: net/tcp: Introduce TCP_AO setsockopt()s

Add 3 setsockopt()s:
1. TCP_AO_ADD_KEY to add a new Master Key Tuple (MKT) on a socket
2. TCP_AO_DEL_KEY to delete present MKT from a socket
3. TCP_AO_INFO to change flags, Current_key/RNext_key on a TCP-AO sk

Userspace has to introduce keys on every socket it wants to use TCP-AO
option on, similarly to TCP_MD5SIG/TCP_MD5SIG_EXT.
RFC5925 prohibits definition of MKTs that would match the same peer,
so do sanity checks on the data provided by userspace. Be as
conservative as possible, including refusal of defining MKT on
an established connection with no AO, removing the key in-use and etc.

(1) and (2) are to be used by userspace key manager to add/remove keys.
(3) main purpose is to set RNext_key, which (as prescribed by RFC5925)
is the KeyID that will be requested in TCP-AO header from the peer to
sign their segments with.

At this moment the life of ao_info ends in tcp_v4_destroy_sock().

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sockptr.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index bae5e2369b4f..307961b41541 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -55,6 +55,29 @@ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size)
 	return copy_from_sockptr_offset(dst, src, 0, size);
 }
 
+static inline int copy_struct_from_sockptr(void *dst, size_t ksize,
+		sockptr_t src, size_t usize)
+{
+	size_t size = min(ksize, usize);
+	size_t rest = max(ksize, usize) - size;
+
+	if (!sockptr_is_kernel(src))
+		return copy_struct_from_user(dst, ksize, src.user, size);
+
+	if (usize < ksize) {
+		memset(dst + size, 0, rest);
+	} else if (usize > ksize) {
+		char *p = src.kernel;
+
+		while (rest--) {
+			if (*p++)
+				return -E2BIG;
+		}
+	}
+	memcpy(dst, src.kernel, size);
+	return 0;
+}
+
 static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset,
 		const void *src, size_t size)
 {
-- 
cgit v1.2.3


From decde2586b34b99684faff1eab41e5c496c27fb6 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:01 +0100
Subject: net/tcp: Add TCP-AO sign to twsk

Add support for sockets in time-wait state.
ao_info as well as all keys are inherited on transition to time-wait
socket. The lifetime of ao_info is now protected by ref counter, so
that tcp_ao_destroy_sock() will destruct it only when the last user is
gone.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 64e7b560fa79..eec6e7e5312e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -514,6 +514,9 @@ struct tcp_timewait_sock {
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	  *tw_md5_key;
 #endif
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info	__rcu *ao_info;
+#endif
 };
 
 static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
-- 
cgit v1.2.3


From 06b22ef29591f625ef877ae00d82192938e29e60 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:02 +0100
Subject: net/tcp: Wire TCP-AO to request sockets

Now when the new request socket is created from the listening socket,
it's recorded what MKT was used by the peer. tcp_rsk_used_ao() is
a new helper for checking if TCP-AO option was used to create the
request socket.
tcp_ao_copy_all_matching() will copy all keys that match the peer on the
request socket, as well as preparing them for the usage (creating
traffic keys).

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index eec6e7e5312e..ec4e9367f5b0 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -166,6 +166,11 @@ struct tcp_request_sock {
 						  * after data-in-SYN.
 						  */
 	u8				syn_tos;
+#ifdef CONFIG_TCP_AO
+	u8				ao_keyid;
+	u8				ao_rcv_next;
+	u8				maclen;
+#endif
 };
 
 static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
@@ -173,6 +178,19 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
 	return (struct tcp_request_sock *)req;
 }
 
+static inline bool tcp_rsk_used_ao(const struct request_sock *req)
+{
+	/* The real length of MAC is saved in the request socket,
+	 * signing anything with zero-length makes no sense, so here is
+	 * a little hack..
+	 */
+#ifndef CONFIG_TCP_AO
+	return false;
+#else
+	return tcp_rsk(req)->maclen != 0;
+#endif
+}
+
 #define TCP_RMEM_TO_WIN_SCALE 8
 
 struct tcp_sock {
-- 
cgit v1.2.3


From 62f47bf9e2c00eea457cad8fa43c24ed0282b37a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 25 Oct 2023 15:30:15 +0300
Subject: net: Add MDB get device operation

Add MDB net device operation that will be invoked by rtnetlink code in
response to received RTM_GETMDB messages. Subsequent patches will
implement the operation in the bridge and VXLAN drivers.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b8bf669212cc..a16c9cc063fe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1609,6 +1609,10 @@ struct net_device_ops {
 	int			(*ndo_mdb_dump)(struct net_device *dev,
 						struct sk_buff *skb,
 						struct netlink_callback *cb);
+	int			(*ndo_mdb_get)(struct net_device *dev,
+					       struct nlattr *tb[], u32 portid,
+					       u32 seq,
+					       struct netlink_ext_ack *extack);
 	int			(*ndo_bridge_setlink)(struct net_device *dev,
 						      struct nlmsghdr *nlh,
 						      u16 flags,
-- 
cgit v1.2.3


From e8eed5f7366f1f5decb694168bd06fb59ef6b12c Mon Sep 17 00:00:00 2001
From: Damian Muszynski <damian.muszynski@intel.com>
Date: Fri, 20 Oct 2023 15:49:25 +0200
Subject: units: Add BYTES_PER_*BIT

There is going to be a new user of the BYTES_PER_[K/M/G]BIT definition
besides possibly existing ones. Add them to the header.

Signed-off-by: Damian Muszynski <damian.muszynski@intel.com>
Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Reviewed-by: Tero Kristo <tero.kristo@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/units.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/units.h b/include/linux/units.h
index 2793a41e73a2..ff1bd6b5f5b3 100644
--- a/include/linux/units.h
+++ b/include/linux/units.h
@@ -31,6 +31,10 @@
 #define MICROWATT_PER_MILLIWATT	1000UL
 #define MICROWATT_PER_WATT	1000000UL
 
+#define BYTES_PER_KBIT		(KILO / BITS_PER_BYTE)
+#define BYTES_PER_MBIT		(MEGA / BITS_PER_BYTE)
+#define BYTES_PER_GBIT		(GIGA / BITS_PER_BYTE)
+
 #define ABSOLUTE_ZERO_MILLICELSIUS -273150
 
 static inline long milli_kelvin_to_millicelsius(long t)
-- 
cgit v1.2.3


From c626910f3f1bbce6ad18bc613d895d2a089ed95e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 22 Oct 2023 01:10:44 -0700
Subject: crypto: ahash - remove support for nonzero alignmask

Currently, the ahash API checks the alignment of all key and result
buffers against the algorithm's declared alignmask, and for any
unaligned buffers it falls back to manually aligned temporary buffers.

This is virtually useless, however.  First, since it does not apply to
the message, its effect is much more limited than e.g. is the case for
the alignmask for "skcipher".  Second, the key and result buffers are
given as virtual addresses and cannot (in general) be DMA'ed into, so
drivers end up having to copy to/from them in software anyway.  As a
result it's easy to use memcpy() or the unaligned access helpers.

The crypto_hash_walk_*() helper functions do use the alignmask to align
the message.  But with one exception those are only used for shash
algorithms being exposed via the ahash API, not for native ahashes, and
aligning the message is not required in this case, especially now that
alignmask support has been removed from shash.  The exception is the
n2_core driver, which doesn't set an alignmask.

In any case, no ahash algorithms actually set a nonzero alignmask
anymore.  Therefore, remove support for it from ahash.  The benefit is
that all the code to handle "misaligned" buffers in the ahash API goes
away, reducing the overhead of the ahash API.

This follows the same change that was made to shash.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crypto.h | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index f3c3a3b27fac..b164da5e129e 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -110,7 +110,6 @@
  *	  crypto_aead_walksize() (with the remainder going at the end), no chunk
  *	  can cross a page boundary or a scatterlist element boundary.
  *    ahash:
- *	- The result buffer must be aligned to the algorithm's alignmask.
  *	- crypto_ahash_finup() must not be used unless the algorithm implements
  *	  ->finup() natively.
  */
@@ -278,18 +277,20 @@ struct compress_alg {
  * @cra_ctxsize: Size of the operational context of the transformation. This
  *		 value informs the kernel crypto API about the memory size
  *		 needed to be allocated for the transformation context.
- * @cra_alignmask: Alignment mask for the input and output data buffer. The data
- *		   buffer containing the input data for the algorithm must be
- *		   aligned to this alignment mask. The data buffer for the
- *		   output data must be aligned to this alignment mask. Note that
- *		   the Crypto API will do the re-alignment in software, but
- *		   only under special conditions and there is a performance hit.
- *		   The re-alignment happens at these occasions for different
- *		   @cra_u types: cipher -- For both input data and output data
- *		   buffer; ahash -- For output hash destination buf; shash --
- *		   For output hash destination buf.
- *		   This is needed on hardware which is flawed by design and
- *		   cannot pick data from arbitrary addresses.
+ * @cra_alignmask: For cipher, skcipher, lskcipher, and aead algorithms this is
+ *		   1 less than the alignment, in bytes, that the algorithm
+ *		   implementation requires for input and output buffers.  When
+ *		   the crypto API is invoked with buffers that are not aligned
+ *		   to this alignment, the crypto API automatically utilizes
+ *		   appropriately aligned temporary buffers to comply with what
+ *		   the algorithm needs.  (For scatterlists this happens only if
+ *		   the algorithm uses the skcipher_walk helper functions.)  This
+ *		   misalignment handling carries a performance penalty, so it is
+ *		   preferred that algorithms do not set a nonzero alignmask.
+ *		   Also, crypto API users may wish to allocate buffers aligned
+ *		   to the alignmask of the algorithm being used, in order to
+ *		   avoid the API having to realign them.  Note: the alignmask is
+ *		   not supported for hash algorithms and is always 0 for them.
  * @cra_priority: Priority of this transformation implementation. In case
  *		  multiple transformations with same @cra_name are available to
  *		  the Crypto API, the kernel will use the one with highest
-- 
cgit v1.2.3


From 2ee7c1bcf3d1c91ede9d914c52fa2f56c449b75a Mon Sep 17 00:00:00 2001
From: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Date: Sun, 22 Oct 2023 19:22:03 +0100
Subject: x509: Add OIDs for FIPS 202 SHA-3 hash and signatures

Add OID for FIPS 202 SHA-3 family of hash functions, RSA & ECDSA
signatures using those. Limit to 256 or larger sizes, for
interoperability reasons. 224 is too weak for any practical uses.

Signed-off-by: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/oid_registry.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 8b79e55cfcec..3921fbed0b28 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -129,6 +129,17 @@ enum OID {
 	OID_TPMImportableKey,		/* 2.23.133.10.1.4 */
 	OID_TPMSealedData,		/* 2.23.133.10.1.5 */
 
+	/* CSOR FIPS-202 SHA-3 */
+	OID_sha3_256,                           /* 2.16.840.1.101.3.4.2.8 */
+	OID_sha3_384,                           /* 2.16.840.1.101.3.4.2.9 */
+	OID_sha3_512,                           /* 2.16.840.1.101.3.4.2.10 */
+	OID_id_ecdsa_with_sha3_256,             /* 2.16.840.1.101.3.4.3.10 */
+	OID_id_ecdsa_with_sha3_384,             /* 2.16.840.1.101.3.4.3.11 */
+	OID_id_ecdsa_with_sha3_512,             /* 2.16.840.1.101.3.4.3.12 */
+	OID_id_rsassa_pkcs1_v1_5_with_sha3_256, /* 2.16.840.1.101.3.4.3.14 */
+	OID_id_rsassa_pkcs1_v1_5_with_sha3_384, /* 2.16.840.1.101.3.4.3.15 */
+	OID_id_rsassa_pkcs1_v1_5_with_sha3_512, /* 2.16.840.1.101.3.4.3.16 */
+
 	OID__NR
 };
 
-- 
cgit v1.2.3


From f4cf4e5db331a5ce69e3f0b21d322cac0f4e4b5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Mon, 23 Oct 2023 12:27:59 +0200
Subject: Revert "nvmem: add new config option"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 517f14d9cf3533d5ab4fded195ab6f80a92e378f.

Config option "no_of_node" is no longer needed since adding a more
explicit and targeted option "add_legacy_fixed_of_cells".

That "no_of_node" config option was needed *earlier* to help mtd's case.

DT nodes of MTD partitions (that are also NVMEM devices) may contain
subnodes. Those SHOULD NOT be treated as NVMEM fixed cells.

To prevent NVMEM core code from parsing subnodes a "no_of_node" option
was added (and set to true in mtd) to make for_each_child_of_node() in
NVMEM a no-op. That was a bit hacky because it was messing with
"of_node" pointer to achieve some side-effect.

With the introduction of "add_legacy_fixed_of_cells" config option
things got more explicit. MTD subsystem simply tells NVMEM when to look
for fixed cells and there is no need to hack "of_node" pointer anymore.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231023102759.31529-1-zajec5@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-provider.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 1b81adebdb8b..e3930835235b 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -89,7 +89,6 @@ struct nvmem_cell_info {
  * @read_only:	Device is read-only.
  * @root_only:	Device is accessibly to root only.
  * @of_node:	If given, this will be used instead of the parent's of_node.
- * @no_of_node:	Device should not use the parent's of_node even if it's !NULL.
  * @reg_read:	Callback to read data.
  * @reg_write:	Callback to write data.
  * @size:	Device size.
@@ -122,7 +121,6 @@ struct nvmem_config {
 	bool			ignore_wp;
 	struct nvmem_layout	*layout;
 	struct device_node	*of_node;
-	bool			no_of_node;
 	nvmem_reg_read_t	reg_read;
 	nvmem_reg_write_t	reg_write;
 	int	size;
-- 
cgit v1.2.3


From 54b406e10f0334ee0245d5129c8def444a49cd9b Mon Sep 17 00:00:00 2001
From: Abhijit Gangurde <abhijit.gangurde@amd.com>
Date: Tue, 17 Oct 2023 21:34:59 +0530
Subject: cdx: Remove cdx controller list from cdx bus system

Remove xarray list of cdx controller. Instead, use platform bus
to locate the cdx controller using compat string used by cdx
controller platform driver.
Also, use ida to allocate a unique id for the controller.

Signed-off-by: Abhijit Gangurde <abhijit.gangurde@amd.com>
Link: https://lore.kernel.org/r/20231017160505.10640-2-abhijit.gangurde@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cdx/cdx_bus.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index bead71b7bc73..82c27b8c94e1 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -63,12 +63,14 @@ struct cdx_ops {
  * @dev: Linux device associated with the CDX controller.
  * @priv: private data
  * @id: Controller ID
+ * @controller_registered: controller registered with bus
  * @ops: CDX controller ops
  */
 struct cdx_controller {
 	struct device *dev;
 	void *priv;
 	u32 id;
+	bool controller_registered;
 	struct cdx_ops *ops;
 };
 
-- 
cgit v1.2.3


From ce558a391d80b9f47a462107977e8e81fe2f2962 Mon Sep 17 00:00:00 2001
From: Abhijit Gangurde <abhijit.gangurde@amd.com>
Date: Tue, 17 Oct 2023 21:35:02 +0530
Subject: cdx: Register cdx bus as a device on cdx subsystem

While scanning for CDX devices, register newly discovered bus as a
cdx device. CDX device attributes are visible based on device type.

Signed-off-by: Abhijit Gangurde <abhijit.gangurde@amd.com>
Link: https://lore.kernel.org/r/20231017160505.10640-5-abhijit.gangurde@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cdx/cdx_bus.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index 82c27b8c94e1..b5e4b7e05666 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -88,6 +88,7 @@ struct cdx_controller {
  * @dma_mask: Default DMA mask
  * @flags: CDX device flags
  * @req_id: Requestor ID associated with CDX device
+ * @is_bus: Is this bus device
  * @driver_override: driver name to force a match; do not set directly,
  *                   because core frees it; use driver_set_override() to
  *                   set or clear it.
@@ -104,6 +105,7 @@ struct cdx_device {
 	u64 dma_mask;
 	u16 flags;
 	u32 req_id;
+	bool is_bus;
 	const char *driver_override;
 };
 
-- 
cgit v1.2.3


From e3cfd49cb9491ac1cc233a4b9e098688b4ab281d Mon Sep 17 00:00:00 2001
From: Abhijit Gangurde <abhijit.gangurde@amd.com>
Date: Tue, 17 Oct 2023 21:35:03 +0530
Subject: cdx: add support for bus enable and disable

CDX bus needs to be disabled before updating/writing devices
in the FPGA. Once the devices are written, the bus shall be
rescanned. This change provides sysfs entry to enable/disable the
CDX bus.

Co-developed-by: Nipun Gupta <nipun.gupta@amd.com>
Signed-off-by: Nipun Gupta <nipun.gupta@amd.com>
Signed-off-by: Abhijit Gangurde <abhijit.gangurde@amd.com>
Link: https://lore.kernel.org/r/20231017160505.10640-6-abhijit.gangurde@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cdx/cdx_bus.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index b5e4b7e05666..18e95076d1d5 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -28,6 +28,10 @@ struct cdx_device_config {
 	u8 type;
 };
 
+typedef int (*cdx_bus_enable_cb)(struct cdx_controller *cdx, u8 bus_num);
+
+typedef int (*cdx_bus_disable_cb)(struct cdx_controller *cdx, u8 bus_num);
+
 typedef int (*cdx_scan_cb)(struct cdx_controller *cdx);
 
 typedef int (*cdx_dev_configure_cb)(struct cdx_controller *cdx,
@@ -49,11 +53,15 @@ typedef int (*cdx_dev_configure_cb)(struct cdx_controller *cdx,
 
 /**
  * struct cdx_ops - Callbacks supported by CDX controller.
+ * @bus_enable: enable bus on the controller
+ * @bus_disable: disable bus on the controller
  * @scan: scan the devices on the controller
  * @dev_configure: configuration like reset, master_enable,
  *		   msi_config etc for a CDX device
  */
 struct cdx_ops {
+	cdx_bus_enable_cb bus_enable;
+	cdx_bus_disable_cb bus_disable;
 	cdx_scan_cb scan;
 	cdx_dev_configure_cb dev_configure;
 };
@@ -89,6 +97,7 @@ struct cdx_controller {
  * @flags: CDX device flags
  * @req_id: Requestor ID associated with CDX device
  * @is_bus: Is this bus device
+ * @enabled: is this bus enabled
  * @driver_override: driver name to force a match; do not set directly,
  *                   because core frees it; use driver_set_override() to
  *                   set or clear it.
@@ -106,6 +115,7 @@ struct cdx_device {
 	u16 flags;
 	u32 req_id;
 	bool is_bus;
+	bool enabled;
 	const char *driver_override;
 };
 
-- 
cgit v1.2.3


From fa10f413091a43f801f82b3cf484f15d6fc9266f Mon Sep 17 00:00:00 2001
From: Abhijit Gangurde <abhijit.gangurde@amd.com>
Date: Tue, 17 Oct 2023 21:35:05 +0530
Subject: cdx: add sysfs for subsystem, class and revision

CDX controller provides subsystem vendor, subsystem device, class and
revision info of the device along with vendor and device ID in native
endian format. CDX Bus system uses this information to bind the cdx
device to the cdx device driver.

Co-developed-by: Puneet Gupta <puneet.gupta@amd.com>
Signed-off-by: Puneet Gupta <puneet.gupta@amd.com>
Co-developed-by: Nipun Gupta <nipun.gupta@amd.com>
Signed-off-by: Nipun Gupta <nipun.gupta@amd.com>
Signed-off-by: Abhijit Gangurde <abhijit.gangurde@amd.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@amd.com>
Tested-by: Nikhil Agarwal <nikhil.agarwal@amd.com>
Link: https://lore.kernel.org/r/20231017160505.10640-8-abhijit.gangurde@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cdx/cdx_bus.h     | 27 +++++++++++++++++++++++++--
 include/linux/mod_devicetable.h | 10 ++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index 18e95076d1d5..cf9738e3a957 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -38,6 +38,19 @@ typedef int (*cdx_dev_configure_cb)(struct cdx_controller *cdx,
 				    u8 bus_num, u8 dev_num,
 				    struct cdx_device_config *dev_config);
 
+/**
+ * CDX_DEVICE - macro used to describe a specific CDX device
+ * @vend: the 16 bit CDX Vendor ID
+ * @dev: the 16 bit CDX Device ID
+ *
+ * This macro is used to create a struct cdx_device_id that matches a
+ * specific device. The subvendor and subdevice fields will be set to
+ * CDX_ANY_ID.
+ */
+#define CDX_DEVICE(vend, dev) \
+	.vendor = (vend), .device = (dev), \
+	.subvendor = CDX_ANY_ID, .subdevice = CDX_ANY_ID
+
 /**
  * CDX_DEVICE_DRIVER_OVERRIDE - macro used to describe a CDX device with
  *                              override_only flags.
@@ -46,10 +59,12 @@ typedef int (*cdx_dev_configure_cb)(struct cdx_controller *cdx,
  * @driver_override: the 32 bit CDX Device override_only
  *
  * This macro is used to create a struct cdx_device_id that matches only a
- * driver_override device.
+ * driver_override device. The subvendor and subdevice fields will be set to
+ * CDX_ANY_ID.
  */
 #define CDX_DEVICE_DRIVER_OVERRIDE(vend, dev, driver_override) \
-	.vendor = (vend), .device = (dev), .override_only = (driver_override)
+	.vendor = (vend), .device = (dev), .subvendor = CDX_ANY_ID,\
+	.subdevice = CDX_ANY_ID, .override_only = (driver_override)
 
 /**
  * struct cdx_ops - Callbacks supported by CDX controller.
@@ -88,6 +103,10 @@ struct cdx_controller {
  * @cdx: CDX controller associated with the device
  * @vendor: Vendor ID for CDX device
  * @device: Device ID for CDX device
+ * @subsystem_vendor: Subsystem Vendor ID for CDX device
+ * @subsystem_device: Subsystem Device ID for CDX device
+ * @class: Class for the CDX device
+ * @revision: Revision of the CDX device
  * @bus_num: Bus number for this CDX device
  * @dev_num: Device number for this device
  * @res: array of MMIO region entries
@@ -107,6 +126,10 @@ struct cdx_device {
 	struct cdx_controller *cdx;
 	u16 vendor;
 	u16 device;
+	u16 subsystem_vendor;
+	u16 subsystem_device;
+	u32 class;
+	u8 revision;
 	u8 bus_num;
 	u8 dev_num;
 	struct resource res[MAX_CDX_DEV_RESOURCES];
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index b0678b093cb2..aa3c28781248 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -935,6 +935,12 @@ enum {
  * struct cdx_device_id - CDX device identifier
  * @vendor: Vendor ID
  * @device: Device ID
+ * @subvendor: Subsystem vendor ID (or CDX_ANY_ID)
+ * @subdevice: Subsystem device ID (or CDX_ANY_ID)
+ * @class: Device class
+ *         Most drivers do not need to specify class/class_mask
+ *         as vendor/device is normally sufficient.
+ * @class_mask: Limit which sub-fields of the class field are compared.
  * @override_only: Match only when dev->driver_override is this driver.
  *
  * Type of entries in the "device Id" table for CDX devices supported by
@@ -943,6 +949,10 @@ enum {
 struct cdx_device_id {
 	__u16 vendor;
 	__u16 device;
+	__u16 subvendor;
+	__u16 subdevice;
+	__u32 class;
+	__u32 class_mask;
 	__u32 override_only;
 };
 
-- 
cgit v1.2.3


From 571d91dcadfa3cef499010b4eddb9b58b0da4d24 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 25 Oct 2023 13:16:19 -0700
Subject: perf: Add branch stack counters

Currently, the additional information of a branch entry is stored in a
u64 space. With more and more information added, the space is running
out. For example, the information of occurrences of events will be added
for each branch.

Two places were suggested to append the counters.
https://lore.kernel.org/lkml/20230802215814.GH231007@hirez.programming.kicks-ass.net/
One place is right after the flags of each branch entry. It changes the
existing struct perf_branch_entry. The later ARCH specific
implementation has to be really careful to consistently pick
the right struct.
The other place is right after the entire struct perf_branch_stack.
The disadvantage is that the pointer of the extra space has to be
recorded. The common interface perf_sample_save_brstack() has to be
updated.

The latter is much straightforward, and should be easily understood and
maintained. It is implemented in the patch.

Add a new branch sample type, PERF_SAMPLE_BRANCH_COUNTERS, to indicate
the event which is recorded in the branch info.

The "u64 counters" may store the occurrences of several events. The
information regarding the number of events/counters and the width of
each counter should be exposed via sysfs as a reference for the perf
tool. Define the branch_counter_nr and branch_counter_width ABI here.
The support will be implemented later in the Intel-specific patch.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231025201626.3000228-1-kan.liang@linux.intel.com
---
 include/linux/perf_event.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0367d748fae0..7897ef066027 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1139,6 +1139,10 @@ static inline bool branch_sample_priv(const struct perf_event *event)
 	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE;
 }
 
+static inline bool branch_sample_counters(const struct perf_event *event)
+{
+	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS;
+}
 
 struct perf_sample_data {
 	/*
@@ -1173,6 +1177,7 @@ struct perf_sample_data {
 	struct perf_callchain_entry	*callchain;
 	struct perf_raw_record		*raw;
 	struct perf_branch_stack	*br_stack;
+	u64				*br_stack_cntr;
 	union perf_sample_weight	weight;
 	union  perf_mem_data_src	data_src;
 	u64				txn;
@@ -1250,7 +1255,8 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
 
 static inline void perf_sample_save_brstack(struct perf_sample_data *data,
 					    struct perf_event *event,
-					    struct perf_branch_stack *brs)
+					    struct perf_branch_stack *brs,
+					    u64 *brs_cntr)
 {
 	int size = sizeof(u64); /* nr */
 
@@ -1258,7 +1264,16 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data,
 		size += sizeof(u64);
 	size += brs->nr * sizeof(struct perf_branch_entry);
 
+	/*
+	 * The extension space for counters is appended after the
+	 * struct perf_branch_stack. It is used to store the occurrences
+	 * of events of each branch.
+	 */
+	if (brs_cntr)
+		size += brs->nr * sizeof(u64);
+
 	data->br_stack = brs;
+	data->br_stack_cntr = brs_cntr;
 	data->dyn_size += size;
 	data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
 }
-- 
cgit v1.2.3


From 1f2376cd03dd3b965d130ed46a7c92769d614ba1 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 25 Oct 2023 13:16:21 -0700
Subject: perf: Add branch_sample_call_stack

Add a helper function to check call stack sample type.

The later patch will invoke the function in several places.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231025201626.3000228-3-kan.liang@linux.intel.com
---
 include/linux/perf_event.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7897ef066027..ac1a59c1f252 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1144,6 +1144,11 @@ static inline bool branch_sample_counters(const struct perf_event *event)
 	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS;
 }
 
+static inline bool branch_sample_call_stack(const struct perf_event *event)
+{
+	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
+}
+
 struct perf_sample_data {
 	/*
 	 * Fields set by perf_sample_data_init() unconditionally,
-- 
cgit v1.2.3


From 6777877eb7a3290cf0a8a6b621e46f72f9d94b6b Mon Sep 17 00:00:00 2001
From: Terry Bowman <terry.bowman@amd.com>
Date: Wed, 18 Oct 2023 19:17:05 +0200
Subject: PCI/AER: Refactor cper_print_aer() for use by CXL driver module

The CXL driver plans to use cper_print_aer() for logging restricted CXL
host (RCH) AER errors. cper_print_aer() is not currently exported and
therefore not usable by the CXL drivers built as loadable modules. Export
the cper_print_aer() function. Use the EXPORT_SYMBOL_NS_GPL() variant
to restrict the export to CXL drivers.

The CONFIG_ACPI_APEI_PCIEAER kernel config is currently used to enable
cper_print_aer(). cper_print_aer() logs the AER registers and is
useful in PCIE AER logging outside of APEI. Remove the
CONFIG_ACPI_APEI_PCIEAER dependency to enable cper_print_aer().

The cper_print_aer() function name implies CPER specific use but is useful
in non-CPER cases as well. Rename cper_print_aer() to pci_print_aer().

Also, update cxl_core to import CXL namespace imports.

Co-developed-by: Robert Richter <rrichter@amd.com>
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Signed-off-by: Robert Richter <rrichter@amd.com>
Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
Cc: Oliver O'Halloran <oohall@gmail.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: linux-pci@vger.kernel.org
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20231018171713.1883517-13-rrichter@amd.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/aer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/aer.h b/include/linux/aer.h
index 29cc10220952..f6ea2f57d808 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -51,7 +51,7 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
 #endif
 
-void cper_print_aer(struct pci_dev *dev, int aer_severity,
+void pci_print_aer(struct pci_dev *dev, int aer_severity,
 		    struct aer_capability_regs *aer);
 int cper_severity_to_aer(int cper_severity);
 void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
-- 
cgit v1.2.3


From a103f46633fdcddc2aaca506420f177e8803a2bd Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 12 Oct 2023 11:53:54 -0700
Subject: acpi: Move common tables helper functions to common lib

Some of the routines in ACPI driver/acpi/tables.c can be shared with
parsing CDAT. CDAT is a device-provided data structure that is formatted
similar to a platform provided ACPI table. CDAT is used by CXL and can
exist on platforms that do not use ACPI. Split out the common routine
from ACPI to accommodate platforms that do not support ACPI and move that
to /lib. The common routines can be built outside of ACPI if
FIRMWARE_TABLES is selected.

Link: https://lore.kernel.org/linux-cxl/CAJZ5v0jipbtTNnsA0-o5ozOk8ZgWnOg34m34a9pPenTyRLj=6A@mail.gmail.com/
Suggested-by: "Rafael J. Wysocki" <rafael@kernel.org>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/169713683430.2205276.17899451119920103445.stgit@djiang5-mobl3
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/acpi.h     | 42 +++++++++++-------------------------------
 include/linux/fw_table.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 31 deletions(-)
 create mode 100644 include/linux/fw_table.h

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a73246c3c35e..0d334975c0c9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -15,6 +15,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/property.h>
 #include <linux/uuid.h>
+#include <linux/fw_table.h>
 
 struct irq_domain;
 struct irq_domain_ops;
@@ -24,6 +25,16 @@ struct irq_domain_ops;
 #endif
 #include <acpi/acpi.h>
 
+#ifdef CONFIG_ACPI_TABLE_LIB
+#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI)
+#define __init_or_acpilib
+#define __initdata_or_acpilib
+#else
+#define EXPORT_SYMBOL_ACPI_LIB(x)
+#define __init_or_acpilib __init
+#define __initdata_or_acpilib __initdata
+#endif
+
 #ifdef	CONFIG_ACPI
 
 #include <linux/list.h>
@@ -119,21 +130,8 @@ enum acpi_address_range_id {
 
 
 /* Table Handlers */
-union acpi_subtable_headers {
-	struct acpi_subtable_header common;
-	struct acpi_hmat_structure hmat;
-	struct acpi_prmt_module_header prmt;
-	struct acpi_cedt_header cedt;
-};
-
 typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table);
 
-typedef int (*acpi_tbl_entry_handler)(union acpi_subtable_headers *header,
-				      const unsigned long end);
-
-typedef int (*acpi_tbl_entry_handler_arg)(union acpi_subtable_headers *header,
-					  void *arg, const unsigned long end);
-
 /* Debugger support */
 
 struct acpi_debugger_ops {
@@ -207,14 +205,6 @@ static inline int acpi_debugger_notify_command_complete(void)
 		(!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
 		((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
 
-struct acpi_subtable_proc {
-	int id;
-	acpi_tbl_entry_handler handler;
-	acpi_tbl_entry_handler_arg handler_arg;
-	void *arg;
-	int count;
-};
-
 void __iomem *__acpi_map_table(unsigned long phys, unsigned long size);
 void __acpi_unmap_table(void __iomem *map, unsigned long size);
 int early_acpi_boot_init(void);
@@ -229,16 +219,6 @@ void acpi_reserve_initial_tables (void);
 void acpi_table_init_complete (void);
 int acpi_table_init (void);
 
-#ifdef CONFIG_ACPI_TABLE_LIB
-#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI)
-#define __init_or_acpilib
-#define __initdata_or_acpilib
-#else
-#define EXPORT_SYMBOL_ACPI_LIB(x)
-#define __init_or_acpilib __init
-#define __initdata_or_acpilib __initdata
-#endif
-
 int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
 int __init_or_acpilib acpi_table_parse_entries(char *id,
 		unsigned long table_size, int entry_id,
diff --git a/include/linux/fw_table.h b/include/linux/fw_table.h
new file mode 100644
index 000000000000..ff8fa58d5818
--- /dev/null
+++ b/include/linux/fw_table.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  fw_tables.h - Parsing support for ACPI and ACPI-like tables provided by
+ *                platform or device firmware
+ *
+ *  Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2023 Intel Corp.
+ */
+#ifndef _FW_TABLE_H_
+#define _FW_TABLE_H_
+
+union acpi_subtable_headers;
+
+typedef int (*acpi_tbl_entry_handler)(union acpi_subtable_headers *header,
+				      const unsigned long end);
+
+typedef int (*acpi_tbl_entry_handler_arg)(union acpi_subtable_headers *header,
+					  void *arg, const unsigned long end);
+
+struct acpi_subtable_proc {
+	int id;
+	acpi_tbl_entry_handler handler;
+	acpi_tbl_entry_handler_arg handler_arg;
+	void *arg;
+	int count;
+};
+
+#include <linux/acpi.h>
+#include <acpi/acpi.h>
+
+union acpi_subtable_headers {
+	struct acpi_subtable_header common;
+	struct acpi_hmat_structure hmat;
+	struct acpi_prmt_module_header prmt;
+	struct acpi_cedt_header cedt;
+};
+
+int acpi_parse_entries_array(char *id, unsigned long table_size,
+			     struct acpi_table_header *table_header,
+			     struct acpi_subtable_proc *proc,
+			     int proc_num, unsigned int max_entries);
+
+#endif
-- 
cgit v1.2.3


From e719b4d156749f02eafed31a3c515f2aa9dcc72a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:07 +0200
Subject: block: Provide bdev_open_* functions

Create struct bdev_handle that contains all parameters that need to be
passed to blkdev_put() and provide bdev_open_* functions that return
this structure instead of plain bdev pointer. This will eventually allow
us to pass one more argument to blkdev_put() (renamed to bdev_release())
without too much hassle.

Acked-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-1-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blkdev.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index eef450f25982..03d3adc3ff34 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1479,14 +1479,24 @@ extern const struct blk_holder_ops fs_holder_ops;
 #define sb_open_mode(flags) \
 	(BLK_OPEN_READ | (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
 
+struct bdev_handle {
+	struct block_device *bdev;
+	void *holder;
+};
+
 struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		const struct blk_holder_ops *hops);
 struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
 		void *holder, const struct blk_holder_ops *hops);
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+		const struct blk_holder_ops *hops);
+struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
+		void *holder, const struct blk_holder_ops *hops);
 int bd_prepare_to_claim(struct block_device *bdev, void *holder,
 		const struct blk_holder_ops *hops);
 void bd_abort_claiming(struct block_device *bdev, void *holder);
 void blkdev_put(struct block_device *bdev, void *holder);
+void bdev_release(struct bdev_handle *handle);
 
 /* just for blk-cgroup, don't use elsewhere */
 struct block_device *blkdev_get_no_open(dev_t dev);
-- 
cgit v1.2.3


From 841dd789b8625eb9288aaa2be9f10872e6622033 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:08 +0200
Subject: block: Use bdev_open_by_dev() in blkdev_open()

Convert blkdev_open() to use bdev_open_by_dev(). To be able to propagate
handle from blkdev_open() to blkdev_release() we need to stop using
existence of file->private_data to determine exclusive block device
opens. Use bdev_handle->mode for this purpose since file->f_flags
isn't usable for this (O_EXCL is cleared from the flags during open).

Acked-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-2-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 03d3adc3ff34..51fa7ffdee83 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1482,6 +1482,7 @@ extern const struct blk_holder_ops fs_holder_ops;
 struct bdev_handle {
 	struct block_device *bdev;
 	void *holder;
+	blk_mode_t mode;
 };
 
 struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
-- 
cgit v1.2.3


From 7ac86df899f0025cfd554c057efb30e5a4ef95b0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:11 +0200
Subject: pktcdvd: Convert to bdev_open_by_dev()

Convert pktcdvd to use bdev_open_by_dev().

Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-5-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pktcdvd.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index 80cb00db42a4..79594aeb160d 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -154,7 +154,9 @@ struct packet_stacked_data
 
 struct pktcdvd_device
 {
-	struct block_device	*bdev;		/* dev attached */
+	struct bdev_handle	*bdev_handle;	/* dev attached */
+	/* handle acquired for bdev during pkt_open_dev() */
+	struct bdev_handle	*open_bdev_handle;
 	dev_t			pkt_dev;	/* our dev */
 	struct packet_settings	settings;
 	struct packet_stats	stats;
-- 
cgit v1.2.3


From c2fce61fb22e3235d29512558c4e93e184e2f68b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:16 +0200
Subject: dm: Convert to bdev_open_by_dev()

Convert device mapper to use bdev_open_by_dev() and pass the handle
around.

CC: Alasdair Kergon <agk@redhat.com>
CC: Mike Snitzer <snitzer@kernel.org>
CC: dm-devel@redhat.com
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-10-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/device-mapper.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 69d0435c7ebb..772ab4d74d94 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -165,6 +165,7 @@ void dm_error(const char *message);
 
 struct dm_dev {
 	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct dax_device *dax_dev;
 	blk_mode_t mode;
 	char name[16];
-- 
cgit v1.2.3


From 4c6bca43c547fe9bc1d7d1519b1d6430fee2cae2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:24 +0200
Subject: mm/swap: Convert to use bdev_open_by_dev()

Convert swapping code to use bdev_open_by_dev() and pass the handle
around.

CC: linux-mm@kvack.org
CC: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-18-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/swap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 493487ed7c38..f6dd6575b905 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -298,6 +298,7 @@ struct swap_info_struct {
 	unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */
 	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
 	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
+	struct bdev_handle *bdev_handle;/* open handle of the bdev */
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	unsigned int old_block_size;	/* seldom referenced */
-- 
cgit v1.2.3


From f4a48bc36cdfae7c603e8e3f2a51e2a283f3f365 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:25 +0200
Subject: fs: Convert to bdev_open_by_dev()

Convert mount code to use bdev_open_by_dev() and propagate the handle
around to bdev_release().

Acked-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-19-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a40823c3c67..fb5accebdcdf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1221,6 +1221,7 @@ struct super_block {
 	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
 	struct block_device	*s_bdev;
+	struct bdev_handle	*s_bdev_handle;
 	struct backing_dev_info *s_bdi;
 	struct mtd_info		*s_mtd;
 	struct hlist_node	s_instances;
-- 
cgit v1.2.3


From 6a4e59eeedc3018cb57722eecfcbb49431aeb05f Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Mon, 23 Oct 2023 02:06:05 +0900
Subject: linux/init: remove __memexit* annotations

We have never used __memexit, __memexitdata, or __memexitconst.

These were unneeded.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/init.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 266c3e1640d4..01b52c9c7526 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -89,9 +89,6 @@
 						  __latent_entropy
 #define __meminitdata    __section(".meminit.data")
 #define __meminitconst   __section(".meminit.rodata")
-#define __memexit        __section(".memexit.text") __exitused __cold notrace
-#define __memexitdata    __section(".memexit.data")
-#define __memexitconst   __section(".memexit.rodata")
 
 /* For assembly routines */
 #define __HEAD		.section	".head.text","ax"
-- 
cgit v1.2.3


From 29166faac5486cbc34228431cd049e1c1be451ac Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 15 Oct 2023 23:34:25 +0200
Subject: ACPI: Add helper acpi_use_parent_companion

In several drivers devices use the ACPI companion of the parent.
Add a helper for this use case to avoid code duplication.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/acpi.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a73246c3c35e..d17a280ce7c7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1539,4 +1539,9 @@ static inline void acpi_device_notify(struct device *dev) { }
 static inline void acpi_device_notify_remove(struct device *dev) { }
 #endif
 
+static inline void acpi_use_parent_companion(struct device *dev)
+{
+	ACPI_COMPANION_SET(dev, ACPI_COMPANION(dev->parent));
+}
+
 #endif	/*_LINUX_ACPI_H*/
-- 
cgit v1.2.3


From e21fc2038c1b978b89bbc3d45a4c5c6ef598e178 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 23 Oct 2023 21:07:59 +0300
Subject: exportfs: make ->encode_fh() a mandatory method for NFS export

Rename the default helper for encoding FILEID_INO32_GEN* file handles to
generic_encode_ino32_fh() and convert the filesystems that used the
default implementation to use the generic helper explicitly.

After this change, exportfs_encode_inode_fh() no longer has a default
implementation to encode FILEID_INO32_GEN* file handles.

This is a step towards allowing filesystems to encode non-decodeable
file handles for fanotify without having to implement any
export_operations.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231023180801.2953446-3-amir73il@gmail.com
Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/exportfs.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 5b3c9f30b422..85bd027494e5 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -235,7 +235,7 @@ extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid,
 
 static inline bool exportfs_can_encode_fid(const struct export_operations *nop)
 {
-	return nop;
+	return nop && nop->encode_fh;
 }
 
 static inline bool exportfs_can_decode_fh(const struct export_operations *nop)
@@ -279,6 +279,13 @@ extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 /*
  * Generic helpers for filesystems.
  */
+#ifdef CONFIG_EXPORTFS
+int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
+			    struct inode *parent);
+#else
+#define generic_encode_ino32_fh NULL
+#endif
+
 extern struct dentry *generic_fh_to_dentry(struct super_block *sb,
 	struct fid *fid, int fh_len, int fh_type,
 	struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
-- 
cgit v1.2.3


From 41d1ddd2717c758b8606a66d57d2cc63b41373c0 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 23 Oct 2023 21:08:00 +0300
Subject: exportfs: define FILEID_INO64_GEN* file handle types

Similar to the common FILEID_INO32* file handle types, define common
FILEID_INO64* file handle types.

The type values of FILEID_INO64_GEN and FILEID_INO64_GEN_PARENT are the
values returned by fuse and xfs for 64bit ino encoded file handle types.

Note that these type value are filesystem specific and they do not define
a universal file handle format, for example:
fuse encodes FILEID_INO64_GEN as [ino-hi32,ino-lo32,gen] and xfs encodes
FILEID_INO64_GEN as [hostr-order-ino64,gen] (a.k.a xfs_fid64).

The FILEID_INO64_GEN fhandle type is going to be used for file ids for
fanotify from filesystems that do not support NFS export.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231023180801.2953446-4-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/exportfs.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 85bd027494e5..4119d3ee72eb 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -98,6 +98,17 @@ enum fid_type {
 	 */
 	FILEID_FAT_WITH_PARENT = 0x72,
 
+	/*
+	 * 64 bit inode number, 32 bit generation number.
+	 */
+	FILEID_INO64_GEN = 0x81,
+
+	/*
+	 * 64 bit inode number, 32 bit generation number,
+	 * 64 bit parent inode number, 32 bit parent generation.
+	 */
+	FILEID_INO64_GEN_PARENT = 0x82,
+
 	/*
 	 * 128 bit child FID (struct lu_fid)
 	 * 128 bit parent FID (struct lu_fid)
-- 
cgit v1.2.3


From 64343119d7b80b4ee9ba7703390681608a17f2c5 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 23 Oct 2023 21:08:01 +0300
Subject: exportfs: support encoding non-decodeable file handles by default

AT_HANDLE_FID was added as an API for name_to_handle_at() that request
the encoding of a file id, which is not intended to be decoded.

This file id is used by fanotify to describe objects in events.

So far, overlayfs is the only filesystem that supports encoding
non-decodeable file ids, by providing export_operations with an
->encode_fh() method and without a ->decode_fh() method.

Add support for encoding non-decodeable file ids to all the filesystems
that do not provide export_operations, by encoding a file id of type
FILEID_INO64_GEN from { i_ino, i_generation }.

A filesystem may that does not support NFS export, can opt-out of
encoding non-decodeable file ids for fanotify by defining an empty
export_operations struct (i.e. with a NULL ->encode_fh() method).

This allows the use of fanotify events with file ids on filesystems
like 9p which do not support NFS export to bring fanotify in feature
parity with inotify on those filesystems.

Note that fanotify also requires that the filesystems report a non-null
fsid.  Currently, many simple filesystems that have support for inotify
(e.g. debugfs, tracefs, sysfs) report a null fsid, so can still not be
used with fanotify in file id reporting mode.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231023180801.2953446-5-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/exportfs.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 4119d3ee72eb..21bae8bfeef1 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -134,7 +134,11 @@ struct fid {
 			u32 parent_ino;
 			u32 parent_gen;
 		} i32;
- 		struct {
+		struct {
+			u64 ino;
+			u32 gen;
+		} __packed i64;
+		struct {
  			u32 block;
  			u16 partref;
  			u16 parent_partref;
@@ -246,7 +250,7 @@ extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid,
 
 static inline bool exportfs_can_encode_fid(const struct export_operations *nop)
 {
-	return nop && nop->encode_fh;
+	return !nop || nop->encode_fh;
 }
 
 static inline bool exportfs_can_decode_fh(const struct export_operations *nop)
@@ -259,7 +263,7 @@ static inline bool exportfs_can_encode_fh(const struct export_operations *nop,
 {
 	/*
 	 * If a non-decodeable file handle was requested, we only need to make
-	 * sure that filesystem can encode file handles.
+	 * sure that filesystem did not opt-out of encoding fid.
 	 */
 	if (fh_flags & EXPORT_FH_FID)
 		return exportfs_can_encode_fid(nop);
-- 
cgit v1.2.3


From d9e5d9221d7f82a2736f091bbc5b54ab8d6ef701 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 26 Oct 2023 23:45:40 +0300
Subject: fs: fix build error with CONFIG_EXPORTFS=m or not defined

Many of the filesystems that call the generic exportfs helpers do not
select the EXPORTFS config.

Move generic_encode_ino32_fh() to libfs.c, same as generic_fh_to_*()
to avoid having to fix all those config dependencies.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202310262151.renqMvme-lkp@intel.com/
Fixes: dfaf653dc415 ("exportfs: make ->encode_fh() a mandatory method for NFS export")
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231026204540.143217-1-amir73il@gmail.com
Tested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/exportfs.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 21bae8bfeef1..e0e69dafaa43 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -294,17 +294,12 @@ extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 /*
  * Generic helpers for filesystems.
  */
-#ifdef CONFIG_EXPORTFS
 int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
 			    struct inode *parent);
-#else
-#define generic_encode_ino32_fh NULL
-#endif
-
-extern struct dentry *generic_fh_to_dentry(struct super_block *sb,
+struct dentry *generic_fh_to_dentry(struct super_block *sb,
 	struct fid *fid, int fh_len, int fh_type,
 	struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
-extern struct dentry *generic_fh_to_parent(struct super_block *sb,
+struct dentry *generic_fh_to_parent(struct super_block *sb,
 	struct fid *fid, int fh_len, int fh_type,
 	struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
 
-- 
cgit v1.2.3


From dcc4e5728eeaeda84878ca0018758cff1abfca21 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 27 Oct 2023 08:56:38 -0700
Subject: seq_buf: Introduce DECLARE_SEQ_BUF and seq_buf_str()

Solve two ergonomic issues with struct seq_buf;

1) Too much boilerplate is required to initialize:

	struct seq_buf s;
	char buf[32];

	seq_buf_init(s, buf, sizeof(buf));

Instead, we can build this directly on the stack. Provide
DECLARE_SEQ_BUF() macro to do this:

	DECLARE_SEQ_BUF(s, 32);

2) %NUL termination is fragile and requires 2 steps to get a valid
   C String (and is a layering violation exposing the "internals" of
   seq_buf):

	seq_buf_terminate(s);
	do_something(s->buffer);

Instead, we can just return s->buffer directly after terminating it in
the refactored seq_buf_terminate(), now known as seq_buf_str():

	do_something(seq_buf_str(s));

Link: https://lore.kernel.org/linux-trace-kernel/20231027155634.make.260-kees@kernel.org
Link: https://lore.kernel.org/linux-trace-kernel/20231026194033.it.702-kees@kernel.org/

Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Yun Zhou <yun.zhou@windriver.com>
Cc: Jacob Keller <jacob.e.keller@intel.com>
Cc: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 8483e4b2d0d2..5fb1f12c33f9 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -21,9 +21,18 @@ struct seq_buf {
 	size_t			len;
 };
 
+#define DECLARE_SEQ_BUF(NAME, SIZE)			\
+	char __ ## NAME ## _buffer[SIZE] = "";		\
+	struct seq_buf NAME = {				\
+		.buffer = &__ ## NAME ## _buffer,	\
+		.size = SIZE,				\
+	}
+
 static inline void seq_buf_clear(struct seq_buf *s)
 {
 	s->len = 0;
+	if (s->size)
+		s->buffer[0] = '\0';
 }
 
 static inline void
@@ -69,8 +78,8 @@ static inline unsigned int seq_buf_used(struct seq_buf *s)
 }
 
 /**
- * seq_buf_terminate - Make sure buffer is nul terminated
- * @s: the seq_buf descriptor to terminate.
+ * seq_buf_str - get %NUL-terminated C string from seq_buf
+ * @s: the seq_buf handle
  *
  * This makes sure that the buffer in @s is nul terminated and
  * safe to read as a string.
@@ -81,16 +90,20 @@ static inline unsigned int seq_buf_used(struct seq_buf *s)
  *
  * After this function is called, s->buffer is safe to use
  * in string operations.
+ *
+ * Returns @s->buf after making sure it is terminated.
  */
-static inline void seq_buf_terminate(struct seq_buf *s)
+static inline const char *seq_buf_str(struct seq_buf *s)
 {
 	if (WARN_ON(s->size == 0))
-		return;
+		return "";
 
 	if (seq_buf_buffer_left(s))
 		s->buffer[s->len] = 0;
 	else
 		s->buffer[s->size - 1] = 0;
+
+	return s->buffer;
 }
 
 /**
-- 
cgit v1.2.3


From 2bce6368c46b835a133f7f4946eea9c4513828dd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 24 Oct 2023 14:46:58 -0400
Subject: closures: Better memory barriers

atomic_(dec|sub)_return_release() are a thing now - use them.

Also, delete the useless barrier in set_closure_fn(): it's redundant
with the memory barrier in closure_put(0.

Since closure_put() would now otherwise just have a release barrier, we
also need a new barrier when the ref hits 0 -
smp_acquire__after_ctrl_dep().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/closure.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/closure.h b/include/linux/closure.h
index 722a586bb224..bdab17050bc8 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -233,8 +233,6 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
 	closure_set_ip(cl);
 	cl->fn = fn;
 	cl->wq = wq;
-	/* between atomic_dec() in closure_put() */
-	smp_mb__before_atomic();
 }
 
 static inline void closure_queue(struct closure *cl)
-- 
cgit v1.2.3


From ee526b88caaa4b4182144bf2576af2c3b1e9c759 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 24 Oct 2023 14:46:58 -0400
Subject: closures: Fix race in closure_sync()

As pointed out by Linus, closure_sync() was racy; we could skip blocking
immediately after a get() and a put(), but then that would skip any
barrier corresponding to the other thread's put() barrier.

To fix this, always do the full __closure_sync() sequence whenever any
get() has happened and the closure might have been used by other
threads.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/closure.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/closure.h b/include/linux/closure.h
index bdab17050bc8..de7bb47d8a46 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -154,6 +154,7 @@ struct closure {
 	struct closure		*parent;
 
 	atomic_t		remaining;
+	bool			closure_get_happened;
 
 #ifdef CONFIG_DEBUG_CLOSURES
 #define CLOSURE_MAGIC_DEAD	0xc054dead
@@ -185,7 +186,11 @@ static inline unsigned closure_nr_remaining(struct closure *cl)
  */
 static inline void closure_sync(struct closure *cl)
 {
-	if (closure_nr_remaining(cl) != 1)
+#ifdef CONFIG_DEBUG_CLOSURES
+	BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened);
+#endif
+
+	if (cl->closure_get_happened)
 		__closure_sync(cl);
 }
 
@@ -257,6 +262,8 @@ static inline void closure_queue(struct closure *cl)
  */
 static inline void closure_get(struct closure *cl)
 {
+	cl->closure_get_happened = true;
+
 #ifdef CONFIG_DEBUG_CLOSURES
 	BUG_ON((atomic_inc_return(&cl->remaining) &
 		CLOSURE_REMAINING_MASK) <= 1);
@@ -279,6 +286,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
 		closure_get(parent);
 
 	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+	cl->closure_get_happened = false;
 
 	closure_debug_create(cl);
 	closure_set_ip(cl);
-- 
cgit v1.2.3


From 0da611a8702101814257a7c03f6caf0574c83b98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Fri, 8 Sep 2023 10:27:23 +0200
Subject: dma-buf: add dma_fence_timestamp helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a fence signals there is a very small race window where the timestamp
isn't updated yet. sync_file solves this by busy waiting for the
timestamp to appear, but on other ocassions didn't handled this
correctly.

Provide a dma_fence_timestamp() helper function for this and use it in
all appropriate cases.

Another alternative would be to grab the spinlock when that happens.

v2 by teddy: add a wait parameter to wait for the timestamp to show up, in case
   the accurate timestamp is needed and/or the timestamp is not based on
   ktime (e.g. hw timestamp)
v3 chk: drop the parameter again for unified handling

Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Fixes: 1774baa64f93 ("drm/scheduler: Change scheduled fence track v2")
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
CC: stable@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20230929104725.2358-1-christian.koenig@amd.com
---
 include/linux/dma-fence.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 0d678e9a7b24..ebe78bd3d121 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -568,6 +568,25 @@ static inline void dma_fence_set_error(struct dma_fence *fence,
 	fence->error = error;
 }
 
+/**
+ * dma_fence_timestamp - helper to get the completion timestamp of a fence
+ * @fence: fence to get the timestamp from.
+ *
+ * After a fence is signaled the timestamp is updated with the signaling time,
+ * but setting the timestamp can race with tasks waiting for the signaling. This
+ * helper busy waits for the correct timestamp to appear.
+ */
+static inline ktime_t dma_fence_timestamp(struct dma_fence *fence)
+{
+	if (WARN_ON(!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)))
+		return ktime_get();
+
+	while (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags))
+		cpu_relax();
+
+	return fence->timestamp;
+}
+
 signed long dma_fence_wait_timeout(struct dma_fence *,
 				   bool intr, signed long timeout);
 signed long dma_fence_wait_any_timeout(struct dma_fence **fences,
-- 
cgit v1.2.3


From 3c70342f1f0045dc827bb2f02d814ce31e0e0d05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= <mirq-linux@rere.qmqm.pl>
Date: Mon, 28 Aug 2023 22:16:11 +0200
Subject: mfd: core: Un-constify mfd_cell.of_reg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable dynamically filling in the whole mfd_cell structure. All other
fields already allow that.

Fixes: 466a62d7642f ("mfd: core: Make a best effort attempt to match devices with the correct of_nodes")
Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Link: https://lore.kernel.org/r/b73fe4bc4bd6ba1af90940a640ed65fe254c0408.1693253717.git.mirq-linux@rere.qmqm.pl
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index 47e7a3a61ce6..e8bcad641d8c 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -92,7 +92,7 @@ struct mfd_cell {
 	 * (above) when matching OF nodes with devices that have identical
 	 * compatible strings
 	 */
-	const u64 of_reg;
+	u64 of_reg;
 
 	/* Set to 'true' to use 'of_reg' (above) - allows for of_reg=0 */
 	bool use_of_reg;
-- 
cgit v1.2.3


From 89b00c328f5623c428e6aa8563c3a648f8190252 Mon Sep 17 00:00:00 2001
From: Ying Sun <sunying@nj.iscas.ac.cn>
Date: Wed, 13 Sep 2023 16:45:59 +0800
Subject: mfd: ab8500: Remove non-existent configuration "#ifdef
 CONFIG_AB8500_DEBUG"

The CONFIG_AB8500_DEBUG has been deleted in:

  3d4d1266597c0 ("mfd: ab8500: Drop debugfs module")

The condition "#ifdef CONFIG_AB8500_DEBUG" in:

   include/linux/mfd/abx500/ab8500.h:502

...cannot be valid. It is recommended to delete redundant code.

Suggested-by: Yanjie Ren <renyanjie01@gmail.com>
Signed-off-by: Ying Sun <sunying@nj.iscas.ac.cn>
Link: https://lore.kernel.org/r/20230913084559.18141-1-sunying@nj.iscas.ac.cn
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/abx500/ab8500.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/abx500/ab8500.h b/include/linux/mfd/abx500/ab8500.h
index 09fb3c56e7d7..76d326ea8eba 100644
--- a/include/linux/mfd/abx500/ab8500.h
+++ b/include/linux/mfd/abx500/ab8500.h
@@ -499,13 +499,7 @@ static inline int is_ab9540_2p0_or_earlier(struct ab8500 *ab)
 
 void ab8500_override_turn_on_stat(u8 mask, u8 set);
 
-#ifdef CONFIG_AB8500_DEBUG
-extern int prcmu_abb_read(u8 slave, u8 reg, u8 *value, u8 size);
-void ab8500_dump_all_banks(struct device *dev);
-void ab8500_debug_register_interrupt(int line);
-#else
 static inline void ab8500_dump_all_banks(struct device *dev) {}
 static inline void ab8500_debug_register_interrupt(int line) {}
-#endif
 
 #endif /* MFD_AB8500_H */
-- 
cgit v1.2.3


From 1c7ea43fc42b09080657c83683846c4be672bd91 Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@foss.st.com>
Date: Tue, 29 Aug 2023 15:40:25 +0200
Subject: mfd: stm32-timers: Add support for interrupts

There are two types of STM32 timers that may have:

- a global interrupt line
- 4 dedicated interrupt lines.

Those interrupts are optional as defined in the dt-bindings. Enforce checks
on either one, four or no interrupts are provided with their names.
Optionally get them here, to be used by child devices.

Signed-off-by: Fabrice Gasnier <fabrice.gasnier@foss.st.com>
Link: https://lore.kernel.org/r/20230829134029.2402868-5-fabrice.gasnier@foss.st.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/stm32-timers.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h
index 1b94325febb3..ca35af30745f 100644
--- a/include/linux/mfd/stm32-timers.h
+++ b/include/linux/mfd/stm32-timers.h
@@ -102,6 +102,15 @@ enum stm32_timers_dmas {
 	STM32_TIMERS_MAX_DMAS,
 };
 
+/* STM32 Timer may have either a unique global interrupt or 4 interrupt lines */
+enum stm32_timers_irqs {
+	STM32_TIMERS_IRQ_GLOBAL_BRK, /* global or brk IRQ */
+	STM32_TIMERS_IRQ_UP,
+	STM32_TIMERS_IRQ_TRG_COM,
+	STM32_TIMERS_IRQ_CC,
+	STM32_TIMERS_MAX_IRQS,
+};
+
 /**
  * struct stm32_timers_dma - STM32 timer DMA handling.
  * @completion:		end of DMA transfer completion
@@ -123,6 +132,8 @@ struct stm32_timers {
 	struct regmap *regmap;
 	u32 max_arr;
 	struct stm32_timers_dma dma; /* Only to be used by the parent */
+	unsigned int nr_irqs;
+	int irq[STM32_TIMERS_MAX_IRQS];
 };
 
 #if IS_REACHABLE(CONFIG_MFD_STM32_TIMERS)
-- 
cgit v1.2.3


From 16c4c1bb7e01d5de797c43d7a2396434f2078b0a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 26 Sep 2023 22:08:32 +0300
Subject: mfd: lpc_ich: Convert gpio_version to be enum

We have an anonymous enum for the GPIO versions. Make it named
and use this type for the gpio_version member of struct lpc_ich_info.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230926190834.932233-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/lpc_ich.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/lpc_ich.h b/include/linux/mfd/lpc_ich.h
index ea4a4b1b246a..83621e0ccf33 100644
--- a/include/linux/mfd/lpc_ich.h
+++ b/include/linux/mfd/lpc_ich.h
@@ -15,7 +15,7 @@
 #define ICH_RES_GPE0	1
 
 /* GPIO compatibility */
-enum {
+enum lpc_gpio_versions {
 	ICH_I3100_GPIO,
 	ICH_V5_GPIO,
 	ICH_V6_GPIO,
@@ -29,7 +29,7 @@ enum {
 struct lpc_ich_info {
 	char name[32];
 	unsigned int iTCO_version;
-	unsigned int gpio_version;
+	enum lpc_gpio_versions gpio_version;
 	enum intel_spi_type spi_type;
 	u8 use_gpio;
 };
-- 
cgit v1.2.3


From 1f84f88dc15036d63fa7c84833a440370f22a74b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 26 Sep 2023 22:08:33 +0300
Subject: mfd: lpc_ich: Move APL GPIO resources to a custom structure

We are expecting more platforms that want to instantiate
the GPIO device via P2SB. For them prepare the custom structure
and move Apollo Lake data there. Refactor the code accordingly.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230926190834.932233-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/lpc_ich.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/lpc_ich.h b/include/linux/mfd/lpc_ich.h
index 83621e0ccf33..1fbda1f8967d 100644
--- a/include/linux/mfd/lpc_ich.h
+++ b/include/linux/mfd/lpc_ich.h
@@ -26,11 +26,14 @@ enum lpc_gpio_versions {
 	AVOTON_GPIO,
 };
 
+struct lpc_ich_gpio_info;
+
 struct lpc_ich_info {
 	char name[32];
 	unsigned int iTCO_version;
 	enum lpc_gpio_versions gpio_version;
 	enum intel_spi_type spi_type;
+	const struct lpc_ich_gpio_info *gpio_info;
 	u8 use_gpio;
 };
 
-- 
cgit v1.2.3


From 272f99edab36974b58347e97ee105885e385fa88 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Tue, 3 Oct 2023 17:29:27 +0200
Subject: spmi: rename spmi device lookup helper

Rename the SPMI device helper which is used to lookup a device from its
OF node as spmi_find_device_by_of_node() so that it reflects the
implementation and matches how other helpers like this are named.

This will specifically make it more clear that this is a lookup function
which returns a reference counted structure.

Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Link: https://lore.kernel.org/r/20231003152927.15000-6-johan+linaro@kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/spmi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spmi.h b/include/linux/spmi.h
index eac1956a8727..2a4ce4144f9f 100644
--- a/include/linux/spmi.h
+++ b/include/linux/spmi.h
@@ -166,7 +166,7 @@ static inline void spmi_driver_unregister(struct spmi_driver *sdrv)
 
 struct device_node;
 
-struct spmi_device *spmi_device_from_of(struct device_node *np);
+struct spmi_device *spmi_find_device_by_of_node(struct device_node *np);
 int spmi_register_read(struct spmi_device *sdev, u8 addr, u8 *buf);
 int spmi_ext_register_read(struct spmi_device *sdev, u8 addr, u8 *buf,
 			   size_t len);
-- 
cgit v1.2.3


From a72cac6067fdfde280ece0ec5c055659a8de6508 Mon Sep 17 00:00:00 2001
From: Si-Wei Liu <si-wei.liu@oracle.com>
Date: Wed, 18 Oct 2023 20:14:41 +0300
Subject: vdpa: introduce dedicated descriptor group for virtqueue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In some cases, the access to the virtqueue's descriptor area, device
and driver areas (precluding indirect descriptor table in guest memory)
may have to be confined to a different address space than where its
buffers reside. Without loss of simplicity and generality with already
established terminology, let's fold up these 3 areas and call them
as a whole as descriptor table group, or descriptor group for short.
Specifically, in case of split virtqueues, descriptor group consists of
regions for Descriptor Table, Available Ring and Used Ring; for packed
virtqueues layout, descriptor group contains Descriptor Ring, Driver
and Device Event Suppression structures.

The group ID for a dedicated descriptor group can be obtained through a
new .get_vq_desc_group() op. If driver implements this op, it means that
the descriptor, device and driver areas of the virtqueue may reside
in a dedicated group than where its buffers reside, a.k.a the default
virtqueue group through the .get_vq_group() op.

In principle, the descriptor group may or may not have same group ID
as the default group. Even if the descriptor group has a different ID,
meaning the vq's descriptor group areas can optionally move to a
separate address space than where guest memory resides, the descriptor
group may still start from a default address space, same as where its
buffers reside. To move the descriptor group to a different address
space, .set_group_asid() has to be called to change the ASID binding
for the group, which is no different than what needs to be done on any
other virtqueue group. On the other hand, the .reset() semantics also
applies on descriptor table group, meaning the device reset will clear
all ASID bindings and move all virtqueue groups including descriptor
group back to the default address space, i.e. in ASID 0.

QEMU's shadow virtqueue is going to utilize dedicated descriptor group
to speed up map and unmap operations, yielding tremendous downtime
reduction by avoiding the full and slow remap cycle in SVQ switching.

Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20231018171456.1624030-4-dtatulea@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Si-Wei Liu <si-wei.liu@oracle.com>
Tested-by: Si-Wei Liu <si-wei.liu@oracle.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 include/linux/vdpa.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 0e652026b776..d376309b99cf 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -204,6 +204,16 @@ struct vdpa_map_file {
  *				@vdev: vdpa device
  *				@idx: virtqueue index
  *				Returns u32: group id for this virtqueue
+ * @get_vq_desc_group:		Get the group id for the descriptor table of
+ *				a specific virtqueue (optional)
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ *				Returns u32: group id for the descriptor table
+ *				portion of this virtqueue. Could be different
+ *				than the one from @get_vq_group, in which case
+ *				the access to the descriptor table can be
+ *				confined to a separate asid, isolating from
+ *				the virtqueue's buffer address access.
  * @get_device_features:	Get virtio features supported by the device
  *				@vdev: vdpa device
  *				Returns the virtio features support by the
@@ -360,6 +370,7 @@ struct vdpa_config_ops {
 	/* Device ops */
 	u32 (*get_vq_align)(struct vdpa_device *vdev);
 	u32 (*get_vq_group)(struct vdpa_device *vdev, u16 idx);
+	u32 (*get_vq_desc_group)(struct vdpa_device *vdev, u16 idx);
 	u64 (*get_device_features)(struct vdpa_device *vdev);
 	u64 (*get_backend_features)(const struct vdpa_device *vdev);
 	int (*set_driver_features)(struct vdpa_device *vdev, u64 features);
-- 
cgit v1.2.3


From 03dd63c8fae4599439a30e0dde91061789260dbe Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Wed, 18 Oct 2023 20:14:53 +0300
Subject: vdpa/mlx5: Enable hw support for vq descriptor mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vq descriptor mappings are supported in hardware by filling in an
additional mkey which contains the descriptor mappings to the hw vq.

A previous patch in this series added support for hw mkey (mr) creation
for ASID 1.

This patch fills in both the vq data and vq descriptor mkeys based on
group ASID mapping.

The feature is signaled to the vdpa core through the presence of the
.get_vq_desc_group op.

Acked-by: Jason Wang <jasowang@redhat.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Message-Id: <20231018171456.1624030-16-dtatulea@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Si-Wei Liu <si-wei.liu@oracle.com>
Tested-by: Si-Wei Liu <si-wei.liu@oracle.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 include/linux/mlx5/mlx5_ifc_vdpa.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h
index 9becdc3fa503..b86d51a855f6 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -74,7 +74,11 @@ struct mlx5_ifc_virtio_q_bits {
 	u8    reserved_at_320[0x8];
 	u8    pd[0x18];
 
-	u8    reserved_at_340[0xc0];
+	u8    reserved_at_340[0x20];
+
+	u8    desc_group_mkey[0x20];
+
+	u8    reserved_at_380[0x80];
 };
 
 struct mlx5_ifc_virtio_net_q_object_bits {
@@ -141,6 +145,7 @@ enum {
 	MLX5_VIRTQ_MODIFY_MASK_STATE                    = (u64)1 << 0,
 	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS      = (u64)1 << 3,
 	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4,
+	MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY          = (u64)1 << 14,
 };
 
 enum {
-- 
cgit v1.2.3


From a5d7df87843dd9025015e4038dd927e893cc8b8b Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Mon, 11 Sep 2023 14:31:04 -0700
Subject: virtio: kdoc for struct virtio_pci_modern_device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Finally following up to Simon's suggestion for some kdoc attention
on struct virtio_pci_modern_device.

Link: https://lore.kernel.org/netdev/ZE%2FQS0lnUvxFacjf@corigine.com/
Cc: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Message-Id: <20230911213104.14391-1-shannon.nelson@amd.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/virtio_pci_modern.h | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index 067ac1d789bc..a38c729d1973 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -12,37 +12,47 @@ struct virtio_pci_modern_common_cfg {
 	__le16 queue_reset;		/* read-write */
 };
 
+/**
+ * struct virtio_pci_modern_device - info for modern PCI virtio
+ * @pci_dev:	    Ptr to the PCI device struct
+ * @common:	    Position of the common capability in the PCI config
+ * @device:	    Device-specific data (non-legacy mode)
+ * @notify_base:    Base of vq notifications (non-legacy mode)
+ * @notify_pa:	    Physical base of vq notifications
+ * @isr:	    Where to read and clear interrupt
+ * @notify_len:	    So we can sanity-check accesses
+ * @device_len:	    So we can sanity-check accesses
+ * @notify_map_cap: Capability for when we need to map notifications per-vq
+ * @notify_offset_multiplier: Multiply queue_notify_off by this value
+ *                            (non-legacy mode).
+ * @modern_bars:    Bitmask of BARs
+ * @id:		    Device and vendor id
+ * @device_id_check: Callback defined before vp_modern_probe() to be used to
+ *		    verify the PCI device is a vendor's expected device rather
+ *		    than the standard virtio PCI device
+ *		    Returns the found device id or ERRNO
+ * @dma_mask:	    Optional mask instead of the traditional DMA_BIT_MASK(64),
+ *		    for vendor devices with DMA space address limitations
+ */
 struct virtio_pci_modern_device {
 	struct pci_dev *pci_dev;
 
 	struct virtio_pci_common_cfg __iomem *common;
-	/* Device-specific data (non-legacy mode)  */
 	void __iomem *device;
-	/* Base of vq notifications (non-legacy mode). */
 	void __iomem *notify_base;
-	/* Physical base of vq notifications */
 	resource_size_t notify_pa;
-	/* Where to read and clear interrupt */
 	u8 __iomem *isr;
 
-	/* So we can sanity-check accesses. */
 	size_t notify_len;
 	size_t device_len;
 
-	/* Capability for when we need to map notifications per-vq. */
 	int notify_map_cap;
 
-	/* Multiply queue_notify_off by this value. (non-legacy mode). */
 	u32 notify_offset_multiplier;
-
 	int modern_bars;
-
 	struct virtio_device_id id;
 
-	/* optional check for vendor virtio device, returns dev_id or -ERRNO */
 	int (*device_id_check)(struct pci_dev *pdev);
-
-	/* optional mask for devices with limited DMA space */
 	u64 dma_mask;
 };
 
-- 
cgit v1.2.3


From e0592acd1ef2497b861ef7ed6eda14b092b1e667 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Thu, 19 Oct 2023 11:49:02 +0800
Subject: virtio_pci: add check for common cfg size

Some buggy devices, the common cfg size may not match the features.

This patch checks the common cfg size for the
features(VIRTIO_F_NOTIF_CONFIG_DATA, VIRTIO_F_RING_RESET). When the
common cfg size does not match the corresponding feature, we fail the
probe and print error message.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20231019034902.7346-1-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_pci_modern.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index a38c729d1973..d0f2797420f7 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -45,6 +45,7 @@ struct virtio_pci_modern_device {
 
 	size_t notify_len;
 	size_t device_len;
+	size_t common_len;
 
 	int notify_map_cap;
 
-- 
cgit v1.2.3


From d2cf1b6e3b85dcb2bb3e8cd7924beede34fbbf0e Mon Sep 17 00:00:00 2001
From: Si-Wei Liu <si-wei.liu@oracle.com>
Date: Sat, 21 Oct 2023 02:25:13 -0700
Subject: vdpa: introduce .reset_map operation callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some device specific IOMMU parent drivers have long standing bogus
behavior that mistakenly clean up the maps during .reset. By definition,
this is violation to the on-chip IOMMU ops (i.e. .set_map, or .dma_map &
.dma_unmap) in those offending drivers, as the removal of internal maps
is completely agnostic to the upper layer, causing inconsistent view
between the userspace and the kernel. Some userspace app like QEMU gets
around of this brokenness by proactively removing and adding back all
the maps around vdpa device reset, but such workaround actually penalize
other well-behaved driver setup, where vdpa reset always comes with the
associated mapping cost, especially for kernel vDPA devices
(use_va=false) that have high cost on pinning. It's imperative to
rectify this behavior and remove the problematic code from all those
non-compliant parent drivers.

The reason why a separate .reset_map op is introduced is because this
allows a simple on-chip IOMMU model without exposing too much device
implementation detail to the upper vdpa layer. The .dma_map/unmap or
.set_map driver API is meant to be used to manipulate the IOTLB
mappings, and has been abstracted in a way similar to how a real IOMMU
device maps or unmaps pages for certain memory ranges. However, apart
from this there also exists other mapping needs, in which case 1:1
passthrough mapping has to be used by other users (read virtio-vdpa). To
ease parent/vendor driver implementation and to avoid abusing DMA ops in
an unexpacted way, these on-chip IOMMU devices can start with 1:1
passthrough mapping mode initially at the time of creation. Then the
.reset_map op can be used to switch iotlb back to this initial state
without having to expose a complex two-dimensional IOMMU device model.

The .reset_map is not a MUST for every parent that implements the
.dma_map or .set_map API, because device may work with DMA ops directly
by implement their own to manipulate system memory mappings, so don't
have to use .reset_map to achieve a simple IOMMU device model for 1:1
passthrough mapping.

Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <1697880319-4937-2-git-send-email-si-wei.liu@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 include/linux/vdpa.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index d376309b99cf..26ae6ae1eac3 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -327,6 +327,15 @@ struct vdpa_map_file {
  *				@iova: iova to be unmapped
  *				@size: size of the area
  *				Returns integer: success (0) or error (< 0)
+ * @reset_map:			Reset device memory mapping to the default
+ *				state (optional)
+ *				Needed for devices that are using device
+ *				specific DMA translation and prefer mapping
+ *				to be decoupled from the virtio life cycle,
+ *				i.e. device .reset op does not reset mapping
+ *				@vdev: vdpa device
+ *				@asid: address space identifier
+ *				Returns integer: success (0) or error (< 0)
  * @get_vq_dma_dev:		Get the dma device for a specific
  *				virtqueue (optional)
  *				@vdev: vdpa device
@@ -405,6 +414,7 @@ struct vdpa_config_ops {
 		       u64 iova, u64 size, u64 pa, u32 perm, void *opaque);
 	int (*dma_unmap)(struct vdpa_device *vdev, unsigned int asid,
 			 u64 iova, u64 size);
+	int (*reset_map)(struct vdpa_device *vdev, unsigned int asid);
 	int (*set_group_asid)(struct vdpa_device *vdev, unsigned int group,
 			      unsigned int asid);
 	struct device *(*get_vq_dma_dev)(struct vdpa_device *vdev, u16 idx);
-- 
cgit v1.2.3


From a26f2e4e68ee3130e5d5acb4f58807041aaea905 Mon Sep 17 00:00:00 2001
From: Si-Wei Liu <si-wei.liu@oracle.com>
Date: Sat, 21 Oct 2023 02:25:16 -0700
Subject: vdpa: introduce .compat_reset operation callback

Some device specific IOMMU parent drivers have long standing bogus
behaviour that mistakenly clean up the maps during .reset. By
definition, this is violation to the on-chip IOMMU ops (i.e. .set_map,
or .dma_map & .dma_unmap) in those offending drivers, as the removal of
internal maps is completely agnostic to the upper layer, causing
inconsistent view between the userspace and the kernel. Some userspace
app like QEMU gets around of this brokenness by proactively removing and
adding back all the maps around vdpa device reset, but such workaround
actually penaltize other well-behaved driver setup, where vdpa reset
always comes with the associated mapping cost, especially for kernel
vDPA devices (use_va=false) that have high cost on pinning. It's
imperative to rectify this behaviour and remove the problematic code
from all those non-compliant parent drivers.

However, we cannot unconditionally remove the bogus map-cleaning code
from the buggy .reset implementation, as there might exist userspace
apps that already rely on the behaviour on some setup. Introduce a
.compat_reset driver op to keep compatibility with older userspace. New
and well behaved parent driver should not bother to implement such op,
but only those drivers that are doing or used to do non-compliant
map-cleaning reset will have to.

Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
Message-Id: <1697880319-4937-5-git-send-email-si-wei.liu@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 include/linux/vdpa.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 26ae6ae1eac3..6b8cbf75712d 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -252,6 +252,17 @@ struct vdpa_map_file {
  * @reset:			Reset device
  *				@vdev: vdpa device
  *				Returns integer: success (0) or error (< 0)
+ * @compat_reset:		Reset device with compatibility quirks to
+ *				accommodate older userspace. Only needed by
+ *				parent driver which used to have bogus reset
+ *				behaviour, and has to maintain such behaviour
+ *				for compatibility with older userspace.
+ *				Historically compliant driver only has to
+ *				implement .reset, Historically non-compliant
+ *				driver should implement both.
+ *				@vdev: vdpa device
+ *				@flags: compatibility quirks for reset
+ *				Returns integer: success (0) or error (< 0)
  * @suspend:			Suspend the device (optional)
  *				@vdev: vdpa device
  *				Returns integer: success (0) or error (< 0)
@@ -393,6 +404,8 @@ struct vdpa_config_ops {
 	u8 (*get_status)(struct vdpa_device *vdev);
 	void (*set_status)(struct vdpa_device *vdev, u8 status);
 	int (*reset)(struct vdpa_device *vdev);
+	int (*compat_reset)(struct vdpa_device *vdev, u32 flags);
+#define VDPA_RESET_F_CLEAN_MAP 1
 	int (*suspend)(struct vdpa_device *vdev);
 	int (*resume)(struct vdpa_device *vdev);
 	size_t (*get_config_size)(struct vdpa_device *vdev);
-- 
cgit v1.2.3


From bc91df5c70ac720eca18bd1f4a288f2582713d3e Mon Sep 17 00:00:00 2001
From: Si-Wei Liu <si-wei.liu@oracle.com>
Date: Sat, 21 Oct 2023 02:25:17 -0700
Subject: vhost-vdpa: clean iotlb map during reset for older userspace

Using .compat_reset op from the previous patch, the buggy .reset
behaviour can be kept as-is on older userspace apps, which don't ack the
IOTLB_PERSIST backend feature. As this compatibility quirk is limited to
those drivers that used to be buggy in the past, it won't affect change
the behaviour or affect ABI on the setups with API compliant driver.

The separation of .compat_reset from the regular .reset allows
vhost-vdpa able to know which driver had broken behaviour before, so it
can apply the corresponding compatibility quirk to the individual driver
whenever needed.  Compared to overloading the existing .reset with
flags, .compat_reset won't cause any extra burden to the implementation
of every compliant driver.

[mst: squashed in two fixup commits]

Message-Id: <1697880319-4937-6-git-send-email-si-wei.liu@oracle.com>
Message-Id: <1698102863-21122-1-git-send-email-si-wei.liu@oracle.com>
Reported-by: Dragos Tatulea <dtatulea@nvidia.com>
Tested-by: Dragos Tatulea <dtatulea@nvidia.com>
Message-Id: <1698275594-19204-1-git-send-email-si-wei.liu@oracle.com>
Reported-by: Lei Yang <leiyang@redhat.com>
Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 include/linux/vdpa.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 6b8cbf75712d..db15ac07f8a6 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -519,14 +519,17 @@ static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev)
 	return vdev->dma_dev;
 }
 
-static inline int vdpa_reset(struct vdpa_device *vdev)
+static inline int vdpa_reset(struct vdpa_device *vdev, u32 flags)
 {
 	const struct vdpa_config_ops *ops = vdev->config;
 	int ret;
 
 	down_write(&vdev->cf_lock);
 	vdev->features_valid = false;
-	ret = ops->reset(vdev);
+	if (ops->compat_reset && flags)
+		ret = ops->compat_reset(vdev, flags);
+	else
+		ret = ops->reset(vdev);
 	up_write(&vdev->cf_lock);
 	return ret;
 }
-- 
cgit v1.2.3


From 3503895788d402d6a3814085ed582c364ec3e903 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 31 Oct 2023 12:02:06 -0400
Subject: virtio_pci: move structure to a header

These are guest/host interfaces, so they belong in the header where e.g.
qemu will know to find them.

Note: we added a new structure as opposed to extending existing one
because someone might be relying on the size of the existing structure
staying unchanged.  Add a warning to avoid using sizeof.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 include/linux/virtio_pci_modern.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index d0f2797420f7..a09e13a577a9 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -5,13 +5,6 @@
 #include <linux/pci.h>
 #include <linux/virtio_pci.h>
 
-struct virtio_pci_modern_common_cfg {
-	struct virtio_pci_common_cfg cfg;
-
-	__le16 queue_notify_data;	/* read-write */
-	__le16 queue_reset;		/* read-write */
-};
-
 /**
  * struct virtio_pci_modern_device - info for modern PCI virtio
  * @pci_dev:	    Ptr to the PCI device struct
-- 
cgit v1.2.3


From 8001f49394e353f035306a45bcf504f06fca6355 Mon Sep 17 00:00:00 2001
From: Krister Johansen <kjlx@templeofstupid.com>
Date: Fri, 27 Oct 2023 14:46:40 -0700
Subject: proc: sysctl: prevent aliased sysctls from getting passed to init

The code that checks for unknown boot options is unaware of the sysctl
alias facility, which maps bootparams to sysctl values.  If a user sets
an old value that has a valid alias, a message about an invalid
parameter will be printed during boot, and the parameter will get passed
to init.  Fix by checking for the existence of aliased parameters in the
unknown boot parameter code.  If an alias exists, don't return an error
or pass the value to init.

Signed-off-by: Krister Johansen <kjlx@templeofstupid.com>
Cc: stable@vger.kernel.org
Fixes: 0a477e1ae21b ("kernel/sysctl: support handling command line aliases")
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/sysctl.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 09d7429d67c0..61b40ea81f4d 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -242,6 +242,7 @@ extern void __register_sysctl_init(const char *path, struct ctl_table *table,
 extern struct ctl_table_header *register_sysctl_mount_point(const char *path);
 
 void do_sysctl_args(void);
+bool sysctl_is_alias(char *param);
 int do_proc_douintvec(struct ctl_table *table, int write,
 		      void *buffer, size_t *lenp, loff_t *ppos,
 		      int (*conv)(unsigned long *lvalp,
@@ -287,6 +288,11 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
 static inline void do_sysctl_args(void)
 {
 }
+
+static inline bool sysctl_is_alias(char *param)
+{
+	return false;
+}
 #endif /* CONFIG_SYSCTL */
 
 int sysctl_max_threads(struct ctl_table *table, int write, void *buffer,
-- 
cgit v1.2.3


From a9b8d90f87263f985fa14250fc8caf7bfcde8803 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 24 Aug 2023 14:52:19 -0400
Subject: NFSv4: fairly test all delegations on a SEQ4_ revocation

When the client is required to use TEST_STATEID to discover which
delegation(s) have been revoked, it may continually test delegations at the
head of the list if the server continues to be unsatisfied and send
SEQ4_STATUS_RECALLABLE_STATE_REVOKED.  For a large number of delegations
this behavior is prone to live-lock because the client may never be able to
test and free revoked state at the end of the list since the
SEQ4_STATUS_RECALLABLE_STATE_REVOKED will cause us to flag delegations at
the head of the list to be tested.  This problem is further exacerbated by
the state manager's willingness to be scheduled out on a busy system while
testing the list of delegations.

Keep a generation counter for each attempt to test all delegations, and
skip delegations that have already been tested in the current pass.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Torkil Svensgaard <torkil@drcmr.dk>
Tested-by: Ruben Vestergaard <rubenv@drcmr.dk>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs_sb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index cd628c4b011e..cd797e00fe35 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -239,6 +239,7 @@ struct nfs_server {
 	struct list_head	delegations;
 	struct list_head	ss_copies;
 
+	unsigned long		delegation_gen;
 	unsigned long		mig_gen;
 	unsigned long		mig_status;
 #define NFS_MIG_IN_TRANSITION		(1)
-- 
cgit v1.2.3


From bfca5fb4e97c46503ddfc582335917b0cc228264 Mon Sep 17 00:00:00 2001
From: felix <fuzhen5@huawei.com>
Date: Mon, 23 Oct 2023 09:40:19 +0800
Subject: SUNRPC: Fix RPC client cleaned up the freed pipefs dentries

RPC client pipefs dentries cleanup is in separated rpc_remove_pipedir()
workqueue,which takes care about pipefs superblock locking.
In some special scenarios, when kernel frees the pipefs sb of the
current client and immediately alloctes a new pipefs sb,
rpc_remove_pipedir function would misjudge the existence of pipefs
sb which is not the one it used to hold. As a result,
the rpc_remove_pipedir would clean the released freed pipefs dentries.

To fix this issue, rpc_remove_pipedir should check whether the
current pipefs sb is consistent with the original pipefs sb.

This error can be catched by KASAN:
=========================================================
[  250.497700] BUG: KASAN: slab-use-after-free in dget_parent+0x195/0x200
[  250.498315] Read of size 4 at addr ffff88800a2ab804 by task kworker/0:18/106503
[  250.500549] Workqueue: events rpc_free_client_work
[  250.501001] Call Trace:
[  250.502880]  kasan_report+0xb6/0xf0
[  250.503209]  ? dget_parent+0x195/0x200
[  250.503561]  dget_parent+0x195/0x200
[  250.503897]  ? __pfx_rpc_clntdir_depopulate+0x10/0x10
[  250.504384]  rpc_rmdir_depopulate+0x1b/0x90
[  250.504781]  rpc_remove_client_dir+0xf5/0x150
[  250.505195]  rpc_free_client_work+0xe4/0x230
[  250.505598]  process_one_work+0x8ee/0x13b0
...
[   22.039056] Allocated by task 244:
[   22.039390]  kasan_save_stack+0x22/0x50
[   22.039758]  kasan_set_track+0x25/0x30
[   22.040109]  __kasan_slab_alloc+0x59/0x70
[   22.040487]  kmem_cache_alloc_lru+0xf0/0x240
[   22.040889]  __d_alloc+0x31/0x8e0
[   22.041207]  d_alloc+0x44/0x1f0
[   22.041514]  __rpc_lookup_create_exclusive+0x11c/0x140
[   22.041987]  rpc_mkdir_populate.constprop.0+0x5f/0x110
[   22.042459]  rpc_create_client_dir+0x34/0x150
[   22.042874]  rpc_setup_pipedir_sb+0x102/0x1c0
[   22.043284]  rpc_client_register+0x136/0x4e0
[   22.043689]  rpc_new_client+0x911/0x1020
[   22.044057]  rpc_create_xprt+0xcb/0x370
[   22.044417]  rpc_create+0x36b/0x6c0
...
[   22.049524] Freed by task 0:
[   22.049803]  kasan_save_stack+0x22/0x50
[   22.050165]  kasan_set_track+0x25/0x30
[   22.050520]  kasan_save_free_info+0x2b/0x50
[   22.050921]  __kasan_slab_free+0x10e/0x1a0
[   22.051306]  kmem_cache_free+0xa5/0x390
[   22.051667]  rcu_core+0x62c/0x1930
[   22.051995]  __do_softirq+0x165/0x52a
[   22.052347]
[   22.052503] Last potentially related work creation:
[   22.052952]  kasan_save_stack+0x22/0x50
[   22.053313]  __kasan_record_aux_stack+0x8e/0xa0
[   22.053739]  __call_rcu_common.constprop.0+0x6b/0x8b0
[   22.054209]  dentry_free+0xb2/0x140
[   22.054540]  __dentry_kill+0x3be/0x540
[   22.054900]  shrink_dentry_list+0x199/0x510
[   22.055293]  shrink_dcache_parent+0x190/0x240
[   22.055703]  do_one_tree+0x11/0x40
[   22.056028]  shrink_dcache_for_umount+0x61/0x140
[   22.056461]  generic_shutdown_super+0x70/0x590
[   22.056879]  kill_anon_super+0x3a/0x60
[   22.057234]  rpc_kill_sb+0x121/0x200

Fixes: 0157d021d23a ("SUNRPC: handle RPC client pipefs dentries by network namespace aware routines")
Signed-off-by: felix <fuzhen5@huawei.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index af7358277f1c..e9d4377d03c6 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -92,6 +92,7 @@ struct rpc_clnt {
 	};
 	const struct cred	*cl_cred;
 	unsigned int		cl_max_connect; /* max number of transports not to the same IP */
+	struct super_block *pipefs_sb;
 };
 
 /*
-- 
cgit v1.2.3


From f003a717ae9086b1e8a4663124a96862df7282e7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Sep 2023 18:33:33 +0100
Subject: nfs: Convert nfs_symlink() to use a folio

Use the folio APIs, saving about four calls to compound_head().
Convert back to a page in each of the individual protocol implementations.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_xdr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 12bbb5c63664..539b57fbf3ce 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1772,7 +1772,7 @@ struct nfs_rpc_ops {
 	void	(*rename_rpc_prepare)(struct rpc_task *task, struct nfs_renamedata *);
 	int	(*rename_done) (struct rpc_task *task, struct inode *old_dir, struct inode *new_dir);
 	int	(*link)    (struct inode *, struct inode *, const struct qstr *);
-	int	(*symlink) (struct inode *, struct dentry *, struct page *,
+	int	(*symlink) (struct inode *, struct dentry *, struct folio *,
 			    unsigned int, struct iattr *);
 	int	(*mkdir)   (struct inode *, struct dentry *, struct iattr *);
 	int	(*rmdir)   (struct inode *, const struct qstr *);
-- 
cgit v1.2.3


From 2c7ccb3c362bc86aeb3e52d6fbf15f7f480ca961 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 13 Sep 2023 16:54:14 -0700
Subject: module: Clarify documentation of module_param_call()

Commit 9bbb9e5a3310 ("param: use ops in struct kernel_param, rather than
get and set fns directly") added the comment that module_param_call()
was deprecated, during a large scale refactoring to bring sanity to type
casting back then. In 2017 following more cleanups, it became useful
again as it wraps a common pattern of creating an ops struct for a
given get/set pair:

  b2f270e87473 ("module: Prepare to convert all module_param_call() prototypes")
  ece1996a21ee ("module: Do not paper over type mismatches in module_param_call()")

        static const struct kernel_param_ops __param_ops_##name = \
                { .flags = 0, .set = _set, .get = _get }; \
        __module_param_call(MODULE_PARAM_PREFIX, \
                            name, &__param_ops_##name, arg, perm, -1, 0)

        __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, -1, 0)

Many users of module_param_cb() appear to be almost universally
open-coding the same thing that module_param_call() does now. Don't
discourage[1] people from using module_param_call(): clarify the comment
to show that module_param_cb() is useful if you repeatedly use the same
pair of get/set functions.

[1] https://lore.kernel.org/lkml/202308301546.5C789E5EC@keescook/

Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Johan Hovold <johan@kernel.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Nick Desaulniers <ndesaulniers@gooogle.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Joe Perches <joe@perches.com>
Cc: linux-modules@vger.kernel.org
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/moduleparam.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 962cd41a2cb5..d4452f93d060 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -293,7 +293,11 @@ struct kparam_array
 	= { __param_str_##name, THIS_MODULE, ops,			\
 	    VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } }
 
-/* Obsolete - use module_param_cb() */
+/*
+ * Useful for describing a set/get pair used only once (i.e. for this
+ * parameter). For repeated set/get pairs (i.e. the same struct
+ * kernel_param_ops), use module_param_cb() instead.
+ */
 #define module_param_call(name, _set, _get, arg, perm)			\
 	static const struct kernel_param_ops __param_ops_##name =	\
 		{ .flags = 0, .set = _set, .get = _get };		\
-- 
cgit v1.2.3


From 60da3640b07ce03706a8c77a3740ebad8b9af063 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Fri, 16 Jun 2023 09:51:32 +0800
Subject: module: Make is_mapping_symbol() return bool

The return value of is_mapping_symbol() is true or false,
so change its type to reflect that.

Suggested-by: Xi Zhang <zhangxi@kylinos.cn>
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/module_symbol.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module_symbol.h b/include/linux/module_symbol.h
index 1269543d0634..77c9895b9ddb 100644
--- a/include/linux/module_symbol.h
+++ b/include/linux/module_symbol.h
@@ -3,7 +3,7 @@
 #define _LINUX_MODULE_SYMBOL_H
 
 /* This ignores the intensely annoying "mapping symbols" found in ELF files. */
-static inline int is_mapping_symbol(const char *str)
+static inline bool is_mapping_symbol(const char *str)
 {
 	if (str[0] == '.' && str[1] == 'L')
 		return true;
-- 
cgit v1.2.3


From bb32500fb9b78215e4ef6ee8b4345c5f5d7eafb4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 31 Oct 2023 12:24:53 -0400
Subject: tracing: Have trace_event_file have ref counters

The following can crash the kernel:

 # cd /sys/kernel/tracing
 # echo 'p:sched schedule' > kprobe_events
 # exec 5>>events/kprobes/sched/enable
 # > kprobe_events
 # exec 5>&-

The above commands:

 1. Change directory to the tracefs directory
 2. Create a kprobe event (doesn't matter what one)
 3. Open bash file descriptor 5 on the enable file of the kprobe event
 4. Delete the kprobe event (removes the files too)
 5. Close the bash file descriptor 5

The above causes a crash!

 BUG: kernel NULL pointer dereference, address: 0000000000000028
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] PREEMPT SMP PTI
 CPU: 6 PID: 877 Comm: bash Not tainted 6.5.0-rc4-test-00008-g2c6b6b1029d4-dirty #186
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
 RIP: 0010:tracing_release_file_tr+0xc/0x50

What happens here is that the kprobe event creates a trace_event_file
"file" descriptor that represents the file in tracefs to the event. It
maintains state of the event (is it enabled for the given instance?).
Opening the "enable" file gets a reference to the event "file" descriptor
via the open file descriptor. When the kprobe event is deleted, the file is
also deleted from the tracefs system which also frees the event "file"
descriptor.

But as the tracefs file is still opened by user space, it will not be
totally removed until the final dput() is called on it. But this is not
true with the event "file" descriptor that is already freed. If the user
does a write to or simply closes the file descriptor it will reference the
event "file" descriptor that was just freed, causing a use-after-free bug.

To solve this, add a ref count to the event "file" descriptor as well as a
new flag called "FREED". The "file" will not be freed until the last
reference is released. But the FREE flag will be set when the event is
removed to prevent any more modifications to that event from happening,
even if there's still a reference to the event "file" descriptor.

Link: https://lore.kernel.org/linux-trace-kernel/20231031000031.1e705592@gandalf.local.home/
Link: https://lore.kernel.org/linux-trace-kernel/20231031122453.7a48b923@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Fixes: f5ca233e2e66d ("tracing: Increase trace array ref count on enable and filter files")
Reported-by: Beau Belgrave <beaub@linux.microsoft.com>
Tested-by: Beau Belgrave <beaub@linux.microsoft.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 12207dc6722d..696f8dc4aa53 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -492,6 +492,7 @@ enum {
 	EVENT_FILE_FL_TRIGGER_COND_BIT,
 	EVENT_FILE_FL_PID_FILTER_BIT,
 	EVENT_FILE_FL_WAS_ENABLED_BIT,
+	EVENT_FILE_FL_FREED_BIT,
 };
 
 extern struct trace_event_file *trace_get_event_file(const char *instance,
@@ -630,6 +631,7 @@ extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...);
  *  TRIGGER_COND  - When set, one or more triggers has an associated filter
  *  PID_FILTER    - When set, the event is filtered based on pid
  *  WAS_ENABLED   - Set when enabled to know to clear trace on module removal
+ *  FREED         - File descriptor is freed, all fields should be considered invalid
  */
 enum {
 	EVENT_FILE_FL_ENABLED		= (1 << EVENT_FILE_FL_ENABLED_BIT),
@@ -643,6 +645,7 @@ enum {
 	EVENT_FILE_FL_TRIGGER_COND	= (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),
 	EVENT_FILE_FL_PID_FILTER	= (1 << EVENT_FILE_FL_PID_FILTER_BIT),
 	EVENT_FILE_FL_WAS_ENABLED	= (1 << EVENT_FILE_FL_WAS_ENABLED_BIT),
+	EVENT_FILE_FL_FREED		= (1 << EVENT_FILE_FL_FREED_BIT),
 };
 
 struct trace_event_file {
@@ -671,6 +674,7 @@ struct trace_event_file {
 	 * caching and such. Which is mostly OK ;-)
 	 */
 	unsigned long		flags;
+	atomic_t		ref;	/* ref count for opened files */
 	atomic_t		sm_ref;	/* soft-mode reference counter */
 	atomic_t		tm_ref;	/* trigger-mode reference counter */
 };
-- 
cgit v1.2.3


From 44365329f8219fc379097c2c9a75ff53f123764f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 1 Nov 2023 13:25:46 -0400
Subject: eventfs: Hold eventfs_mutex when calling callback functions

The callback function that is used to create inodes and dentries is not
protected by anything and the data that is passed to it could become
stale. After eventfs_remove_dir() is called by the tracing system, it is
free to remove the events that are associated to that directory.
Unfortunately, that means the callbacks must not be called after that.

     CPU0				CPU1
     ----				----
 eventfs_root_lookup() {
				 eventfs_remove_dir() {
				      mutex_lock(&event_mutex);
				      ei->is_freed = set;
				      mutex_unlock(&event_mutex);
				 }
				 kfree(event_call);

    for (...) {
      entry = &ei->entries[i];
      r = entry->callback() {
          call = data;		// call == event_call above
          if (call->flags ...)

 [ USE AFTER FREE BUG ]

The safest way to protect this is to wrap the callback with:

 mutex_lock(&eventfs_mutex);
 if (!ei->is_freed)
     r = entry->callback();
 else
     r = -1;
 mutex_unlock(&eventfs_mutex);

This will make sure that the callback will not be called after it is
freed. But now it needs to be known that the callback is called while
holding internal eventfs locks, and that it must not call back into the
eventfs / tracefs system. There's no reason it should anyway, but document
that as well.

Link: https://lore.kernel.org/all/CA+G9fYu9GOEbD=rR5eMR-=HJ8H6rMsbzDC2ZY5=Y50WpWAE7_Q@mail.gmail.com/
Link: https://lkml.kernel.org/r/20231101172649.906696613@goodmis.org

Cc: Ajay Kaher <akaher@vmware.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode")
Reported-by: Linux Kernel Functional Testing <lkft@linaro.org>
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracefs.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 13359b1a35d1..7a5fe17b6bf9 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -23,9 +23,52 @@ struct file_operations;
 
 struct eventfs_file;
 
+/**
+ * eventfs_callback - A callback function to create dynamic files in eventfs
+ * @name: The name of the file that is to be created
+ * @mode: return the file mode for the file (RW access, etc)
+ * @data: data to pass to the created file ops
+ * @fops: the file operations of the created file
+ *
+ * The evetnfs files are dynamically created. The struct eventfs_entry array
+ * is passed to eventfs_create_dir() or eventfs_create_events_dir() that will
+ * be used to create the files within those directories. When a lookup
+ * or access to a file within the directory is made, the struct eventfs_entry
+ * array is used to find a callback() with the matching name that is being
+ * referenced (for lookups, the entire array is iterated and each callback
+ * will be called).
+ *
+ * The callback will be called with @name for the name of the file to create.
+ * The callback can return less than 1 to indicate  that no file should be
+ * created.
+ *
+ * If a file is to be created, then @mode should be populated with the file
+ * mode (permissions) for which the file is created for. This would be
+ * used to set the created inode i_mode field.
+ *
+ * The @data should be set to the data passed to the other file operations
+ * (read, write, etc). Note, @data will also point to the data passed in
+ * to eventfs_create_dir() or eventfs_create_events_dir(), but the callback
+ * can replace the data if it chooses to. Otherwise, the original data
+ * will be used for the file operation functions.
+ *
+ * The @fops should be set to the file operations that will be used to create
+ * the inode.
+ *
+ * NB. This callback is called while holding internal locks of the eventfs
+ *     system. The callback must not call any code that might also call into
+ *     the tracefs or eventfs system or it will risk creating a deadlock.
+ */
 typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
 				const struct file_operations **fops);
 
+/**
+ * struct eventfs_entry - dynamically created eventfs file call back handler
+ * @name:	Then name of the dynamic file in an eventfs directory
+ * @callback:	The callback to get the fops of the file when it is created
+ *
+ * See evenfs_callback() typedef for how to set up @callback.
+ */
 struct eventfs_entry {
 	const char			*name;
 	eventfs_callback		callback;
-- 
cgit v1.2.3


From 391145ba2accc48b596f3d438af1a6255b62a555 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 31 Oct 2023 14:56:24 -0700
Subject: bpf: Add __bpf_kfunc_{start,end}_defs macros

BPF kfuncs are meant to be called from BPF programs. Accordingly, most
kfuncs are not called from anywhere in the kernel, which the
-Wmissing-prototypes warning is unhappy about. We've peppered
__diag_ignore_all("-Wmissing-prototypes", ... everywhere kfuncs are
defined in the codebase to suppress this warning.

This patch adds two macros meant to bound one or many kfunc definitions.
All existing kfunc definitions which use these __diag calls to suppress
-Wmissing-prototypes are migrated to use the newly-introduced macros.
A new __diag_ignore_all - for "-Wmissing-declarations" - is added to the
__bpf_kfunc_start_defs macro based on feedback from Andrii on an earlier
version of this patch [0] and another recent mailing list thread [1].

In the future we might need to ignore different warnings or do other
kfunc-specific things. This change will make it easier to make such
modifications for all kfunc defs.

  [0]: https://lore.kernel.org/bpf/CAEf4BzaE5dRWtK6RPLnjTW-MW9sx9K3Fn6uwqCTChK2Dcb1Xig@mail.gmail.com/
  [1]: https://lore.kernel.org/bpf/ZT+2qCc%2FaXep0%2FLf@krava/

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20231031215625.2343848-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index c2231c64d60b..dc5ce962f600 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -84,6 +84,15 @@
  */
 #define __bpf_kfunc __used noinline
 
+#define __bpf_kfunc_start_defs()					       \
+	__diag_push();							       \
+	__diag_ignore_all("-Wmissing-declarations",			       \
+			  "Global kfuncs as their definitions will be in BTF");\
+	__diag_ignore_all("-Wmissing-prototypes",			       \
+			  "Global kfuncs as their definitions will be in BTF")
+
+#define __bpf_kfunc_end_defs() __diag_pop()
+
 /*
  * Return the name of the passed struct, if exists, or halt the build if for
  * example the structure gets renamed. In this way, developers have to revisit
-- 
cgit v1.2.3


From 15fb6f2b6c4c3c129adc2412ae12ec15e60a6adb Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 31 Oct 2023 14:56:25 -0700
Subject: bpf: Add __bpf_hook_{start,end} macros

Not all uses of __diag_ignore_all(...) in BPF-related code in order to
suppress warnings are wrapping kfunc definitions. Some "hook point"
definitions - small functions meant to be used as attach points for
fentry and similar BPF progs - need to suppress -Wmissing-declarations.

We could use __bpf_kfunc_{start,end}_defs added in the previous patch in
such cases, but this might be confusing to someone unfamiliar with BPF
internals. Instead, this patch adds __bpf_hook_{start,end} macros,
currently having the same effect as __bpf_kfunc_{start,end}_defs, then
uses them to suppress warnings for two hook points in the kernel itself
and some bpf_testmod hook points as well.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Cc: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20231031215625.2343848-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index dc5ce962f600..59d404e22814 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -92,6 +92,8 @@
 			  "Global kfuncs as their definitions will be in BTF")
 
 #define __bpf_kfunc_end_defs() __diag_pop()
+#define __bpf_hook_start() __bpf_kfunc_start_defs()
+#define __bpf_hook_end() __bpf_kfunc_end_defs()
 
 /*
  * Return the name of the passed struct, if exists, or halt the build if for
-- 
cgit v1.2.3


From f55d8e60f10909dbc5524e261041e1d28d7d20d8 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 28 Oct 2023 21:25:11 +0200
Subject: net: ethtool: Fix documentation of ethtool_sprintf()

This function takes a pointer to a pointer, unlike sprintf() which is
passed a plain pointer. Fix up the documentation to make this clear.

Fixes: 7888fe53b706 ("ethtool: Add common function for filling out strings")
Cc: Alexander Duyck <alexanderduyck@fb.com>
Cc: Justin Stitt <justinstitt@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Justin Stitt <justinstitt@google.com>
Link: https://lore.kernel.org/r/20231028192511.100001-1-andrew@lunn.ch
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/ethtool.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 226a36ed5aa1..689028257fcc 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -1045,10 +1045,10 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add,
 
 /**
  * ethtool_sprintf - Write formatted string to ethtool string data
- * @data: Pointer to start of string to update
+ * @data: Pointer to a pointer to the start of string to update
  * @fmt: Format of string to write
  *
- * Write formatted string to data. Update data to point at start of
+ * Write formatted string to *data. Update *data to point at start of
  * next string.
  */
 extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...);
-- 
cgit v1.2.3


From e8ae8ad479e2d037daa33756e5e72850a7bd37a9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 24 Oct 2023 09:53:33 +1100
Subject: Fix termination state for idr_for_each_entry_ul()

The comment for idr_for_each_entry_ul() states

  after normal termination @entry is left with the value NULL

This is not correct in the case where UINT_MAX has an entry in the idr.
In that case @entry will be non-NULL after termination.
No current code depends on the documentation being correct, but to
save future code we should fix it.

Also fix idr_for_each_entry_continue_ul().  While this is not documented
as leaving @entry as NULL, the mellanox driver appears to depend on
it doing so.  So make that explicit in the documentation as well as in
the code.

Fixes: e33d2b74d805 ("idr: fix overflow case for idr_for_each_entry_ul()")
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chris Mi <chrism@mellanox.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/idr.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index a0dce14090a9..da5f5fa4a3a6 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -200,7 +200,7 @@ static inline void idr_preload_end(void)
  */
 #define idr_for_each_entry_ul(idr, entry, tmp, id)			\
 	for (tmp = 0, id = 0;						\
-	     tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \
+	     ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
 	     tmp = id, ++id)
 
 /**
@@ -224,10 +224,12 @@ static inline void idr_preload_end(void)
  * @id: Entry ID.
  *
  * Continue to iterate over entries, continuing after the current position.
+ * After normal termination @entry is left with the value NULL.  This
+ * is convenient for a "not found" value.
  */
 #define idr_for_each_entry_continue_ul(idr, entry, tmp, id)		\
 	for (tmp = id;							\
-	     tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \
+	     ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
 	     tmp = id, ++id)
 
 /*
-- 
cgit v1.2.3


From cdbab6236605dc11780779d9af689aea7d58cab1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 31 Oct 2023 06:19:45 +0000
Subject: tcp: fix fastopen code vs usec TS

After blamed commit, TFO client-ack-dropped-then-recovery-ms-timestamps
packetdrill test failed.

David Morley and Neal Cardwell started investigating and Neal pointed
that we had :

tcp_conn_request()
  tcp_try_fastopen()
   -> tcp_fastopen_create_child
     -> child = inet_csk(sk)->icsk_af_ops->syn_recv_sock()
       -> tcp_create_openreq_child()
          -> copy req_usec_ts from req:
          newtp->tcp_usec_ts = treq->req_usec_ts;
          // now the new TFO server socket always does usec TS, no matter
          // what the route options are...
  send_synack()
    -> tcp_make_synack()
        // disable tcp_rsk(req)->req_usec_ts if route option is not present:
        if (tcp_rsk(req)->req_usec_ts < 0)
                tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);

tcp_conn_request() has the initial dst, we can initialize
tcp_rsk(req)->req_usec_ts there instead of later in send_synack();

This means tcp_rsk(req)->req_usec_ts can be a boolean.

Many thanks to David an Neal for their help.

Fixes: 614e8316aa4c ("tcp: add support for usec resolution in TCP TS values")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202310302216.f79d78bc-oliver.sang@intel.com
Suggested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Morley <morleyd@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index ec4e9367f5b0..68f3d315d2e1 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -152,7 +152,7 @@ struct tcp_request_sock {
 	u64				snt_synack; /* first SYNACK sent time */
 	bool				tfo_listener;
 	bool				is_mptcp;
-	s8				req_usec_ts;
+	bool				req_usec_ts;
 #if IS_ENABLED(CONFIG_MPTCP)
 	bool				drop_req;
 #endif
-- 
cgit v1.2.3


From ed9009ad300c0f15a3ecfe9613547b1962bde02c Mon Sep 17 00:00:00 2001
From: Bean Huo <beanhuo@micron.com>
Date: Mon, 30 Oct 2023 23:48:09 +0100
Subject: mmc: Add quirk MMC_QUIRK_BROKEN_CACHE_FLUSH for Micron eMMC Q2J54A

Micron MTFC4GACAJCN eMMC supports cache but requires that flush cache
operation be allowed only after a write has occurred. Otherwise, the
cache flush command or subsequent commands will time out.

Signed-off-by: Bean Huo <beanhuo@micron.com>
Signed-off-by: Rafael Beims <rafael.beims@toradex.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20231030224809.59245-1-beanhuo@iokpp.de
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/card.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index daa2f40d9ce6..7b12eebc5586 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -295,7 +295,9 @@ struct mmc_card {
 #define MMC_QUIRK_BROKEN_HPI	(1<<13)		/* Disable broken HPI support */
 #define MMC_QUIRK_BROKEN_SD_DISCARD	(1<<14)	/* Disable broken SD discard support */
 #define MMC_QUIRK_BROKEN_SD_CACHE	(1<<15)	/* Disable broken SD cache support */
+#define MMC_QUIRK_BROKEN_CACHE_FLUSH	(1<<16)	/* Don't flush cache until the write has occurred */
 
+	bool			written_flag;	/* Indicates eMMC has been written since power on */
 	bool			reenable_cmdq;	/* Re-enable Command Queue */
 
 	unsigned int		erase_size;	/* erase size in sectors */
-- 
cgit v1.2.3


From 5c5f0d2b5f92c47baf82b9b211e27edd7d195158 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Fri, 9 Jun 2023 13:04:14 +0800
Subject: libceph: add doutc and *_client debug macros support

This will help print the fsid and client's global_id in debug logs,
and also print the function names.

[ idryomov: %lld -> %llu, leading space for doutc(), don't include
  __func__ in pr_*() variants ]

Link: https://tracker.ceph.com/issues/61590
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_debug.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h
index d5a5da838caf..11a92a946016 100644
--- a/include/linux/ceph/ceph_debug.h
+++ b/include/linux/ceph/ceph_debug.h
@@ -19,12 +19,25 @@
 	pr_debug("%.*s %12.12s:%-4d : " fmt,				\
 		 8 - (int)sizeof(KBUILD_MODNAME), "    ",		\
 		 kbasename(__FILE__), __LINE__, ##__VA_ARGS__)
+#  define doutc(client, fmt, ...)					\
+	pr_debug("%.*s %12.12s:%-4d : [%pU %llu] " fmt,			\
+		 8 - (int)sizeof(KBUILD_MODNAME), "    ",		\
+		 kbasename(__FILE__), __LINE__,				\
+		 &client->fsid, client->monc.auth->global_id,		\
+		 ##__VA_ARGS__)
 # else
 /* faux printk call just to see any compiler warnings. */
 #  define dout(fmt, ...)	do {				\
 		if (0)						\
 			printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
 	} while (0)
+#  define doutc(client, fmt, ...)	do {			\
+		if (0)						\
+			printk(KERN_DEBUG "[%pU %llu] " fmt,	\
+			&client->fsid,				\
+			client->monc.auth->global_id,		\
+			##__VA_ARGS__);				\
+		} while (0)
 # endif
 
 #else
@@ -33,7 +46,32 @@
  * or, just wrap pr_debug
  */
 # define dout(fmt, ...)	pr_debug(" " fmt, ##__VA_ARGS__)
+# define doutc(client, fmt, ...)					\
+	pr_debug(" [%pU %llu] %s: " fmt, &client->fsid,			\
+		 client->monc.auth->global_id, __func__, ##__VA_ARGS__)
 
 #endif
 
+#define pr_notice_client(client, fmt, ...)				\
+	pr_notice("[%pU %llu]: " fmt, &client->fsid,			\
+		  client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_info_client(client, fmt, ...)				\
+	pr_info("[%pU %llu]: " fmt, &client->fsid,			\
+		client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_warn_client(client, fmt, ...)				\
+	pr_warn("[%pU %llu]: " fmt, &client->fsid,			\
+		client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_warn_once_client(client, fmt, ...)				\
+	pr_warn_once("[%pU %llu]: " fmt, &client->fsid,			\
+		     client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_err_client(client, fmt, ...)					\
+	pr_err("[%pU %llu]: " fmt, &client->fsid,			\
+	       client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_warn_ratelimited_client(client, fmt, ...)			\
+	pr_warn_ratelimited("[%pU %llu]: " fmt, &client->fsid,		\
+			    client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_err_ratelimited_client(client, fmt, ...)			\
+	pr_err_ratelimited("[%pU %llu]: " fmt, &client->fsid,		\
+			   client->monc.auth->global_id, ##__VA_ARGS__)
+
 #endif
-- 
cgit v1.2.3


From 197b7d792d6aead2e30d4b2c054ffabae2ed73dc Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Fri, 9 Jun 2023 15:15:47 +0800
Subject: ceph: pass the mdsc to several helpers

We will use the 'mdsc' to get the global_id in the following commits.

Link: https://tracker.ceph.com/issues/61590
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/mdsmap.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 4c3e0648dc27..89f1931f1ba6 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -5,6 +5,8 @@
 #include <linux/bug.h>
 #include <linux/ceph/types.h>
 
+struct ceph_mds_client;
+
 /*
  * mds map - describe servers in the mds cluster.
  *
@@ -65,7 +67,8 @@ static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
 }
 
 extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2);
+struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
+				       void *end, bool msgr2);
 extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
 extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
 
-- 
cgit v1.2.3


From 522dc5108f07ef30e2c7399e59b9547d382308ff Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Mon, 12 Jun 2023 15:41:10 +0800
Subject: libceph, ceph: move mdsmap.h to fs/ceph

The mdsmap.h is only used by CephFS, so move it to fs/ceph.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/mdsmap.h | 75 ---------------------------------------------
 1 file changed, 75 deletions(-)
 delete mode 100644 include/linux/ceph/mdsmap.h

(limited to 'include/linux')

diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
deleted file mode 100644
index 89f1931f1ba6..000000000000
--- a/include/linux/ceph/mdsmap.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _FS_CEPH_MDSMAP_H
-#define _FS_CEPH_MDSMAP_H
-
-#include <linux/bug.h>
-#include <linux/ceph/types.h>
-
-struct ceph_mds_client;
-
-/*
- * mds map - describe servers in the mds cluster.
- *
- * we limit fields to those the client actually xcares about
- */
-struct ceph_mds_info {
-	u64 global_id;
-	struct ceph_entity_addr addr;
-	s32 state;
-	int num_export_targets;
-	bool laggy;
-	u32 *export_targets;
-};
-
-struct ceph_mdsmap {
-	u32 m_epoch, m_client_epoch, m_last_failure;
-	u32 m_root;
-	u32 m_session_timeout;          /* seconds */
-	u32 m_session_autoclose;        /* seconds */
-	u64 m_max_file_size;
-	u64 m_max_xattr_size;		/* maximum size for xattrs blob */
-	u32 m_max_mds;			/* expected up:active mds number */
-	u32 m_num_active_mds;		/* actual up:active mds number */
-	u32 possible_max_rank;		/* possible max rank index */
-	struct ceph_mds_info *m_info;
-
-	/* which object pools file data can be stored in */
-	int m_num_data_pg_pools;
-	u64 *m_data_pg_pools;
-	u64 m_cas_pg_pool;
-
-	bool m_enabled;
-	bool m_damaged;
-	int m_num_laggy;
-};
-
-static inline struct ceph_entity_addr *
-ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
-{
-	if (w >= m->possible_max_rank)
-		return NULL;
-	return &m->m_info[w].addr;
-}
-
-static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
-{
-	BUG_ON(w < 0);
-	if (w >= m->possible_max_rank)
-		return CEPH_MDS_STATE_DNE;
-	return m->m_info[w].state;
-}
-
-static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
-{
-	if (w >= 0 && w < m->possible_max_rank)
-		return m->m_info[w].laggy;
-	return false;
-}
-
-extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
-				       void *end, bool msgr2);
-extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
-extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
-
-#endif
-- 
cgit v1.2.3


From 1b90344614cc5949666328b37f03edec1d4e2873 Mon Sep 17 00:00:00 2001
From: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Date: Mon, 7 Aug 2023 15:26:15 +0200
Subject: fs: export mnt_idmap_get/mnt_idmap_put

These helpers are required to support idmapped mounts in CephFS.

Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/mnt_idmapping.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index 057c89867aa2..b8da2db4ecd2 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -115,6 +115,9 @@ static inline bool vfsgid_eq_kgid(vfsgid_t vfsgid, kgid_t kgid)
 
 int vfsgid_in_group_p(vfsgid_t vfsgid);
 
+struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
+void mnt_idmap_put(struct mnt_idmap *idmap);
+
 vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
 		     struct user_namespace *fs_userns, kuid_t kuid);
 
-- 
cgit v1.2.3


From 5ccd8530dd7ba97531a50ffa11eabe258d65a7af Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 7 Aug 2023 15:26:17 +0200
Subject: ceph: handle idmapped mounts in create_request_message()

Inode operations that create a new filesystem object such as ->mknod,
->create, ->mkdir() and others don't take a {g,u}id argument explicitly.
Instead the caller's fs{g,u}id is used for the {g,u}id of the new
filesystem object.

In order to ensure that the correct {g,u}id is used map the caller's
fs{g,u}id for creation requests. This doesn't require complex changes.
It suffices to pass in the relevant idmapping recorded in the request
message. If this request message was triggered from an inode operation
that creates filesystem objects it will have passed down the relevant
idmaping. If this is a request message that was triggered from an inode
operation that doens't need to take idmappings into account the initial
idmapping is passed down which is an identity mapping.

This change uses a new cephfs protocol extension CEPHFS_FEATURE_HAS_OWNER_UIDGID
which adds two new fields (owner_{u,g}id) to the request head structure.
So, we need to ensure that MDS supports it otherwise we need to fail
any IO that comes through an idmapped mount because we can't process it
in a proper way. MDS server without such an extension will use caller_{u,g}id
fields to set a new inode owner UID/GID which is incorrect because caller_{u,g}id
values are unmapped. At the same time we can't map these fields with an
idmapping as it can break UID/GID-based permission checks logic on the
MDS side. This problem was described with a lot of details at [1], [2].

[1] https://lore.kernel.org/lkml/CAEivzxfw1fHO2TFA4dx3u23ZKK6Q+EThfzuibrhA3RKM=ZOYLg@mail.gmail.com/
[2] https://lore.kernel.org/all/20220104140414.155198-3-brauner@kernel.org/

Link: https://github.com/ceph/ceph/pull/52575
Link: https://tracker.ceph.com/issues/62217
Co-Developed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index f3b3593254b9..ee1d0e5f9789 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -357,6 +357,11 @@ enum {
 	CEPH_MDS_OP_RENAMESNAP = 0x01403,
 };
 
+#define IS_CEPH_MDS_OP_NEWINODE(op) (op == CEPH_MDS_OP_CREATE     || \
+				     op == CEPH_MDS_OP_MKNOD      || \
+				     op == CEPH_MDS_OP_MKDIR      || \
+				     op == CEPH_MDS_OP_SYMLINK)
+
 extern const char *ceph_mds_op_name(int op);
 
 #define CEPH_SETATTR_MODE              (1 << 0)
@@ -497,7 +502,7 @@ struct ceph_mds_request_head_legacy {
 	union ceph_mds_request_args args;
 } __attribute__ ((packed));
 
-#define CEPH_MDS_REQUEST_HEAD_VERSION  2
+#define CEPH_MDS_REQUEST_HEAD_VERSION  3
 
 struct ceph_mds_request_head_old {
 	__le16 version;                /* struct version */
@@ -528,6 +533,9 @@ struct ceph_mds_request_head {
 
 	__le32 ext_num_retry;          /* new count retry attempts */
 	__le32 ext_num_fwd;            /* new count fwd attempts */
+
+	__le32 struct_len;             /* to store size of struct ceph_mds_request_head */
+	__le32 owner_uid, owner_gid;   /* used for OPs which create inodes */
 } __attribute__ ((packed));
 
 /* cap/lease release record */
-- 
cgit v1.2.3


From 9fd00df05e81a2e1080ce6e9abc35533dca99d74 Mon Sep 17 00:00:00 2001
From: Zbigniew Lukwinski <zbigniew.lukwinski@linux.intel.com>
Date: Mon, 16 Oct 2023 00:23:34 +0200
Subject: i3c: master: handle IBIs in order they came

IBI shall be handled in order they appear on the bus. Otherwise could hit
case when order of handling them in device driver will be different. It may
lead to invalid assembling fragmented packets or events order broken.

Added separate workqueue with option WQ_MEM_RECLAIM for each device driver.
This ensures IBI handling order and improves IBI handling performance: IBI
handlers for device B are not blocked by IBI handlers for device A.

Original solution (single workqueue in main driver) was able to handle also
general IBI (not related to specific device) like HJ or MR. So leaving this
for such purposes.

Signed-off-by: Zbigniew Lukwinski <zbigniew.lukwinski@linux.intel.com>
Link: https://lore.kernel.org/r/20231015222334.1652401-2-zbigniew.lukwinski@linux.intel.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 4fd6a777150f..349d7d0dda73 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -129,6 +129,7 @@ struct i3c_ibi_slot {
  *		     rejected by the master
  * @num_slots: number of IBI slots reserved for this device
  * @enabled: reflect the IBI status
+ * @wq: workqueue used to execute IBI handlers.
  * @handler: IBI handler specified at i3c_device_request_ibi() call time. This
  *	     handler will be called from the controller workqueue, and as such
  *	     is allowed to sleep (though it is recommended to process the IBI
@@ -151,6 +152,7 @@ struct i3c_device_ibi_info {
 	unsigned int max_payload_len;
 	unsigned int num_slots;
 	unsigned int enabled;
+	struct workqueue_struct *wq;
 	void (*handler)(struct i3c_device *dev,
 			const struct i3c_ibi_payload *payload);
 };
@@ -469,7 +471,7 @@ struct i3c_master_controller_ops {
  * @boardinfo.i2c: list of I2C boardinfo objects
  * @boardinfo: board-level information attached to devices connected on the bus
  * @bus: I3C bus exposed by this master
- * @wq: workqueue used to execute IBI handlers. Can also be used by master
+ * @wq: workqueue which can be used by master
  *	drivers if they need to postpone operations that need to take place
  *	in a thread context. Typical examples are Hot Join processing which
  *	requires taking the bus lock in maintenance, which in turn, can only
-- 
cgit v1.2.3


From 2654521d774f9461234bcf3d089185404abd110b Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 5 Sep 2023 20:53:09 +0200
Subject: gpiolib: remove gpiochip_find()

With all users of gpiochip_find() converted to using gpio_device_find(),
we can now remove this function from the kernel.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 1d454dc944b3..0aed62f0c633 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -605,9 +605,6 @@ int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc,
 				    void *data, struct lock_class_key *lock_key,
 				    struct lock_class_key *request_key);
 
-struct gpio_chip *gpiochip_find(void *data,
-				int (*match)(struct gpio_chip *gc, void *data));
-
 struct gpio_device *gpio_device_find(void *data,
 				int (*match)(struct gpio_chip *gc, void *data));
 struct gpio_device *gpio_device_find_by_label(const char *label);
-- 
cgit v1.2.3


From 9596ebf87c4f3f439d658af4d7cc2a302bea3750 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 2 Oct 2023 20:53:47 +0200
Subject: pinctrl: remove unneeded extern specifiers from consumer.h

The 'extern' specifiers are not needed for function declarations. Remove
all of them from the pinctrl/consumer.h header.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/pinctrl/consumer.h | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 4729d54e8995..22eef0a513ce 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -25,27 +25,27 @@ struct pinctrl_state;
 #ifdef CONFIG_PINCTRL
 
 /* External interface to pin control */
-extern bool pinctrl_gpio_can_use_line(unsigned gpio);
-extern int pinctrl_gpio_request(unsigned gpio);
-extern void pinctrl_gpio_free(unsigned gpio);
-extern int pinctrl_gpio_direction_input(unsigned gpio);
-extern int pinctrl_gpio_direction_output(unsigned gpio);
-extern int pinctrl_gpio_set_config(unsigned gpio, unsigned long config);
-
-extern struct pinctrl * __must_check pinctrl_get(struct device *dev);
-extern void pinctrl_put(struct pinctrl *p);
-extern struct pinctrl_state * __must_check pinctrl_lookup_state(struct pinctrl *p,
-								const char *name);
-extern int pinctrl_select_state(struct pinctrl *p, struct pinctrl_state *s);
-
-extern struct pinctrl * __must_check devm_pinctrl_get(struct device *dev);
-extern void devm_pinctrl_put(struct pinctrl *p);
-extern int pinctrl_select_default_state(struct device *dev);
+bool pinctrl_gpio_can_use_line(unsigned gpio);
+int pinctrl_gpio_request(unsigned gpio);
+void pinctrl_gpio_free(unsigned gpio);
+int pinctrl_gpio_direction_input(unsigned gpio);
+int pinctrl_gpio_direction_output(unsigned gpio);
+int pinctrl_gpio_set_config(unsigned gpio, unsigned long config);
+
+struct pinctrl * __must_check pinctrl_get(struct device *dev);
+void pinctrl_put(struct pinctrl *p);
+struct pinctrl_state * __must_check pinctrl_lookup_state(struct pinctrl *p,
+							 const char *name);
+int pinctrl_select_state(struct pinctrl *p, struct pinctrl_state *s);
+
+struct pinctrl * __must_check devm_pinctrl_get(struct device *dev);
+void devm_pinctrl_put(struct pinctrl *p);
+int pinctrl_select_default_state(struct device *dev);
 
 #ifdef CONFIG_PM
-extern int pinctrl_pm_select_default_state(struct device *dev);
-extern int pinctrl_pm_select_sleep_state(struct device *dev);
-extern int pinctrl_pm_select_idle_state(struct device *dev);
+int pinctrl_pm_select_default_state(struct device *dev);
+int pinctrl_pm_select_sleep_state(struct device *dev);
+int pinctrl_pm_select_idle_state(struct device *dev);
 #else
 static inline int pinctrl_pm_select_default_state(struct device *dev)
 {
-- 
cgit v1.2.3


From ec963d04ca8651cc3738bc8e013b3ab7fdb95e3d Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 09:12:13 +0200
Subject: pinctrl: provide new GPIO-to-pinctrl glue helpers

Currently the pinctrl GPIO helpers all take a number from the global
GPIO numberspace - of which we're trying to get rid of as argument.

These helpers are almost universally called from GPIOLIB driver
callbacks which take a pointer to the backing gpio_chip and the
controller-relative offset as arguments.

Let's provide improved variants of these functions that match the
GPIOLIB signatures as the first step in removing the older flavor.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 46 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 22eef0a513ce..c95c13983376 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -17,6 +17,7 @@
 #include <linux/pinctrl/pinctrl-state.h>
 
 struct device;
+struct gpio_chip;
 
 /* This struct is private to the core and should be regarded as a cookie */
 struct pinctrl;
@@ -26,11 +27,20 @@ struct pinctrl_state;
 
 /* External interface to pin control */
 bool pinctrl_gpio_can_use_line(unsigned gpio);
+bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request(unsigned gpio);
+int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free(unsigned gpio);
+void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input(unsigned gpio);
+int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
+				     unsigned int offset);
 int pinctrl_gpio_direction_output(unsigned gpio);
+int pinctrl_gpio_direction_output_new(struct gpio_chip *gc,
+				      unsigned int offset);
 int pinctrl_gpio_set_config(unsigned gpio, unsigned long config);
+int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
+				unsigned long config);
 
 struct pinctrl * __must_check pinctrl_get(struct device *dev);
 void pinctrl_put(struct pinctrl *p);
@@ -68,30 +78,66 @@ static inline bool pinctrl_gpio_can_use_line(unsigned gpio)
 	return true;
 }
 
+static inline bool
+pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return true;
+}
+
 static inline int pinctrl_gpio_request(unsigned gpio)
 {
 	return 0;
 }
 
+static inline int
+pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return 0;
+}
+
 static inline void pinctrl_gpio_free(unsigned gpio)
 {
 }
 
+static inline void
+pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset)
+{
+}
+
 static inline int pinctrl_gpio_direction_input(unsigned gpio)
 {
 	return 0;
 }
 
+static inline int
+pinctrl_gpio_direction_input_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return 0;
+}
+
 static inline int pinctrl_gpio_direction_output(unsigned gpio)
 {
 	return 0;
 }
 
+static inline int
+pinctrl_gpio_direction_output_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return 0;
+}
+
 static inline int pinctrl_gpio_set_config(unsigned gpio, unsigned long config)
 {
 	return 0;
 }
 
+static inline int
+pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
+			    unsigned long config)
+{
+	return 0;
+}
+
 static inline struct pinctrl * __must_check pinctrl_get(struct device *dev)
 {
 	return NULL;
-- 
cgit v1.2.3


From a063e57adf7baa91eee52eb4131557c3a43f4c3a Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 10 Oct 2023 14:59:36 +0200
Subject: pinctrl: remove pinctrl_gpio_can_use_line()

There are no more users of pinctrl_gpio_can_use_line() so remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index c95c13983376..3991271cc39f 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -26,7 +26,6 @@ struct pinctrl_state;
 #ifdef CONFIG_PINCTRL
 
 /* External interface to pin control */
-bool pinctrl_gpio_can_use_line(unsigned gpio);
 bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request(unsigned gpio);
 int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
@@ -73,11 +72,6 @@ static inline int pinctrl_pm_select_idle_state(struct device *dev)
 
 #else /* !CONFIG_PINCTRL */
 
-static inline bool pinctrl_gpio_can_use_line(unsigned gpio)
-{
-	return true;
-}
-
 static inline bool
 pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
 {
-- 
cgit v1.2.3


From 699f0784631eee4f482291fdcc23c87a055ab5f7 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 10 Oct 2023 15:04:24 +0200
Subject: pinctrl: remove pinctrl_gpio_request()

There are no more users of pinctrl_gpio_request() so remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 3991271cc39f..83a414a347cf 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -27,7 +27,6 @@ struct pinctrl_state;
 
 /* External interface to pin control */
 bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset);
-int pinctrl_gpio_request(unsigned gpio);
 int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free(unsigned gpio);
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
@@ -78,11 +77,6 @@ pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
 	return true;
 }
 
-static inline int pinctrl_gpio_request(unsigned gpio)
-{
-	return 0;
-}
-
 static inline int
 pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
 {
-- 
cgit v1.2.3


From 1d2c88450f77d9d62883a4a2ecc5e840cc7c74ee Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 10 Oct 2023 15:11:28 +0200
Subject: pinctrl: remove pinctrl_gpio_free()

There are no more users of pinctrl_gpio_free() so remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 83a414a347cf..90b2f0b78c4b 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -28,7 +28,6 @@ struct pinctrl_state;
 /* External interface to pin control */
 bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
-void pinctrl_gpio_free(unsigned gpio);
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input(unsigned gpio);
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
@@ -83,10 +82,6 @@ pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
 	return 0;
 }
 
-static inline void pinctrl_gpio_free(unsigned gpio)
-{
-}
-
 static inline void
 pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset)
 {
-- 
cgit v1.2.3


From aec96797f9efcaf444400a9d92900de3acc13e51 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 10 Oct 2023 15:13:38 +0200
Subject: pinctrl: remove pinctrl_gpio_direction_input()

There are no more users of pinctrl_gpio_direction_input() so remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 90b2f0b78c4b..249bb28a52d6 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -29,7 +29,6 @@ struct pinctrl_state;
 bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
-int pinctrl_gpio_direction_input(unsigned gpio);
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
 				     unsigned int offset);
 int pinctrl_gpio_direction_output(unsigned gpio);
@@ -87,11 +86,6 @@ pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset)
 {
 }
 
-static inline int pinctrl_gpio_direction_input(unsigned gpio)
-{
-	return 0;
-}
-
 static inline int
 pinctrl_gpio_direction_input_new(struct gpio_chip *gc, unsigned int offset)
 {
-- 
cgit v1.2.3


From 45d2055b0067739253883dc541f37c86aad45c92 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 10 Oct 2023 15:14:33 +0200
Subject: pinctrl: remove pinctrl_gpio_direction_output()

There are no more users of pinctrl_gpio_direction_output() so remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 249bb28a52d6..e25e9b0599ac 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -31,7 +31,6 @@ int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
 				     unsigned int offset);
-int pinctrl_gpio_direction_output(unsigned gpio);
 int pinctrl_gpio_direction_output_new(struct gpio_chip *gc,
 				      unsigned int offset);
 int pinctrl_gpio_set_config(unsigned gpio, unsigned long config);
@@ -92,11 +91,6 @@ pinctrl_gpio_direction_input_new(struct gpio_chip *gc, unsigned int offset)
 	return 0;
 }
 
-static inline int pinctrl_gpio_direction_output(unsigned gpio)
-{
-	return 0;
-}
-
 static inline int
 pinctrl_gpio_direction_output_new(struct gpio_chip *gc, unsigned int offset)
 {
-- 
cgit v1.2.3


From ab56e2bfceecae12e3b656b1641ecb961de6f92c Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 10 Oct 2023 15:16:54 +0200
Subject: pinctrl: remove pinctrl_gpio_set_config()

There are no more users of pinctrl_gpio_set_config() so remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index e25e9b0599ac..aa3a7a7ca52d 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -33,7 +33,6 @@ int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
 				     unsigned int offset);
 int pinctrl_gpio_direction_output_new(struct gpio_chip *gc,
 				      unsigned int offset);
-int pinctrl_gpio_set_config(unsigned gpio, unsigned long config);
 int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
 				unsigned long config);
 
@@ -97,11 +96,6 @@ pinctrl_gpio_direction_output_new(struct gpio_chip *gc, unsigned int offset)
 	return 0;
 }
 
-static inline int pinctrl_gpio_set_config(unsigned gpio, unsigned long config)
-{
-	return 0;
-}
-
 static inline int
 pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
 			    unsigned long config)
-- 
cgit v1.2.3


From 00762e416cd2001270a59fab417fbb1c30e2b7e5 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 11:54:45 +0200
Subject: treewide: rename pinctrl_gpio_can_use_line_new()

Now that pinctrl_gpio_can_use_line() is no longer used, let's drop the
'_new' suffix from its improved variant.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index aa3a7a7ca52d..4f9ead7f65d2 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -26,7 +26,7 @@ struct pinctrl_state;
 #ifdef CONFIG_PINCTRL
 
 /* External interface to pin control */
-bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset);
+bool pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
@@ -68,7 +68,7 @@ static inline int pinctrl_pm_select_idle_state(struct device *dev)
 #else /* !CONFIG_PINCTRL */
 
 static inline bool
-pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
+pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset)
 {
 	return true;
 }
-- 
cgit v1.2.3


From acb38be654f9af05fe626b37ea83e119cd310c3c Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 11:56:03 +0200
Subject: treewide: rename pinctrl_gpio_request_new()

Now that pinctrl_gpio_request() is no longer used, let's drop the '_new'
suffix from its improved variant.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 4f9ead7f65d2..05b2dae05f24 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -27,7 +27,7 @@ struct pinctrl_state;
 
 /* External interface to pin control */
 bool pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset);
-int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
+int pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
 				     unsigned int offset);
@@ -74,7 +74,7 @@ pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset)
 }
 
 static inline int
-pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
+pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 4fccb263f3a0dc07b306213930e0c25d1c9002b0 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 11:57:17 +0200
Subject: treewide: rename pinctrl_gpio_free_new()

Now that pinctrl_gpio_free()() is no longer used, let's drop the '_new'
suffix from its improved variant.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 05b2dae05f24..55b465edaff1 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -28,7 +28,7 @@ struct pinctrl_state;
 /* External interface to pin control */
 bool pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset);
-void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
+void pinctrl_gpio_free(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
 				     unsigned int offset);
 int pinctrl_gpio_direction_output_new(struct gpio_chip *gc,
@@ -80,7 +80,7 @@ pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset)
 }
 
 static inline void
-pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset)
+pinctrl_gpio_free(struct gpio_chip *gc, unsigned int offset)
 {
 }
 
-- 
cgit v1.2.3


From 315c46f9b696be82972290d50349c7824276b844 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 11:58:30 +0200
Subject: treewide: rename pinctrl_gpio_direction_input_new()

Now that pinctrl_gpio_direction_input() is no longer used, let's drop the
'_new' suffix from its improved variant.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 55b465edaff1..9531595d1abd 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -29,8 +29,8 @@ struct pinctrl_state;
 bool pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free(struct gpio_chip *gc, unsigned int offset);
-int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
-				     unsigned int offset);
+int pinctrl_gpio_direction_input(struct gpio_chip *gc,
+				 unsigned int offset);
 int pinctrl_gpio_direction_output_new(struct gpio_chip *gc,
 				      unsigned int offset);
 int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
@@ -85,7 +85,7 @@ pinctrl_gpio_free(struct gpio_chip *gc, unsigned int offset)
 }
 
 static inline int
-pinctrl_gpio_direction_input_new(struct gpio_chip *gc, unsigned int offset)
+pinctrl_gpio_direction_input(struct gpio_chip *gc, unsigned int offset)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From b679d6c06b2b29187374f9f4da7eea1961c2eeaa Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 11:59:51 +0200
Subject: treewide: rename pinctrl_gpio_direction_output_new()

Now that pinctrl_gpio_direction_output() is no longer used, let's drop
the '_new' suffix from its improved variant.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 9531595d1abd..e7d660e98e86 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -31,8 +31,8 @@ int pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input(struct gpio_chip *gc,
 				 unsigned int offset);
-int pinctrl_gpio_direction_output_new(struct gpio_chip *gc,
-				      unsigned int offset);
+int pinctrl_gpio_direction_output(struct gpio_chip *gc,
+				  unsigned int offset);
 int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
 				unsigned long config);
 
@@ -91,7 +91,7 @@ pinctrl_gpio_direction_input(struct gpio_chip *gc, unsigned int offset)
 }
 
 static inline int
-pinctrl_gpio_direction_output_new(struct gpio_chip *gc, unsigned int offset)
+pinctrl_gpio_direction_output(struct gpio_chip *gc, unsigned int offset)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From acf2981b84c344428baa10dbc9234749c68dedae Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 12:01:01 +0200
Subject: treewide: rename pinctrl_gpio_set_config_new()

Now that pinctrl_gpio_set_config() is no longer used, let's drop the
'_new' suffix from its improved variant.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index e7d660e98e86..73de70362b98 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -33,7 +33,7 @@ int pinctrl_gpio_direction_input(struct gpio_chip *gc,
 				 unsigned int offset);
 int pinctrl_gpio_direction_output(struct gpio_chip *gc,
 				  unsigned int offset);
-int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
+int pinctrl_gpio_set_config(struct gpio_chip *gc, unsigned int offset,
 				unsigned long config);
 
 struct pinctrl * __must_check pinctrl_get(struct device *dev);
@@ -97,7 +97,7 @@ pinctrl_gpio_direction_output(struct gpio_chip *gc, unsigned int offset)
 }
 
 static inline int
-pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
+pinctrl_gpio_set_config(struct gpio_chip *gc, unsigned int offset,
 			    unsigned long config)
 {
 	return 0;
-- 
cgit v1.2.3


From 8ae0e970319ac0b516d285650a744bab4ed3dd37 Mon Sep 17 00:00:00 2001
From: Jia He <justin.he@arm.com>
Date: Sat, 28 Oct 2023 10:20:58 +0000
Subject: dma-mapping: move dma_addressing_limited() out of line

This patch moves dma_addressing_limited() out of line, serving as a
preliminary step to prevent the introduction of a new publicly accessible
low-level helper when validating whether all system RAM is mapped within
the DMA mapping range.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jia He <justin.he@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f0ccca16a0ac..4a658de44ee9 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -144,6 +144,7 @@ bool dma_pci_p2pdma_supported(struct device *dev);
 int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
+bool dma_addressing_limited(struct device *dev);
 size_t dma_max_mapping_size(struct device *dev);
 size_t dma_opt_mapping_size(struct device *dev);
 bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
@@ -264,6 +265,10 @@ static inline u64 dma_get_required_mask(struct device *dev)
 {
 	return 0;
 }
+static inline bool dma_addressing_limited(struct device *dev)
+{
+	return false;
+}
 static inline size_t dma_max_mapping_size(struct device *dev)
 {
 	return 0;
@@ -465,20 +470,6 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
 	return dma_set_mask_and_coherent(dev, mask);
 }
 
-/**
- * dma_addressing_limited - return if the device is addressing limited
- * @dev:	device to check
- *
- * Return %true if the devices DMA mask is too small to address all memory in
- * the system, else %false.  Lack of addressing bits is the prime reason for
- * bounce buffering, but might not be the only one.
- */
-static inline bool dma_addressing_limited(struct device *dev)
-{
-	return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
-			    dma_get_required_mask(dev);
-}
-
 static inline unsigned int dma_get_max_seg_size(struct device *dev)
 {
 	if (dev->dma_parms && dev->dma_parms->max_segment_size)
-- 
cgit v1.2.3


From 1a229d8690a0f8951fc4aa8b76a7efab0d8de342 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Mon, 6 Nov 2023 12:06:54 +0100
Subject: Revert "usb: phy: add usb phy notify port status API"

This reverts commit a08799cf17c22375752abfad3b4a2b34b3acb287.

The recently added Realtek PHY drivers depend on the new port status
notification mechanism which was built on the deprecated USB PHY
implementation and devicetree binding.

Specifically, using these PHYs would require describing the very same
PHY using both the generic "phy" property and the deprecated "usb-phy"
property which is clearly wrong.

We should not be building new functionality on top of the legacy USB PHY
implementation even if it is currently stuck in some kind of
transitional limbo.

Revert the new notification interface which is broken by design.

Fixes: a08799cf17c2 ("usb: phy: add usb phy notify port status API")
Cc: stable@vger.kernel.org      # 6.6
Cc: Stanley Chang <stanley_chang@realtek.com>
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://lore.kernel.org/r/20231106110654.31090-4-johan+linaro@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/phy.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index b513749582d7..e4de6bc1f69b 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -144,10 +144,6 @@ struct usb_phy {
 	 */
 	int	(*set_wakeup)(struct usb_phy *x, bool enabled);
 
-	/* notify phy port status change */
-	int	(*notify_port_status)(struct usb_phy *x, int port,
-				      u16 portstatus, u16 portchange);
-
 	/* notify phy connect status change */
 	int	(*notify_connect)(struct usb_phy *x,
 			enum usb_device_speed speed);
@@ -320,15 +316,6 @@ usb_phy_set_wakeup(struct usb_phy *x, bool enabled)
 		return 0;
 }
 
-static inline int
-usb_phy_notify_port_status(struct usb_phy *x, int port, u16 portstatus, u16 portchange)
-{
-	if (x && x->notify_port_status)
-		return x->notify_port_status(x, port, portstatus, portchange);
-	else
-		return 0;
-}
-
 static inline int
 usb_phy_notify_connect(struct usb_phy *x, enum usb_device_speed speed)
 {
-- 
cgit v1.2.3


From 75276847f4e262a52ccaf1a1c6b929280ddf77f6 Mon Sep 17 00:00:00 2001
From: Mark O'Donovan <shiftee@posteo.net>
Date: Wed, 25 Oct 2023 10:51:23 +0000
Subject: nvme-auth: auth success1 msg always includes resp

In cases where RVALID is false, the response is still transmitted,
but is cleared to zero.

Relevant extract from the spec:
Response R2, if valid (i.e., if the RVALID field is set to 01h),
cleared to 0h otherwise

Signed-off-by: Mark O'Donovan <shiftee@posteo.net>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index a7ba74babad7..44325c068b6a 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1732,7 +1732,7 @@ struct nvmf_auth_dhchap_success1_data {
 	__u8		rsvd2;
 	__u8		rvalid;
 	__u8		rsvd3[7];
-	/* 'hl' bytes of response value if 'rvalid' is set */
+	/* 'hl' bytes of response value */
 	__u8		rval[];
 };
 
-- 
cgit v1.2.3


From 6affe08aea5f3b630565676e227b41d55a6f009c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 26 Oct 2023 15:08:03 +0200
Subject: nvme: common: make keyring and auth separate modules

When only the keyring module is included but auth is not, modpost
complains about the lack of a module license tag:

ERROR: modpost: missing MODULE_LICENSE() in drivers/nvme/common/nvme-common.o

Address this by making both modules buildable standalone,
removing the now unnecessary CONFIG_NVME_COMMON symbol
in the process.

Also, now that NVME_KEYRING config symbol can be either a module or
built-in, the stubs need to check for '#if IS_ENABLED' rather than a
simple '#ifdef'.

Fixes: 9d77eb5277849 ("nvme-keyring: register '.nvme' keyring")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme-keyring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h
index 4efea9dd967c..6cc0696625f3 100644
--- a/include/linux/nvme-keyring.h
+++ b/include/linux/nvme-keyring.h
@@ -6,7 +6,7 @@
 #ifndef _NVME_KEYRING_H
 #define _NVME_KEYRING_H
 
-#ifdef CONFIG_NVME_KEYRING
+#if IS_ENABLED(CONFIG_NVME_KEYRING)
 
 key_serial_t nvme_tls_psk_default(struct key *keyring,
 		const char *hostnqn, const char *subnqn);
-- 
cgit v1.2.3


From 706add13676da7ad213b65e92b94af5efc8c4131 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 26 Oct 2023 15:08:04 +0200
Subject: nvme: keyring: fix conditional compilation

The keyring and auth functions can be called from both the host and
the target side and are controlled by Kconfig options for each of the
combinations, but the declarations are controlled by #ifdef checks
on the shared Kconfig symbols.

This leads to link failures in combinations where one of the frontends
is built-in and the other one is a module, and the keyring code
ends up in a module that is not reachable from the builtin code:

ld: drivers/nvme/host/core.o: in function `nvme_core_exit':
core.c:(.exit.text+0x4): undefined reference to `nvme_keyring_exit'
ld: drivers/nvme/host/core.o: in function `nvme_core_init':
core.c:(.init.text+0x94): undefined reference to `nvme_keyring_init

ld: drivers/nvme/host/tcp.o: in function `nvme_tcp_setup_ctrl':
tcp.c:(.text+0x4c18): undefined reference to `nvme_tls_psk_default'

Address this by moving nvme_keyring_init()/nvme_keyring_exit() into
module init/exit functions for the keyring module.

Fixes: be8e82caa6859 ("nvme-tcp: enable TLS handshake upcall")
Signed-off-by: Hannes Reinecke <hare@suse.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme-keyring.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h
index 6cc0696625f3..e10333d78dbb 100644
--- a/include/linux/nvme-keyring.h
+++ b/include/linux/nvme-keyring.h
@@ -12,8 +12,6 @@ key_serial_t nvme_tls_psk_default(struct key *keyring,
 		const char *hostnqn, const char *subnqn);
 
 key_serial_t nvme_keyring_id(void);
-int nvme_keyring_init(void);
-void nvme_keyring_exit(void);
 
 #else
 
@@ -26,11 +24,5 @@ static inline key_serial_t nvme_keyring_id(void)
 {
 	return 0;
 }
-static inline int nvme_keyring_init(void)
-{
-	return 0;
-}
-static inline void nvme_keyring_exit(void) {}
-
 #endif /* !CONFIG_NVME_KEYRING */
 #endif /* _NVME_KEYRING_H */
-- 
cgit v1.2.3


From 65120498aaf8d7320647a8b6d6de7db42e74ea52 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Nov 2023 13:58:27 +0100
Subject: stackleak: add declarations for global functions

With -Wmissing-prototypes enabled, the stackleak code produces a couple of
warnings that have no declarations because they are only called from assembler:

stackleak.c:127:25: error: no previous prototype for 'stackleak_erase' [-Werror=missing-prototypes]
stackleak.c:139:25: error: no previous prototype for 'stackleak_erase_on_task_stack' [-Werror=missing-prototypes]
stackleak.c:151:25: error: no previous prototype for 'stackleak_erase_off_task_stack' [-Werror=missing-prototypes]
stackleak.c:159:49: error: no previous prototype for 'stackleak_track_stack' [-Werror=missing-prototypes]

Add declarations to the stackleak header to shut up the warnings.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20231108125843.3806765-7-arnd@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/stackleak.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h
index c36e7a3b45e7..3be2cb564710 100644
--- a/include/linux/stackleak.h
+++ b/include/linux/stackleak.h
@@ -14,6 +14,7 @@
 
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 #include <asm/stacktrace.h>
+#include <linux/linkage.h>
 
 /*
  * The lowest address on tsk's stack which we can plausibly erase.
@@ -76,6 +77,11 @@ static inline void stackleak_task_init(struct task_struct *t)
 # endif
 }
 
+asmlinkage void noinstr stackleak_erase(void);
+asmlinkage void noinstr stackleak_erase_on_task_stack(void);
+asmlinkage void noinstr stackleak_erase_off_task_stack(void);
+void __no_caller_saved_registers noinstr stackleak_track_stack(void);
+
 #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */
 static inline void stackleak_task_init(struct task_struct *t) { }
 #endif
-- 
cgit v1.2.3


From 866d648059d5faf53f1cd960b43fe8365ad93ea7 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Tue, 31 Oct 2023 13:32:06 +0100
Subject: lsm: fix default return value for vm_enough_memory

1 is the return value that implements a "no-op" hook, not 0.

Cc: stable@vger.kernel.org
Fixes: 98e828a0650f ("security: Refactor declaration of LSM hooks")
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 99b8176c3738..4dd55fdfec26 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -48,7 +48,7 @@ LSM_HOOK(int, 0, quota_on, struct dentry *dentry)
 LSM_HOOK(int, 0, syslog, int type)
 LSM_HOOK(int, 0, settime, const struct timespec64 *ts,
 	 const struct timezone *tz)
-LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages)
+LSM_HOOK(int, 1, vm_enough_memory, struct mm_struct *mm, long pages)
 LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm)
 LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file)
 LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
-- 
cgit v1.2.3


From b36995b8609a5a8fe5cf259a1ee768fcaed919f8 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Tue, 31 Oct 2023 13:32:07 +0100
Subject: lsm: fix default return value for inode_getsecctx

-EOPNOTSUPP is the return value that implements a "no-op" hook, not 0.

Without this fix having only the BPF LSM enabled (with no programs
attached) can cause uninitialized variable reads in
nfsd4_encode_fattr(), because the BPF hook returns 0 without touching
the 'ctxlen' variable and the corresponding 'contextlen' variable in
nfsd4_encode_fattr() remains uninitialized, yet being treated as valid
based on the 0 return value.

Cc: stable@vger.kernel.org
Fixes: 98e828a0650f ("security: Refactor declaration of LSM hooks")
Reported-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 4dd55fdfec26..ff217a5ce552 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -273,7 +273,7 @@ LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen)
 LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode)
 LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen)
 LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen)
-LSM_HOOK(int, 0, inode_getsecctx, struct inode *inode, void **ctx,
+LSM_HOOK(int, -EOPNOTSUPP, inode_getsecctx, struct inode *inode, void **ctx,
 	 u32 *ctxlen)
 
 #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE)
-- 
cgit v1.2.3


From d23b5c577715892c87533b13923306acc6243f93 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 29 Oct 2023 06:14:29 +0000
Subject: cgroup: Make operations on the cgroup root_list RCU safe

At present, when we perform operations on the cgroup root_list, we must
hold the cgroup_mutex, which is a relatively heavyweight lock. In reality,
we can make operations on this list RCU-safe, eliminating the need to hold
the cgroup_mutex during traversal. Modifications to the list only occur in
the cgroup root setup and destroy paths, which should be infrequent in a
production environment. In contrast, traversal may occur frequently.
Therefore, making it RCU-safe would be beneficial.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 4a6b6b77ccb6..4caab0c6b361 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -563,6 +563,7 @@ struct cgroup_root {
 
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
+	struct rcu_head rcu;
 
 	/* Hierarchy-specific flags */
 	unsigned int flags;
-- 
cgit v1.2.3


From aecd408b7e50742868b3305c24325a89024e2a30 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 29 Oct 2023 06:14:32 +0000
Subject: cgroup: Add a new helper for cgroup1 hierarchy

A new helper is added for cgroup1 hierarchy:

- task_get_cgroup1
  Acquires the associated cgroup of a task within a specific cgroup1
  hierarchy. The cgroup1 hierarchy is identified by its hierarchy ID.

This helper function is added to facilitate the tracing of tasks within
a particular container or cgroup dir in BPF programs. It's important to
note that this helper is designed specifically for cgroup1 only.

tj: Use irsqsave/restore as suggested by Hou Tao <houtao@huaweicloud.com>.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Hou Tao <houtao@huaweicloud.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0ef0af66080e..34aaf0e87def 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -69,6 +69,7 @@ struct css_task_iter {
 extern struct file_system_type cgroup_fs_type;
 extern struct cgroup_root cgrp_dfl_root;
 extern struct css_set init_css_set;
+extern spinlock_t css_set_lock;
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
 #include <linux/cgroup_subsys.h>
@@ -386,7 +387,6 @@ static inline void cgroup_unlock(void)
  * as locks used during the cgroup_subsys::attach() methods.
  */
 #ifdef CONFIG_PROVE_RCU
-extern spinlock_t css_set_lock;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\
 		rcu_read_lock_sched_held() ||				\
@@ -853,4 +853,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 
 #endif /* CONFIG_CGROUP_BPF */
 
+struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);
+
 #endif /* _LINUX_CGROUP_H */
-- 
cgit v1.2.3


From 74523c06ae20b83c5508a98af62393ac34913362 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 6 Nov 2023 20:57:23 -0800
Subject: bpf: Add __bpf_dynptr_data* for in kernel use

Different types of bpf dynptr have different internal data storage.
Specifically, SKB and XDP type of dynptr may have non-continuous data.
Therefore, it is not always safe to directly access dynptr->data.

Add __bpf_dynptr_data and __bpf_dynptr_data_rw to replace direct access to
dynptr->data.

Update bpf_verify_pkcs7_signature to use __bpf_dynptr_data instead of
dynptr->data.

Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://lore.kernel.org/bpf/20231107045725.2278852-2-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4825d3cdb29..eb84caf133df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1222,6 +1222,8 @@ enum bpf_dynptr_type {
 
 int bpf_dynptr_check_size(u32 size);
 u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr);
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len);
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len);
 
 #ifdef CONFIG_BPF_JIT
 int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
-- 
cgit v1.2.3


From 790ce3cfefb1b768dccd4eee324ddef0f0ce3db4 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 7 Nov 2023 00:56:37 -0800
Subject: bpf: Move GRAPH_{ROOT,NODE}_MASK macros into btf_field_type enum

This refactoring patch removes the unused BPF_GRAPH_NODE_OR_ROOT
btf_field_type and moves BPF_GRAPH_{NODE,ROOT} macros into the
btf_field_type enum. Further patches in the series will use
BPF_GRAPH_NODE, so let's move this useful definition out of btf.c.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20231107085639.3016113-5-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eb84caf133df..4001d11be151 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -186,8 +186,8 @@ enum btf_field_type {
 	BPF_LIST_NODE  = (1 << 6),
 	BPF_RB_ROOT    = (1 << 7),
 	BPF_RB_NODE    = (1 << 8),
-	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
-				 BPF_RB_NODE | BPF_RB_ROOT,
+	BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE,
+	BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD,
 	BPF_REFCOUNT   = (1 << 9),
 };
 
-- 
cgit v1.2.3


From 3feb263bb516ee7e1da0acd22b15afbb9a7daa19 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Nov 2023 16:26:36 -0800
Subject: bpf: handle ldimm64 properly in check_cfg()

ldimm64 instructions are 16-byte long, and so have to be handled
appropriately in check_cfg(), just like the rest of BPF verifier does.

This has implications in three places:
  - when determining next instruction for non-jump instructions;
  - when determining next instruction for callback address ldimm64
    instructions (in visit_func_call_insn());
  - when checking for unreachable instructions, where second half of
    ldimm64 is expected to be unreachable;

We take this also as an opportunity to report jump into the middle of
ldimm64. And adjust few test_verifier tests accordingly.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reported-by: Hao Sun <sunhao.th@gmail.com>
Fixes: 475fb78fbf48 ("bpf: verifier (add branch/goto checks)")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231110002638.4168352-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4825d3cdb29..35bff17396c0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -909,10 +909,14 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
 	aux->ctx_field_size = size;
 }
 
+static bool bpf_is_ldimm64(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
+}
+
 static inline bool bpf_pseudo_func(const struct bpf_insn *insn)
 {
-	return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
-	       insn->src_reg == BPF_PSEUDO_FUNC;
+	return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
 }
 
 struct bpf_prog_ops {
-- 
cgit v1.2.3


From d27abbfd4888d79dd24baf50e774631046ac4732 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Wed, 25 Oct 2023 14:58:18 +0300
Subject: pwm: Fix double shift bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These enums are passed to set/test_bit().  The set/test_bit() functions
take a bit number instead of a shifted value.  Passing a shifted value
is a double shift bug like doing BIT(BIT(1)).  The double shift bug
doesn't cause a problem here because we are only checking 0 and 1 but
if the value was 5 or above then it can lead to a buffer overflow.

Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Reviewed-by: Sam Protsenko <semen.protsenko@linaro.org>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index e3b437587b32..cda3597b84f2 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -41,8 +41,8 @@ struct pwm_args {
 };
 
 enum {
-	PWMF_REQUESTED = 1 << 0,
-	PWMF_EXPORTED = 1 << 1,
+	PWMF_REQUESTED = 0,
+	PWMF_EXPORTED = 1,
 };
 
 /*
-- 
cgit v1.2.3


From abc28463c81853e4fdf8d009f71b2a3ce62a6f40 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 10 Nov 2023 19:59:05 +0900
Subject: kprobes: unify kprobes_exceptions_nofify() prototypes

Most architectures that support kprobes declare this function in their
own asm/kprobes.h header and provide an override, but some are missing
the prototype, which causes a warning for the __weak stub implementation:

kernel/kprobes.c:1865:12: error: no previous prototype for 'kprobe_exceptions_notify' [-Werror=missing-prototypes]
 1865 | int __weak kprobe_exceptions_notify(struct notifier_block *self,

Move the prototype into linux/kprobes.h so it is visible to all
the definitions.

Link: https://lore.kernel.org/all/20231108125843.3806765-4-arnd@kernel.org/

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/kprobes.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 365eb092e9c4..ab1da3142b06 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -445,6 +445,10 @@ int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 
 int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
 			    char *type, char *sym);
+
+int kprobe_exceptions_notify(struct notifier_block *self,
+			     unsigned long val, void *data);
+
 #else /* !CONFIG_KPROBES: */
 
 static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-- 
cgit v1.2.3


From bef4a48f4ef798c4feddf045d49e53c8a97d5e37 Mon Sep 17 00:00:00 2001
From: Mark Hasemeyer <markhas@chromium.org>
Date: Tue, 7 Nov 2023 14:47:43 -0700
Subject: spi: Fix null dereference on suspend

A race condition exists where a synchronous (noqueue) transfer can be
active during a system suspend. This can cause a null pointer
dereference exception to occur when the system resumes.

Example order of events leading to the exception:
1. spi_sync() calls __spi_transfer_message_noqueue() which sets
   ctlr->cur_msg
2. Spi transfer begins via spi_transfer_one_message()
3. System is suspended interrupting the transfer context
4. System is resumed
6. spi_controller_resume() calls spi_start_queue() which resets cur_msg
   to NULL
7. Spi transfer context resumes and spi_finalize_current_message() is
   called which dereferences cur_msg (which is now NULL)

Wait for synchronous transfers to complete before suspending by
acquiring the bus mutex and setting/checking a suspend flag.

Signed-off-by: Mark Hasemeyer <markhas@chromium.org>
Link: https://lore.kernel.org/r/20231107144743.v1.1.I7987f05f61901f567f7661763646cb7d7919b528@changeid
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@kernel.org
---
 include/linux/spi/spi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 86825c88b576..255a0562aea5 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -566,6 +566,7 @@ struct spi_controller {
 #define SPI_CONTROLLER_MUST_RX		BIT(3)	/* Requires rx */
 #define SPI_CONTROLLER_MUST_TX		BIT(4)	/* Requires tx */
 #define SPI_CONTROLLER_GPIO_SS		BIT(5)	/* GPIO CS must select slave */
+#define SPI_CONTROLLER_SUSPENDED	BIT(6)	/* Currently suspended */
 
 	/* Flag indicating if the allocation of this struct is devres-managed */
 	bool			devm_allocated;
-- 
cgit v1.2.3


From 689b097a06bafb461ec162fc3b3ecc9765cea67b Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Mon, 6 Nov 2023 03:18:02 +0000
Subject: compiler-gcc: Suppress -Wmissing-prototypes warning for all supported
 GCC

The kernel supports a minimum GCC version of 5.1.0 for building. However,
the "__diag_ignore_all" directive only suppresses the
"-Wmissing-prototypes" warning for GCC versions >= 8.0.0. As a result, when
building the kernel with older GCC versions, warnings may be triggered. The
example below illustrates the warnings reported by the kernel test robot
using GCC 7.5.0:

  compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0
  All warnings (new ones prefixed by >>):

   kernel/bpf/helpers.c:1893:19: warning: no previous prototype for 'bpf_obj_new_impl' [-Wmissing-prototypes]
    __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
                      ^~~~~~~~~~~~~~~~
   kernel/bpf/helpers.c:1907:19: warning: no previous prototype for 'bpf_percpu_obj_new_impl' [-Wmissing-prototypes]
    __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
   [...]

To address this, we should also suppress the "-Wmissing-prototypes" warning
for older GCC versions. "#pragma GCC diagnostic push" is supported as
of GCC 4.6, and both "-Wmissing-prototypes" and "-Wmissing-declarations"
are supported for all the GCC versions that we currently support.
Therefore, it is reasonable to suppress these warnings for all supported
GCC versions.

With this adjustment, it's important to note that after implementing
"__diag_ignore_all", it will effectively suppress warnings for all the
supported GCC versions.

In the future, if you wish to suppress warnings that are only supported on
higher GCC versions, it is advisable to explicitly use "__diag_ignore" to
specify the GCC version you are targeting.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202311031651.A7crZEur-lkp@intel.com/
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20231106031802.4188-1-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/compiler-gcc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 2ceba3fe4ec1..aebb65bf95a7 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -136,7 +136,7 @@
 #endif
 
 #define __diag_ignore_all(option, comment) \
-	__diag_GCC(8, ignore, option)
+	__diag(__diag_GCC_ignore option)
 
 /*
  * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size"
-- 
cgit v1.2.3


From 5c0930ccaad5a74d74e8b18b648c5eb21ed2fe94 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Nov 2023 15:57:13 +0100
Subject: hrtimers: Push pending hrtimers away from outgoing CPU earlier

2b8272ff4a70 ("cpu/hotplug: Prevent self deadlock on CPU hot-unplug")
solved the straight forward CPU hotplug deadlock vs. the scheduler
bandwidth timer. Yu discovered a more involved variant where a task which
has a bandwidth timer started on the outgoing CPU holds a lock and then
gets throttled. If the lock required by one of the CPU hotplug callbacks
the hotplug operation deadlocks because the unthrottling timer event is not
handled on the dying CPU and can only be recovered once the control CPU
reaches the hotplug state which pulls the pending hrtimers from the dead
CPU.

Solve this by pushing the hrtimers away from the dying CPU in the dying
callbacks. Nothing can queue a hrtimer on the dying CPU at that point because
all other CPUs spin in stop_machine() with interrupts disabled and once the
operation is finished the CPU is marked offline.

Reported-by: Yu Liao <liaoyu15@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Liu Tie <liutie4@huawei.com>
Link: https://lore.kernel.org/r/87a5rphara.ffs@tglx
---
 include/linux/cpuhotplug.h | 1 +
 include/linux/hrtimer.h    | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 068f7738be22..448f5f995adc 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -193,6 +193,7 @@ enum cpuhp_state {
 	CPUHP_AP_ARM_CORESIGHT_CTI_STARTING,
 	CPUHP_AP_ARM64_ISNDEP_STARTING,
 	CPUHP_AP_SMPCFD_DYING,
+	CPUHP_AP_HRTIMERS_DYING,
 	CPUHP_AP_X86_TBOOT_DYING,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DYING,
 	CPUHP_AP_ONLINE,
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0ee140176f10..f2044d5a652b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -531,9 +531,9 @@ extern void sysrq_timer_list_show(void);
 
 int hrtimers_prepare_cpu(unsigned int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-int hrtimers_dead_cpu(unsigned int cpu);
+int hrtimers_cpu_dying(unsigned int cpu);
 #else
-#define hrtimers_dead_cpu	NULL
+#define hrtimers_cpu_dying	NULL
 #endif
 
 #endif
-- 
cgit v1.2.3


From fe28f631fa941fba583d1c4f25895284b90af671 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Wed, 25 Oct 2023 14:25:52 -0400
Subject: workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs
 from wq_unbound_cpumask

When the "isolcpus" boot command line option is used to add a set
of isolated CPUs, those CPUs will be excluded automatically from
wq_unbound_cpumask to avoid running work functions from unbound
workqueues.

Recently cpuset has been extended to allow the creation of partitions
of isolated CPUs dynamically. To make it closer to the "isolcpus"
in functionality, the CPUs in those isolated cpuset partitions should be
excluded from wq_unbound_cpumask as well. This can be done currently by
explicitly writing to the workqueue's cpumask sysfs file after creating
the isolated partitions. However, this process can be error prone.

Ideally, the cpuset code should be allowed to request the workqueue code
to exclude those isolated CPUs from wq_unbound_cpumask so that this
operation can be done automatically and the isolated CPUs will be returned
back to wq_unbound_cpumask after the destructions of the isolated
cpuset partitions.

This patch adds a new workqueue_unbound_exclude_cpumask() function to
enable that. This new function will exclude the specified isolated
CPUs from wq_unbound_cpumask. To be able to restore those isolated
CPUs back after the destruction of isolated cpuset partitions, a new
wq_requested_unbound_cpumask is added to store the user provided unbound
cpumask either from the boot command line options or from writing to
the cpumask sysfs file. This new cpumask provides the basis for CPU
exclusion.

To enable users to understand how the wq_unbound_cpumask is being
modified internally, this patch also exposes the newly introduced
wq_requested_unbound_cpumask as well as a wq_isolated_cpumask to
store the cpumask to be excluded from wq_unbound_cpumask as read-only
sysfs files.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 24b1e5070f4d..b0b9604b76b8 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -491,7 +491,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void);
 void free_workqueue_attrs(struct workqueue_attrs *attrs);
 int apply_workqueue_attrs(struct workqueue_struct *wq,
 			  const struct workqueue_attrs *attrs);
-int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
+extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask);
 
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work);
-- 
cgit v1.2.3


From e76d28bdf9ba5388b8c4835a5199dc427b603188 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Fri, 3 Nov 2023 23:13:01 -0400
Subject: cgroup/rstat: Reduce cpu_lock hold time in
 cgroup_rstat_flush_locked()

When cgroup_rstat_updated() isn't being called concurrently with
cgroup_rstat_flush_locked(), its run time is pretty short. When
both are called concurrently, the cgroup_rstat_updated() run time
can spike to a pretty high value due to high cpu_lock hold time in
cgroup_rstat_flush_locked(). This can be problematic if the task calling
cgroup_rstat_updated() is a realtime task running on an isolated CPU
with a strict latency requirement. The cgroup_rstat_updated() call can
happen when there is a page fault even though the task is running in
user space most of the time.

The percpu cpu_lock is used to protect the update tree -
updated_next and updated_children. This protection is only needed when
cgroup_rstat_cpu_pop_updated() is being called. The subsequent flushing
operation which can take a much longer time does not need that protection
as it is already protected by cgroup_rstat_lock.

To reduce the cpu_lock hold time, we need to perform all the
cgroup_rstat_cpu_pop_updated() calls up front with the lock
released afterward before doing any flushing. This patch adds a new
cgroup_rstat_updated_list() function to return a singly linked list of
cgroups to be flushed.

Some instrumentation code are added to measure the cpu_lock hold time
right after lock acquisition to after releasing the lock. Parallel
kernel build on a 2-socket x86-64 server is used as the benchmarking
tool for measuring the lock hold time.

The maximum cpu_lock hold time before and after the patch are 100us and
29us respectively. So the worst case time is reduced to about 30% of
the original. However, there may be some OS or hardware noises like NMI
or SMI in the test system that can worsen the worst case value. Those
noises are usually tuned out in a real production environment to get
a better result.

OTOH, the lock hold time frequency distribution should give a better
idea of the performance benefit of the patch.  Below were the frequency
distribution before and after the patch:

     Hold time        Before patch       After patch
     ---------        ------------       -----------
       0-01 us           804,139         13,738,708
      01-05 us         9,772,767          1,177,194
      05-10 us         4,595,028              4,984
      10-15 us           303,481              3,562
      15-20 us            78,971              1,314
      20-25 us            24,583                 18
      25-30 us             6,908                 12
      30-40 us             8,015
      40-50 us             2,192
      50-60 us               316
      60-70 us                43
      70-80 us                 7
      80-90 us                 2
        >90 us                 3

Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 4caab0c6b361..37518436cfe7 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -496,6 +496,13 @@ struct cgroup {
 	struct cgroup_rstat_cpu __percpu *rstat_cpu;
 	struct list_head rstat_css_list;
 
+	/*
+	 * A singly-linked list of cgroup structures to be rstat flushed.
+	 * This is a scratch field to be used exclusively by
+	 * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock.
+	 */
+	struct cgroup	*rstat_flush_next;
+
 	/* cgroup basic resource statistics */
 	struct cgroup_base_stat last_bstat;
 	struct cgroup_base_stat bstat;
-- 
cgit v1.2.3


From 8156c7dd47b92fc4a70c9ea58e7a9e88c8bc32be Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Thu, 26 Oct 2023 16:48:21 +0200
Subject: regulator: Introduce handling for system-critical under-voltage
 events

Handle under-voltage events for crucial regulators to maintain system
stability and avoid issues during power drops.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20231026144824.4065145-3-o.rempel@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/machine.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 621b7f4a3639..e0ddfb5593c9 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -49,6 +49,13 @@ struct regulator;
 #define DISABLE_IN_SUSPEND	1
 #define ENABLE_IN_SUSPEND	2
 
+/*
+ * Default time window (in milliseconds) following a critical under-voltage
+ * event during which less critical actions can be safely carried out by the
+ * system.
+ */
+#define REGULATOR_DEF_UV_LESS_CRITICAL_WINDOW_MS	10
+
 /* Regulator active discharge flags */
 enum regulator_active_discharge {
 	REGULATOR_ACTIVE_DISCHARGE_DEFAULT,
@@ -127,6 +134,8 @@ struct notification_limit {
  * @ramp_disable: Disable ramp delay when initialising or when setting voltage.
  * @soft_start: Enable soft start so that voltage ramps slowly.
  * @pull_down: Enable pull down when regulator is disabled.
+ * @system_critical: Set if the regulator is critical to system stability or
+ *                   functionality.
  * @over_current_protection: Auto disable on over current event.
  *
  * @over_current_detection: Configure over current limits.
@@ -214,6 +223,7 @@ struct regulation_constraints {
 	unsigned ramp_disable:1; /* disable ramp delay */
 	unsigned soft_start:1;	/* ramp voltage slowly */
 	unsigned pull_down:1;	/* pull down resistor when regulator off */
+	unsigned system_critical:1;	/* critical to system stability */
 	unsigned over_current_protection:1; /* auto disable on over current */
 	unsigned over_current_detection:1; /* notify on over current */
 	unsigned over_voltage_detection:1; /* notify on over voltage */
-- 
cgit v1.2.3


From 1e22152aa59d793743fc53051dd7a042f362aecb Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Thu, 26 Oct 2023 16:48:24 +0200
Subject: regulator: Implement uv_survival_time for handling under-voltage
 events

Add 'uv_survival_time' field to regulation_constraints for specifying
survival time post critical under-voltage event. Update the regulator
notifier call chain and Device Tree property parsing to use this new
field, allowing a configurable timeout before emergency shutdown.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20231026144824.4065145-6-o.rempel@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/machine.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index e0ddfb5593c9..0cd76d264727 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -162,6 +162,13 @@ struct notification_limit {
  *		      regulator_active_discharge values are used for
  *		      initialisation.
  * @enable_time: Turn-on time of the rails (unit: microseconds)
+ * @uv_less_critical_window_ms: Specifies the time window (in milliseconds)
+ *                              following a critical under-voltage (UV) event
+ *                              during which less critical actions can be
+ *                              safely carried out by the system (for example
+ *                              logging). After this time window more critical
+ *                              actions should be done (for example prevent
+ *                              HW damage).
  */
 struct regulation_constraints {
 
@@ -213,6 +220,7 @@ struct regulation_constraints {
 	unsigned int settling_time_up;
 	unsigned int settling_time_down;
 	unsigned int enable_time;
+	unsigned int uv_less_critical_window_ms;
 
 	unsigned int active_discharge;
 
-- 
cgit v1.2.3


From f3b8788cde61b02f1e6c202f8fac4360e6adbafc Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:46 -0700
Subject: LSM: Identify modules by more than name

Create a struct lsm_id to contain identifying information about Linux
Security Modules (LSMs). At inception this contains the name of the
module and an identifier associated with the security module.  Change
the security_add_hooks() interface to use this structure.  Change the
individual modules to maintain their own struct lsm_id and pass it to
security_add_hooks().

The values are for LSM identifiers are defined in a new UAPI
header file linux/lsm.h. Each existing LSM has been updated to
include it's LSMID in the lsm_id.

The LSM ID values are sequential, with the oldest module
LSM_ID_CAPABILITY being the lowest value and the existing modules
numbered in the order they were included in the main line kernel.
This is an arbitrary convention for assigning the values, but
none better presents itself. The value 0 is defined as being invalid.
The values 1-99 are reserved for any special case uses which may
arise in the future. This may include attributes of the LSM
infrastructure itself, possibly related to namespacing or network
attribute management. A special range is identified for such attributes
to help reduce confusion for developers unfamiliar with LSMs.

LSM attribute values are defined for the attributes presented by
modules that are available today. As with the LSM IDs, The value 0
is defined as being invalid. The values 1-99 are reserved for any
special case uses which may arise in the future.

Cc: linux-security-module <linux-security-module@vger.kernel.org>
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Mickael Salaun <mic@digikod.net>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Nacked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
[PM: forward ported beyond v6.6 due merge window changes]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index dcb5e5b5eb13..7f0adb33caaa 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -42,6 +42,18 @@ struct security_hook_heads {
 	#undef LSM_HOOK
 } __randomize_layout;
 
+/**
+ * struct lsm_id - Identify a Linux Security Module.
+ * @lsm: name of the LSM, must be approved by the LSM maintainers
+ * @id: LSM ID number from uapi/linux/lsm.h
+ *
+ * Contains the information that identifies the LSM.
+ */
+struct lsm_id {
+	const char	*name;
+	u64		id;
+};
+
 /*
  * Security module hook list structure.
  * For use with generic list macros for common operations.
@@ -50,7 +62,7 @@ struct security_hook_list {
 	struct hlist_node		list;
 	struct hlist_head		*head;
 	union security_list_options	hook;
-	const char			*lsm;
+	const struct lsm_id		*lsmid;
 } __randomize_layout;
 
 /*
@@ -104,7 +116,7 @@ extern struct security_hook_heads security_hook_heads;
 extern char *lsm_names;
 
 extern void security_add_hooks(struct security_hook_list *hooks, int count,
-				const char *lsm);
+			       const struct lsm_id *lsmid);
 
 #define LSM_FLAG_LEGACY_MAJOR	BIT(0)
 #define LSM_FLAG_EXCLUSIVE	BIT(1)
-- 
cgit v1.2.3


From 9285c5ad9d00abfe0f4e2ce4039c8127e7a09738 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:47 -0700
Subject: LSM: Maintain a table of LSM attribute data

As LSMs are registered add their lsm_id pointers to a table.
This will be used later for attribute reporting.

Determine the number of possible security modules based on
their respective CONFIG options. This allows the number to be
known at build time. This allows data structures and tables
to use the constant.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Mickael Salaun <mic@digikod.net>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 1d1df326c881..50c178019a58 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -138,6 +138,8 @@ enum lockdown_reason {
 };
 
 extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1];
+extern u32 lsm_active_cnt;
+extern const struct lsm_id *lsm_idlist[];
 
 /* These functions are in security/commoncap.c */
 extern int cap_capable(const struct cred *cred, struct user_namespace *ns,
-- 
cgit v1.2.3


From 267c068e5f8b81b68cc4247c94dbba90a21a634e Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:48 -0700
Subject: proc: Use lsmids instead of lsm names for attrs

Use the LSM ID number instead of the LSM name to identify which
security module's attibute data should be shown in /proc/self/attr.
The security_[gs]etprocattr() functions have been changed to expect
the LSM ID. The change from a string comparison to an integer comparison
in these functions will provide a minor performance improvement.

Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Mickael Salaun <mic@digikod.net>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 50c178019a58..c81bca77f4f2 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -472,10 +472,9 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd);
 int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
 			unsigned nsops, int alter);
 void security_d_instantiate(struct dentry *dentry, struct inode *inode);
-int security_getprocattr(struct task_struct *p, const char *lsm, const char *name,
+int security_getprocattr(struct task_struct *p, int lsmid, const char *name,
 			 char **value);
-int security_setprocattr(const char *lsm, const char *name, void *value,
-			 size_t size);
+int security_setprocattr(int lsmid, const char *name, void *value, size_t size);
 int security_netlink_send(struct sock *sk, struct sk_buff *skb);
 int security_ismaclabel(const char *name);
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
@@ -1339,14 +1338,14 @@ static inline void security_d_instantiate(struct dentry *dentry,
 					  struct inode *inode)
 { }
 
-static inline int security_getprocattr(struct task_struct *p, const char *lsm,
+static inline int security_getprocattr(struct task_struct *p, int lsmid,
 				       const char *name, char **value)
 {
 	return -EINVAL;
 }
 
-static inline int security_setprocattr(const char *lsm, char *name,
-				       void *value, size_t size)
+static inline int security_setprocattr(int lsmid, char *name, void *value,
+				       size_t size)
 {
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From a04a1198088a1378d0389c250cc684f649bcc91e Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:49 -0700
Subject: LSM: syscalls for current process attributes

Create a system call lsm_get_self_attr() to provide the security
module maintained attributes of the current process.
Create a system call lsm_set_self_attr() to set a security
module maintained attribute of the current process.
Historically these attributes have been exposed to user space via
entries in procfs under /proc/self/attr.

The attribute value is provided in a lsm_ctx structure. The structure
identifies the size of the attribute, and the attribute value. The format
of the attribute value is defined by the security module. A flags field
is included for LSM specific information. It is currently unused and must
be 0. The total size of the data, including the lsm_ctx structure and any
padding, is maintained as well.

struct lsm_ctx {
        __u64 id;
        __u64 flags;
        __u64 len;
        __u64 ctx_len;
        __u8 ctx[];
};

Two new LSM hooks are used to interface with the LSMs.
security_getselfattr() collects the lsm_ctx values from the
LSMs that support the hook, accounting for space requirements.
security_setselfattr() identifies which LSM the attribute is
intended for and passes it along.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h |  4 ++++
 include/linux/lsm_hooks.h     |  1 +
 include/linux/security.h      | 19 +++++++++++++++++++
 include/linux/syscalls.h      |  5 +++++
 4 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ff217a5ce552..c925a0d26edf 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -262,6 +262,10 @@ LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops,
 LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb)
 LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry,
 	 struct inode *inode)
+LSM_HOOK(int, -EOPNOTSUPP, getselfattr, unsigned int attr,
+	 struct lsm_ctx __user *ctx, size_t *size, u32 flags)
+LSM_HOOK(int, -EOPNOTSUPP, setselfattr, unsigned int attr,
+	 struct lsm_ctx *ctx, size_t size, u32 flags)
 LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name,
 	 char **value)
 LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 7f0adb33caaa..a2ade0ffe9e7 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -25,6 +25,7 @@
 #ifndef __LINUX_LSM_HOOKS_H
 #define __LINUX_LSM_HOOKS_H
 
+#include <uapi/linux/lsm.h>
 #include <linux/security.h>
 #include <linux/init.h>
 #include <linux/rculist.h>
diff --git a/include/linux/security.h b/include/linux/security.h
index c81bca77f4f2..dd1fe487385d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -60,6 +60,7 @@ struct fs_parameter;
 enum fs_value_type;
 struct watch;
 struct watch_notification;
+struct lsm_ctx;
 
 /* Default (no) options for the capable function */
 #define CAP_OPT_NONE 0x0
@@ -472,6 +473,10 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd);
 int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
 			unsigned nsops, int alter);
 void security_d_instantiate(struct dentry *dentry, struct inode *inode);
+int security_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
+			 size_t __user *size, u32 flags);
+int security_setselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
+			 size_t size, u32 flags);
 int security_getprocattr(struct task_struct *p, int lsmid, const char *name,
 			 char **value);
 int security_setprocattr(int lsmid, const char *name, void *value, size_t size);
@@ -1338,6 +1343,20 @@ static inline void security_d_instantiate(struct dentry *dentry,
 					  struct inode *inode)
 { }
 
+static inline int security_getselfattr(unsigned int attr,
+				       struct lsm_ctx __user *ctx,
+				       size_t __user *size, u32 flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int security_setselfattr(unsigned int attr,
+				       struct lsm_ctx __user *ctx,
+				       size_t size, u32 flags)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int security_getprocattr(struct task_struct *p, int lsmid,
 				       const char *name, char **value)
 {
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index fd9d12de7e92..4e1e56a24f1e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -71,6 +71,7 @@ struct clone_args;
 struct open_how;
 struct mount_attr;
 struct landlock_ruleset_attr;
+struct lsm_ctx;
 enum landlock_rule_type;
 struct cachestat_range;
 struct cachestat;
@@ -949,6 +950,10 @@ asmlinkage long sys_cachestat(unsigned int fd,
 		struct cachestat_range __user *cstat_range,
 		struct cachestat __user *cstat, unsigned int flags);
 asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags);
+asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx,
+				      size_t *size, __u32 flags);
+asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx,
+				      size_t size, __u32 flags);
 
 /*
  * Architecture-specific system calls
-- 
cgit v1.2.3


From ad4aff9ec25f400608283c10d634cc4eeda83a02 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:50 -0700
Subject: LSM: Create lsm_list_modules system call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a system call to report the list of Linux Security Modules
that are active on the system. The list is provided as an array
of LSM ID numbers.

The calling application can use this list determine what LSM
specific actions it might take. That might include choosing an
output format, determining required privilege or bypassing
security module specific behavior.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Reviewed-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/syscalls.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 4e1e56a24f1e..feec5719750b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -954,6 +954,7 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx,
 				      size_t *size, __u32 flags);
 asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx,
 				      size_t size, __u32 flags);
+asmlinkage long sys_lsm_list_modules(u64 *ids, size_t *size, u32 flags);
 
 /*
  * Architecture-specific system calls
-- 
cgit v1.2.3


From e1ca7129db2c3b3c4d261702905a752e6b2710b4 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:52 -0700
Subject: LSM: Helpers for attribute names and filling lsm_ctx
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add lsm_name_to_attr(), which translates a text string to a
LSM_ATTR value if one is available.

Add lsm_fill_user_ctx(), which fills a struct lsm_ctx, including
the trailing attribute value.

Both are used in module specific components of LSM system calls.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index dd1fe487385d..334f75aa7289 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/sockptr.h>
+#include <uapi/linux/lsm.h>
 
 struct linux_binprm;
 struct cred;
@@ -264,6 +265,7 @@ int unregister_blocking_lsm_notifier(struct notifier_block *nb);
 /* prototypes */
 extern int security_init(void);
 extern int early_security_init(void);
+extern u64 lsm_name_to_attr(const char *name);
 
 /* Security operations */
 int security_binder_set_context_mgr(const struct cred *mgr);
@@ -490,6 +492,8 @@ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen);
 int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen);
 int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen);
 int security_locked_down(enum lockdown_reason what);
+int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context,
+		      size_t context_size, u64 id, u64 flags);
 #else /* CONFIG_SECURITY */
 
 static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data)
@@ -507,6 +511,11 @@ static inline  int unregister_blocking_lsm_notifier(struct notifier_block *nb)
 	return 0;
 }
 
+static inline u64 lsm_name_to_attr(const char *name)
+{
+	return LSM_ATTR_UNDEF;
+}
+
 static inline void security_free_mnt_opts(void **mnt_opts)
 {
 }
@@ -1415,6 +1424,11 @@ static inline int security_locked_down(enum lockdown_reason what)
 {
 	return 0;
 }
+static inline int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context,
+				    size_t context_size, u64 id, u64 flags)
+{
+	return -EOPNOTSUPP;
+}
 #endif	/* CONFIG_SECURITY */
 
 #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE)
-- 
cgit v1.2.3


From d7cf3412a9f6c547e5ee443fa7644e08898aa3e2 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 24 Oct 2023 14:44:00 -0400
Subject: lsm: consolidate buffer size handling into lsm_fill_user_ctx()

While we have a lsm_fill_user_ctx() helper function designed to make
life easier for LSMs which return lsm_ctx structs to userspace, we
didn't include all of the buffer length safety checks and buffer
padding adjustments in the helper.  This led to code duplication
across the different LSMs and the possibility for mistakes across the
different LSM subsystems.  In order to reduce code duplication and
decrease the chances of silly mistakes, we're consolidating all of
this code into the lsm_fill_user_ctx() helper.

The buffer padding is also modified from a fixed 8-byte alignment to
an alignment that matches the word length of the machine
(BITS_PER_LONG / 8).

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 334f75aa7289..750130a7b9dd 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -492,8 +492,8 @@ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen);
 int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen);
 int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen);
 int security_locked_down(enum lockdown_reason what);
-int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context,
-		      size_t context_size, u64 id, u64 flags);
+int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, size_t *uctx_len,
+		      void *val, size_t val_len, u64 id, u64 flags);
 #else /* CONFIG_SECURITY */
 
 static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data)
@@ -1424,8 +1424,9 @@ static inline int security_locked_down(enum lockdown_reason what)
 {
 	return 0;
 }
-static inline int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context,
-				    size_t context_size, u64 id, u64 flags)
+static inline int lsm_fill_user_ctx(struct lsm_ctx __user *uctx,
+				    size_t *uctx_len, void *val, size_t val_len,
+				    u64 id, u64 flags)
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3


From 8569992d64b8f750e34b7858eac5d7daaf0f80fd Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Fri, 27 Oct 2023 11:21:45 -0700
Subject: KVM: Use gfn instead of hva for mmu_notifier_retry

Currently in mmu_notifier invalidate path, hva range is recorded and then
checked against by mmu_invalidate_retry_hva() in the page fault handling
path. However, for the soon-to-be-introduced private memory, a page fault
may not have a hva associated, checking gfn(gpa) makes more sense.

For existing hva based shared memory, gfn is expected to also work. The
only downside is when aliasing multiple gfns to a single hva, the
current algorithm of checking multiple ranges could result in a much
larger range being rejected. Such aliasing should be uncommon, so the
impact is expected small.

Suggested-by: Sean Christopherson <seanjc@google.com>
Cc: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
[sean: convert vmx_set_apic_access_page_addr() to gfn-based API]
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Xu Yilun <yilun.xu@linux.intel.com>
Message-Id: <20231027182217.3615211-4-seanjc@google.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fb6c6109fdca..11d091688346 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -787,8 +787,8 @@ struct kvm {
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_invalidate_seq;
 	long mmu_invalidate_in_progress;
-	unsigned long mmu_invalidate_range_start;
-	unsigned long mmu_invalidate_range_end;
+	gfn_t mmu_invalidate_range_start;
+	gfn_t mmu_invalidate_range_end;
 #endif
 	struct list_head devices;
 	u64 manual_dirty_log_protect;
@@ -1392,10 +1392,9 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 #endif
 
-void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
-			      unsigned long end);
-void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
-			    unsigned long end);
+void kvm_mmu_invalidate_begin(struct kvm *kvm);
+void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
+void kvm_mmu_invalidate_end(struct kvm *kvm);
 
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
@@ -1970,9 +1969,9 @@ static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq)
 	return 0;
 }
 
-static inline int mmu_invalidate_retry_hva(struct kvm *kvm,
+static inline int mmu_invalidate_retry_gfn(struct kvm *kvm,
 					   unsigned long mmu_seq,
-					   unsigned long hva)
+					   gfn_t gfn)
 {
 	lockdep_assert_held(&kvm->mmu_lock);
 	/*
@@ -1981,10 +1980,20 @@ static inline int mmu_invalidate_retry_hva(struct kvm *kvm,
 	 * that might be being invalidated. Note that it may include some false
 	 * positives, due to shortcuts when handing concurrent invalidations.
 	 */
-	if (unlikely(kvm->mmu_invalidate_in_progress) &&
-	    hva >= kvm->mmu_invalidate_range_start &&
-	    hva < kvm->mmu_invalidate_range_end)
-		return 1;
+	if (unlikely(kvm->mmu_invalidate_in_progress)) {
+		/*
+		 * Dropping mmu_lock after bumping mmu_invalidate_in_progress
+		 * but before updating the range is a KVM bug.
+		 */
+		if (WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA ||
+				 kvm->mmu_invalidate_range_end == INVALID_GPA))
+			return 1;
+
+		if (gfn >= kvm->mmu_invalidate_range_start &&
+		    gfn < kvm->mmu_invalidate_range_end)
+			return 1;
+	}
+
 	if (kvm->mmu_invalidate_seq != mmu_seq)
 		return 1;
 	return 0;
-- 
cgit v1.2.3


From f128cf8cfbecccf95e891ae90d9c917df5117c7a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 27 Oct 2023 11:21:49 -0700
Subject: KVM: Convert KVM_ARCH_WANT_MMU_NOTIFIER to
 CONFIG_KVM_GENERIC_MMU_NOTIFIER

Convert KVM_ARCH_WANT_MMU_NOTIFIER into a Kconfig and select it where
appropriate to effectively maintain existing behavior.  Using a proper
Kconfig will simplify building more functionality on top of KVM's
mmu_notifier infrastructure.

Add a forward declaration of kvm_gfn_range to kvm_types.h so that
including arch/powerpc/include/asm/kvm_ppc.h's with CONFIG_KVM=n doesn't
generate warnings due to kvm_gfn_range being undeclared.  PPC defines
hooks for PR vs. HV without guarding them via #ifdeffery, e.g.

  bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
  bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
  bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
  bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);

Alternatively, PPC could forward declare kvm_gfn_range, but there's no
good reason not to define it in common KVM.

Acked-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Message-Id: <20231027182217.3615211-8-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h  | 6 +++---
 include/linux/kvm_types.h | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 11d091688346..5faba69403ac 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -253,7 +253,7 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
-#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 union kvm_mmu_notifier_arg {
 	pte_t pte;
 };
@@ -783,7 +783,7 @@ struct kvm {
 	struct hlist_head irq_ack_notifier_list;
 #endif
 
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_invalidate_seq;
 	long mmu_invalidate_in_progress;
@@ -1946,7 +1946,7 @@ extern const struct _kvm_stats_desc kvm_vm_stats_desc[];
 extern const struct kvm_stats_header kvm_vcpu_stats_header;
 extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[];
 
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq)
 {
 	if (unlikely(kvm->mmu_invalidate_in_progress))
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 6f4737d5046a..9d1f7835d8c1 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -6,6 +6,7 @@
 struct kvm;
 struct kvm_async_pf;
 struct kvm_device_ops;
+struct kvm_gfn_range;
 struct kvm_interrupt;
 struct kvm_irq_routing_table;
 struct kvm_memory_slot;
-- 
cgit v1.2.3


From bb58b90b1a8f753b582055adaf448214a8e22c31 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 27 Oct 2023 11:21:50 -0700
Subject: KVM: Introduce KVM_SET_USER_MEMORY_REGION2

Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional
information can be supplied without setting userspace up to fail.  The
padding in the new kvm_userspace_memory_region2 structure will be used to
pass a file descriptor in addition to the userspace_addr, i.e. allow
userspace to point at a file descriptor and map memory into a guest that
is NOT mapped into host userspace.

Alternatively, KVM could simply add "struct kvm_userspace_memory_region2"
without a new ioctl(), but as Paolo pointed out, adding a new ioctl()
makes detection of bad flags a bit more robust, e.g. if the new fd field
is guarded only by a flag and not a new ioctl(), then a userspace bug
(setting a "bad" flag) would generate out-of-bounds access instead of an
-EINVAL error.

Cc: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Message-Id: <20231027182217.3615211-9-seanjc@google.com>
Acked-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5faba69403ac..4e741ff27af3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1146,9 +1146,9 @@ enum kvm_mr_change {
 };
 
 int kvm_set_memory_region(struct kvm *kvm,
-			  const struct kvm_userspace_memory_region *mem);
+			  const struct kvm_userspace_memory_region2 *mem);
 int __kvm_set_memory_region(struct kvm *kvm,
-			    const struct kvm_userspace_memory_region *mem);
+			    const struct kvm_userspace_memory_region2 *mem);
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
-- 
cgit v1.2.3


From 16f95f3b95caded251a0440051e44a2fbe9e5f55 Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Fri, 27 Oct 2023 11:21:51 -0700
Subject: KVM: Add KVM_EXIT_MEMORY_FAULT exit to report faults to userspace

Add a new KVM exit type to allow userspace to handle memory faults that
KVM cannot resolve, but that userspace *may* be able to handle (without
terminating the guest).

KVM will initially use KVM_EXIT_MEMORY_FAULT to report implicit
conversions between private and shared memory.  With guest private memory,
there will be two kind of memory conversions:

  - explicit conversion: happens when the guest explicitly calls into KVM
    to map a range (as private or shared)

  - implicit conversion: happens when the guest attempts to access a gfn
    that is configured in the "wrong" state (private vs. shared)

On x86 (first architecture to support guest private memory), explicit
conversions will be reported via KVM_EXIT_HYPERCALL+KVM_HC_MAP_GPA_RANGE,
but reporting KVM_EXIT_HYPERCALL for implicit conversions is undesriable
as there is (obviously) no hypercall, and there is no guarantee that the
guest actually intends to convert between private and shared, i.e. what
KVM thinks is an implicit conversion "request" could actually be the
result of a guest code bug.

KVM_EXIT_MEMORY_FAULT will be used to report memory faults that appear to
be implicit conversions.

Note!  To allow for future possibilities where KVM reports
KVM_EXIT_MEMORY_FAULT and fills run->memory_fault on _any_ unresolved
fault, KVM returns "-EFAULT" (-1 with errno == EFAULT from userspace's
perspective), not '0'!  Due to historical baggage within KVM, exiting to
userspace with '0' from deep callstacks, e.g. in emulation paths, is
infeasible as doing so would require a near-complete overhaul of KVM,
whereas KVM already propagates -errno return codes to userspace even when
the -errno originated in a low level helper.

Report the gpa+size instead of a single gfn even though the initial usage
is expected to always report single pages.  It's entirely possible, likely
even, that KVM will someday support sub-page granularity faults, e.g.
Intel's sub-page protection feature allows for additional protections at
128-byte granularity.

Link: https://lore.kernel.org/all/20230908222905.1321305-5-amoorthy@google.com
Link: https://lore.kernel.org/all/ZQ3AmLO2SYv3DszH@google.com
Cc: Anish Moorthy <amoorthy@google.com>
Cc: David Matlack <dmatlack@google.com>
Suggested-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20231027182217.3615211-10-seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4e741ff27af3..96aa930536b1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2327,4 +2327,15 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
 /* Max number of entries allowed for each kvm dirty ring */
 #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
 
+static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+						 gpa_t gpa, gpa_t size)
+{
+	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
+	vcpu->run->memory_fault.gpa = gpa;
+	vcpu->run->memory_fault.size = size;
+
+	/* Flags are not (yet) defined or communicated to userspace. */
+	vcpu->run->memory_fault.flags = 0;
+}
+
 #endif
-- 
cgit v1.2.3


From 5a475554db1e476a14216e742ea2bdb77362d5d5 Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Fri, 27 Oct 2023 11:21:55 -0700
Subject: KVM: Introduce per-page memory attributes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In confidential computing usages, whether a page is private or shared is
necessary information for KVM to perform operations like page fault
handling, page zapping etc. There are other potential use cases for
per-page memory attributes, e.g. to make memory read-only (or no-exec,
or exec-only, etc.) without having to modify memslots.

Introduce the KVM_SET_MEMORY_ATTRIBUTES ioctl, advertised by
KVM_CAP_MEMORY_ATTRIBUTES, to allow userspace to set the per-page memory
attributes to a guest memory range.

Use an xarray to store the per-page attributes internally, with a naive,
not fully optimized implementation, i.e. prioritize correctness over
performance for the initial implementation.

Use bit 3 for the PRIVATE attribute so that KVM can use bits 0-2 for RWX
attributes/protections in the future, e.g. to give userspace fine-grained
control over read, write, and execute protections for guest memory.

Provide arch hooks for handling attribute changes before and after common
code sets the new attributes, e.g. x86 will use the "pre" hook to zap all
relevant mappings, and the "post" hook to track whether or not hugepages
can be used to map the range.

To simplify the implementation wrap the entire sequence with
kvm_mmu_invalidate_{begin,end}() even though the operation isn't strictly
guaranteed to be an invalidation.  For the initial use case, x86 *will*
always invalidate memory, and preventing arch code from creating new
mappings while the attributes are in flux makes it much easier to reason
about the correctness of consuming attributes.

It's possible that future usages may not require an invalidation, e.g.
if KVM ends up supporting RWX protections and userspace grants _more_
protections, but again opt for simplicity and punt optimizations to
if/when they are needed.

Suggested-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/all/Y2WB48kD0J4VGynX@google.com
Cc: Fuad Tabba <tabba@google.com>
Cc: Xu Yilun <yilun.xu@intel.com>
Cc: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20231027182217.3615211-14-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 96aa930536b1..68a144cb7dbc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -256,6 +256,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 union kvm_mmu_notifier_arg {
 	pte_t pte;
+	unsigned long attributes;
 };
 
 struct kvm_gfn_range {
@@ -806,6 +807,10 @@ struct kvm {
 
 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 	struct notifier_block pm_notifier;
+#endif
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+	/* Protected by slots_locks (for writes) and RCU (for reads) */
+	struct xarray mem_attr_array;
 #endif
 	char stats_id[KVM_STATS_NAME_SIZE];
 };
@@ -2338,4 +2343,18 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
 	vcpu->run->memory_fault.flags = 0;
 }
 
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
+{
+	return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
+}
+
+bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+				     unsigned long attrs);
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+					struct kvm_gfn_range *range);
+bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
+					 struct kvm_gfn_range *range);
+#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+
 #endif
-- 
cgit v1.2.3


From 0003e2a414687fff6a75250d381e4abf345d663f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 27 Oct 2023 11:21:56 -0700
Subject: mm: Add AS_UNMOVABLE to mark mapping as completely unmovable

Add an "unmovable" flag for mappings that cannot be migrated under any
circumstance.  KVM will use the flag for its upcoming GUEST_MEMFD support,
which will not support compaction/migration, at least not in the
foreseeable future.

Test AS_UNMOVABLE under folio lock as already done for the async
compaction/dirty folio case, as the mapping can be removed by truncation
while compaction is running.  To avoid having to lock every folio with a
mapping, assume/require that unmovable mappings are also unevictable, and
have mapping_set_unmovable() also set AS_UNEVICTABLE.

Cc: Matthew Wilcox <willy@infradead.org>
Co-developed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20231027182217.3615211-15-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/pagemap.h | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 351c3b7f93a1..82c9bf506b79 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -203,7 +203,8 @@ enum mapping_flags {
 	/* writeback related tags are not used */
 	AS_NO_WRITEBACK_TAGS = 5,
 	AS_LARGE_FOLIO_SUPPORT = 6,
-	AS_RELEASE_ALWAYS,	/* Call ->release_folio(), even if no private data */
+	AS_RELEASE_ALWAYS = 7,	/* Call ->release_folio(), even if no private data */
+	AS_UNMOVABLE	= 8,	/* The mapping cannot be moved, ever */
 };
 
 /**
@@ -289,6 +290,22 @@ static inline void mapping_clear_release_always(struct address_space *mapping)
 	clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
 }
 
+static inline void mapping_set_unmovable(struct address_space *mapping)
+{
+	/*
+	 * It's expected unmovable mappings are also unevictable. Compaction
+	 * migrate scanner (isolate_migratepages_block()) relies on this to
+	 * reduce page locking.
+	 */
+	set_bit(AS_UNEVICTABLE, &mapping->flags);
+	set_bit(AS_UNMOVABLE, &mapping->flags);
+}
+
+static inline bool mapping_unmovable(struct address_space *mapping)
+{
+	return test_bit(AS_UNMOVABLE, &mapping->flags);
+}
+
 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 {
 	return mapping->gfp_mask;
-- 
cgit v1.2.3


From 438cbcdf105d84449fceb39a2d0e16d0ec20708f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org>
Date: Fri, 10 Nov 2023 13:05:46 +0100
Subject: net: mdio: fix typo in header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The quotes symbol in
  "EEE "link partner ability 1
should be at the end of the register name
  "EEE link partner ability 1"

Signed-off-by: Marek Behún <kabel@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 8fa23bdcedbf..007fd9c3e4b6 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -420,7 +420,7 @@ static inline u32 linkmode_adv_to_mii_t1_adv_m_t(unsigned long *advertising)
  * A function that translates value of following registers to the linkmode:
  * IEEE 802.3-2018 45.2.3.10 "EEE control and capability 1" register (3.20)
  * IEEE 802.3-2018 45.2.7.13 "EEE advertisement 1" register (7.60)
- * IEEE 802.3-2018 45.2.7.14 "EEE "link partner ability 1 register (7.61)
+ * IEEE 802.3-2018 45.2.7.14 "EEE link partner ability 1" register (7.61)
  */
 static inline void mii_eee_cap1_mod_linkmode_t(unsigned long *adv, u32 val)
 {
-- 
cgit v1.2.3


From 3fad96e9b21bed214c1593d7d7fb3e40d1fbf6f4 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Tue, 24 Oct 2023 11:57:15 +0100
Subject: firmware: arm_ffa: Declare ffa_bus_type structure in the header

smatch reports:
drivers/firmware/arm_ffa/bus.c:108:17: warning:
	symbol 'ffa_bus_type' was not declared. Should it be static?

ffa_bus_type is exported to be useful in the FF-A driver. So this
warning is not correct. However, declaring the ffa_bus_type structure
in the header like many other bus_types do already removes this warning.
So let us just do the same and get rid of the warning.

Link: https://lore.kernel.org/r/20231024105715.2369638-1-sudeep.holla@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 1abedb5b2e48..3d0fde57ba90 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -209,6 +209,8 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; }
 #define module_ffa_driver(__ffa_driver)	\
 	module_driver(__ffa_driver, ffa_register, ffa_unregister)
 
+extern struct bus_type ffa_bus_type;
+
 /* FFA transport related */
 struct ffa_partition_info {
 	u16 id;
-- 
cgit v1.2.3


From 4f0b9194bc119a9850a99e5e824808e2f468c348 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 3 Nov 2023 06:47:51 -0400
Subject: fs: Rename anon_inode_getfile_secure() and anon_inode_getfd_secure()

The call to the inode_init_security_anon() LSM hook is not the sole
reason to use anon_inode_getfile_secure() or anon_inode_getfd_secure().
For example, the functions also allow one to create a file with non-zero
size, without needing a full-blown filesystem.  In this case, you don't
need a "secure" version, just unique inodes; the current name of the
functions is confusing and does not explain well the difference with
the more "standard" anon_inode_getfile() and anon_inode_getfd().

Of course, there is another side of the coin; neither io_uring nor
userfaultfd strictly speaking need distinct inodes, and it is not
that clear anymore that anon_inode_create_get{file,fd}() allow the LSM
to intercept and block the inode's creation.  If one was so inclined,
anon_inode_getfile_secure() and anon_inode_getfd_secure() could be kept,
using the shared inode or a new one depending on CONFIG_SECURITY.
However, this is probably overkill, and potentially a cause of bugs in
different configurations.  Therefore, just add a comment to io_uring
and userfaultfd explaining the choice of the function.

While at it, remove the export for what is now anon_inode_create_getfd().
There is no in-tree module that uses it, and the old name is gone anyway.
If anybody actually needs the symbol, they can ask or they can just use
anon_inode_create_getfile(), which will be exported very soon for use
in KVM.

Suggested-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/anon_inodes.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
index 5deaddbd7927..93a5f16d03f3 100644
--- a/include/linux/anon_inodes.h
+++ b/include/linux/anon_inodes.h
@@ -15,13 +15,13 @@ struct inode;
 struct file *anon_inode_getfile(const char *name,
 				const struct file_operations *fops,
 				void *priv, int flags);
-struct file *anon_inode_getfile_secure(const char *name,
+struct file *anon_inode_create_getfile(const char *name,
 				       const struct file_operations *fops,
 				       void *priv, int flags,
 				       const struct inode *context_inode);
 int anon_inode_getfd(const char *name, const struct file_operations *fops,
 		     void *priv, int flags);
-int anon_inode_getfd_secure(const char *name,
+int anon_inode_create_getfd(const char *name,
 			    const struct file_operations *fops,
 			    void *priv, int flags,
 			    const struct inode *context_inode);
-- 
cgit v1.2.3


From a7800aa80ea4d5356b8474c2302812e9d4926fa6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 13 Nov 2023 05:42:34 -0500
Subject: KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing
 memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce an ioctl(), KVM_CREATE_GUEST_MEMFD, to allow creating file-based
memory that is tied to a specific KVM virtual machine and whose primary
purpose is to serve guest memory.

A guest-first memory subsystem allows for optimizations and enhancements
that are kludgy or outright infeasible to implement/support in a generic
memory subsystem.  With guest_memfd, guest protections and mapping sizes
are fully decoupled from host userspace mappings.   E.g. KVM currently
doesn't support mapping memory as writable in the guest without it also
being writable in host userspace, as KVM's ABI uses VMA protections to
define the allow guest protection.  Userspace can fudge this by
establishing two mappings, a writable mapping for the guest and readable
one for itself, but that’s suboptimal on multiple fronts.

Similarly, KVM currently requires the guest mapping size to be a strict
subset of the host userspace mapping size, e.g. KVM doesn’t support
creating a 1GiB guest mapping unless userspace also has a 1GiB guest
mapping.  Decoupling the mappings sizes would allow userspace to precisely
map only what is needed without impacting guest performance, e.g. to
harden against unintentional accesses to guest memory.

Decoupling guest and userspace mappings may also allow for a cleaner
alternative to high-granularity mappings for HugeTLB, which has reached a
bit of an impasse and is unlikely to ever be merged.

A guest-first memory subsystem also provides clearer line of sight to
things like a dedicated memory pool (for slice-of-hardware VMs) and
elimination of "struct page" (for offload setups where userspace _never_
needs to mmap() guest memory).

More immediately, being able to map memory into KVM guests without mapping
said memory into the host is critical for Confidential VMs (CoCo VMs), the
initial use case for guest_memfd.  While AMD's SEV and Intel's TDX prevent
untrusted software from reading guest private data by encrypting guest
memory with a key that isn't usable by the untrusted host, projects such
as Protected KVM (pKVM) provide confidentiality and integrity *without*
relying on memory encryption.  And with SEV-SNP and TDX, accessing guest
private memory can be fatal to the host, i.e. KVM must be prevent host
userspace from accessing guest memory irrespective of hardware behavior.

Attempt #1 to support CoCo VMs was to add a VMA flag to mark memory as
being mappable only by KVM (or a similarly enlightened kernel subsystem).
That approach was abandoned largely due to it needing to play games with
PROT_NONE to prevent userspace from accessing guest memory.

Attempt #2 to was to usurp PG_hwpoison to prevent the host from mapping
guest private memory into userspace, but that approach failed to meet
several requirements for software-based CoCo VMs, e.g. pKVM, as the kernel
wouldn't easily be able to enforce a 1:1 page:guest association, let alone
a 1:1 pfn:gfn mapping.  And using PG_hwpoison does not work for memory
that isn't backed by 'struct page', e.g. if devices gain support for
exposing encrypted memory regions to guests.

Attempt #3 was to extend the memfd() syscall and wrap shmem to provide
dedicated file-based guest memory.  That approach made it as far as v10
before feedback from Hugh Dickins and Christian Brauner (and others) led
to it demise.

Hugh's objection was that piggybacking shmem made no sense for KVM's use
case as KVM didn't actually *want* the features provided by shmem.  I.e.
KVM was using memfd() and shmem to avoid having to manage memory directly,
not because memfd() and shmem were the optimal solution, e.g. things like
read/write/mmap in shmem were dead weight.

Christian pointed out flaws with implementing a partial overlay (wrapping
only _some_ of shmem), e.g. poking at inode_operations or super_operations
would show shmem stuff, but address_space_operations and file_operations
would show KVM's overlay.  Paraphrashing heavily, Christian suggested KVM
stop being lazy and create a proper API.

Link: https://lore.kernel.org/all/20201020061859.18385-1-kirill.shutemov@linux.intel.com
Link: https://lore.kernel.org/all/20210416154106.23721-1-kirill.shutemov@linux.intel.com
Link: https://lore.kernel.org/all/20210824005248.200037-1-seanjc@google.com
Link: https://lore.kernel.org/all/20211111141352.26311-1-chao.p.peng@linux.intel.com
Link: https://lore.kernel.org/all/20221202061347.1070246-1-chao.p.peng@linux.intel.com
Link: https://lore.kernel.org/all/ff5c5b97-acdf-9745-ebe5-c6609dd6322e@google.com
Link: https://lore.kernel.org/all/20230418-anfallen-irdisch-6993a61be10b@brauner
Link: https://lore.kernel.org/all/ZEM5Zq8oo+xnApW9@google.com
Link: https://lore.kernel.org/linux-mm/20230306191944.GA15773@monkey
Link: https://lore.kernel.org/linux-mm/ZII1p8ZHlHaQ3dDl@casper.infradead.org
Cc: Fuad Tabba <tabba@google.com>
Cc: Vishal Annapurve <vannapurve@google.com>
Cc: Ackerley Tng <ackerleytng@google.com>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Maciej Szmigiero <mail@maciej.szmigiero.name>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Quentin Perret <qperret@google.com>
Cc: Michael Roth <michael.roth@amd.com>
Cc: Wang <wei.w.wang@intel.com>
Cc: Liam Merwick <liam.merwick@oracle.com>
Cc: Isaku Yamahata <isaku.yamahata@gmail.com>
Co-developed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Co-developed-by: Chao Peng <chao.p.peng@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Co-developed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Co-developed-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20231027182217.3615211-17-seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 68a144cb7dbc..a6de526c0426 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -589,8 +589,20 @@ struct kvm_memory_slot {
 	u32 flags;
 	short id;
 	u16 as_id;
+
+#ifdef CONFIG_KVM_PRIVATE_MEM
+	struct {
+		struct file __rcu *file;
+		pgoff_t pgoff;
+	} gmem;
+#endif
 };
 
+static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
+{
+	return slot && (slot->flags & KVM_MEM_GUEST_MEMFD);
+}
+
 static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
 {
 	return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
@@ -685,6 +697,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
 }
 #endif
 
+/*
+ * Arch code must define kvm_arch_has_private_mem if support for private memory
+ * is enabled.
+ */
+#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
+static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
+{
+	return false;
+}
+#endif
+
 struct kvm_memslots {
 	u64 generation;
 	atomic_long_t last_used_slot;
@@ -1400,6 +1423,7 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 void kvm_mmu_invalidate_begin(struct kvm *kvm);
 void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
 void kvm_mmu_invalidate_end(struct kvm *kvm);
+bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
@@ -2355,6 +2379,30 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
 					struct kvm_gfn_range *range);
 bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
 					 struct kvm_gfn_range *range);
+
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+	return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
+	       kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+#else
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+	return false;
+}
 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
 
+#ifdef CONFIG_KVM_PRIVATE_MEM
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
+#else
+static inline int kvm_gmem_get_pfn(struct kvm *kvm,
+				   struct kvm_memory_slot *slot, gfn_t gfn,
+				   kvm_pfn_t *pfn, int *max_order)
+{
+	KVM_BUG_ON(1, kvm);
+	return -EIO;
+}
+#endif /* CONFIG_KVM_PRIVATE_MEM */
+
 #endif
-- 
cgit v1.2.3


From 8dd2eee9d526c30fccfe75da7ec5365c6476e510 Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Fri, 27 Oct 2023 11:22:02 -0700
Subject: KVM: x86/mmu: Handle page fault for private memory

Add support for resolving page faults on guest private memory for VMs
that differentiate between "shared" and "private" memory.  For such VMs,
KVM_MEM_GUEST_MEMFD memslots can include both fd-based private memory and
hva-based shared memory, and KVM needs to map in the "correct" variant,
i.e. KVM needs to map the gfn shared/private as appropriate based on the
current state of the gfn's KVM_MEMORY_ATTRIBUTE_PRIVATE flag.

For AMD's SEV-SNP and Intel's TDX, the guest effectively gets to request
shared vs. private via a bit in the guest page tables, i.e. what the guest
wants may conflict with the current memory attributes.  To support such
"implicit" conversion requests, exit to user with KVM_EXIT_MEMORY_FAULT
to forward the request to userspace.  Add a new flag for memory faults,
KVM_MEMORY_EXIT_FLAG_PRIVATE, to communicate whether the guest wants to
map memory as shared vs. private.

Like KVM_MEMORY_ATTRIBUTE_PRIVATE, use bit 3 for flagging private memory
so that KVM can use bits 0-2 for capturing RWX behavior if/when userspace
needs such information, e.g. a likely user of KVM_EXIT_MEMORY_FAULT is to
exit on missing mappings when handling guest page fault VM-Exits.  In
that case, userspace will want to know RWX information in order to
correctly/precisely resolve the fault.

Note, private memory *must* be backed by guest_memfd, i.e. shared mappings
always come from the host userspace page tables, and private mappings
always come from a guest_memfd instance.

Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Message-Id: <20231027182217.3615211-21-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a6de526c0426..67dfd4d79529 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2357,14 +2357,18 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
 #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
 
 static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
-						 gpa_t gpa, gpa_t size)
+						 gpa_t gpa, gpa_t size,
+						 bool is_write, bool is_exec,
+						 bool is_private)
 {
 	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
 	vcpu->run->memory_fault.gpa = gpa;
 	vcpu->run->memory_fault.size = size;
 
-	/* Flags are not (yet) defined or communicated to userspace. */
+	/* RWX flags are not (yet) defined or communicated to userspace. */
 	vcpu->run->memory_fault.flags = 0;
+	if (is_private)
+		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
 }
 
 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
-- 
cgit v1.2.3


From 2333afa17af0f4b6651214ee17cfd5ae5f47787a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 27 Oct 2023 11:22:03 -0700
Subject: KVM: Drop superfluous __KVM_VCPU_MULTIPLE_ADDRESS_SPACE macro

Drop __KVM_VCPU_MULTIPLE_ADDRESS_SPACE and instead check the value of
KVM_ADDRESS_SPACE_NUM.

No functional change intended.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Message-Id: <20231027182217.3615211-22-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 67dfd4d79529..db423ea9e3a4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -690,7 +690,7 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm);
 #define KVM_MEM_SLOTS_NUM SHRT_MAX
 #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS)
 
-#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
+#if KVM_ADDRESS_SPACE_NUM == 1
 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
 {
 	return 0;
-- 
cgit v1.2.3


From eed52e434bc33603ddb0af62b6c4ef818948489d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 27 Oct 2023 11:22:04 -0700
Subject: KVM: Allow arch code to track number of memslot address spaces per VM

Let x86 track the number of address spaces on a per-VM basis so that KVM
can disallow SMM memslots for confidential VMs.  Confidentials VMs are
fundamentally incompatible with emulating SMM, which as the name suggests
requires being able to read and write guest memory and register state.

Disallowing SMM will simplify support for guest private memory, as KVM
will not need to worry about tracking memory attributes for multiple
address spaces (SMM is the only "non-default" address space across all
architectures).

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Message-Id: <20231027182217.3615211-23-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index db423ea9e3a4..3ebc6912c54a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -80,8 +80,8 @@
 /* Two fragments for cross MMIO pages. */
 #define KVM_MAX_MMIO_FRAGMENTS	2
 
-#ifndef KVM_ADDRESS_SPACE_NUM
-#define KVM_ADDRESS_SPACE_NUM	1
+#ifndef KVM_MAX_NR_ADDRESS_SPACES
+#define KVM_MAX_NR_ADDRESS_SPACES	1
 #endif
 
 /*
@@ -690,7 +690,12 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm);
 #define KVM_MEM_SLOTS_NUM SHRT_MAX
 #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS)
 
-#if KVM_ADDRESS_SPACE_NUM == 1
+#if KVM_MAX_NR_ADDRESS_SPACES == 1
+static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm)
+{
+	return KVM_MAX_NR_ADDRESS_SPACES;
+}
+
 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
 {
 	return 0;
@@ -745,9 +750,9 @@ struct kvm {
 	struct mm_struct *mm; /* userspace tied to this vm */
 	unsigned long nr_memslot_pages;
 	/* The two memslot sets - active and inactive (per address space) */
-	struct kvm_memslots __memslots[KVM_ADDRESS_SPACE_NUM][2];
+	struct kvm_memslots __memslots[KVM_MAX_NR_ADDRESS_SPACES][2];
 	/* The current active memslot set for each address space */
-	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
+	struct kvm_memslots __rcu *memslots[KVM_MAX_NR_ADDRESS_SPACES];
 	struct xarray vcpu_array;
 	/*
 	 * Protected by slots_lock, but can be read outside if an
@@ -1017,7 +1022,7 @@ void kvm_put_kvm_no_destroy(struct kvm *kvm);
 
 static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
 {
-	as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM);
+	as_id = array_index_nospec(as_id, KVM_MAX_NR_ADDRESS_SPACES);
 	return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu,
 			lockdep_is_held(&kvm->slots_lock) ||
 			!refcount_read(&kvm->users_count));
-- 
cgit v1.2.3


From 2e122362d25e1b977aa5c5c88c03956be4228e02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Winiarski?= <michal.winiarski@intel.com>
Date: Tue, 24 Oct 2023 13:07:10 +0200
Subject: iosys-map: Rename locals used inside macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Widely used variable names can be used by macro users, potentially
leading to name collisions.
Suffix locals used inside the macros with an underscore, to reduce the
collision potential.

Suggested-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231024110710.3039807-1-michal.winiarski@intel.com
---
 include/linux/iosys-map.h | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iosys-map.h b/include/linux/iosys-map.h
index 1b06d074ade0..e3649a6563dd 100644
--- a/include/linux/iosys-map.h
+++ b/include/linux/iosys-map.h
@@ -168,9 +168,9 @@ struct iosys_map {
  * about the use of uninitialized variable.
  */
 #define IOSYS_MAP_INIT_OFFSET(map_, offset_) ({				\
-	struct iosys_map copy = *map_;					\
-	iosys_map_incr(&copy, offset_);					\
-	copy;								\
+	struct iosys_map copy_ = *map_;					\
+	iosys_map_incr(&copy_, offset_);				\
+	copy_;								\
 })
 
 /**
@@ -391,14 +391,14 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * Returns:
  * The value read from the mapping.
  */
-#define iosys_map_rd(map__, offset__, type__) ({				\
-	type__ val;								\
-	if ((map__)->is_iomem) {						\
-		__iosys_map_rd_io(val, (map__)->vaddr_iomem + (offset__), type__);\
-	} else {								\
-		__iosys_map_rd_sys(val, (map__)->vaddr + (offset__), type__);	\
-	}									\
-	val;									\
+#define iosys_map_rd(map__, offset__, type__) ({					\
+	type__ val_;									\
+	if ((map__)->is_iomem) {							\
+		__iosys_map_rd_io(val_, (map__)->vaddr_iomem + (offset__), type__);	\
+	} else {									\
+		__iosys_map_rd_sys(val_, (map__)->vaddr + (offset__), type__);		\
+	}										\
+	val_;										\
 })
 
 /**
@@ -413,13 +413,13 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * or if pointer may be unaligned (and problematic for the architecture
  * supported), use iosys_map_memcpy_to()
  */
-#define iosys_map_wr(map__, offset__, type__, val__) ({				\
-	type__ val = (val__);							\
-	if ((map__)->is_iomem) {						\
-		__iosys_map_wr_io(val, (map__)->vaddr_iomem + (offset__), type__);\
-	} else {								\
-		__iosys_map_wr_sys(val, (map__)->vaddr + (offset__), type__);	\
-	}									\
+#define iosys_map_wr(map__, offset__, type__, val__) ({					\
+	type__ val_ = (val__);								\
+	if ((map__)->is_iomem) {							\
+		__iosys_map_wr_io(val_, (map__)->vaddr_iomem + (offset__), type__);	\
+	} else {									\
+		__iosys_map_wr_sys(val_, (map__)->vaddr + (offset__), type__);		\
+	}										\
 })
 
 /**
@@ -485,9 +485,9 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * The value read from the mapping.
  */
 #define iosys_map_rd_field(map__, struct_offset__, struct_type__, field__) ({	\
-	struct_type__ *s;							\
+	struct_type__ *s_;							\
 	iosys_map_rd(map__, struct_offset__ + offsetof(struct_type__, field__),	\
-		     typeof(s->field__));					\
+		     typeof(s_->field__));					\
 })
 
 /**
@@ -508,9 +508,9 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * usage and memory layout.
  */
 #define iosys_map_wr_field(map__, struct_offset__, struct_type__, field__, val__) ({	\
-	struct_type__ *s;								\
+	struct_type__ *s_;								\
 	iosys_map_wr(map__, struct_offset__ + offsetof(struct_type__, field__),		\
-		     typeof(s->field__), val__);					\
+		     typeof(s_->field__), val__);					\
 })
 
 #endif /* __IOSYS_MAP_H__ */
-- 
cgit v1.2.3


From 889c58b3155ff4c8e8671c95daef63d6fabbb6b1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 12:34:46 +0200
Subject: perf/core: Fix cpuctx refcounting

Audit of the refcounting turned up that perf_pmu_migrate_context()
fails to migrate the ctx refcount.

Fixes: bd2756811766 ("perf: Rewrite core context handling")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20230612093539.085862001@infradead.org
Cc: <stable@vger.kernel.org>
---
 include/linux/perf_event.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index afb028c54f33..5547ba68e6e4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -843,11 +843,11 @@ struct perf_event {
 };
 
 /*
- *           ,-----------------------[1:n]----------------------.
- *           V                                                  V
- * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
- *           ^                      ^     |                     |
- *           `--------[1:n]---------'     `-[n:1]-> pmu <-[1:n]-'
+ *           ,-----------------------[1:n]------------------------.
+ *           V                                                    V
+ * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event
+ *                                        |                       |
+ *                                        `--[n:1]-> pmu <-[1:n]--'
  *
  *
  * struct perf_event_pmu_context  lifetime is refcount based and RCU freed
@@ -865,6 +865,9 @@ struct perf_event {
  * ctx->mutex pinning the configuration. Since we hold a reference on
  * group_leader (through the filedesc) it can't go away, therefore it's
  * associated pmu_ctx must exist and cannot change due to ctx->mutex.
+ *
+ * perf_event holds a refcount on perf_event_context
+ * perf_event holds a refcount on perf_event_pmu_context
  */
 struct perf_event_pmu_context {
 	struct pmu			*pmu;
-- 
cgit v1.2.3


From 84db47ca7146d7bd00eb5cf2b93989a971c84650 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@amd.com>
Date: Fri, 20 Oct 2023 21:27:46 +0530
Subject: sched/numa: Fix mm numa_scan_seq based unconditional scan

Since commit fc137c0ddab2 ("sched/numa: enhance vma scanning logic")

NUMA Balancing allows updating PTEs to trap NUMA hinting faults if the
task had previously accessed VMA. However unconditional scan of VMAs are
allowed during initial phase of VMA creation until process's
mm numa_scan_seq reaches 2 even though current task had not accessed VMA.

Rationale:
 - Without initial scan subsequent PTE update may never happen.
 - Give fair opportunity to all the VMAs to be scanned and subsequently
understand the access pattern of all the VMAs.

But it has a corner case where, if a VMA is created after some time,
process's mm numa_scan_seq could be already greater than 2.

For e.g., values of mm numa_scan_seq when VMAs are created by running
mmtest autonuma benchmark briefly looks like:
start_seq=0 : 459
start_seq=2 : 138
start_seq=3 : 144
start_seq=4 : 8
start_seq=8 : 1
start_seq=9 : 1
This results in no unconditional PTE updates for those VMAs created after
some time.

Fix:
 - Note down the initial value of mm numa_scan_seq in per VMA start_seq.
 - Allow unconditional scan till start_seq + 2.

Result:
SUT: AMD EPYC Milan with 2 NUMA nodes 256 cpus.
base kernel: upstream 6.6-rc6 with Mels patches [1] applied.

kernbench
==========		base                  patched %gain
Amean    elsp-128      165.09 ( 0.00%)      164.78 *   0.19%*

Duration User       41404.28    41375.08
Duration System      9862.22     9768.48
Duration Elapsed      519.87      518.72

Ops NUMA PTE updates           1041416.00      831536.00
Ops NUMA hint faults            263296.00      220966.00
Ops NUMA pages migrated         258021.00      212769.00
Ops AutoNUMA cost                 1328.67        1114.69

autonumabench

NUMA01_THREADLOCAL
==================
Amean  elsp-NUMA01_THREADLOCAL   81.79 (0.00%)  67.74 *  17.18%*

Duration User       54832.73    47379.67
Duration System        75.00      185.75
Duration Elapsed      576.72      476.09

Ops NUMA PTE updates                  394429.00    11121044.00
Ops NUMA hint faults                    1001.00     8906404.00
Ops NUMA pages migrated                  288.00     2998694.00
Ops AutoNUMA cost                          7.77       44666.84

Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/2ea7cbce80ac7c62e90cbfb9653a7972f902439f.1697816692.git.raghavendra.kt@amd.com
---
 include/linux/mm_types.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 957ce38768b2..950df415d7de 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -600,6 +600,9 @@ struct vma_numab_state {
 	 */
 	unsigned long pids_active[2];
 
+	/* MM scan sequence ID when scan first started after VMA creation */
+	int start_scan_seq;
+
 	/*
 	 * MM scan sequence ID when the VMA was last completely scanned.
 	 * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
-- 
cgit v1.2.3


From 2227a957e1d5b1941be4e4207879ec74f4bb37f8 Mon Sep 17 00:00:00 2001
From: Abel Wu <wuyun.abel@bytedance.com>
Date: Wed, 15 Nov 2023 11:36:45 +0800
Subject: sched/eevdf: Sort the rbtree by virtual deadline

Sort the task timeline by virtual deadline and keep the min_vruntime
in the augmented tree, so we can avoid doubling the worst case cost
and make full use of the cached leftmost node to enable O(1) fastpath
picking in next patch.

Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231115033647.80785-3-wuyun.abel@bytedance.com
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..cd56d4018527 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -553,7 +553,7 @@ struct sched_entity {
 	struct load_weight		load;
 	struct rb_node			run_node;
 	u64				deadline;
-	u64				min_deadline;
+	u64				min_vruntime;
 
 	struct list_head		group_node;
 	unsigned int			on_rq;
-- 
cgit v1.2.3


From 5d69eca542ee17c618f9a55da52191d5e28b435f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 4 Nov 2023 11:59:18 +0100
Subject: sched: Unify runtime accounting across classes

All classes use sched_entity::exec_start to track runtime and have
copies of the exact same code around to compute runtime.

Collapse all that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/54d148a144f26d9559698c4dd82d8859038a7380.1699095159.git.bristot@kernel.org
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cd56d4018527..44b46d9743bf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,7 +523,7 @@ struct sched_statistics {
 	u64				block_max;
 	s64				sum_block_runtime;
 
-	u64				exec_max;
+	s64				exec_max;
 	u64				slice_max;
 
 	u64				nr_migrations_cold;
-- 
cgit v1.2.3


From 63ba8422f876e32ee564ea95da9a7313b13ff0a1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 4 Nov 2023 11:59:21 +0100
Subject: sched/deadline: Introduce deadline servers

Low priority tasks (e.g., SCHED_OTHER) can suffer starvation if tasks
with higher priority (e.g., SCHED_FIFO) monopolize CPU(s).

RT Throttling has been introduced a while ago as a (mostly debug)
countermeasure one can utilize to reserve some CPU time for low priority
tasks (usually background type of work, e.g. workqueues, timers, etc.).
It however has its own problems (see documentation) and the undesired
effect of unconditionally throttling FIFO tasks even when no lower
priority activity needs to run (there are mechanisms to fix this issue
as well, but, again, with their own problems).

Introduce deadline servers to service low priority tasks needs under
starvation conditions. Deadline servers are built extending SCHED_DEADLINE
implementation to allow 2-level scheduling (a sched_deadline entity
becomes a container for lower priority scheduling entities).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/4968601859d920335cf85822eb573a5f179f04b8.1699095159.git.bristot@kernel.org
---
 include/linux/sched.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 44b46d9743bf..8d258162deb0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,11 +63,13 @@ struct robust_list_head;
 struct root_domain;
 struct rq;
 struct sched_attr;
+struct sched_dl_entity;
 struct seq_file;
 struct sighand_struct;
 struct signal_struct;
 struct task_delay_info;
 struct task_group;
+struct task_struct;
 struct user_event_mm;
 
 /*
@@ -607,6 +609,9 @@ struct sched_rt_entity {
 #endif
 } __randomize_layout;
 
+typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+
 struct sched_dl_entity {
 	struct rb_node			rb_node;
 
@@ -654,6 +659,7 @@ struct sched_dl_entity {
 	unsigned int			dl_yielded        : 1;
 	unsigned int			dl_non_contending : 1;
 	unsigned int			dl_overrun	  : 1;
+	unsigned int			dl_server         : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
@@ -668,7 +674,20 @@ struct sched_dl_entity {
 	 * timer is needed to decrease the active utilization at the correct
 	 * time.
 	 */
-	struct hrtimer inactive_timer;
+	struct hrtimer			inactive_timer;
+
+	/*
+	 * Bits for DL-server functionality. Also see the comment near
+	 * dl_server_update().
+	 *
+	 * @rq the runqueue this server is for
+	 *
+	 * @server_has_tasks() returns true if @server_pick return a
+	 * runnable task.
+	 */
+	struct rq			*rq;
+	dl_server_has_tasks_f		server_has_tasks;
+	dl_server_pick_f		server_pick;
 
 #ifdef CONFIG_RT_MUTEXES
 	/*
@@ -795,6 +814,7 @@ struct task_struct {
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;
 	struct sched_dl_entity		dl;
+	struct sched_dl_entity		*dl_server;
 	const struct sched_class	*sched_class;
 
 #ifdef CONFIG_SCHED_CORE
-- 
cgit v1.2.3


From e4ab322fbaaaf84b23d6cb0e3317a7f68baf36dc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sun, 17 Sep 2023 13:22:17 +0200
Subject: cleanup: Add conditional guard support

Adds:

 - DEFINE_GUARD_COND() / DEFINE_LOCK_GUARD_1_COND() to extend existing
   guards with conditional lock primitives, eg. mutex_trylock(),
   mutex_lock_interruptible().

   nb. both primitives allow NULL 'locks', which cause the lock to
       fail (obviously).

 - extends scoped_guard() to not take the body when the the
   conditional guard 'fails'. eg.

     scoped_guard (mutex_intr, &task->signal_cred_guard_mutex) {
	...
     }

   will only execute the body when the mutex is held.

 - provides scoped_cond_guard(name, fail, args...); which extends
   scoped_guard() to do fail when the lock-acquire fails.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231102110706.460851167%40infradead.org
---
 include/linux/cleanup.h  | 52 +++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/mutex.h    |  3 ++-
 include/linux/rwsem.h    |  8 ++++----
 include/linux/spinlock.h | 15 ++++++++++++++
 4 files changed, 70 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 9f1a9c455b68..c2d09bc4f976 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -125,25 +125,55 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
  *	trivial wrapper around DEFINE_CLASS() above specifically
  *	for locks.
  *
+ * DEFINE_GUARD_COND(name, ext, condlock)
+ *	wrapper around EXTEND_CLASS above to add conditional lock
+ *	variants to a base class, eg. mutex_trylock() or
+ *	mutex_lock_interruptible().
+ *
  * guard(name):
- *	an anonymous instance of the (guard) class
+ *	an anonymous instance of the (guard) class, not recommended for
+ *	conditional locks.
  *
  * scoped_guard (name, args...) { }:
  *	similar to CLASS(name, scope)(args), except the variable (with the
  *	explicit name 'scope') is declard in a for-loop such that its scope is
  *	bound to the next (compound) statement.
  *
+ *	for conditional locks the loop body is skipped when the lock is not
+ *	acquired.
+ *
+ * scoped_cond_guard (name, fail, args...) { }:
+ *      similar to scoped_guard(), except it does fail when the lock
+ *      acquire fails.
+ *
  */
 
 #define DEFINE_GUARD(_name, _type, _lock, _unlock) \
-	DEFINE_CLASS(_name, _type, _unlock, ({ _lock; _T; }), _type _T)
+	DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \
+	static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
+	{ return *_T; }
+
+#define DEFINE_GUARD_COND(_name, _ext, _condlock) \
+	EXTEND_CLASS(_name, _ext, \
+		     ({ void *_t = _T; if (_T && !(_condlock)) _t = NULL; _t; }), \
+		     class_##_name##_t _T) \
+	static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
+	{ return class_##_name##_lock_ptr(_T); }
 
 #define guard(_name) \
 	CLASS(_name, __UNIQUE_ID(guard))
 
+#define __guard_ptr(_name) class_##_name##_lock_ptr
+
 #define scoped_guard(_name, args...)					\
 	for (CLASS(_name, scope)(args),					\
-	     *done = NULL; !done; done = (void *)1)
+	     *done = NULL; __guard_ptr(_name)(&scope) && !done; done = (void *)1)
+
+#define scoped_cond_guard(_name, _fail, args...) \
+	for (CLASS(_name, scope)(args), \
+	     *done = NULL; !done; done = (void *)1) \
+		if (!__guard_ptr(_name)(&scope)) _fail; \
+		else
 
 /*
  * Additional helper macros for generating lock guards with types, either for
@@ -152,6 +182,7 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
  *
  * DEFINE_LOCK_GUARD_0(name, lock, unlock, ...)
  * DEFINE_LOCK_GUARD_1(name, type, lock, unlock, ...)
+ * DEFINE_LOCK_GUARD_1_COND(name, ext, condlock)
  *
  * will result in the following type:
  *
@@ -173,6 +204,11 @@ typedef struct {							\
 static inline void class_##_name##_destructor(class_##_name##_t *_T)	\
 {									\
 	if (_T->lock) { _unlock; }					\
+}									\
+									\
+static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T)	\
+{									\
+	return _T->lock;						\
 }
 
 
@@ -201,4 +237,14 @@ __DEFINE_LOCK_GUARD_1(_name, _type, _lock)
 __DEFINE_UNLOCK_GUARD(_name, void, _unlock, __VA_ARGS__)		\
 __DEFINE_LOCK_GUARD_0(_name, _lock)
 
+#define DEFINE_LOCK_GUARD_1_COND(_name, _ext, _condlock)		\
+	EXTEND_CLASS(_name, _ext,					\
+		     ({ class_##_name##_t _t = { .lock = l }, *_T = &_t;\
+		        if (_T->lock && !(_condlock)) _T->lock = NULL;	\
+			_t; }),						\
+		     typeof_member(class_##_name##_t, lock) l)		\
+	static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
+	{ return class_##_name##_lock_ptr(_T); }
+
+
 #endif /* __LINUX_GUARDS_H */
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index a33aa9eb9fc3..95d11308f995 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -221,6 +221,7 @@ extern void mutex_unlock(struct mutex *lock);
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T))
-DEFINE_FREE(mutex, struct mutex *, if (_T) mutex_unlock(_T))
+DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T))
+DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0)
 
 #endif /* __LINUX_MUTEX_H */
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 1dd530ce8b45..9c29689ff505 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -203,11 +203,11 @@ extern void up_read(struct rw_semaphore *sem);
 extern void up_write(struct rw_semaphore *sem);
 
 DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T))
-DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))
-
-DEFINE_FREE(up_read, struct rw_semaphore *, if (_T) up_read(_T))
-DEFINE_FREE(up_write, struct rw_semaphore *, if (_T) up_write(_T))
+DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T))
+DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T) == 0)
 
+DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))
+DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
 
 /*
  * downgrade write lock to read lock
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 31d3d747a9db..ceb56b39c70f 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -507,6 +507,8 @@ DEFINE_LOCK_GUARD_1(raw_spinlock, raw_spinlock_t,
 		    raw_spin_lock(_T->lock),
 		    raw_spin_unlock(_T->lock))
 
+DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock))
+
 DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t,
 		    raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING),
 		    raw_spin_unlock(_T->lock))
@@ -515,23 +517,36 @@ DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t,
 		    raw_spin_lock_irq(_T->lock),
 		    raw_spin_unlock_irq(_T->lock))
 
+DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock))
+
 DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t,
 		    raw_spin_lock_irqsave(_T->lock, _T->flags),
 		    raw_spin_unlock_irqrestore(_T->lock, _T->flags),
 		    unsigned long flags)
 
+DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try,
+			 raw_spin_trylock_irqsave(_T->lock, _T->flags))
+
 DEFINE_LOCK_GUARD_1(spinlock, spinlock_t,
 		    spin_lock(_T->lock),
 		    spin_unlock(_T->lock))
 
+DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock))
+
 DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t,
 		    spin_lock_irq(_T->lock),
 		    spin_unlock_irq(_T->lock))
 
+DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try,
+			 spin_trylock_irq(_T->lock))
+
 DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
 		    spin_lock_irqsave(_T->lock, _T->flags),
 		    spin_unlock_irqrestore(_T->lock, _T->flags),
 		    unsigned long flags)
 
+DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try,
+			 spin_trylock_irqsave(_T->lock, _T->flags))
+
 #undef __LINUX_INSIDE_SPINLOCK_H
 #endif /* __LINUX_SPINLOCK_H */
-- 
cgit v1.2.3


From 1fda5bb66ad8fb24ecb3858e61a13a6548428898 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Fri, 10 Nov 2023 17:39:28 -0800
Subject: bpf: Do not allocate percpu memory at init stage

Kirill Shutemov reported significant percpu memory consumption increase after
booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for
non-fix-size percpu mem allocation"). The percpu memory consumption is
increased from 111MB to 969MB. The number is from /proc/meminfo.

I tried to reproduce the issue with my local VM which at most supports upto
255 cpus. With 252 cpus, without the above commit, the percpu memory
consumption immediately after boot is 57MB while with the above commit the
percpu memory consumption is 231MB.

This is not good since so far percpu memory from bpf memory allocator is not
widely used yet. Let us change pre-allocation in init stage to on-demand
allocation when verifier detects there is a need of percpu memory for bpf
program. With this change, percpu memory consumption after boot can be reduced
signicantly.

  [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/

Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation")
Reported-and-tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231111013928.948838-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 35bff17396c0..6762dac3ef76 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@ extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
 extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
-extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
+extern bool bpf_global_ma_set;
 
 typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
-- 
cgit v1.2.3


From 4aea6a6d61cd6e3df9ed98345638abad1b1e5276 Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Thu, 30 Mar 2023 15:05:38 -0700
Subject: net/mlx5: Query maximum frequency adjustment of the PTP hardware
 clock

Some mlx5 devices do not support the default advertised maximum frequency
adjustment value for the PTP hardware clock that is set by the driver.
These devices need to be queried when initializing the clock functionality
in order to get the maximum supported frequency adjustment value. This
value can be greater than the minimum supported frequency adjustment across
mlx5 devices (50 million ppb).

Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6f3631425f38..ce2e71cd6d2a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10103,7 +10103,10 @@ enum {
 struct mlx5_ifc_mtutc_reg_bits {
 	u8         reserved_at_0[0x5];
 	u8         freq_adj_units[0x3];
-	u8         reserved_at_8[0x14];
+	u8         reserved_at_8[0x3];
+	u8         log_max_freq_adjustment[0x5];
+
+	u8         reserved_at_10[0xc];
 	u8         operation[0x4];
 
 	u8         freq_adjustment[0x20];
-- 
cgit v1.2.3


From 67420501e8681ae18f9f0ea0a69cd2f432100e70 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 11 Nov 2023 17:05:57 -0800
Subject: bpf: generalize reg_set_min_max() to handle non-const register
 comparisons

Generalize bounds adjustment logic of reg_set_min_max() to handle not
just register vs constant case, but in general any register vs any
register cases. For most of the operations it's trivial extension based
on range vs range comparison logic, we just need to properly pick
min/max of a range to compare against min/max of the other range.

For BPF_JSET we keep the original capabilities, just make sure JSET is
integrated in the common framework. This is manifested in the
internal-only BPF_JSET + BPF_X "opcode" to allow for simpler and more
uniform rev_opcode() handling. See the code for details. This allows to
reuse the same code exactly both for TRUE and FALSE branches without
explicitly handling both conditions with custom code.

Note also that now we don't need a special handling of BPF_JEQ/BPF_JNE
case none of the registers are constants. This is now just a normal
generic case handled by reg_set_min_max().

To make tnum handling cleaner, tnum_with_subreg() helper is added, as
that's a common operator when dealing with 32-bit subregister bounds.
This keeps the overall logic much less noisy when it comes to tnums.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231112010609.848406-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/tnum.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index 1c3948a1d6ad..3c13240077b8 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -106,6 +106,10 @@ int tnum_sbin(char *str, size_t size, struct tnum a);
 struct tnum tnum_subreg(struct tnum a);
 /* Returns the tnum with the lower 32-bit subreg cleared */
 struct tnum tnum_clear_subreg(struct tnum a);
+/* Returns the tnum with the lower 32-bit subreg in *reg* set to the lower
+ * 32-bit subreg in *subreg*
+ */
+struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg);
 /* Returns the tnum with the lower 32-bit subreg set to value */
 struct tnum tnum_const_subreg(struct tnum a, u32 value);
 /* Returns true if 32-bit subreg @a is a known constant*/
-- 
cgit v1.2.3


From 5f99f312bd3bedb3b266b0d26376a8c500cdc97f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 11 Nov 2023 17:06:00 -0800
Subject: bpf: add register bounds sanity checks and sanitization

Add simple sanity checks that validate well-formed ranges (min <= max)
across u64, s64, u32, and s32 ranges. Also for cases when the value is
constant (either 64-bit or 32-bit), we validate that ranges and tnums
are in agreement.

These bounds checks are performed at the end of BPF_ALU/BPF_ALU64
operations, on conditional jumps, and for LDX instructions (where subreg
zero/sign extension is probably the most important to check). This
covers most of the interesting cases.

Also, we validate the sanity of the return register when manually
adjusting it for some special helpers.

By default, sanity violation will trigger a warning in verifier log and
resetting register bounds to "unbounded" ones. But to aid development
and debugging, BPF_F_TEST_SANITY_STRICT flag is added, which will
trigger hard failure of verification with -EFAULT on register bounds
violations. This allows selftests to catch such issues. veristat will
also gain a CLI option to enable this behavior.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231112010609.848406-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 24213a99cc79..402b6bc44a1b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -602,6 +602,7 @@ struct bpf_verifier_env {
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	bool test_state_freq;		/* test verifier with different pruning frequency */
+	bool test_sanity_strict;	/* fail verification on sanity violations */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_verifier_state_list *free_list;
-- 
cgit v1.2.3


From dfcb264a01a9199e8338a548731baf5bbe77ef19 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Sat, 4 Nov 2023 16:49:06 +0100
Subject: power: supply: bq27xxx: Stop and start delayed work in suspend and
 resume

This driver uses delayed work to perform periodic battery state read out.
This delayed work is not stopped across suspend and resume cycle. The
read out can occur early in the resume cycle. In case of an I2C variant
of this hardware, that read out triggers I2C transfer. That I2C transfer
may happen while the I2C controller is still suspended, which produces a
WARNING in the kernel log.

Fix this by introducing trivial PM ops, which stop the delayed work before
the system enters suspend, and schedule the delayed work right after the
system resumes.

Signed-off-by: Marek Vasut <marex@denx.de>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20231104154920.68585-1-marex@denx.de
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/power/bq27xxx_battery.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 7c8d65414a70..7d8025fb74b7 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -83,5 +83,6 @@ struct bq27xxx_device_info {
 void bq27xxx_battery_update(struct bq27xxx_device_info *di);
 int bq27xxx_battery_setup(struct bq27xxx_device_info *di);
 void bq27xxx_battery_teardown(struct bq27xxx_device_info *di);
+extern const struct dev_pm_ops bq27xxx_battery_battery_pm_ops;
 
 #endif
-- 
cgit v1.2.3


From 74d016ecc1a7974664e98d1afbf649cd4e0e0423 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 Nov 2023 22:41:27 -0500
Subject: new helper: user_path_locked_at()

Equivalent of kern_path_locked() taking dfd/userland name.
User introduced in the next commit.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/namei.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/namei.h b/include/linux/namei.h
index 3100371b5e32..74e0cc14ebf8 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -66,6 +66,7 @@ extern struct dentry *kern_path_create(int, const char *, struct path *, unsigne
 extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int);
 extern void done_path_create(struct path *, struct dentry *);
 extern struct dentry *kern_path_locked(const char *, struct path *);
+extern struct dentry *user_path_locked_at(int , const char __user *, struct path *);
 int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
 			   struct path *parent, struct qstr *last, int *type,
 			   const struct path *root);
-- 
cgit v1.2.3


From 3185d57cfcd34fadbe28f4ed57a6cb5122277ece Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Tue, 14 Nov 2023 11:42:02 +0100
Subject: indirect_call_wrapper: Fix typo in INDIRECT_CALL_$NR kerneldoc

Fix a small typo in the kerneldoc comment of the INDIRECT_CALL_$NR
macro.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20231114104202.4680-1-tklauser@distanz.ch
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/indirect_call_wrapper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h
index c1c76a70a6ce..adb83a42a6b9 100644
--- a/include/linux/indirect_call_wrapper.h
+++ b/include/linux/indirect_call_wrapper.h
@@ -11,7 +11,7 @@
  *  @__VA_ARGS__: arguments for @f
  *
  * Avoid retpoline overhead for known builtin, checking @f vs each of them and
- * eventually invoking directly the builtin function. The functions are check
+ * eventually invoking directly the builtin function. The functions are checked
  * in the given order. Fallback to the indirect call.
  */
 #define INDIRECT_CALL_1(f, f1, ...)					\
-- 
cgit v1.2.3


From f7f965c982f7954b46db910146a7ffe0fe1eb5e1 Mon Sep 17 00:00:00 2001
From: Tao Zhang <quic_taozha@quicinc.com>
Date: Thu, 28 Sep 2023 14:29:36 +0800
Subject: coresight-tpdm: Introduce TPDM subtype to TPDM driver

Introduce the new subtype of "CORESIGHT_DEV_SUBTYPE_SOURCE_TPDM"
for TPDM components in driver.

Signed-off-by: Tao Zhang <quic_taozha@quicinc.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/1695882586-10306-4-git-send-email-quic_taozha@quicinc.com
---
 include/linux/coresight.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index a269fffaf991..a4cb7dd6ca23 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -64,6 +64,7 @@ enum coresight_dev_subtype_source {
 	CORESIGHT_DEV_SUBTYPE_SOURCE_PROC,
 	CORESIGHT_DEV_SUBTYPE_SOURCE_BUS,
 	CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE,
+	CORESIGHT_DEV_SUBTYPE_SOURCE_TPDM,
 	CORESIGHT_DEV_SUBTYPE_SOURCE_OTHERS,
 };
 
-- 
cgit v1.2.3


From 3fc6350fc8470d42f5e700ecd1c3d90f9dd9fd2d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 13 Nov 2023 13:12:49 +0200
Subject: treewide, spi: Get rid of SPI_MASTER_HALF_DUPLEX

The SPI_MASTER_HALF_DUPLEX is the legacy name of a definition
for a half duplex flag. Since all others had been replaced with
the respective SPI_CONTROLLER prefix get rid of the last one
as well. There is no functional change intended.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # For MMC
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> # for input
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20231113111249.3982461-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 255a0562aea5..7b4baff63c5c 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -1638,8 +1638,6 @@ spi_transfer_is_last(struct spi_controller *ctlr, struct spi_transfer *xfer)
 /* Compatibility layer */
 #define spi_master			spi_controller
 
-#define SPI_MASTER_HALF_DUPLEX		SPI_CONTROLLER_HALF_DUPLEX
-
 #define spi_master_get_devdata(_ctlr)	spi_controller_get_devdata(_ctlr)
 #define spi_master_set_devdata(_ctlr, _data)	\
 	spi_controller_set_devdata(_ctlr, _data)
-- 
cgit v1.2.3


From fac4a535758851215d23d7d92879aeee5035f51d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 8 Aug 2023 19:27:56 +0300
Subject: device property: Add fwnode_property_match_property_string()

Sometimes the users want to match the single value string property
against an array of predefined strings. Create a helper for them.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230808162800.61651-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/property.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 9f2585d705a8..2b8f07fc68a9 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -98,6 +98,18 @@ static inline bool device_is_compatible(const struct device *dev, const char *co
 	return fwnode_device_is_compatible(dev_fwnode(dev), compat);
 }
 
+int fwnode_property_match_property_string(const struct fwnode_handle *fwnode,
+					  const char *propname,
+					  const char * const *array, size_t n);
+
+static inline
+int device_property_match_property_string(const struct device *dev,
+					  const char *propname,
+					  const char * const *array, size_t n)
+{
+	return fwnode_property_match_property_string(dev_fwnode(dev), propname, array, n);
+}
+
 int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode,
 				       const char *prop, const char *nargs_prop,
 				       unsigned int nargs, unsigned int index,
-- 
cgit v1.2.3


From 96fa96e198f9707285003075fbbce7db6a485112 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Wed, 15 Nov 2023 11:39:18 +0000
Subject: net: linkmode: add linkmode_fill() helper

Add a linkmode_fill() helper, which will allow us to convert phylink's
open coded bitmap_fill() operations.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/linkmode.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index 7303b4bc2ce0..287f590ed56b 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -10,6 +10,11 @@ static inline void linkmode_zero(unsigned long *dst)
 	bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
+static inline void linkmode_fill(unsigned long *dst)
+{
+	bitmap_fill(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
 static inline void linkmode_copy(unsigned long *dst, const unsigned long *src)
 {
 	bitmap_copy(dst, src, __ETHTOOL_LINK_MODE_MASK_NBITS);
-- 
cgit v1.2.3


From 9e0be3f50c0e8517d0238b62409c20bcb8cd8785 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Fri, 10 Nov 2023 13:07:22 +0100
Subject: linux/export: clean up the IA-64 KSYM_FUNC macro

With commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"),
there is no need to keep the IA-64 definition of the KSYM_FUNC macro.

Clean up the IA-64 definition of the KSYM_FUNC macro.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/export-internal.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h
index 45fca09b2319..69501e0ec239 100644
--- a/include/linux/export-internal.h
+++ b/include/linux/export-internal.h
@@ -50,9 +50,7 @@
 	    "	.previous"						"\n"	\
 	)
 
-#ifdef CONFIG_IA64
-#define KSYM_FUNC(name)		@fptr(name)
-#elif defined(CONFIG_PARISC) && defined(CONFIG_64BIT)
+#if defined(CONFIG_PARISC) && defined(CONFIG_64BIT)
 #define KSYM_FUNC(name)		P%name
 #else
 #define KSYM_FUNC(name)		name
-- 
cgit v1.2.3


From a9214a8883ceb82df55aa90d1c49ddb85fc1e3d5 Mon Sep 17 00:00:00 2001
From: Etienne Carriere <etienne.carriere@foss.st.com>
Date: Mon, 30 Oct 2023 09:48:10 +0100
Subject: tee: system session

Adds kernel client API function tee_client_system_session() for a client
to request a system service entry in TEE context.

This feature is needed to prevent a system deadlock when several TEE
client applications invoke TEE, consuming all TEE thread contexts
available in the secure world. The deadlock can happen in the OP-TEE
driver for example if all these TEE threads issue an RPC call from TEE
to Linux OS to access an eMMC RPMB partition (TEE secure storage) which
device clock or regulator controller is accessed through an OP-TEE SCMI
services. In that case, Linux SCMI driver must reach OP-TEE SCMI service
without waiting until one of the consumed TEE threads is freed.

Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Co-developed-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Etienne Carriere <etienne.carriere@foss.st.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 include/linux/tee_drv.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 17eb1c5205d3..911ddf92dcee 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -84,6 +84,7 @@ struct tee_param {
  * @release:		release this open file
  * @open_session:	open a new session
  * @close_session:	close a session
+ * @system_session:	declare session as a system session
  * @invoke_func:	invoke a trusted function
  * @cancel_req:		request cancel of an ongoing invoke or open
  * @supp_recv:		called for supplicant to get a command
@@ -100,6 +101,7 @@ struct tee_driver_ops {
 			    struct tee_ioctl_open_session_arg *arg,
 			    struct tee_param *param);
 	int (*close_session)(struct tee_context *ctx, u32 session);
+	int (*system_session)(struct tee_context *ctx, u32 session);
 	int (*invoke_func)(struct tee_context *ctx,
 			   struct tee_ioctl_invoke_arg *arg,
 			   struct tee_param *param);
@@ -429,6 +431,20 @@ int tee_client_open_session(struct tee_context *ctx,
  */
 int tee_client_close_session(struct tee_context *ctx, u32 session);
 
+/**
+ * tee_client_system_session() - Declare session as a system session
+ * @ctx:	TEE Context
+ * @session:	Session id
+ *
+ * This function requests TEE to provision an entry context ready to use for
+ * that session only. The provisioned entry context is used for command
+ * invocation and session closure, not for command cancelling requests.
+ * TEE releases the provisioned context upon session closure.
+ *
+ * Return < 0 on error else 0 if an entry context has been provisioned.
+ */
+int tee_client_system_session(struct tee_context *ctx, u32 session);
+
 /**
  * tee_client_invoke_func() - Invoke a function in a Trusted Application
  * @ctx:	TEE Context
-- 
cgit v1.2.3


From 5e4166461cf66a26f925011d90017da74e410747 Mon Sep 17 00:00:00 2001
From: Yang Hubin <yanghb2019@lzu.edu.cn>
Date: Sat, 4 Nov 2023 00:45:01 -0700
Subject: f2fs: the name of a struct is wrong in a comment.

The macro SUMMARY_SIZE represents the size of the struct f2fs_summary,

instead of the size of the struct summary.

Signed-off-by: Yang Hubin <yanghb2019@lzu.edu.cn>
Signed-off-by: Qian Haolai <qianhl2023@lzu.edu.cn>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 039fe0ce8d83..053137a0fe45 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -441,7 +441,7 @@ struct f2fs_sit_block {
  * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node)
  */
 #define ENTRIES_IN_SUM		(F2FS_BLKSIZE / 8)
-#define	SUMMARY_SIZE		(7)	/* sizeof(struct summary) */
+#define	SUMMARY_SIZE		(7)	/* sizeof(struct f2fs_summary) */
 #define	SUM_FOOTER_SIZE		(5)	/* sizeof(struct summary_footer) */
 #define SUM_ENTRY_SIZE		(SUMMARY_SIZE * ENTRIES_IN_SUM)
 
-- 
cgit v1.2.3


From ff8867af01daa7ea770bebf5f91199b7434b74e5 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 09:14:04 -0800
Subject: bpf: rename BPF_F_TEST_SANITY_STRICT to BPF_F_TEST_REG_INVARIANTS

Rename verifier internal flag BPF_F_TEST_SANITY_STRICT to more neutral
BPF_F_TEST_REG_INVARIANTS. This is a follow up to [0].

A few selftests and veristat need to be adjusted in the same patch as
well.

  [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231112010609.848406-5-andrii@kernel.org/

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231117171404.225508-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 402b6bc44a1b..52a4012b8255 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -602,7 +602,7 @@ struct bpf_verifier_env {
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	bool test_state_freq;		/* test verifier with different pruning frequency */
-	bool test_sanity_strict;	/* fail verification on sanity violations */
+	bool test_reg_invariants;	/* fail verification on register invariants violations */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_verifier_state_list *free_list;
-- 
cgit v1.2.3


From 2eea9ce4310d8c0f8ef1dbe7b0e7d9219ff02b97 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 25 Oct 2023 16:02:00 +0200
Subject: mounts: keep list of mounts in an rbtree

When adding a mount to a namespace insert it into an rbtree rooted in the
mnt_namespace instead of a linear list.

The mnt.mnt_list is still used to set up the mount tree and for
propagation, but not after the mount has been added to a namespace.  Hence
mnt_list can live in union with rb_node.  Use MNT_ONRB mount flag to
validate that the mount is on the correct list.

This allows removing the cursor used for reading /proc/$PID/mountinfo.  The
mnt_id_unique of the next mount can be used as an index into the seq file.

Tested by inserting 100k bind mounts, unsharing the mount namespace, and
unmounting.  No performance regressions have been observed.

For the last mount in the 100k list the statmount() call was more than 100x
faster due to the mount ID lookup not having to do a linear search.  This
patch makes the overhead of mount ID lookup non-observable in this range.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20231025140205.3586473-3-mszeredi@redhat.com
Reviewed-by: Ian Kent <raven@themaw.net>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/mount.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mount.h b/include/linux/mount.h
index ac3dd2876197..c34c18b4e8f3 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -50,8 +50,7 @@ struct path;
 #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
 
 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
-			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \
-			    MNT_CURSOR)
+			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB)
 
 #define MNT_INTERNAL	0x4000
 
@@ -65,7 +64,7 @@ struct path;
 #define MNT_SYNC_UMOUNT		0x2000000
 #define MNT_MARKED		0x4000000
 #define MNT_UMOUNT		0x8000000
-#define MNT_CURSOR		0x10000000
+#define MNT_ONRB		0x10000000
 
 struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */
-- 
cgit v1.2.3


From 982c3b3058433f20aba9fb032599cee5dfc17328 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Oct 2023 15:01:08 +0200
Subject: bdev: rename freeze and thaw helpers

We have bdev_mark_dead() etc and we're going to move block device
freezing to holder ops in the next patch. Make the naming consistent:

* freeze_bdev() -> bdev_freeze()
* thaw_bdev()   -> bdev_thaw()

Also document the return code.

Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-2-599c19f4faac@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blkdev.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 51fa7ffdee83..7a3da7f44afb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1541,8 +1541,8 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev)
 }
 #endif /* CONFIG_BLOCK */
 
-int freeze_bdev(struct block_device *bdev);
-int thaw_bdev(struct block_device *bdev);
+int bdev_freeze(struct block_device *bdev);
+int bdev_thaw(struct block_device *bdev);
 
 struct io_comp_batch {
 	struct request *req_list;
-- 
cgit v1.2.3


From a30561a9be69d446d8d542a4f9735fe5ca9573df Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Oct 2023 15:01:10 +0200
Subject: bdev: add freeze and thaw holder operations

Add block device freeze and thaw holder operations. Follow-up patches
will implement block device freeze and thaw based on stuct
blk_holder_ops.

Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-4-599c19f4faac@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blkdev.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7a3da7f44afb..1bc776335ff8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1468,6 +1468,16 @@ struct blk_holder_ops {
 	 * Sync the file system mounted on the block device.
 	 */
 	void (*sync)(struct block_device *bdev);
+
+	/*
+	 * Freeze the file system mounted on the block device.
+	 */
+	int (*freeze)(struct block_device *bdev);
+
+	/*
+	 * Thaw the file system mounted on the block device.
+	 */
+	int (*thaw)(struct block_device *bdev);
 };
 
 extern const struct blk_holder_ops fs_holder_ops;
-- 
cgit v1.2.3


From 49ef8832fb1a9e0da0020eb17480fd286433bc13 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 27 Sep 2023 15:21:16 +0200
Subject: bdev: implement freeze and thaw holder operations

The old method of implementing block device freeze and thaw operations
required us to rely on get_active_super() to walk the list of all
superblocks on the system to find any superblock that might use the
block device. This is wasteful and not very pleasant overall.

Now that we can finally go straight from block device to owning
superblock things become way simpler.

Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-5-599c19f4faac@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blk_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d5c5e59ddbd2..88e1848b0869 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -57,7 +57,7 @@ struct block_device {
 	const struct blk_holder_ops *bd_holder_ops;
 	struct mutex		bd_holder_lock;
 	/* The counter of freeze processes */
-	int			bd_fsfreeze_count;
+	atomic_t		bd_fsfreeze_count;
 	int			bd_holders;
 	struct kobject		*bd_holder_dir;
 
-- 
cgit v1.2.3


From 434f8d8299f2a0c97578f77ab23a70cd0ae56544 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Oct 2023 15:01:12 +0200
Subject: fs: remove get_active_super()

This function is now unused so remove it. One less function that uses
the global superblock list.

Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-6-599c19f4faac@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..7dc6c1bf5f55 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3121,7 +3121,6 @@ extern int vfs_readlink(struct dentry *, char __user *, int);
 extern struct file_system_type *get_filesystem(struct file_system_type *fs);
 extern void put_filesystem(struct file_system_type *fs);
 extern struct file_system_type *get_fs_type(const char *name);
-extern struct super_block *get_active_super(struct block_device *bdev);
 extern void drop_super(struct super_block *sb);
 extern void drop_super_exclusive(struct super_block *sb);
 extern void iterate_supers(void (*)(struct super_block *, void *), void *);
-- 
cgit v1.2.3


From 90f95dc415de23267b888f8238c4a19fa0f66b89 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Oct 2023 15:01:13 +0200
Subject: super: remove bd_fsfreeze_sb

Remove bd_fsfreeze_sb as it's now unused and can be removed. Also move
bd_fsfreeze_count down to not have it weirdly placed in the middle of
the holder fields.

Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-7-599c19f4faac@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Suggested-by: Jan Kara <jack@suse.cz>
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blk_types.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 88e1848b0869..749203277fee 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -56,14 +56,11 @@ struct block_device {
 	void *			bd_holder;
 	const struct blk_holder_ops *bd_holder_ops;
 	struct mutex		bd_holder_lock;
-	/* The counter of freeze processes */
-	atomic_t		bd_fsfreeze_count;
 	int			bd_holders;
 	struct kobject		*bd_holder_dir;
 
-	/* Mutex for freeze */
-	struct mutex		bd_fsfreeze_mutex;
-	struct super_block	*bd_fsfreeze_sb;
+	atomic_t		bd_fsfreeze_count; /* number of freeze requests */
+	struct mutex		bd_fsfreeze_mutex; /* serialize freeze/thaw */
 
 	struct partition_meta_info *bd_meta_info;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
-- 
cgit v1.2.3


From e419cf3ebaee694a826ddcfb350f1b1ebaf1e599 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Oct 2023 15:01:16 +0200
Subject: blkdev: comment fs_holder_ops

Add a comment to @fs_holder_ops that @holder must point to a superblock.

Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-10-599c19f4faac@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blkdev.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1bc776335ff8..abf71cce785c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1480,6 +1480,11 @@ struct blk_holder_ops {
 	int (*thaw)(struct block_device *bdev);
 };
 
+/*
+ * For filesystems using @fs_holder_ops, the @holder argument passed to
+ * helpers used to open and claim block devices via
+ * bd_prepare_to_claim() must point to a superblock.
+ */
 extern const struct blk_holder_ops fs_holder_ops;
 
 /*
-- 
cgit v1.2.3


From 7366f8b6fc6aa21c4199cb5d337b023df69745b0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sat, 4 Nov 2023 15:00:13 +0100
Subject: fs: handle freezing from multiple devices

Before [1] freezing a filesystems through the block layer only worked
for the main block device as the owning superblock of additional block
devices could not be found. Any filesystem that made use of multiple
block devices would only be freezable via it's main block device.

For example, consider xfs over device mapper with /dev/dm-0 as main
block device and /dev/dm-1 as external log device. Two freeze requests
before [1]:

(1) dmsetup suspend /dev/dm-0 on the main block device

    bdev_freeze(dm-0)
    -> dm-0->bd_fsfreeze_count++
    -> freeze_super(xfs-sb)

    The owning superblock is found and the filesystem gets frozen.
    Returns 0.

(2) dmsetup suspend /dev/dm-1 on the log device

    bdev_freeze(dm-1)
    -> dm-1->bd_fsfreeze_count++

    The owning superblock isn't found and only the block device freeze
    count is incremented. Returns 0.

Two freeze requests after [1]:

(1') dmsetup suspend /dev/dm-0 on the main block device

    bdev_freeze(dm-0)
    -> dm-0->bd_fsfreeze_count++
    -> freeze_super(xfs-sb)

    The owning superblock is found and the filesystem gets frozen.
    Returns 0.

(2') dmsetup suspend /dev/dm-1 on the log device

    bdev_freeze(dm-0)
    -> dm-0->bd_fsfreeze_count++
    -> freeze_super(xfs-sb)

    The owning superblock is found and the filesystem gets frozen.
    Returns -EBUSY.

When (2') is called we initiate a freeze from another block device of
the same superblock. So we increment the bd_fsfreeze_count for that
additional block device. But we now also find the owning superblock for
additional block devices and call freeze_super() again which reports
-EBUSY.

This can be reproduced through xfstests via:

    mkfs.xfs -f -m crc=1,reflink=1,rmapbt=1, -i sparse=1 -lsize=1g,logdev=/dev/nvme1n1p4 /dev/nvme1n1p3
    mkfs.xfs -f -m crc=1,reflink=1,rmapbt=1, -i sparse=1 -lsize=1g,logdev=/dev/nvme1n1p6 /dev/nvme1n1p5

    FSTYP=xfs
    export TEST_DEV=/dev/nvme1n1p3
    export TEST_DIR=/mnt/test
    export TEST_LOGDEV=/dev/nvme1n1p4
    export SCRATCH_DEV=/dev/nvme1n1p5
    export SCRATCH_MNT=/mnt/scratch
    export SCRATCH_LOGDEV=/dev/nvme1n1p6
    export USE_EXTERNAL=yes

    sudo ./check generic/311

Current semantics allow two concurrent freezers: one initiated from
userspace via FREEZE_HOLDER_USERSPACE and one initiated from the kernel
via FREEZE_HOLDER_KERNEL. If there are multiple concurrent freeze
requests from either FREEZE_HOLDER_USERSPACE or FREEZE_HOLDER_KERNEL
-EBUSY is returned.

We need to preserve these semantics because as they are uapi via
FIFREEZE and FITHAW ioctl()s. IOW, freezes don't nest for FIFREEZE and
FITHAW. Other kernels consumers rely on non-nesting freezes as well.

With freezes initiated from the block layer freezes need to nest if the
same superblock is frozen via multiple devices. So we need to start
counting the number of freeze requests.

If FREEZE_MAY_NEST is passed alongside FREEZE_HOLDER_KERNEL or
FREEZE_HOLDER_USERSPACE we allow the caller to nest freeze calls.

To accommodate the old semantics we split the freeze counter into two
counting kernel initiated and userspace initiated freezes separately. We
can then also stop recording FREEZE_HOLDER_* in struct sb_writers.

We also simplify freezing by making all concurrent freezers share a
single active superblock reference count instead of having separate
references for kernel and userspace. I don't see why we would need two
active reference counts. Neither FREEZE_HOLDER_KERNEL nor
FREEZE_HOLDER_USERSPACE can put the active reference as long as they are
concurrent freezers anwyay. That was already true before we allowed
nesting freezes.

Survives various fstests runs with different options including the
reproducer, online scrub, and online repair, fsfreze, and so on. Also
survives blktests.

Link: https://lore.kernel.org/linux-block/87bkccnwxc.fsf@debian-BULLSEYE-live-builder-AMD64
Link: https://lore.kernel.org/r/20231104-vfs-multi-device-freeze-v2-2-5b5b69626eac@kernel.org
Fixes: 288d8706abfc ("bdev: implement freeze and thaw holder operations") [1] # no backport needed
Tested-by: Chandan Babu R <chandanbabu@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reported-by: Chandan Babu R <chandanbabu@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7dc6c1bf5f55..b2a3f1c61c19 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1185,7 +1185,8 @@ enum {
 
 struct sb_writers {
 	unsigned short			frozen;		/* Is sb frozen? */
-	unsigned short			freeze_holders;	/* Who froze fs? */
+	int				freeze_kcount;	/* How many kernel freeze requests? */
+	int				freeze_ucount;	/* How many userspace freeze requests? */
 	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
 };
 
@@ -2051,9 +2052,24 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
 					struct file *dst_file, loff_t dst_pos,
 					loff_t len, unsigned int remap_flags);
 
+/**
+ * enum freeze_holder - holder of the freeze
+ * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
+ * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
+ * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
+ *
+ * Indicate who the owner of the freeze or thaw request is and whether
+ * the freeze needs to be exclusive or can nest.
+ * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
+ * same holder aren't allowed. It is however allowed to hold a single
+ * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
+ * the same time. This is relied upon by some filesystems during online
+ * repair or similar.
+ */
 enum freeze_holder {
 	FREEZE_HOLDER_KERNEL	= (1U << 0),
 	FREEZE_HOLDER_USERSPACE	= (1U << 1),
+	FREEZE_MAY_NEST		= (1U << 2),
 };
 
 struct super_operations {
-- 
cgit v1.2.3


From cd34758c5238ae6976b10fe15bba7031b409c969 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2023 18:43:07 +0100
Subject: block: Remove blkdev_get_by_*() functions

blkdev_get_by_*() and blkdev_put() functions are now unused. Remove
them.

Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20231101174325.10596-2-jack@suse.cz
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blkdev.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index abf71cce785c..7afc10315dd5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1500,10 +1500,6 @@ struct bdev_handle {
 	blk_mode_t mode;
 };
 
-struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
-		const struct blk_holder_ops *hops);
-struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
-		void *holder, const struct blk_holder_ops *hops);
 struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		const struct blk_holder_ops *hops);
 struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
@@ -1511,7 +1507,6 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
 int bd_prepare_to_claim(struct block_device *bdev, void *holder,
 		const struct blk_holder_ops *hops);
 void bd_abort_claiming(struct block_device *bdev, void *holder);
-void blkdev_put(struct block_device *bdev, void *holder);
 void bdev_release(struct bdev_handle *handle);
 
 /* just for blk-cgroup, don't use elsewhere */
-- 
cgit v1.2.3


From ed5cc702d311c14b653323d76062b0294effa66e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2023 18:43:08 +0100
Subject: block: Add config option to not allow writing to mounted devices

Writing to mounted devices is dangerous and can lead to filesystem
corruption as well as crashes. Furthermore syzbot comes with more and
more involved examples how to corrupt block device under a mounted
filesystem leading to kernel crashes and reports we can do nothing
about. Add tracking of writers to each block device and a kernel cmdline
argument which controls whether other writeable opens to block devices
open with BLK_OPEN_RESTRICT_WRITES flag are allowed. We will make
filesystems use this flag for used devices.

Note that this effectively only prevents modification of the particular
block device's page cache by other writers. The actual device content
can still be modified by other means - e.g. by issuing direct scsi
commands, by doing writes through devices lower in the storage stack
(e.g. in case loop devices, DM, or MD are involved) etc. But blocking
direct modifications of the block device page cache is enough to give
filesystems a chance to perform data validation when loading data from
the underlying storage and thus prevent kernel crashes.

Syzbot can use this cmdline argument option to avoid uninteresting
crashes. Also users whose userspace setup does not need writing to
mounted block devices can set this option for hardening.

Link: https://lore.kernel.org/all/60788e5d-5c7c-1142-e554-c21d709acfd9@linaro.org
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20231101174325.10596-3-jack@suse.cz
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blk_types.h | 1 +
 include/linux/blkdev.h    | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 749203277fee..52e264d5a830 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -66,6 +66,7 @@ struct block_device {
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	bool			bd_make_it_fail;
 #endif
+	int			bd_writers;
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
 	 * path
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7afc10315dd5..0e0c0186aa32 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -124,6 +124,8 @@ typedef unsigned int __bitwise blk_mode_t;
 #define BLK_OPEN_NDELAY		((__force blk_mode_t)(1 << 3))
 /* open for "writes" only for ioctls (specialy hack for floppy.c) */
 #define BLK_OPEN_WRITE_IOCTL	((__force blk_mode_t)(1 << 4))
+/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
+#define BLK_OPEN_RESTRICT_WRITES	((__force blk_mode_t)(1 << 5))
 
 struct gendisk {
 	/*
-- 
cgit v1.2.3


From 6f861765464f43a71462d52026fbddfc858239a5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2023 18:43:10 +0100
Subject: fs: Block writes to mounted block devices

Ask block layer to block writes to block devices mounted by filesystems.

Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20231101174325.10596-5-jack@suse.cz
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blkdev.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0e0c0186aa32..9f6c3373f9fc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1494,7 +1494,8 @@ extern const struct blk_holder_ops fs_holder_ops;
  * as stored in sb->s_flags.
  */
 #define sb_open_mode(flags) \
-	(BLK_OPEN_READ | (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
+	(BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \
+	 (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
 
 struct bdev_handle {
 	struct block_device *bdev;
-- 
cgit v1.2.3


From db3db63b1d17c98f69e894edaa2b0b364ecde7a9 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Sat, 4 Nov 2023 23:11:17 +0100
Subject: vfs: remove a redundant might_sleep in wait_on_inode

wait_on_bit already does it.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20231104221117.2584708-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/writeback.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 083387c00f0c..6d0a14f7019d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -193,7 +193,6 @@ void inode_io_list_del(struct inode *inode);
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
 {
-	might_sleep();
 	wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
 }
 
-- 
cgit v1.2.3


From 446e2305827b76e8081057ce56bbd2703b4da8a9 Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Tue, 14 Nov 2023 12:28:29 +0100
Subject: net: Convert PHYs hwtstamp callback to use kernel_hwtstamp_config

The PHYs hwtstamp callback are still getting the timestamp config from
ifreq and using copy_from/to_user.
Get rid of these functions by using timestamp configuration in parameter.
This also allow to move on to kernel_hwtstamp_config and be similar to
net devices using the new ndo_hwstamp_get/set.

This adds the possibility to manipulate the timestamp configuration
from the kernel which was not possible with the copy_from/to_user.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mii_timestamper.h | 4 +++-
 include/linux/phy.h             | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h
index fa940bbaf8ae..26b04f73f214 100644
--- a/include/linux/mii_timestamper.h
+++ b/include/linux/mii_timestamper.h
@@ -9,6 +9,7 @@
 #include <linux/device.h>
 #include <linux/ethtool.h>
 #include <linux/skbuff.h>
+#include <linux/net_tstamp.h>
 
 struct phy_device;
 
@@ -51,7 +52,8 @@ struct mii_timestamper {
 			 struct sk_buff *skb, int type);
 
 	int  (*hwtstamp)(struct mii_timestamper *mii_ts,
-			 struct ifreq *ifreq);
+			 struct kernel_hwtstamp_config *kernel_config,
+			 struct netlink_ext_ack *extack);
 
 	void (*link_state)(struct mii_timestamper *mii_ts,
 			   struct phy_device *phydev);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3cc52826f18e..e5f1f41e399c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1560,9 +1560,11 @@ static inline bool phy_has_txtstamp(struct phy_device *phydev)
 	return phydev && phydev->mii_ts && phydev->mii_ts->txtstamp;
 }
 
-static inline int phy_hwtstamp(struct phy_device *phydev, struct ifreq *ifr)
+static inline int phy_hwtstamp(struct phy_device *phydev,
+			       struct kernel_hwtstamp_config *cfg,
+			       struct netlink_ext_ack *extack)
 {
-	return phydev->mii_ts->hwtstamp(phydev->mii_ts, ifr);
+	return phydev->mii_ts->hwtstamp(phydev->mii_ts, cfg, extack);
 }
 
 static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb,
-- 
cgit v1.2.3


From b8768dc4077712915f045ba1b198f521493c7914 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 14 Nov 2023 12:28:31 +0100
Subject: net: ethtool: Refactor identical get_ts_info implementations.

The vlan, macvlan and the bonding drivers call their "real" device driver
in order to report the time stamping capabilities.  Provide a core
ethtool helper function to avoid copy/paste in the stack.

Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 689028257fcc..c2bb74143eda 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -1043,6 +1043,14 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add,
 	return -EINVAL;
 }
 
+/**
+ * ethtool_get_ts_info_by_layer - Obtains time stamping capabilities from the MAC or PHY layer.
+ * @dev: pointer to net_device structure
+ * @info: buffer to hold the result
+ * Returns zero on success, non-zero otherwise.
+ */
+int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info *info);
+
 /**
  * ethtool_sprintf - Write formatted string to ethtool string data
  * @data: Pointer to a pointer to the start of string to update
-- 
cgit v1.2.3


From 011dd3b3f83f9c89605c640424e05845b84f2dad Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Tue, 14 Nov 2023 12:28:33 +0100
Subject: net: Make dev_set_hwtstamp_phylib accessible

Make the dev_set_hwtstamp_phylib function accessible in prevision to use
it from ethtool to reset the tstamp current configuration.

Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a16c9cc063fe..2d840d7056f2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3942,6 +3942,9 @@ int generic_hwtstamp_get_lower(struct net_device *dev,
 int generic_hwtstamp_set_lower(struct net_device *dev,
 			       struct kernel_hwtstamp_config *kernel_cfg,
 			       struct netlink_ext_ack *extack);
+int dev_set_hwtstamp_phylib(struct net_device *dev,
+			    struct kernel_hwtstamp_config *cfg,
+			    struct netlink_ext_ack *extack);
 int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata);
 unsigned int dev_get_flags(const struct net_device *);
 int __dev_change_flags(struct net_device *dev, unsigned int flags,
-- 
cgit v1.2.3


From 51bdf3165f012827644c474a6d905baa3de3f1ea Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Tue, 14 Nov 2023 12:28:40 +0100
Subject: net: Replace hwtstamp_source by timestamping layer

Replace hwtstamp_source which is only used by the kernel_hwtstamp_config
structure by the more widely use timestamp_layer structure. This is done
to prepare the support of selectable timestamping source.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net_tstamp.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h
index eb01c37e71e0..bb289c2ad376 100644
--- a/include/linux/net_tstamp.h
+++ b/include/linux/net_tstamp.h
@@ -5,11 +5,6 @@
 
 #include <uapi/linux/net_tstamp.h>
 
-enum hwtstamp_source {
-	HWTSTAMP_SOURCE_NETDEV,
-	HWTSTAMP_SOURCE_PHYLIB,
-};
-
 /**
  * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config
  *
@@ -20,8 +15,8 @@ enum hwtstamp_source {
  *	a legacy implementation of a lower driver
  * @copied_to_user: request was passed to a legacy implementation which already
  *	copied the ioctl request back to user space
- * @source: indication whether timestamps should come from the netdev or from
- *	an attached phylib PHY
+ * @source: indication whether timestamps should come from software, the netdev
+ *	or from an attached phylib PHY
  *
  * Prefer using this structure for in-kernel processing of hardware
  * timestamping configuration, over the inextensible struct hwtstamp_config
@@ -33,7 +28,7 @@ struct kernel_hwtstamp_config {
 	int rx_filter;
 	struct ifreq *ifr;
 	bool copied_to_user;
-	enum hwtstamp_source source;
+	enum timestamping_layer source;
 };
 
 static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg,
-- 
cgit v1.2.3


From 0f7f463d4821a4f52fa5c0a961389e651d50c384 Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Tue, 14 Nov 2023 12:28:41 +0100
Subject: net: Change the API of PHY default timestamp to MAC

Change the API to select MAC default time stamping instead of the PHY.
Indeed the PHY is closer to the wire therefore theoretically it has less
delay than the MAC timestamping but the reality is different. Due to lower
time stamping clock frequency, latency in the MDIO bus and no PHC hardware
synchronization between different PHY, the PHY PTP is often less precise
than the MAC. The exception is for PHY designed specially for PTP case but
these devices are not very widespread. For not breaking the compatibility I
introduce a default_timestamp flag in phy_device that is set by the phy
driver to know we are using the old API behavior.

The phy_set_timestamp function is called at each call of phy_attach_direct.
In case of MAC driver using phylink this function is called when the
interface is turned up. Then if the interface goes down and up again the
last choice of timestamp will be overwritten by the default choice.
A solution could be to cache the timestamp status but it can bring other
issues. In case of SFP, if we change the module, it doesn't make sense to
blindly re-set the timestamp back to PHY, if the new module has a PHY with
mediocre timestamping capabilities.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++++
 include/linux/phy.h       | 4 ++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2d840d7056f2..f020d2790c12 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -47,6 +47,7 @@
 #include <uapi/linux/if_bonding.h>
 #include <uapi/linux/pkt_cls.h>
 #include <uapi/linux/netdev.h>
+#include <uapi/linux/net_tstamp.h>
 #include <linux/hashtable.h>
 #include <linux/rbtree.h>
 #include <net/net_trackers.h>
@@ -2074,6 +2075,8 @@ enum netdev_ml_priv_type {
  *
  *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
  *		   where the clock is recovered.
+ *	@ts_layer:	Tracks which network device
+ *			performs packet	time stamping.
  *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
@@ -2435,6 +2438,8 @@ struct net_device {
 #if IS_ENABLED(CONFIG_DPLL)
 	struct dpll_pin		*dpll_pin;
 #endif
+
+	enum timestamping_layer	ts_layer;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e5f1f41e399c..317def2a7843 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -604,6 +604,8 @@ struct macsec_ops;
  *                 handling shall be postponed until PHY has resumed
  * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended,
  *             requiring a rerun of the interrupt handler after resume
+ * @default_timestamp: Flag indicating whether we are using the phy
+ *		       timestamp as the default one
  * @interface: enum phy_interface_t value
  * @skb: Netlink message for cable diagnostics
  * @nest: Netlink nest used for cable diagnostics
@@ -667,6 +669,8 @@ struct phy_device {
 	unsigned irq_suspended:1;
 	unsigned irq_rerun:1;
 
+	unsigned default_timestamp:1;
+
 	int rate_matching;
 
 	enum phy_state state;
-- 
cgit v1.2.3


From db840d389bad60ce6f3aadc1079da13e7e993a16 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 19:46:16 -0800
Subject: bpf: move verbose_linfo() into kernel/bpf/log.c

verifier.c is huge. Let's try to move out parts that are logging-related
into log.c, as we previously did with bpf_log() and other related stuff.
This patch moves line info verbose output routines: it's pretty
self-contained and isolated code, so there is no problem with this.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231118034623.3320920-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 52a4012b8255..d896f3db6a22 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -680,6 +680,10 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
 void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
 int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual);
 
+__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
+				  u32 insn_off,
+				  const char *prefix_fmt, ...);
+
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state *cur = env->cur_state;
-- 
cgit v1.2.3


From 42feb6620accded89cad5f455665e21281813d79 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 19:46:17 -0800
Subject: bpf: move verifier state printing code to kernel/bpf/log.c

Move a good chunk of code from verifier.c to log.c: verifier state
verbose printing logic. This is an important and very much
logging/debugging oriented code. It fits the overlall log.c's focus on
verifier logging, and moving it allows to keep growing it without
unnecessarily adding to verifier.c code that otherwise contains a core
verification logic.

There are not many shared dependencies between this code and the rest of
verifier.c code, except a few single-line helpers for various register
type checks and a bit of state "scratching" helpers. We move all such
trivial helpers into include/bpf/bpf_verifier.h as static inlines.

No functional changes in this patch.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231118034623.3320920-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 72 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d896f3db6a22..39edc76f436e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -783,4 +783,76 @@ static inline bool bpf_type_has_unsafe_modifiers(u32 type)
 	return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS;
 }
 
+static inline bool type_is_ptr_alloc_obj(u32 type)
+{
+	return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
+}
+
+static inline bool type_is_non_owning_ref(u32 type)
+{
+	return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
+}
+
+static inline bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+	type = base_type(type);
+	return type == PTR_TO_PACKET ||
+	       type == PTR_TO_PACKET_META;
+}
+
+static inline bool type_is_sk_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_SOCKET ||
+		type == PTR_TO_SOCK_COMMON ||
+		type == PTR_TO_TCP_SOCK ||
+		type == PTR_TO_XDP_SOCK;
+}
+
+static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
+{
+	env->scratched_regs |= 1U << regno;
+}
+
+static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
+{
+	env->scratched_stack_slots |= 1ULL << spi;
+}
+
+static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
+{
+	return (env->scratched_regs >> regno) & 1;
+}
+
+static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
+{
+	return (env->scratched_stack_slots >> regno) & 1;
+}
+
+static inline bool verifier_state_scratched(const struct bpf_verifier_env *env)
+{
+	return env->scratched_regs || env->scratched_stack_slots;
+}
+
+static inline void mark_verifier_state_clean(struct bpf_verifier_env *env)
+{
+	env->scratched_regs = 0U;
+	env->scratched_stack_slots = 0ULL;
+}
+
+/* Used for printing the entire verifier state. */
+static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env)
+{
+	env->scratched_regs = ~0U;
+	env->scratched_stack_slots = ~0ULL;
+}
+
+const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type);
+const char *dynptr_type_str(enum bpf_dynptr_type type);
+const char *iter_type_str(const struct btf *btf, u32 btf_id);
+const char *iter_state_str(enum bpf_iter_state state);
+
+void print_verifier_state(struct bpf_verifier_env *env,
+			  const struct bpf_func_state *state, bool print_all);
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state);
+
 #endif /* _LINUX_BPF_VERIFIER_H */
-- 
cgit v1.2.3


From acfde6e8abee6b23e53b08606f861d9124288030 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 4 Nov 2023 00:10:18 -0400
Subject: struct dentry: get rid of randomize_layout idiocy

This is beyond ridiculous.  There is a reason why that thing is
cacheline-aligned...

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 3da2f0545d5d..1d9f7f132055 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -111,7 +111,7 @@ struct dentry {
 		struct hlist_bl_node d_in_lookup_hash;	/* only for in-lookup ones */
 	 	struct rcu_head d_rcu;
 	} d_u;
-} __randomize_layout;
+};
 
 /*
  * dentry->d_lock spinlock nesting subclasses:
-- 
cgit v1.2.3


From 641c3ef5cb68a1426d42e6d3aba16db9bdfbe94f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 12:46:30 -0500
Subject: DCACHE_... ->d_flags bits: switch to BIT()

For bits 20..22 (inode type cached in ->d_flags) turn the definitions into
expressions like (5 << 20); everything else turns into straight use of
BIT()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 76 +++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 1d9f7f132055..d9c314cc93b8 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -151,13 +151,13 @@ struct dentry_operations {
  */
 
 /* d_flags entries */
-#define DCACHE_OP_HASH			0x00000001
-#define DCACHE_OP_COMPARE		0x00000002
-#define DCACHE_OP_REVALIDATE		0x00000004
-#define DCACHE_OP_DELETE		0x00000008
-#define DCACHE_OP_PRUNE			0x00000010
+#define DCACHE_OP_HASH			BIT(0)
+#define DCACHE_OP_COMPARE		BIT(1)
+#define DCACHE_OP_REVALIDATE		BIT(2)
+#define DCACHE_OP_DELETE		BIT(3)
+#define DCACHE_OP_PRUNE			BIT(4)
 
-#define	DCACHE_DISCONNECTED		0x00000020
+#define	DCACHE_DISCONNECTED		BIT(5)
      /* This dentry is possibly not currently connected to the dcache tree, in
       * which case its parent will either be itself, or will have this flag as
       * well.  nfsd will not use a dentry with this bit set, but will first
@@ -168,50 +168,50 @@ struct dentry_operations {
       * dentry into place and return that dentry rather than the passed one,
       * typically using d_splice_alias. */
 
-#define DCACHE_REFERENCED		0x00000040 /* Recently used, don't discard. */
+#define DCACHE_REFERENCED		BIT(6) /* Recently used, don't discard. */
 
-#define DCACHE_DONTCACHE		0x00000080 /* Purge from memory on final dput() */
+#define DCACHE_DONTCACHE		BIT(7) /* Purge from memory on final dput() */
 
-#define DCACHE_CANT_MOUNT		0x00000100
-#define DCACHE_GENOCIDE			0x00000200
-#define DCACHE_SHRINK_LIST		0x00000400
+#define DCACHE_CANT_MOUNT		BIT(8)
+#define DCACHE_GENOCIDE			BIT(9)
+#define DCACHE_SHRINK_LIST		BIT(10)
 
-#define DCACHE_OP_WEAK_REVALIDATE	0x00000800
+#define DCACHE_OP_WEAK_REVALIDATE	BIT(11)
 
-#define DCACHE_NFSFS_RENAMED		0x00001000
+#define DCACHE_NFSFS_RENAMED		BIT(12)
      /* this dentry has been "silly renamed" and has to be deleted on the last
       * dput() */
-#define DCACHE_COOKIE			0x00002000 /* For use by dcookie subsystem */
-#define DCACHE_FSNOTIFY_PARENT_WATCHED	0x00004000
+#define DCACHE_COOKIE			BIT(13) /* For use by dcookie subsystem */
+#define DCACHE_FSNOTIFY_PARENT_WATCHED	BIT(14)
      /* Parent inode is watched by some fsnotify listener */
 
-#define DCACHE_DENTRY_KILLED		0x00008000
+#define DCACHE_DENTRY_KILLED		BIT(15)
 
-#define DCACHE_MOUNTED			0x00010000 /* is a mountpoint */
-#define DCACHE_NEED_AUTOMOUNT		0x00020000 /* handle automount on this dir */
-#define DCACHE_MANAGE_TRANSIT		0x00040000 /* manage transit from this dirent */
+#define DCACHE_MOUNTED			BIT(16) /* is a mountpoint */
+#define DCACHE_NEED_AUTOMOUNT		BIT(17) /* handle automount on this dir */
+#define DCACHE_MANAGE_TRANSIT		BIT(18) /* manage transit from this dirent */
 #define DCACHE_MANAGED_DENTRY \
 	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)
 
-#define DCACHE_LRU_LIST			0x00080000
-
-#define DCACHE_ENTRY_TYPE		0x00700000
-#define DCACHE_MISS_TYPE		0x00000000 /* Negative dentry (maybe fallthru to nowhere) */
-#define DCACHE_WHITEOUT_TYPE		0x00100000 /* Whiteout dentry (stop pathwalk) */
-#define DCACHE_DIRECTORY_TYPE		0x00200000 /* Normal directory */
-#define DCACHE_AUTODIR_TYPE		0x00300000 /* Lookupless directory (presumed automount) */
-#define DCACHE_REGULAR_TYPE		0x00400000 /* Regular file type (or fallthru to such) */
-#define DCACHE_SPECIAL_TYPE		0x00500000 /* Other file type (or fallthru to such) */
-#define DCACHE_SYMLINK_TYPE		0x00600000 /* Symlink (or fallthru to such) */
-
-#define DCACHE_MAY_FREE			0x00800000
-#define DCACHE_FALLTHRU			0x01000000 /* Fall through to lower layer */
-#define DCACHE_NOKEY_NAME		0x02000000 /* Encrypted name encoded without key */
-#define DCACHE_OP_REAL			0x04000000
-
-#define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
-#define DCACHE_DENTRY_CURSOR		0x20000000
-#define DCACHE_NORCU			0x40000000 /* No RCU delay for freeing */
+#define DCACHE_LRU_LIST			BIT(19)
+
+#define DCACHE_ENTRY_TYPE		(7 << 20) /* bits 20..22 are for storing type: */
+#define DCACHE_MISS_TYPE		(0 << 20) /* Negative dentry (maybe fallthru to nowhere) */
+#define DCACHE_WHITEOUT_TYPE		(1 << 20) /* Whiteout dentry (stop pathwalk) */
+#define DCACHE_DIRECTORY_TYPE		(2 << 20) /* Normal directory */
+#define DCACHE_AUTODIR_TYPE		(3 << 20) /* Lookupless directory (presumed automount) */
+#define DCACHE_REGULAR_TYPE		(4 << 20) /* Regular file type (or fallthru to such) */
+#define DCACHE_SPECIAL_TYPE		(5 << 20) /* Other file type (or fallthru to such) */
+#define DCACHE_SYMLINK_TYPE		(6 << 20) /* Symlink (or fallthru to such) */
+
+#define DCACHE_MAY_FREE			BIT(23)
+#define DCACHE_FALLTHRU			BIT(24) /* Fall through to lower layer */
+#define DCACHE_NOKEY_NAME		BIT(25) /* Encrypted name encoded without key */
+#define DCACHE_OP_REAL			BIT(26)
+
+#define DCACHE_PAR_LOOKUP		BIT(28) /* being looked up (with parent locked shared) */
+#define DCACHE_DENTRY_CURSOR		BIT(29)
+#define DCACHE_NORCU			BIT(30) /* No RCU delay for freeing */
 
 extern seqlock_t rename_lock;
 
-- 
cgit v1.2.3


From 0bec65a80f1b1ebcda05286e539a204713b70353 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 12:50:29 -0500
Subject: DCACHE_COOKIE: RIP

the last user gone in 2021...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index d9c314cc93b8..92c0b2a1ae2e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -181,7 +181,6 @@ struct dentry_operations {
 #define DCACHE_NFSFS_RENAMED		BIT(12)
      /* this dentry has been "silly renamed" and has to be deleted on the last
       * dput() */
-#define DCACHE_COOKIE			BIT(13) /* For use by dcookie subsystem */
 #define DCACHE_FSNOTIFY_PARENT_WATCHED	BIT(14)
      /* Parent inode is watched by some fsnotify listener */
 
-- 
cgit v1.2.3


From 8219cb58feddcf28909072015f4e17e29f68c41a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 14:32:05 -0500
Subject: kill d_{is,set}_fallthru()

Introduced in 2015 and never had any in-tree users...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 92c0b2a1ae2e..8cd937bb2292 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -195,16 +195,15 @@ struct dentry_operations {
 #define DCACHE_LRU_LIST			BIT(19)
 
 #define DCACHE_ENTRY_TYPE		(7 << 20) /* bits 20..22 are for storing type: */
-#define DCACHE_MISS_TYPE		(0 << 20) /* Negative dentry (maybe fallthru to nowhere) */
+#define DCACHE_MISS_TYPE		(0 << 20) /* Negative dentry */
 #define DCACHE_WHITEOUT_TYPE		(1 << 20) /* Whiteout dentry (stop pathwalk) */
 #define DCACHE_DIRECTORY_TYPE		(2 << 20) /* Normal directory */
 #define DCACHE_AUTODIR_TYPE		(3 << 20) /* Lookupless directory (presumed automount) */
-#define DCACHE_REGULAR_TYPE		(4 << 20) /* Regular file type (or fallthru to such) */
-#define DCACHE_SPECIAL_TYPE		(5 << 20) /* Other file type (or fallthru to such) */
-#define DCACHE_SYMLINK_TYPE		(6 << 20) /* Symlink (or fallthru to such) */
+#define DCACHE_REGULAR_TYPE		(4 << 20) /* Regular file type */
+#define DCACHE_SPECIAL_TYPE		(5 << 20) /* Other file type */
+#define DCACHE_SYMLINK_TYPE		(6 << 20) /* Symlink */
 
 #define DCACHE_MAY_FREE			BIT(23)
-#define DCACHE_FALLTHRU			BIT(24) /* Fall through to lower layer */
 #define DCACHE_NOKEY_NAME		BIT(25) /* Encrypted name encoded without key */
 #define DCACHE_OP_REAL			BIT(26)
 
@@ -489,14 +488,6 @@ static inline int simple_positive(const struct dentry *dentry)
 	return d_really_is_positive(dentry) && !d_unhashed(dentry);
 }
 
-extern void d_set_fallthru(struct dentry *dentry);
-
-static inline bool d_is_fallthru(const struct dentry *dentry)
-{
-	return dentry->d_flags & DCACHE_FALLTHRU;
-}
-
-
 extern int sysctl_vfs_cache_pressure;
 
 static inline unsigned long vfs_pressure_ratio(unsigned long val)
-- 
cgit v1.2.3


From 0d486510f86eb8162022ed61e6dc424a10909a10 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 15:22:40 -0500
Subject: dentry.h: trim externs

d_instantiate_unique() had been gone for 7 years; __d_lookup...()
and shrink_dcache_for_umount() are fs/internal.h fodder.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 8cd937bb2292..9706bf1dc5de 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -218,7 +218,6 @@ extern seqlock_t rename_lock;
  */
 extern void d_instantiate(struct dentry *, struct inode *);
 extern void d_instantiate_new(struct dentry *, struct inode *);
-extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
 extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *);
 extern void __d_drop(struct dentry *dentry);
 extern void d_drop(struct dentry *dentry);
@@ -240,7 +239,6 @@ extern struct dentry * d_obtain_alias(struct inode *);
 extern struct dentry * d_obtain_root(struct inode *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
-extern void shrink_dcache_for_umount(struct super_block *);
 extern void d_invalidate(struct dentry *);
 
 /* only used at mount-time */
@@ -275,9 +273,6 @@ extern struct dentry *d_ancestor(struct dentry *, struct dentry *);
 /* appendix may either be NULL or be used for transname suffixes */
 extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
 extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
-extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
-extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
-				const struct qstr *name, unsigned *seq);
 
 static inline unsigned d_count(const struct dentry *dentry)
 {
-- 
cgit v1.2.3


From 2fcd38f4de7256e2b5cb23ad22a6e3ebfea7dd18 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 15:24:45 -0500
Subject: [software coproarchaeology] dentry.h: kill a mysterious comment

there's a strange comment in front of d_lookup() declaration:

/* appendix may either be NULL or be used for transname suffixes */

Looks like nobody had been curious enough to track its history;
it predates git, it predates bitkeeper and if you look through
the pre-BK trees, you finally arrive at this in 2.1.44-for-davem:
  /* appendix may either be NULL or be used for transname suffixes */
 -extern struct dentry * d_lookup(struct inode * dir, struct qstr * name,
 -                               struct qstr * appendix);
 +extern struct dentry * d_lookup(struct dentry * dir, struct qstr * name);
In other words, it refers to the third argument d_lookup() used to have
back then.  It had been introduced in 2.1.43-pre, on June 12 1997,
along with d_lookup(), only to be removed by July 4 1997, presumably
when the Cthulhu-awful thing it used to be used for (look for
CONFIG_TRANS_NAMES in 2.1.43-pre, and keep a heavy-duty barfbag
ready) had been, er, noticed and recognized for what it had been.

Despite the appendectomy, the comment remained.  Some things really
need to be put out of their misery...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 9706bf1dc5de..a5e5e274eee0 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -270,7 +270,6 @@ extern void d_move(struct dentry *, struct dentry *);
 extern void d_exchange(struct dentry *, struct dentry *);
 extern struct dentry *d_ancestor(struct dentry *, struct dentry *);
 
-/* appendix may either be NULL or be used for transname suffixes */
 extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
 extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
 
-- 
cgit v1.2.3


From 698f1e2b71736977b04f951e2e2ef1c9a80696ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 16:19:59 -0500
Subject: kill d_backing_dentry()

no users left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index a5e5e274eee0..fa0414cff85c 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -530,21 +530,6 @@ static inline struct inode *d_backing_inode(const struct dentry *upper)
 	return inode;
 }
 
-/**
- * d_backing_dentry - Get upper or lower dentry we should be using
- * @upper: The upper layer
- *
- * This is the helper that should be used to get the dentry of the inode that
- * will be used if this dentry were opened as a file.  It may be the upper
- * dentry or it may be a lower dentry pinned by the upper.
- *
- * Normal filesystems should not use this to access their own dentries.
- */
-static inline struct dentry *d_backing_dentry(struct dentry *upper)
-{
-	return upper;
-}
-
 /**
  * d_real - Return the real dentry
  * @dentry: the dentry to query
-- 
cgit v1.2.3


From 289354f21b2c3fac93e956efd45f256a88a4d997 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 18 Nov 2023 18:38:05 -0800
Subject: net: partial revert of the "Make timestamping selectable: series

Revert following commits:

commit acec05fb78ab ("net_tstamp: Add TIMESTAMPING SOFTWARE and HARDWARE mask")
commit 11d55be06df0 ("net: ethtool: Add a command to expose current time stamping layer")
commit bb8645b00ced ("netlink: specs: Introduce new netlink command to get current timestamp")
commit d905f9c75329 ("net: ethtool: Add a command to list available time stamping layers")
commit aed5004ee7a0 ("netlink: specs: Introduce new netlink command to list available time stamping layers")
commit 51bdf3165f01 ("net: Replace hwtstamp_source by timestamping layer")
commit 0f7f463d4821 ("net: Change the API of PHY default timestamp to MAC")
commit 091fab122869 ("net: ethtool: ts: Update GET_TS to reply the current selected timestamp")
commit 152c75e1d002 ("net: ethtool: ts: Let the active time stamping layer be selectable")
commit ee60ea6be0d3 ("netlink: specs: Introduce time stamping set command")

They need more time for reviews.

Link: https://lore.kernel.org/all/20231118183529.6e67100c@kernel.org/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/net_tstamp.h | 11 ++++++++---
 include/linux/netdevice.h  |  5 -----
 include/linux/phy.h        |  4 ----
 3 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h
index bb289c2ad376..eb01c37e71e0 100644
--- a/include/linux/net_tstamp.h
+++ b/include/linux/net_tstamp.h
@@ -5,6 +5,11 @@
 
 #include <uapi/linux/net_tstamp.h>
 
+enum hwtstamp_source {
+	HWTSTAMP_SOURCE_NETDEV,
+	HWTSTAMP_SOURCE_PHYLIB,
+};
+
 /**
  * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config
  *
@@ -15,8 +20,8 @@
  *	a legacy implementation of a lower driver
  * @copied_to_user: request was passed to a legacy implementation which already
  *	copied the ioctl request back to user space
- * @source: indication whether timestamps should come from software, the netdev
- *	or from an attached phylib PHY
+ * @source: indication whether timestamps should come from the netdev or from
+ *	an attached phylib PHY
  *
  * Prefer using this structure for in-kernel processing of hardware
  * timestamping configuration, over the inextensible struct hwtstamp_config
@@ -28,7 +33,7 @@ struct kernel_hwtstamp_config {
 	int rx_filter;
 	struct ifreq *ifr;
 	bool copied_to_user;
-	enum timestamping_layer source;
+	enum hwtstamp_source source;
 };
 
 static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f020d2790c12..2d840d7056f2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -47,7 +47,6 @@
 #include <uapi/linux/if_bonding.h>
 #include <uapi/linux/pkt_cls.h>
 #include <uapi/linux/netdev.h>
-#include <uapi/linux/net_tstamp.h>
 #include <linux/hashtable.h>
 #include <linux/rbtree.h>
 #include <net/net_trackers.h>
@@ -2075,8 +2074,6 @@ enum netdev_ml_priv_type {
  *
  *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
  *		   where the clock is recovered.
- *	@ts_layer:	Tracks which network device
- *			performs packet	time stamping.
  *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
@@ -2438,8 +2435,6 @@ struct net_device {
 #if IS_ENABLED(CONFIG_DPLL)
 	struct dpll_pin		*dpll_pin;
 #endif
-
-	enum timestamping_layer	ts_layer;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 317def2a7843..e5f1f41e399c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -604,8 +604,6 @@ struct macsec_ops;
  *                 handling shall be postponed until PHY has resumed
  * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended,
  *             requiring a rerun of the interrupt handler after resume
- * @default_timestamp: Flag indicating whether we are using the phy
- *		       timestamp as the default one
  * @interface: enum phy_interface_t value
  * @skb: Netlink message for cable diagnostics
  * @nest: Netlink nest used for cable diagnostics
@@ -669,8 +667,6 @@ struct phy_device {
 	unsigned irq_suspended:1;
 	unsigned irq_rerun:1;
 
-	unsigned default_timestamp:1;
-
 	int rate_matching;
 
 	enum phy_state state;
-- 
cgit v1.2.3


From ac40916a3f7243efbe6e129ebf495b5c33a3adfe Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Wed, 15 Nov 2023 20:01:08 +0800
Subject: rtnetlink: introduce nlmsg_new_large and use it in rtnl_getlink

if a PF has 256 or more VFs, ip link command will allocate an order 3
memory or more, and maybe trigger OOM due to memory fragment,
the VFs needed memory size is computed in rtnl_vfinfo_size.

so introduce nlmsg_new_large which calls netlink_alloc_large_skb in
which vmalloc is used for large memory, to avoid the failure of
allocating memory

    ip invoked oom-killer: gfp_mask=0xc2cc0(GFP_KERNEL|__GFP_NOWARN|\
	__GFP_COMP|__GFP_NOMEMALLOC), order=3, oom_score_adj=0
    CPU: 74 PID: 204414 Comm: ip Kdump: loaded Tainted: P           OE
    Call Trace:
    dump_stack+0x57/0x6a
    dump_header+0x4a/0x210
    oom_kill_process+0xe4/0x140
    out_of_memory+0x3e8/0x790
    __alloc_pages_slowpath.constprop.116+0x953/0xc50
    __alloc_pages_nodemask+0x2af/0x310
    kmalloc_large_node+0x38/0xf0
    __kmalloc_node_track_caller+0x417/0x4d0
    __kmalloc_reserve.isra.61+0x2e/0x80
    __alloc_skb+0x82/0x1c0
    rtnl_getlink+0x24f/0x370
    rtnetlink_rcv_msg+0x12c/0x350
    netlink_rcv_skb+0x50/0x100
    netlink_unicast+0x1b2/0x280
    netlink_sendmsg+0x355/0x4a0
    sock_sendmsg+0x5b/0x60
    ____sys_sendmsg+0x1ea/0x250
    ___sys_sendmsg+0x88/0xd0
    __sys_sendmsg+0x5e/0xa0
    do_syscall_64+0x33/0x40
    entry_SYSCALL_64_after_hwframe+0x44/0xa9
    RIP: 0033:0x7f95a65a5b70

Cc: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Link: https://lore.kernel.org/r/20231115120108.3711-1-lirongqing@baidu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 75d7de34c908..abe91ed6b9aa 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -351,5 +351,6 @@ bool netlink_ns_capable(const struct sk_buff *skb,
 			struct user_namespace *ns, int cap);
 bool netlink_capable(const struct sk_buff *skb, int cap);
 bool netlink_net_capable(const struct sk_buff *skb, int cap);
+struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast);
 
 #endif	/* __LINUX_NETLINK_H */
-- 
cgit v1.2.3


From f25d34646bd01505a0989ca67bc9a37390cae755 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Fri, 3 Nov 2023 19:25:23 +0100
Subject: platform/x86: wmi: Add wmidev_block_set()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, WMI drivers have to use the deprecated GUID-based
interface when setting data blocks. This prevents those
drivers from fully moving away from this interface.

Provide wmidev_block_set() so drivers using wmi_set_block() can
fully migrate to the modern bus-based interface.

Tested with a custom SSDT from the Intel Slim Bootloader project.

Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20231103182526.3524-1-W_Armin@gmx.de
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/wmi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 763bd382cf2d..207544968268 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -35,6 +35,8 @@ extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev,
 extern union acpi_object *wmidev_block_query(struct wmi_device *wdev,
 					     u8 instance);
 
+acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct acpi_buffer *in);
+
 u8 wmidev_instance_count(struct wmi_device *wdev);
 
 extern int set_required_buffer_size(struct wmi_device *wdev, u64 length);
-- 
cgit v1.2.3


From 7275bf3e09578e1761157e7683f2e898c5c235a6 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Fri, 3 Nov 2023 19:25:24 +0100
Subject: platform/x86: wmi: Add to_wmi_device() helper macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a helper macro for WMI drivers to cast a device to
the corresponding WMI device. This should replace some
boilerplate code.

Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20231103182526.3524-2-W_Armin@gmx.de
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/wmi.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 207544968268..8a643c39fcce 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -27,6 +27,14 @@ struct wmi_device {
 	bool setable;
 };
 
+/**
+ * to_wmi_device() - Helper macro to cast a device to a wmi_device
+ * @device: device struct
+ *
+ * Cast a struct device to a struct wmi_device.
+ */
+#define to_wmi_device(device)	container_of(device, struct wmi_device, dev)
+
 extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev,
 					  u8 instance, u32 method_id,
 					  const struct acpi_buffer *in,
-- 
cgit v1.2.3


From 762321dab9a72760bf9aec48362f932717c9424d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 25 Oct 2023 16:10:17 +0200
Subject: filemap: add a per-mapping stable writes flag

folio_wait_stable waits for writeback to finish before modifying the
contents of a folio again, e.g. to support check summing of the data
in the block integrity code.

Currently this behavior is controlled by the SB_I_STABLE_WRITES flag
on the super_block, which means it is uniform for the entire file system.
This is wrong for the block device pseudofs which is shared by all
block devices, or file systems that can use multiple devices like XFS
witht the RT subvolume or btrfs (although btrfs currently reimplements
folio_wait_stable anyway).

Add a per-address_space AS_STABLE_WRITES flag to control the behavior
in a more fine grained way.  The existing SB_I_STABLE_WRITES is kept
to initialize AS_STABLE_WRITES to the existing default which covers
most cases.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231025141020.192413-2-hch@lst.de
Tested-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pagemap.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bcc1ea44b4e8..06142ff7f9ce 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -204,6 +204,8 @@ enum mapping_flags {
 	AS_NO_WRITEBACK_TAGS = 5,
 	AS_LARGE_FOLIO_SUPPORT = 6,
 	AS_RELEASE_ALWAYS,	/* Call ->release_folio(), even if no private data */
+	AS_STABLE_WRITES,	/* must wait for writeback before modifying
+				   folio contents */
 };
 
 /**
@@ -289,6 +291,21 @@ static inline void mapping_clear_release_always(struct address_space *mapping)
 	clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
 }
 
+static inline bool mapping_stable_writes(const struct address_space *mapping)
+{
+	return test_bit(AS_STABLE_WRITES, &mapping->flags);
+}
+
+static inline void mapping_set_stable_writes(struct address_space *mapping)
+{
+	set_bit(AS_STABLE_WRITES, &mapping->flags);
+}
+
+static inline void mapping_clear_stable_writes(struct address_space *mapping)
+{
+	clear_bit(AS_STABLE_WRITES, &mapping->flags);
+}
+
 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 {
 	return mapping->gfp_mask;
-- 
cgit v1.2.3


From 48c9996f1dfe92bd7318472651c9ad538d6d53b5 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 6 Nov 2023 17:16:37 +0100
Subject: device property: Add SOFTWARE_NODE() macro for defining software
 nodes

Add SOFTWARE_NODE() macro in order to make defining software nodes look
nicer. This is analogous to different PROPERTY_ENTRY_*() macros for
defining properties.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Tested-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/property.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 9f2585d705a8..97f901c0914e 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -489,6 +489,13 @@ struct software_node {
 	const struct property_entry *properties;
 };
 
+#define SOFTWARE_NODE(_name_, _properties_, _parent_)	\
+	(struct software_node) {			\
+		.name = _name_,				\
+		.properties = _properties_,		\
+		.parent = _parent_,			\
+	}
+
 bool is_software_node(const struct fwnode_handle *fwnode);
 const struct software_node *
 to_software_node(const struct fwnode_handle *fwnode);
-- 
cgit v1.2.3


From 44844db91397d3d94589f3c0c855be02daeebdb3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 9 Nov 2023 16:01:48 +0100
Subject: thermal: core: Add trip thresholds for trip crossing detection

The trip crossing detection in handle_thermal_trip() does not work
correctly in the cases when a trip point is crossed on the way up and
then the zone temperature stays above its low temperature (that is, its
temperature decreased by its hysteresis).  The trip temperature may
be passed by the zone temperature subsequently in that case, even
multiple times, but that does not count as the trip crossing as long as
the zone temperature does not fall below the trip's low temperature or,
in other words, until the trip is crossed on the way down.

|-----------low--------high------------|
             |<--------->|
             |    hyst   |
             |           |
             |          -|--> crossed on the way up
             |
         <---|-- crossed on the way down

However, handle_thermal_trip() will invoke thermal_notify_tz_trip_up()
every time the trip temperature is passed by the zone temperature on
the way up regardless of whether or not the trip has been crossed on
the way down yet.  Moreover, it will not call thermal_notify_tz_trip_down()
if the last zone temperature was between the trip's temperature and its
low temperature, so some "trip crossed on the way down" events may not
be reported.

To address this issue, introduce trip thresholds equal to either the
temperature of the given trip, or its low temperature, such that if
the trip's threshold is passed by the zone temperature on the way up,
its value will be set to the trip's low temperature and
thermal_notify_tz_trip_up() will be called, and if the trip's threshold
is passed by the zone temperature on the way down, its value will be set
to the trip's temperature (high) and thermal_notify_tz_trip_down() will
be called.  Accordingly, if the threshold is passed on the way up, it
cannot be passed on the way up again until its passed on the way down
and if it is passed on the way down, it cannot be passed on the way down
again until it is passed on the way up which guarantees correct
triggering of trip crossing notifications.

If the last temperature of the zone is invalid, the trip's threshold
will be set depending of the zone's current temperature: If that
temperature is above the trip's temperature, its threshold will be
set to its low temperature or otherwise its threshold will be set to
its (high) temperature.  Because the zone temperature is initially
set to invalid and tz->last_temperature is only updated by
update_temperature(), this is sufficient to set the correct initial
threshold values for all trips.

Link: https://lore.kernel.org/all/20220718145038.1114379-4-daniel.lezcano@linaro.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index cee814d5d1ac..1f9ee869f9f9 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -57,12 +57,14 @@ enum thermal_notify_event {
  * struct thermal_trip - representation of a point in temperature domain
  * @temperature: temperature value in miliCelsius
  * @hysteresis: relative hysteresis in miliCelsius
+ * @threshold: trip crossing notification threshold miliCelsius
  * @type: trip point type
  * @priv: pointer to driver data associated with this trip
  */
 struct thermal_trip {
 	int temperature;
 	int hysteresis;
+	int threshold;
 	enum thermal_trip_type type;
 	void *priv;
 };
-- 
cgit v1.2.3


From c96b8175522a2c52e297ee0a49827a668f95e1e8 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 20 Nov 2023 16:06:11 +0900
Subject: block: Remove blk_set_runtime_active()

The function blk_set_runtime_active() is called only from
blk_post_runtime_resume(), so there is no need for that function to be
exported. Open-code this function directly in blk_post_runtime_resume()
and remove it.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20231120070611.33951-1-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-pm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-pm.h b/include/linux/blk-pm.h
index 2580e05a8ab6..004b38a538ff 100644
--- a/include/linux/blk-pm.h
+++ b/include/linux/blk-pm.h
@@ -15,7 +15,6 @@ extern int blk_pre_runtime_suspend(struct request_queue *q);
 extern void blk_post_runtime_suspend(struct request_queue *q, int err);
 extern void blk_pre_runtime_resume(struct request_queue *q);
 extern void blk_post_runtime_resume(struct request_queue *q);
-extern void blk_set_runtime_active(struct request_queue *q);
 #else
 static inline void blk_pm_runtime_init(struct request_queue *q,
 				       struct device *dev) {}
-- 
cgit v1.2.3


From 79e0c5be8c73a674c92bd4ba77b75f4f8c91d32e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 14 Nov 2023 01:42:13 +0100
Subject: net, vrf: Move dstats structure to core

Just move struct pcpu_dstats out of the vrf into the core, and streamline
the field names slightly, so they better align with the {t,l}stats ones.

No functional change otherwise. A conversion of the u64s to u64_stats_t
could be done at a separate point in future. This move is needed as we are
moving the {t,l,d}stats allocation/freeing to the core.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20231114004220.6495-2-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/netdevice.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a16c9cc063fe..98082113156e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2755,6 +2755,16 @@ struct pcpu_sw_netstats {
 	struct u64_stats_sync   syncp;
 } __aligned(4 * sizeof(u64));
 
+struct pcpu_dstats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			rx_drops;
+	u64			tx_packets;
+	u64			tx_bytes;
+	u64			tx_drops;
+	struct u64_stats_sync	syncp;
+} __aligned(8 * sizeof(u64));
+
 struct pcpu_lstats {
 	u64_stats_t packets;
 	u64_stats_t bytes;
-- 
cgit v1.2.3


From 34d21de99cea9cb17967874313e5b0262527833c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 14 Nov 2023 01:42:14 +0100
Subject: net: Move {l,t,d}stats allocation to core and convert veth & vrf

Move {l,t,d}stats allocation to the core and let netdevs pick the stats
type they need. That way the driver doesn't have to bother with error
handling (allocation failure checking, making sure free happens in the
right spot, etc) - all happening in the core.

Co-developed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Cc: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20231114004220.6495-3-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/netdevice.h | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 98082113156e..2564e209465e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1797,6 +1797,13 @@ enum netdev_ml_priv_type {
 	ML_PRIV_CAN,
 };
 
+enum netdev_stat_type {
+	NETDEV_PCPU_STAT_NONE,
+	NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */
+	NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */
+	NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */
+};
+
 /**
  *	struct net_device - The DEVICE structure.
  *
@@ -1991,10 +1998,14 @@ enum netdev_ml_priv_type {
  *
  * 	@ml_priv:	Mid-layer private
  *	@ml_priv_type:  Mid-layer private type
- * 	@lstats:	Loopback statistics
- * 	@tstats:	Tunnel statistics
- * 	@dstats:	Dummy statistics
- * 	@vstats:	Virtual ethernet statistics
+ *
+ *	@pcpu_stat_type:	Type of device statistics which the core should
+ *				allocate/free: none, lstats, tstats, dstats. none
+ *				means the driver is handling statistics allocation/
+ *				freeing internally.
+ *	@lstats:		Loopback statistics: packets, bytes
+ *	@tstats:		Tunnel statistics: RX/TX packets, RX/TX bytes
+ *	@dstats:		Dummy statistics: RX/TX/drop packets, RX/TX bytes
  *
  *	@garp_port:	GARP
  *	@mrp_port:	MRP
@@ -2354,6 +2365,7 @@ struct net_device {
 	void				*ml_priv;
 	enum netdev_ml_priv_type	ml_priv_type;
 
+	enum netdev_stat_type		pcpu_stat_type:8;
 	union {
 		struct pcpu_lstats __percpu		*lstats;
 		struct pcpu_sw_netstats __percpu	*tstats;
-- 
cgit v1.2.3


From d9dcdb4531fe39ce48919ef8c2c9369ee49f3ad2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 20 Oct 2023 11:21:07 +0200
Subject: PCI: host-generic: Convert to platform remove callback returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code.  However the value returned is (mostly) ignored
and this typically results in resource leaks. To improve here there is a
quest to make the remove callback return void. In the first step of this
quest all drivers are converted to .remove_new() which already returns
void.

pci_host_common_remove() returned zero unconditionally. With that converted
to return void instead, the generic pci host driver can be switched to
.remove_new() trivially.

Link: https://lore.kernel.org/r/20231020092107.2148311-1-u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Will Deacon <will@kernel.org>
---
 include/linux/pci-ecam.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h
index 6b1301e2498e..3a4860bd2758 100644
--- a/include/linux/pci-ecam.h
+++ b/include/linux/pci-ecam.h
@@ -93,6 +93,6 @@ extern const struct pci_ecam_ops loongson_pci_ecam_ops; /* Loongson PCIe */
 #if IS_ENABLED(CONFIG_PCI_HOST_COMMON)
 /* for DT-based PCI controllers that support ECAM */
 int pci_host_common_probe(struct platform_device *pdev);
-int pci_host_common_remove(struct platform_device *pdev);
+void pci_host_common_remove(struct platform_device *pdev);
 #endif
 #endif
-- 
cgit v1.2.3


From 3171e46d677a668eed3086da78671f1e4f5b8405 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 30 Oct 2023 13:42:18 +0200
Subject: PCI: Avoid potential out-of-bounds read in
 pci_dev_for_each_resource()

Coverity complains that pointer in the pci_dev_for_each_resource() may be
wrong, i.e., might be used for the out-of-bounds read.

There is no actual issue right now because we have another check afterwards
and the out-of-bounds read is not being performed. In any case it's better
code with this fixed, hence the proposed change.

As Jonas pointed out "It probably makes the code slightly less performant
as res will now be checked for being not NULL (which will always be true),
but I doubt it will be significant (or in any hot paths)."

Fixes: 09cc90063240 ("PCI: Introduce pci_dev_for_each_resource()")
Reported-by: Bjorn Helgaas <bhelgaas@google.com>
Closes: https://lore.kernel.org/r/20230509182122.GA1259567@bhelgaas
Suggested-by: Jonas Gorski <jonas.gorski@gmail.com>
Link: https://lore.kernel.org/r/20231030114218.2752236-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60ca768bc867..4ebecc7896ef 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2127,14 +2127,14 @@ int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma);
 	(pci_resource_end((dev), (bar)) ? 				\
 	 resource_size(pci_resource_n((dev), (bar))) : 0)
 
-#define __pci_dev_for_each_res0(dev, res, ...)				\
-	for (unsigned int __b = 0;					\
-	     res = pci_resource_n(dev, __b), __b < PCI_NUM_RESOURCES;	\
+#define __pci_dev_for_each_res0(dev, res, ...)				  \
+	for (unsigned int __b = 0;					  \
+	     __b < PCI_NUM_RESOURCES && (res = pci_resource_n(dev, __b)); \
 	     __b++)
 
-#define __pci_dev_for_each_res1(dev, res, __b)				\
-	for (__b = 0;							\
-	     res = pci_resource_n(dev, __b), __b < PCI_NUM_RESOURCES;	\
+#define __pci_dev_for_each_res1(dev, res, __b)				  \
+	for (__b = 0;							  \
+	     __b < PCI_NUM_RESOURCES && (res = pci_resource_n(dev, __b)); \
 	     __b++)
 
 #define pci_dev_for_each_resource(dev, res, ...)			\
-- 
cgit v1.2.3


From ab5cfac139ab8576fb54630d4cca23c3e690ee90 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 21 Nov 2023 04:06:56 +0200
Subject: bpf: verify callbacks as if they are called unknown number of times

Prior to this patch callbacks were handled as regular function calls,
execution of callback body was modeled exactly once.
This patch updates callbacks handling logic as follows:
- introduces a function push_callback_call() that schedules callback
  body verification in env->head stack;
- updates prepare_func_exit() to reschedule callback body verification
  upon BPF_EXIT;
- as calls to bpf_*_iter_next(), calls to callback invoking functions
  are marked as checkpoints;
- is_state_visited() is updated to stop callback based iteration when
  some identical parent state is found.

Paths with callback function invoked zero times are now verified first,
which leads to necessity to modify some selftests:
- the following negative tests required adding release/unlock/drop
  calls to avoid previously masked unrelated error reports:
  - cb_refs.c:underflow_prog
  - exceptions_fail.c:reject_rbtree_add_throw
  - exceptions_fail.c:reject_with_cp_reference
- the following precision tracking selftests needed change in expected
  log trace:
  - verifier_subprog_precision.c:callback_result_precise
    (note: r0 precision is no longer propagated inside callback and
           I think this is a correct behavior)
  - verifier_subprog_precision.c:parent_callee_saved_reg_precise_with_callback
  - verifier_subprog_precision.c:parent_stack_slot_precise_with_callback

Reported-by: Andrew Werner <awerner32@gmail.com>
Closes: https://lore.kernel.org/bpf/CA+vRuzPChFNXmouzGG+wsy=6eMcfr1mFG0F3g7rbg-sedGKW3w@mail.gmail.com/
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231121020701.26440-7-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 24213a99cc79..dd326936dd6f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -400,6 +400,7 @@ struct bpf_verifier_state {
 	struct bpf_idx_pair *jmp_history;
 	u32 jmp_history_cnt;
 	u32 dfs_depth;
+	u32 callback_unroll_depth;
 };
 
 #define bpf_get_spilled_reg(slot, frame, mask)				\
@@ -511,6 +512,10 @@ struct bpf_insn_aux_data {
 	 * this instruction, regardless of any heuristics
 	 */
 	bool force_checkpoint;
+	/* true if instruction is a call to a helper function that
+	 * accepts callback function as a parameter.
+	 */
+	bool calls_callback;
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
-- 
cgit v1.2.3


From bb124da69c47dd98d69361ec13244ece50bec63e Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 21 Nov 2023 04:07:00 +0200
Subject: bpf: keep track of max number of bpf_loop callback iterations

In some cases verifier can't infer convergence of the bpf_loop()
iteration. E.g. for the following program:

    static int cb(__u32 idx, struct num_context* ctx)
    {
        ctx->i++;
        return 0;
    }

    SEC("?raw_tp")
    int prog(void *_)
    {
        struct num_context ctx = { .i = 0 };
        __u8 choice_arr[2] = { 0, 1 };

        bpf_loop(2, cb, &ctx, 0);
        return choice_arr[ctx.i];
    }

Each 'cb' simulation would eventually return to 'prog' and reach
'return choice_arr[ctx.i]' statement. At which point ctx.i would be
marked precise, thus forcing verifier to track multitude of separate
states with {.i=0}, {.i=1}, ... at bpf_loop() callback entry.

This commit allows "brute force" handling for such cases by limiting
number of callback body simulations using 'umax' value of the first
bpf_loop() parameter.

For this, extend bpf_func_state with 'callback_depth' field.
Increment this field when callback visiting state is pushed to states
traversal stack. For frame #N it's 'callback_depth' field counts how
many times callback with frame depth N+1 had been executed.
Use bpf_func_state specifically to allow independent tracking of
callback depths when multiple nested bpf_loop() calls are present.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231121020701.26440-11-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index dd326936dd6f..aa4d19d0bc94 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -301,6 +301,17 @@ struct bpf_func_state {
 	struct tnum callback_ret_range;
 	bool in_async_callback_fn;
 	bool in_exception_callback_fn;
+	/* For callback calling functions that limit number of possible
+	 * callback executions (e.g. bpf_loop) keeps track of current
+	 * simulated iteration number.
+	 * Value in frame N refers to number of times callback with frame
+	 * N+1 was simulated, e.g. for the following call:
+	 *
+	 *   bpf_loop(..., fn, ...); | suppose current frame is N
+	 *                           | fn would be simulated in frame N+1
+	 *                           | number of simulations is tracked in frame N
+	 */
+	u32 callback_depth;
 
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
-- 
cgit v1.2.3


From fc43e9c857b7aa55efba9398419b14d9e35dcc7d Mon Sep 17 00:00:00 2001
From: Charles Yi <be286@163.com>
Date: Tue, 31 Oct 2023 12:32:39 +0800
Subject: HID: fix HID device resource race between HID core and debugging
 support

hid_debug_events_release releases resources bound to the HID device instance.
hid_device_release releases the underlying HID device instance potentially
before hid_debug_events_release has completed releasing debug resources bound
to the same HID device instance.

Reference count to prevent the HID device instance from being torn down
preemptively when HID debugging support is used. When count reaches zero,
release core resources of HID device instance using hiddev_free.

The crash:

[  120.728477][ T4396] kernel BUG at lib/list_debug.c:53!
[  120.728505][ T4396] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
[  120.739806][ T4396] Modules linked in: bcmdhd dhd_static_buf 8822cu pcie_mhi r8168
[  120.747386][ T4396] CPU: 1 PID: 4396 Comm: hidt_bridge Not tainted 5.10.110 #257
[  120.754771][ T4396] Hardware name: Rockchip RK3588 EVB4 LP4 V10 Board (DT)
[  120.761643][ T4396] pstate: 60400089 (nZCv daIf +PAN -UAO -TCO BTYPE=--)
[  120.768338][ T4396] pc : __list_del_entry_valid+0x98/0xac
[  120.773730][ T4396] lr : __list_del_entry_valid+0x98/0xac
[  120.779120][ T4396] sp : ffffffc01e62bb60
[  120.783126][ T4396] x29: ffffffc01e62bb60 x28: ffffff818ce3a200
[  120.789126][ T4396] x27: 0000000000000009 x26: 0000000000980000
[  120.795126][ T4396] x25: ffffffc012431000 x24: ffffff802c6d4e00
[  120.801125][ T4396] x23: ffffff8005c66f00 x22: ffffffc01183b5b8
[  120.807125][ T4396] x21: ffffff819df2f100 x20: 0000000000000000
[  120.813124][ T4396] x19: ffffff802c3f0700 x18: ffffffc01d2cd058
[  120.819124][ T4396] x17: 0000000000000000 x16: 0000000000000000
[  120.825124][ T4396] x15: 0000000000000004 x14: 0000000000003fff
[  120.831123][ T4396] x13: ffffffc012085588 x12: 0000000000000003
[  120.837123][ T4396] x11: 00000000ffffbfff x10: 0000000000000003
[  120.843123][ T4396] x9 : 455103d46b329300 x8 : 455103d46b329300
[  120.849124][ T4396] x7 : 74707572726f6320 x6 : ffffffc0124b8cb5
[  120.855124][ T4396] x5 : ffffffffffffffff x4 : 0000000000000000
[  120.861123][ T4396] x3 : ffffffc011cf4f90 x2 : ffffff81fee7b948
[  120.867122][ T4396] x1 : ffffffc011cf4f90 x0 : 0000000000000054
[  120.873122][ T4396] Call trace:
[  120.876259][ T4396]  __list_del_entry_valid+0x98/0xac
[  120.881304][ T4396]  hid_debug_events_release+0x48/0x12c
[  120.886617][ T4396]  full_proxy_release+0x50/0xbc
[  120.891323][ T4396]  __fput+0xdc/0x238
[  120.895075][ T4396]  ____fput+0x14/0x24
[  120.898911][ T4396]  task_work_run+0x90/0x148
[  120.903268][ T4396]  do_exit+0x1bc/0x8a4
[  120.907193][ T4396]  do_group_exit+0x8c/0xa4
[  120.911458][ T4396]  get_signal+0x468/0x744
[  120.915643][ T4396]  do_signal+0x84/0x280
[  120.919650][ T4396]  do_notify_resume+0xd0/0x218
[  120.924262][ T4396]  work_pending+0xc/0x3f0

[ Rahul Rameshbabu <sergeantsagara@protonmail.com>: rework changelog ]
Fixes: cd667ce24796 ("HID: use debugfs for events/reports dumping")
Signed-off-by: Charles Yi <be286@163.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 5a8387a4a712..bf43f3ff6664 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -679,6 +679,7 @@ struct hid_device {							/* device report descriptor */
 	struct list_head debug_list;
 	spinlock_t  debug_list_lock;
 	wait_queue_head_t debug_wait;
+	struct kref			ref;
 
 	unsigned int id;						/* system unique id */
 
@@ -687,6 +688,8 @@ struct hid_device {							/* device report descriptor */
 #endif /* CONFIG_BPF */
 };
 
+void hiddev_free(struct kref *ref);
+
 #define to_hid_device(pdev) \
 	container_of(pdev, struct hid_device, dev)
 
-- 
cgit v1.2.3


From 600f111ef51dc2cbdb330b09d09f1856efa64912 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Nov 2023 21:58:23 +0000
Subject: fs: Rename mapping private members

It is hard to find where mapping->private_lock, mapping->private_list and
mapping->private_data are used, due to private_XXX being a relatively
common name for variables and structure members in the kernel.  To fit
with other members of struct address_space, rename them all to have an
i_ prefix.  Tested with an allmodconfig build.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://lore.kernel.org/r/20231117215823.2821906-1-willy@infradead.org
Acked-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..f171505940ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -463,9 +463,9 @@ extern const struct address_space_operations empty_aops;
  * @a_ops: Methods.
  * @flags: Error bits and flags (AS_*).
  * @wb_err: The most recent error which has occurred.
- * @private_lock: For use by the owner of the address_space.
- * @private_list: For use by the owner of the address_space.
- * @private_data: For use by the owner of the address_space.
+ * @i_private_lock: For use by the owner of the address_space.
+ * @i_private_list: For use by the owner of the address_space.
+ * @i_private_data: For use by the owner of the address_space.
  */
 struct address_space {
 	struct inode		*host;
@@ -484,9 +484,9 @@ struct address_space {
 	unsigned long		flags;
 	struct rw_semaphore	i_mmap_rwsem;
 	errseq_t		wb_err;
-	spinlock_t		private_lock;
-	struct list_head	private_list;
-	void			*private_data;
+	spinlock_t		i_private_lock;
+	struct list_head	i_private_list;
+	void *			i_private_data;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
-- 
cgit v1.2.3


From f47507988145185aef5d0e7a0e28dbf6e7776f29 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 17 Oct 2023 22:05:23 +0200
Subject: thermal: ACPI: Move the ACPI thermal library to drivers/acpi/

The ACPI thermal library contains functions that can be used to
retrieve trip point temperature values through the platform firmware
for various types of trip points.  Each of these functions basically
evaluates a specific ACPI object, checks if the value produced by it
is reasonable and returns it (or THERMAL_TEMP_INVALID if anything
fails).

It made sense to hold it in drivers/thermal/ so long as it was only used
by the code in that directory, but since it is also going to be used by
the ACPI thermal driver located in drivers/acpi/, move it to the latter
in order to keep the code related to evaluating ACPI objects defined in
the specification proper together.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h    | 7 +++++++
 include/linux/thermal.h | 7 -------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 54189e0e5f41..b63d7811c728 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -424,6 +424,13 @@ extern int acpi_blacklisted(void);
 extern void acpi_osi_setup(char *str);
 extern bool acpi_osi_is_win8(void);
 
+#ifdef CONFIG_ACPI_THERMAL_LIB
+int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp);
+int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp);
+int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp);
+int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp);
+#endif
+
 #ifdef CONFIG_ACPI_NUMA
 int acpi_map_pxm_to_node(int pxm);
 int acpi_get_node(acpi_handle handle);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index cee814d5d1ac..35f620059456 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -294,13 +294,6 @@ int thermal_zone_get_num_trips(struct thermal_zone_device *tz);
 
 int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp);
 
-#ifdef CONFIG_THERMAL_ACPI
-int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp);
-int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp);
-int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp);
-int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp);
-#endif
-
 #ifdef CONFIG_THERMAL
 struct thermal_zone_device *thermal_zone_device_register_with_trips(
 					const char *type,
-- 
cgit v1.2.3


From 35732699f5d2922ff674e711e566cf44a4bd86d2 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 22 Nov 2023 08:33:53 -0700
Subject: ACPI: Fix ARM32 platforms compile issue introduced by fw_table
 changes

Linus reported that:
After commit a103f46633fd the kernel stopped compiling for
several ARM32 platforms that I am building with a bare metal
compiler. Bare metal compilers (arm-none-eabi-) don't
define __linux__.

This is because the header <acpi/platform/acenv.h> is now
in the include path for <linux/irq.h>:

  CC      arch/arm/kernel/irq.o
  CC      kernel/sysctl.o
  CC      crypto/api.o
In file included from ../include/acpi/acpi.h:22,
                 from ../include/linux/fw_table.h:29,
                 from ../include/linux/acpi.h:18,
                 from ../include/linux/irqchip.h:14,
                 from ../arch/arm/kernel/irq.c:25:
../include/acpi/platform/acenv.h:218:2: error: #error Unknown target environment
  218 | #error Unknown target environment
      |  ^~~~~

The issue is caused by the introducing of splitting out the ACPI code to
support the new generic fw_table code.

Rafael suggested [1] moving the fw_table.h include in linux/acpi.h to below
the linux/mutex.h. Remove the two includes in fw_table.h. Replace
linux/fw_table.h include in fw_table.c with linux/acpi.h.

Link: https://lore.kernel.org/linux-acpi/CAJZ5v0idWdJq3JSqQWLG5q+b+b=zkEdWR55rGYEoxh7R6N8kFQ@mail.gmail.com/
Fixes: a103f46633fd ("acpi: Move common tables helper functions to common lib")
Closes: https://lore.kernel.org/linux-acpi/20231114-arm-build-bug-v1-1-458745fe32a4@linaro.org/
Reported-by: Linus Walleij <linus.walleij@linaro.org>
Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h     | 22 +++++++++++-----------
 include/linux/fw_table.h |  3 ---
 2 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 54189e0e5f41..4db54e928b36 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -15,7 +15,6 @@
 #include <linux/mod_devicetable.h>
 #include <linux/property.h>
 #include <linux/uuid.h>
-#include <linux/fw_table.h>
 
 struct irq_domain;
 struct irq_domain_ops;
@@ -25,22 +24,13 @@ struct irq_domain_ops;
 #endif
 #include <acpi/acpi.h>
 
-#ifdef CONFIG_ACPI_TABLE_LIB
-#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI)
-#define __init_or_acpilib
-#define __initdata_or_acpilib
-#else
-#define EXPORT_SYMBOL_ACPI_LIB(x)
-#define __init_or_acpilib __init
-#define __initdata_or_acpilib __initdata
-#endif
-
 #ifdef	CONFIG_ACPI
 
 #include <linux/list.h>
 #include <linux/dynamic_debug.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/fw_table.h>
 
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
@@ -48,6 +38,16 @@ struct irq_domain_ops;
 #include <acpi/acpi_io.h>
 #include <asm/acpi.h>
 
+#ifdef CONFIG_ACPI_TABLE_LIB
+#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI)
+#define __init_or_acpilib
+#define __initdata_or_acpilib
+#else
+#define EXPORT_SYMBOL_ACPI_LIB(x)
+#define __init_or_acpilib __init
+#define __initdata_or_acpilib __initdata
+#endif
+
 static inline acpi_handle acpi_device_handle(struct acpi_device *adev)
 {
 	return adev ? adev->handle : NULL;
diff --git a/include/linux/fw_table.h b/include/linux/fw_table.h
index ff8fa58d5818..ca49947f0a77 100644
--- a/include/linux/fw_table.h
+++ b/include/linux/fw_table.h
@@ -25,9 +25,6 @@ struct acpi_subtable_proc {
 	int count;
 };
 
-#include <linux/acpi.h>
-#include <acpi/acpi.h>
-
 union acpi_subtable_headers {
 	struct acpi_subtable_header common;
 	struct acpi_hmat_structure hmat;
-- 
cgit v1.2.3


From 21f4c443731fdb064c0dd31a743aafd0b075156c Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Mon, 20 Nov 2023 18:47:20 +0100
Subject: soundwire: stream: constify sdw_port_config when adding devices

sdw_stream_add_master() and sdw_stream_add_slave() do not modify
contents of passed sdw_port_config, so it can be made const for code
safety and as documentation of expected usage.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/r/20231120174720.239610-1-krzysztof.kozlowski@linaro.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 4f3d14bb1538..904004d8b562 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -1040,7 +1040,7 @@ int sdw_compute_params(struct sdw_bus *bus);
 
 int sdw_stream_add_master(struct sdw_bus *bus,
 		struct sdw_stream_config *stream_config,
-		struct sdw_port_config *port_config,
+		const struct sdw_port_config *port_config,
 		unsigned int num_ports,
 		struct sdw_stream_runtime *stream);
 int sdw_stream_remove_master(struct sdw_bus *bus,
@@ -1062,7 +1062,7 @@ void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id
 
 int sdw_stream_add_slave(struct sdw_slave *slave,
 			 struct sdw_stream_config *stream_config,
-			 struct sdw_port_config *port_config,
+			 const struct sdw_port_config *port_config,
 			 unsigned int num_ports,
 			 struct sdw_stream_runtime *stream);
 int sdw_stream_remove_slave(struct sdw_slave *slave,
@@ -1084,7 +1084,7 @@ int sdw_update_no_pm(struct sdw_slave *slave, u32 addr, u8 mask, u8 val);
 
 static inline int sdw_stream_add_slave(struct sdw_slave *slave,
 				       struct sdw_stream_config *stream_config,
-				       struct sdw_port_config *port_config,
+				       const struct sdw_port_config *port_config,
 				       unsigned int num_ports,
 				       struct sdw_stream_runtime *stream)
 {
-- 
cgit v1.2.3


From 66b73e9a402d822723b3af5263eb12d735628eac Mon Sep 17 00:00:00 2001
From: Matt Coster <matt.coster@imgtec.com>
Date: Wed, 22 Nov 2023 16:34:22 +0000
Subject: sizes.h: Add entries between SZ_32G and SZ_64T

sizes.h has a gap in defines between SZ_32G and SZ_64T. Add the missing
defines so they can be used in drivers.

Signed-off-by: Matt Coster <matt.coster@imgtec.com>
Signed-off-by: Sarah Walker <sarah.walker@imgtec.com>
Signed-off-by: Donald Robson <donald.robson@imgtec.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/58b227d96f27859b453caf0ceaaac81a6616304b.1700668843.git.donald.robson@imgtec.com
Signed-off-by: Maxime Ripard <mripard@kernel.org>
---
 include/linux/sizes.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sizes.h b/include/linux/sizes.h
index 84aa448d8bb3..c3a00b967d18 100644
--- a/include/linux/sizes.h
+++ b/include/linux/sizes.h
@@ -47,8 +47,17 @@
 #define SZ_8G				_AC(0x200000000, ULL)
 #define SZ_16G				_AC(0x400000000, ULL)
 #define SZ_32G				_AC(0x800000000, ULL)
+#define SZ_64G				_AC(0x1000000000, ULL)
+#define SZ_128G				_AC(0x2000000000, ULL)
+#define SZ_256G				_AC(0x4000000000, ULL)
+#define SZ_512G				_AC(0x8000000000, ULL)
 
 #define SZ_1T				_AC(0x10000000000, ULL)
+#define SZ_2T				_AC(0x20000000000, ULL)
+#define SZ_4T				_AC(0x40000000000, ULL)
+#define SZ_8T				_AC(0x80000000000, ULL)
+#define SZ_16T				_AC(0x100000000000, ULL)
+#define SZ_32T				_AC(0x200000000000, ULL)
 #define SZ_64T				_AC(0x400000000000, ULL)
 
 #endif /* __LINUX_SIZES_H__ */
-- 
cgit v1.2.3


From 57b79ac9f43dc71fc8b55af51d1c9f469cb7a0de Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan@marcan.st>
Date: Thu, 5 May 2022 01:25:16 +0900
Subject: soc: apple: rtkit: Get rid of apple_rtkit_send_message_wait

It is fundamentally broken and has no users. Just remove it.

Acked-by: Eric Curtin <ecurtin@redhat.com>
Acked-by: Neal Gompa <neal@gompa.dev>
Acked-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Signed-off-by: Hector Martin <marcan@marcan.st>
---
 include/linux/soc/apple/rtkit.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/apple/rtkit.h b/include/linux/soc/apple/rtkit.h
index fc456f75c131..8c9ca857ccf6 100644
--- a/include/linux/soc/apple/rtkit.h
+++ b/include/linux/soc/apple/rtkit.h
@@ -160,24 +160,6 @@ int apple_rtkit_start_ep(struct apple_rtkit *rtk, u8 endpoint);
 int apple_rtkit_send_message(struct apple_rtkit *rtk, u8 ep, u64 message,
 			     struct completion *completion, bool atomic);
 
-/*
- * Send a message to the given endpoint and wait until it has been submitted
- * to the hardware FIFO.
- * Will return zero on success and a negative error code on failure
- * (e.g. -ETIME when the message couldn't be written within the given
- * timeout)
- *
- * @rtk:            RTKit reference
- * @ep:             target endpoint
- * @message:        message to be sent
- * @timeout:        timeout in milliseconds to allow the message transmission
- *                  to be completed
- * @atomic:         if set to true this function can be called from atomic
- *                  context.
- */
-int apple_rtkit_send_message_wait(struct apple_rtkit *rtk, u8 ep, u64 message,
-				  unsigned long timeout, bool atomic);
-
 /*
  * Process incoming messages in atomic context.
  * This only guarantees that messages arrive as far as the recv_message_early
-- 
cgit v1.2.3


From 143897c4fa976d02bfafe5ae32b9ffc60dc6145a Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan@marcan.st>
Date: Tue, 14 Mar 2023 19:47:32 +0900
Subject: mailbox: apple: Delete driver

This driver is now orphaned and superseded by
drivers/soc/apple/mailbox.c.

Acked-by: Eric Curtin <ecurtin@redhat.com>
Acked-by: Neal Gompa <neal@gompa.dev>
Acked-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Signed-off-by: Hector Martin <marcan@marcan.st>
---
 include/linux/apple-mailbox.h | 19 -------------------
 1 file changed, 19 deletions(-)
 delete mode 100644 include/linux/apple-mailbox.h

(limited to 'include/linux')

diff --git a/include/linux/apple-mailbox.h b/include/linux/apple-mailbox.h
deleted file mode 100644
index 720fbb70294a..000000000000
--- a/include/linux/apple-mailbox.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only OR MIT */
-/*
- * Apple mailbox message format
- *
- * Copyright (C) 2021 The Asahi Linux Contributors
- */
-
-#ifndef _LINUX_APPLE_MAILBOX_H_
-#define _LINUX_APPLE_MAILBOX_H_
-
-#include <linux/types.h>
-
-/* encodes a single 96bit message sent over the single channel */
-struct apple_mbox_msg {
-	u64 msg0;
-	u32 msg1;
-};
-
-#endif
-- 
cgit v1.2.3


From 9c0b4bb7f6303c9c4e2e34984c46f5a86478f84d Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 22 Nov 2023 14:39:03 +0100
Subject: sched/cpufreq: Rework schedutil governor performance estimation

The current method to take into account uclamp hints when estimating the
target frequency can end in a situation where the selected target
frequency is finally higher than uclamp hints, whereas there are no real
needs. Such cases mainly happen because we are currently mixing the
traditional scheduler utilization signal with the uclamp performance
hints. By adding these 2 metrics, we loose an important information when
it comes to select the target frequency, and we have to make some
assumptions which can't fit all cases.

Rework the interface between the scheduler and schedutil governor in order
to propagate all information down to the cpufreq governor.

effective_cpu_util() interface changes and now returns the actual
utilization of the CPU with 2 optional inputs:

- The minimum performance for this CPU; typically the capacity to handle
  the deadline task and the interrupt pressure. But also uclamp_min
  request when available.

- The maximum targeting performance for this CPU which reflects the
  maximum level that we would like to not exceed. By default it will be
  the CPU capacity but can be reduced because of some performance hints
  set with uclamp. The value can be lower than actual utilization and/or
  min performance level.

A new sugov_effective_cpu_perf() interface is also available to compute
the final performance level that is targeted for the CPU, after applying
some cpufreq headroom and taking into account all inputs.

With these 2 functions, schedutil is now able to decide when it must go
above uclamp hints. It now also has a generic way to get the min
performance level.

The dependency between energy model and cpufreq governor and its headroom
policy doesn't exist anymore.

eenv_pd_max_util() asks schedutil for the targeted performance after
applying the impact of the waking task.

[ mingo: Refined the changelog & C comments. ]

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20231122133904.446032-2-vincent.guittot@linaro.org
---
 include/linux/energy_model.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index b9caa01dfac4..adec808b371a 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -243,7 +243,6 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	scale_cpu = arch_scale_cpu_capacity(cpu);
 	ps = &pd->table[pd->nr_perf_states - 1];
 
-	max_util = map_util_perf(max_util);
 	max_util = min(max_util, allowed_cpu_cap);
 	freq = map_util_freq(max_util, ps->frequency, scale_cpu);
 
-- 
cgit v1.2.3


From 64bac5ea17d527872121adddfee869c7a0618f8f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Nov 2023 13:58:29 +0100
Subject: arch: consolidate arch_irq_work_raise prototypes

The prototype was hidden in an #ifdef on x86, which causes a warning:

kernel/irq_work.c:72:13: error: no previous prototype for 'arch_irq_work_raise' [-Werror=missing-prototypes]

Some architectures have a working prototype, while others don't.
Fix this by providing it in only one place that is always visible.

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Acked-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/irq_work.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 8cd11a223260..136f2980cba3 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -66,6 +66,9 @@ void irq_work_sync(struct irq_work *work);
 void irq_work_run(void);
 bool irq_work_needs_cpu(void);
 void irq_work_single(void *arg);
+
+void arch_irq_work_raise(void);
+
 #else
 static inline bool irq_work_needs_cpu(void) { return false; }
 static inline void irq_work_run(void) { }
-- 
cgit v1.2.3


From a769154c7cac037914ba375ae88aae55b2c853e0 Mon Sep 17 00:00:00 2001
From: Hardik Gajjar <hgajjar@de.adit-jv.com>
Date: Fri, 27 Oct 2023 17:20:28 +0200
Subject: usb: xhci: Add timeout argument in address_device USB HCD callback

- The HCD address_device callback now accepts a user-defined timeout value
  in milliseconds, providing better control over command execution times.
- The default timeout value for the address_device command has been set
  to 5000 ms, aligning with the USB 3.2 specification. However, this
  timeout can be adjusted as needed.
- The xhci_setup_device function has been updated to accept the timeout
  value, allowing it to specify the maximum wait time for the command
  operation to complete.
- The hub driver has also been updated to accommodate the newly added
  timeout parameter during the SET_ADDRESS request.

Signed-off-by: Hardik Gajjar <hgajjar@de.adit-jv.com>
Reviewed-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Link: https://lore.kernel.org/r/20231027152029.104363-1-hgajjar@de.adit-jv.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/hcd.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 00724b4f6e12..cd77fc6095a1 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -372,8 +372,9 @@ struct hc_driver {
 		 * or bandwidth constraints.
 		 */
 	void	(*reset_bandwidth)(struct usb_hcd *, struct usb_device *);
-		/* Returns the hardware-chosen device address */
-	int	(*address_device)(struct usb_hcd *, struct usb_device *udev);
+		/* Set the hardware-chosen device address */
+	int	(*address_device)(struct usb_hcd *, struct usb_device *udev,
+				  unsigned int timeout_ms);
 		/* prepares the hardware to send commands to the device */
 	int	(*enable_device)(struct usb_hcd *, struct usb_device *udev);
 		/* Notifies the HCD after a hub descriptor is fetched.
-- 
cgit v1.2.3


From 5a1ccf0c72cf917ff3ccc131d1bb8d19338ffe52 Mon Sep 17 00:00:00 2001
From: Hardik Gajjar <hgajjar@de.adit-jv.com>
Date: Fri, 27 Oct 2023 17:20:29 +0200
Subject: usb: new quirk to reduce the SET_ADDRESS request timeout

This patch introduces a new USB quirk,
USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT, which modifies the timeout value
for the SET_ADDRESS request. The standard timeout for USB request/command
is 5000 ms, as recommended in the USB 3.2 specification (section 9.2.6.1).

However, certain scenarios, such as connecting devices through an APTIV
hub, can lead to timeout errors when the device enumerates as full speed
initially and later switches to high speed during chirp negotiation.

In such cases, USB analyzer logs reveal that the bus suspends for
5 seconds due to incorrect chirp parsing and resumes only after two
consecutive timeout errors trigger a hub driver reset.

Packet(54) Dir(?) Full Speed J(997.100 us) Idle(  2.850 us)
_______| Time Stamp(28 . 105 910 682)
_______|_____________________________________________________________Ch0
Packet(55) Dir(?) Full Speed J(997.118 us) Idle(  2.850 us)
_______| Time Stamp(28 . 106 910 632)
_______|_____________________________________________________________Ch0
Packet(56) Dir(?) Full Speed J(399.650 us) Idle(222.582 us)
_______| Time Stamp(28 . 107 910 600)
_______|_____________________________________________________________Ch0
Packet(57) Dir Chirp J( 23.955 ms) Idle(115.169 ms)
_______| Time Stamp(28 . 108 532 832)
_______|_____________________________________________________________Ch0
Packet(58) Dir(?) Full Speed J (Suspend)( 5.347 sec) Idle(  5.366 us)
_______| Time Stamp(28 . 247 657 600)
_______|_____________________________________________________________Ch0

This 5-second delay in device enumeration is undesirable, particularly
in automotive applications where quick enumeration is crucial
(ideally within 3 seconds).

The newly introduced quirks provide the flexibility to align with a
3-second time limit, as required in specific contexts like automotive
applications.

By reducing the SET_ADDRESS request timeout to 500 ms, the
system can respond more swiftly to errors, initiate rapid recovery, and
ensure efficient device enumeration. This change is vital for scenarios
where rapid smartphone enumeration and screen projection are essential.

To use the quirk, please write "vendor_id:product_id:p" to
/sys/bus/usb/drivers/hub/module/parameter/quirks

For example,
echo "0x2c48:0x0132:p" > /sys/bus/usb/drivers/hub/module/parameters/quirks"

Signed-off-by: Hardik Gajjar <hgajjar@de.adit-jv.com>
Reviewed-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/20231027152029.104363-2-hgajjar@de.adit-jv.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/quirks.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h
index eeb7c2157c72..59409c1fc3de 100644
--- a/include/linux/usb/quirks.h
+++ b/include/linux/usb/quirks.h
@@ -72,4 +72,7 @@
 /* device has endpoints that should be ignored */
 #define USB_QUIRK_ENDPOINT_IGNORE		BIT(15)
 
+/* short SET_ADDRESS request timeout */
+#define USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT	BIT(16)
+
 #endif /* __LINUX_USB_QUIRKS_H */
-- 
cgit v1.2.3


From d4e3b928ab487a8aecd1f6a140b40ac365116cfb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Nov 2023 19:13:27 -0500
Subject: closures: CLOSURE_CALLBACK() to fix type punning

Control flow integrity is now checking that type signatures match on
indirect function calls. That breaks closures, which embed a work_struct
in a closure in such a way that a closure_fn may also be used as a
workqueue fn by the underlying closure code.

So we have to change closure fns to take a work_struct as their
argument - but that results in a loss of clarity, as closure fns have
different semantics from normal workqueue functions (they run owning a
ref on the closure, which must be released with continue_at() or
closure_return()).

Thus, this patc introduces CLOSURE_CALLBACK() and closure_type() macros
as suggested by Kees, to smooth things over a bit.

Suggested-by: Kees Cook <keescook@chromium.org>
Cc: Coly Li <colyli@suse.de>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/closure.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/closure.h b/include/linux/closure.h
index de7bb47d8a46..c554c6a08768 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -104,7 +104,7 @@
 
 struct closure;
 struct closure_syncer;
-typedef void (closure_fn) (struct closure *);
+typedef void (closure_fn) (struct work_struct *);
 extern struct dentry *bcache_debug;
 
 struct closure_waitlist {
@@ -254,7 +254,7 @@ static inline void closure_queue(struct closure *cl)
 		INIT_WORK(&cl->work, cl->work.func);
 		BUG_ON(!queue_work(wq, &cl->work));
 	} else
-		cl->fn(cl);
+		cl->fn(&cl->work);
 }
 
 /**
@@ -309,6 +309,11 @@ static inline void closure_wake_up(struct closure_waitlist *list)
 	__closure_wake_up(list);
 }
 
+#define CLOSURE_CALLBACK(name)	void name(struct work_struct *ws)
+#define closure_type(name, type, member)				\
+	struct closure *cl = container_of(ws, struct closure, work);	\
+	type *name = container_of(cl, type, member)
+
 /**
  * continue_at - jump to another function with barrier
  *
-- 
cgit v1.2.3


From 6543ac13c623f906200dfd3f1c407d8d333b6995 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 17 Oct 2023 11:09:32 -0500
Subject: soundwire: bus: introduce controller_id

The existing SoundWire support misses a clear Controller/Manager
hiearchical definition to deal with all variants across SOC vendors.

a) Intel platforms have one controller with 4 or more Managers.
b) AMD platforms have two controllers with one Manager each, but due
to BIOS issues use two different link_id values within the scope of a
single controller.
c) QCOM platforms have one or more controller with one Manager each.

This patch adds a 'controller_id' which can be set by higher
levels. If assigned to -1, the controller_id will be set to the
system-unique IDA-assigned bus->id.

The main change is that the bus->id is no longer used for any device
name, which makes the definition completely predictable and not
dependent on any enumeration order. The bus->id is only used to insert
the Managers in the stream rt context.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Tested-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/stable/20231017160933.12624-2-pierre-louis.bossart%40linux.intel.com
Tested-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231017160933.12624-2-pierre-louis.bossart@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 904004d8b562..66f814b63a43 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -886,7 +886,8 @@ struct sdw_master_ops {
  * struct sdw_bus - SoundWire bus
  * @dev: Shortcut to &bus->md->dev to avoid changing the entire code.
  * @md: Master device
- * @link_id: Link id number, can be 0 to N, unique for each Master
+ * @controller_id: system-unique controller ID. If set to -1, the bus @id will be used.
+ * @link_id: Link id number, can be 0 to N, unique for each Controller
  * @id: bus system-wide unique id
  * @slaves: list of Slaves on this bus
  * @assigned: Bitmap for Slave device numbers.
@@ -918,6 +919,7 @@ struct sdw_master_ops {
 struct sdw_bus {
 	struct device *dev;
 	struct sdw_master_device *md;
+	int controller_id;
 	unsigned int link_id;
 	int id;
 	struct list_head slaves;
-- 
cgit v1.2.3


From 8802e580ee643e3f63c6b39ff64e7c7baa4a55ba Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 22 Nov 2023 14:27:13 +0200
Subject: fs: create __sb_write_started() helper

Similar to sb_write_started() for use by other sb freeze levels.

Unlike the boolean sb_write_started(), this helper returns a tristate
to distiguish the cases of lockdep disabled or unknown lock state.

This is needed for fanotify "pre content" events.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231122122715.2561213-15-amir73il@gmail.com
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..ac8b5a9b467b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1645,9 +1645,23 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
 #define __sb_writers_release(sb, lev)	\
 	percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
 
+/**
+ * __sb_write_started - check if sb freeze level is held
+ * @sb: the super we write to
+ * @level: the freeze level
+ *
+ * > 0 sb freeze level is held
+ *   0 sb freeze level is not held
+ * < 0 !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
+ */
+static inline int __sb_write_started(const struct super_block *sb, int level)
+{
+	return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
+}
+
 static inline bool sb_write_started(const struct super_block *sb)
 {
-	return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1);
+	return __sb_write_started(sb, SB_FREEZE_WRITE);
 }
 
 /**
-- 
cgit v1.2.3


From 3d5cd4911e04683df8f4439fddd788e00a2510a8 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 22 Nov 2023 14:27:14 +0200
Subject: fs: create file_write_started() helper

Convenience wrapper for sb_write_started(file_inode(inode)->i_sb)), which
has a single occurrence in the code right now.

Document the false negatives of those helpers, which makes them unusable
to assert that sb_start_write() is not held.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231122122715.2561213-16-amir73il@gmail.com
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ac8b5a9b467b..75a10b632edd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1659,11 +1659,32 @@ static inline int __sb_write_started(const struct super_block *sb, int level)
 	return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
 }
 
+/**
+ * sb_write_started - check if SB_FREEZE_WRITE is held
+ * @sb: the super we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ */
 static inline bool sb_write_started(const struct super_block *sb)
 {
 	return __sb_write_started(sb, SB_FREEZE_WRITE);
 }
 
+/**
+ * file_write_started - check if SB_FREEZE_WRITE is held
+ * @file: the file we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ * May be false positive with !S_ISREG, because file_start_write() has
+ * no effect on !S_ISREG.
+ */
+static inline bool file_write_started(const struct file *file)
+{
+	if (!S_ISREG(file_inode(file)->i_mode))
+		return true;
+	return sb_write_started(file_inode(file)->i_sb);
+}
+
 /**
  * sb_end_write - drop write access to a superblock
  * @sb: the super we wrote to
-- 
cgit v1.2.3


From 21b32e6a0ab5b174fa1ca2fb4c212577cf405d83 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 22 Nov 2023 14:27:15 +0200
Subject: fs: create {sb,file}_write_not_started() helpers

Create new helpers {sb,file}_write_not_started() that can be used
to assert that sb_start_write() is not held.

This is needed for fanotify "pre content" events.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231122122715.2561213-17-amir73il@gmail.com
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75a10b632edd..ae0e2fb7bcea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1670,6 +1670,17 @@ static inline bool sb_write_started(const struct super_block *sb)
 	return __sb_write_started(sb, SB_FREEZE_WRITE);
 }
 
+/**
+ * sb_write_not_started - check if SB_FREEZE_WRITE is not held
+ * @sb: the super we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ */
+static inline bool sb_write_not_started(const struct super_block *sb)
+{
+	return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
+}
+
 /**
  * file_write_started - check if SB_FREEZE_WRITE is held
  * @file: the file we write to
@@ -1685,6 +1696,21 @@ static inline bool file_write_started(const struct file *file)
 	return sb_write_started(file_inode(file)->i_sb);
 }
 
+/**
+ * file_write_not_started - check if SB_FREEZE_WRITE is not held
+ * @file: the file we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ * May be false positive with !S_ISREG, because file_start_write() has
+ * no effect on !S_ISREG.
+ */
+static inline bool file_write_not_started(const struct file *file)
+{
+	if (!S_ISREG(file_inode(file)->i_mode))
+		return true;
+	return sb_write_not_started(file_inode(file)->i_sb);
+}
+
 /**
  * sb_end_write - drop write access to a superblock
  * @sb: the super we wrote to
-- 
cgit v1.2.3


From 2afae08c9dcb8ac648414277cec70c2fe6a34d9e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 23 Nov 2023 19:59:36 -0800
Subject: bpf: Validate global subprogs lazily

Slightly change BPF verifier logic around eagerness and order of global
subprog validation. Instead of going over every global subprog eagerly
and validating it before main (entry) BPF program is verified, turn it
around. Validate main program first, mark subprogs that were called from
main program for later verification, but otherwise assume it is valid.
Afterwards, go over marked global subprogs and validate those,
potentially marking some more global functions as being called. Continue
this process until all (transitively) callable global subprogs are
validated. It's a BFS traversal at its heart and will always converge.

This is an important change because it allows to feature-gate some
subprograms that might not be verifiable on some older kernel, depending
on supported set of features.

E.g., at some point, global functions were allowed to accept a pointer
to memory, which size is identified by user-provided type.
Unfortunately, older kernels don't support this feature. With BPF CO-RE
approach, the natural way would be to still compile BPF object file once
and guard calls to this global subprog with some CO-RE check or using
.rodata variables. That's what people do to guard usage of new helpers
or kfuncs, and any other new BPF-side feature that might be missing on
old kernels.

That's currently impossible to do with global subprogs, unfortunately,
because they are eagerly and unconditionally validated. This patch set
aims to change this, so that in the future when global funcs gain new
features, those can be guarded using BPF CO-RE techniques in the same
fashion as any other new kernel feature.

Two selftests had to be adjusted in sync with these changes.

test_global_func12 relied on eager global subprog validation failing
before main program failure is detected (unknown return value). Fix by
making sure that main program is always valid.

verifier_subprog_precision's parent_stack_slot_precise subtest relied on
verifier checkpointing heuristic to do a checkpoint at instruction #5,
but that's no longer true because we don't have enough jumps validated
before reaching insn #5 due to global subprogs being validated later.

Other than that, no changes, as one would expect.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20231124035937.403208-3-andrii@kernel.org
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 258ba232e302..eb447b0a9423 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1347,6 +1347,8 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 struct bpf_func_info_aux {
 	u16 linkage;
 	bool unreliable;
+	bool called : 1;
+	bool verified : 1;
 };
 
 enum bpf_jit_poke_reason {
-- 
cgit v1.2.3


From ef5828805842204dd0259ecfc132b5916c8a77ae Mon Sep 17 00:00:00 2001
From: Michael-CY Lee <michael-cy.lee@mediatek.com>
Date: Wed, 22 Nov 2023 11:02:37 +0800
Subject: wifi: avoid offset calculation on NULL pointer

ieee80211_he_6ghz_oper() can be passed a NULL pointer
and checks for that, but already did the calculation
to inside of it before. Move it after the check.

Signed-off-by: Michael-CY Lee <michael-cy.lee@mediatek.com>
Link: https://lore.kernel.org/r/20231122030237.31276-1-michael-cy.lee@mediatek.com
[rewrite commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 958771bac9c0..c2ac9e9e7ee9 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2830,12 +2830,14 @@ ieee80211_he_oper_size(const u8 *he_oper_ie)
 static inline const struct ieee80211_he_6ghz_oper *
 ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper)
 {
-	const u8 *ret = (const void *)&he_oper->optional;
+	const u8 *ret;
 	u32 he_oper_params;
 
 	if (!he_oper)
 		return NULL;
 
+	ret = (const void *)&he_oper->optional;
+
 	he_oper_params = le32_to_cpu(he_oper->he_oper_params);
 
 	if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO))
-- 
cgit v1.2.3


From d3ca4ab4f16eb81dc3e7721251adcba49b229d54 Mon Sep 17 00:00:00 2001
From: Liam Kearney <liam.kearney@morsemicro.com>
Date: Wed, 25 Oct 2023 11:27:55 +1100
Subject: wifi: ieee80211: fix PV1 frame control field name

Update PV1 frame control field TODS to FROMDS to match 802.11 standard

Signed-off-by: Liam Kearney <liam.kearney@morsemicro.com>
Reviewed-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://lore.kernel.org/r/20231025002755.1752983-1-liam.kearney@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 958771bac9c0..5e5ea216f341 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -172,11 +172,11 @@
 #define IEEE80211_SN_MODULO		(IEEE80211_MAX_SN + 1)
 
 
-/* PV1 Layout 11ah 9.8.3.1 */
+/* PV1 Layout IEEE 802.11-2020 9.8.3.1 */
 #define IEEE80211_PV1_FCTL_VERS		0x0003
 #define IEEE80211_PV1_FCTL_FTYPE	0x001c
 #define IEEE80211_PV1_FCTL_STYPE	0x00e0
-#define IEEE80211_PV1_FCTL_TODS		0x0100
+#define IEEE80211_PV1_FCTL_FROMDS		0x0100
 #define IEEE80211_PV1_FCTL_MOREFRAGS	0x0200
 #define IEEE80211_PV1_FCTL_PM		0x0400
 #define IEEE80211_PV1_FCTL_MOREDATA	0x0800
-- 
cgit v1.2.3


From d1f7728259ef02ac20b7afb6e7eb5a9eb1696c25 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 15 Nov 2023 17:49:59 +0100
Subject: gpiolib: provide gpio_device_get_label()

Provide a getter for the GPIO device label string so that users don't
have to dereference struct gpio_chip directly.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 0aed62f0c633..100c329dc986 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -786,6 +786,7 @@ struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc);
 
 /* struct gpio_device getters */
 int gpio_device_get_base(struct gpio_device *gdev);
+const char *gpio_device_get_label(struct gpio_device *gdev);
 
 #else /* CONFIG_GPIOLIB */
 
-- 
cgit v1.2.3


From a066f906ba396ab00d4af19fc5fad42b2605582a Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Wed, 22 Nov 2023 14:52:43 +0100
Subject: firmware_loader: Expand Firmware upload error codes with firmware
 invalid error

No error code are available to signal an invalid firmware content.
Drivers that can check the firmware content validity can not return this
specific failure to the user-space

Expand the firmware error code with an additional code:
- "firmware invalid" code which can be used when the provided firmware
  is invalid

Sync lib/test_firmware.c file accordingly.

Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20231122-feature_firmware_error_code-v3-1-04ec753afb71@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/firmware.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index de7fea3bca51..0311858b46ce 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -27,6 +27,7 @@ struct firmware {
  * @FW_UPLOAD_ERR_INVALID_SIZE: invalid firmware image size
  * @FW_UPLOAD_ERR_RW_ERROR: read or write to HW failed, see kernel log
  * @FW_UPLOAD_ERR_WEAROUT: FLASH device is approaching wear-out, wait & retry
+ * @FW_UPLOAD_ERR_FW_INVALID: invalid firmware file
  * @FW_UPLOAD_ERR_MAX: Maximum error code marker
  */
 enum fw_upload_err {
@@ -38,6 +39,7 @@ enum fw_upload_err {
 	FW_UPLOAD_ERR_INVALID_SIZE,
 	FW_UPLOAD_ERR_RW_ERROR,
 	FW_UPLOAD_ERR_WEAROUT,
+	FW_UPLOAD_ERR_FW_INVALID,
 	FW_UPLOAD_ERR_MAX
 };
 
-- 
cgit v1.2.3


From b286f4e87e325b76789f30337c98ba72e00532e2 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 13 Nov 2023 10:07:52 +0200
Subject: serial: core: Move tty and serdev to be children of serial core port
 device

Let's move tty and serdev controller to be children of the serial core port
device. This way the runtime PM usage count of a child device propagates
to the serial hardware device.

The tty and serdev devices are associated with a specific serial port of
a serial hardware controller device, and we now have serial core hierarchy
of controllers and ports.

The tty device moves happily with just a change of the parent device and
update of device_find_child() handling. The serdev device init needs some
changes to separate the serial hardware controller device from the parent
device.

With this change the tty devices move under sysfs similar to this x86_64
qemu example of a diff of "find /sys -name ttyS*":

 /sys/class/tty/ttyS0
 /sys/class/tty/ttyS3
 /sys/class/tty/ttyS1
-/sys/devices/pnp0/00:04/tty/ttyS0
-/sys/devices/platform/serial8250/tty/ttyS2
-/sys/devices/platform/serial8250/tty/ttyS3
-/sys/devices/platform/serial8250/tty/ttyS1
+/sys/devices/pnp0/00:04/00:04:0/00:04:0.0/tty/ttyS0
+/sys/devices/platform/serial8250/serial8250:0/serial8250:0.3/tty/ttyS3
+/sys/devices/platform/serial8250/serial8250:0/serial8250:0.1/tty/ttyS1
+/sys/devices/platform/serial8250/serial8250:0/serial8250:0.2/tty/ttyS2

If a serdev device is used instead of a tty, it moves in a similar way.

Suggested-by: Johan Hovold <johan@kernel.org>
Cc: Maximilian Luz <luzmaximilian@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20231113080758.30346-1-tony@atomide.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serdev.h   | 8 +++++++-
 include/linux/tty_port.h | 4 ++--
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index f5f97fa25e8a..0ebf53bb254f 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -99,12 +99,14 @@ struct serdev_controller_ops {
 /**
  * struct serdev_controller - interface to the serdev controller
  * @dev:	Driver model representation of the device.
+ * @host:	Serial port hardware controller device
  * @nr:		number identifier for this controller/bus.
  * @serdev:	Pointer to slave device for this controller.
  * @ops:	Controller operations.
  */
 struct serdev_controller {
 	struct device		dev;
+	struct device		*host;
 	unsigned int		nr;
 	struct serdev_device	*serdev;
 	const struct serdev_controller_ops *ops;
@@ -167,7 +169,9 @@ struct serdev_device *serdev_device_alloc(struct serdev_controller *);
 int serdev_device_add(struct serdev_device *);
 void serdev_device_remove(struct serdev_device *);
 
-struct serdev_controller *serdev_controller_alloc(struct device *, size_t);
+struct serdev_controller *serdev_controller_alloc(struct device *host,
+						  struct device *parent,
+						  size_t size);
 int serdev_controller_add(struct serdev_controller *);
 void serdev_controller_remove(struct serdev_controller *);
 
@@ -311,11 +315,13 @@ struct tty_driver;
 
 #ifdef CONFIG_SERIAL_DEV_CTRL_TTYPORT
 struct device *serdev_tty_port_register(struct tty_port *port,
+					struct device *host,
 					struct device *parent,
 					struct tty_driver *drv, int idx);
 int serdev_tty_port_unregister(struct tty_port *port);
 #else
 static inline struct device *serdev_tty_port_register(struct tty_port *port,
+					   struct device *host,
 					   struct device *parent,
 					   struct tty_driver *drv, int idx)
 {
diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h
index 6b367eb17979..3276311a7f38 100644
--- a/include/linux/tty_port.h
+++ b/include/linux/tty_port.h
@@ -149,10 +149,10 @@ struct device *tty_port_register_device_attr(struct tty_port *port,
 		const struct attribute_group **attr_grp);
 struct device *tty_port_register_device_serdev(struct tty_port *port,
 		struct tty_driver *driver, unsigned index,
-		struct device *device);
+		struct device *host, struct device *parent);
 struct device *tty_port_register_device_attr_serdev(struct tty_port *port,
 		struct tty_driver *driver, unsigned index,
-		struct device *device, void *drvdata,
+		struct device *host, struct device *parent, void *drvdata,
 		const struct attribute_group **attr_grp);
 void tty_port_unregister_device(struct tty_port *port,
 		struct tty_driver *driver, unsigned index);
-- 
cgit v1.2.3


From 358779dd18c1e8531bd6d78c19ed802958d7c677 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Tue, 21 Nov 2023 10:22:44 +0100
Subject: tty: fix tty_operations types in documentation

Commits 95713967ba52 ("tty: make tty_operations::write()'s count
size_t") and dcaafbe6ee3b ("tty: propagate u8 data to
tty_operations::put_char()") changed types of characters to u8, but
omitted to fix the documentation.

Fix the latter now.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20231121092258.9334-4-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 18beff0cec1a..f428c1b784a2 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -72,8 +72,7 @@ struct serial_struct;
  *	is closed for the last time freeing up the resources. This is
  *	actually the second part of shutdown for routines that might sleep.
  *
- * @write: ``ssize_t ()(struct tty_struct *tty, const unsigned char *buf,
- *		    size_t count)``
+ * @write: ``ssize_t ()(struct tty_struct *tty, const u8 *buf, size_t count)``
  *
  *	This routine is called by the kernel to write a series (@count) of
  *	characters (@buf) to the @tty device. The characters may come from
@@ -85,7 +84,7 @@ struct serial_struct;
  *
  *	Optional: Required for writable devices. May not sleep.
  *
- * @put_char: ``int ()(struct tty_struct *tty, unsigned char ch)``
+ * @put_char: ``int ()(struct tty_struct *tty, u8 ch)``
  *
  *	This routine is called by the kernel to write a single character @ch to
  *	the @tty device. If the kernel uses this routine, it must call the
-- 
cgit v1.2.3


From 239123e7e8ec4d35c8591c48f5de44925a88391d Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Tue, 21 Nov 2023 10:22:45 +0100
Subject: tty: move locking docs out of Returns for functions in tty.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both tty_kref_get() and tty_get_baud_rate() note about locking in their
Return kernel-doc clause. Extract this info into a separate "Locking"
paragraph -- the same as we do for other tty functions.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Suggested-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20231121092258.9334-5-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 4b6340ac2af2..7625fc98fef3 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -393,8 +393,10 @@ extern const struct class tty_class;
  * tty_kref_get - get a tty reference
  * @tty: tty device
  *
- * Returns: a new reference to a tty object. The caller must hold sufficient
- * locks/counts to ensure that their existing reference cannot go away
+ * Returns: a new reference to a tty object
+ *
+ * Locking: The caller must hold sufficient locks/counts to ensure that their
+ * existing reference cannot go away.
  */
 static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 {
@@ -436,10 +438,10 @@ void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud,
  * tty_get_baud_rate - get tty bit rates
  * @tty: tty to query
  *
- * Returns: the baud rate as an integer for this terminal. The termios lock
- * must be held by the caller and the terminal bit flags may be updated.
+ * Returns: the baud rate as an integer for this terminal
  *
- * Locking: none
+ * Locking: The termios lock must be held by the caller and the terminal bit
+ * flags may be updated.
  */
 static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
 {
-- 
cgit v1.2.3


From da549bdd15c295c24b2ee7ffe7ad0f3877fa8a87 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Nov 2023 02:00:39 -0500
Subject: dentry: switch the lists of children to hlist

Saves a pointer per struct dentry and actually makes the things less
clumsy.  Cleaned the d_walk() and dcache_readdir() a bit by use
of hlist_for_... iterators.

A couple of new helpers - d_first_child() and d_next_sibling(),
to make the expressions less awful.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 3da2f0545d5d..0e397a0c519c 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -68,12 +68,12 @@ extern const struct qstr dotdot_name;
  * large memory footprint increase).
  */
 #ifdef CONFIG_64BIT
-# define DNAME_INLINE_LEN 32 /* 192 bytes */
+# define DNAME_INLINE_LEN 40 /* 192 bytes */
 #else
 # ifdef CONFIG_SMP
-#  define DNAME_INLINE_LEN 36 /* 128 bytes */
-# else
 #  define DNAME_INLINE_LEN 40 /* 128 bytes */
+# else
+#  define DNAME_INLINE_LEN 44 /* 128 bytes */
 # endif
 #endif
 
@@ -101,8 +101,8 @@ struct dentry {
 		struct list_head d_lru;		/* LRU list */
 		wait_queue_head_t *d_wait;	/* in-lookup ones only */
 	};
-	struct list_head d_child;	/* child of parent list */
-	struct list_head d_subdirs;	/* our children */
+	struct hlist_node d_sib;	/* child of parent list */
+	struct hlist_head d_children;	/* our children */
 	/*
 	 * d_alias and d_rcu can share memory
 	 */
@@ -600,4 +600,14 @@ struct name_snapshot {
 void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
 void release_dentry_name_snapshot(struct name_snapshot *);
 
+static inline struct dentry *d_first_child(const struct dentry *dentry)
+{
+	return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib);
+}
+
+static inline struct dentry *d_next_sibling(const struct dentry *dentry)
+{
+	return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib);
+}
+
 #endif	/* __LINUX_DCACHE_H */
-- 
cgit v1.2.3


From f2824db1b49f947ba6e208ddf02edf4b1391480a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Nov 2023 16:42:43 -0500
Subject: kill d_instantate_anon(), fold __d_instantiate_anon() into remaining
 caller

now that the only user of d_instantiate_anon() is gone...
[braino fix folded - kudos to Dan Carpenter]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index fa0414cff85c..8c5e3bdf1147 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -218,7 +218,6 @@ extern seqlock_t rename_lock;
  */
 extern void d_instantiate(struct dentry *, struct inode *);
 extern void d_instantiate_new(struct dentry *, struct inode *);
-extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *);
 extern void __d_drop(struct dentry *dentry);
 extern void d_drop(struct dentry *dentry);
 extern void d_delete(struct dentry *);
-- 
cgit v1.2.3


From 8a54b38f3e5ced6cc4b246b8e54bd0f50deceaa8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 11 Nov 2023 16:01:27 -0500
Subject: d_genocide(): move the extern into fs/internal.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 8c5e3bdf1147..b4324d47f249 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -243,9 +243,6 @@ extern void d_invalidate(struct dentry *);
 /* only used at mount-time */
 extern struct dentry * d_make_root(struct inode *);
 
-/* <clickety>-<click> the ramfs-type tree */
-extern void d_genocide(struct dentry *);
-
 extern void d_mark_tmpfile(struct file *, struct inode *);
 extern void d_tmpfile(struct file *, struct inode *);
 
-- 
cgit v1.2.3


From 57851607326a2beef21e67f83f4f53a90df8445a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Nov 2023 21:38:48 -0500
Subject: get rid of DCACHE_GENOCIDE

... now that we never call d_genocide() other than from kill_litter_super()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index b4324d47f249..981f529c6cb5 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -173,7 +173,6 @@ struct dentry_operations {
 #define DCACHE_DONTCACHE		BIT(7) /* Purge from memory on final dput() */
 
 #define DCACHE_CANT_MOUNT		BIT(8)
-#define DCACHE_GENOCIDE			BIT(9)
 #define DCACHE_SHRINK_LIST		BIT(10)
 
 #define DCACHE_OP_WEAK_REVALIDATE	BIT(11)
-- 
cgit v1.2.3


From 1b327b5ac57cf83e3d015de45d0142852f475375 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 14:07:43 -0500
Subject: kill DCACHE_MAY_FREE

With the new ordering in __dentry_kill() it has become redundant -
it's set if and only if both DCACHE_DENTRY_KILLED and DCACHE_SHRINK_LIST
are set.

We set it in __dentry_kill(), after having set DCACHE_DENTRY_KILLED
with the only condition being that DCACHE_SHRINK_LIST is there;
all of that is done without dropping ->d_lock and the only place
that checks that flag (shrink_dentry_list()) does so under ->d_lock,
after having found the victim on its shrink list.  Since DCACHE_SHRINK_LIST
is set only when placing dentry into shrink list and removed only by
shrink_dentry_list() itself, a check for DCACHE_DENTRY_KILLED in
there would be equivalent to check for DCACHE_MAY_FREE.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index b4449a1a47ff..48b393545ec2 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -202,7 +202,6 @@ struct dentry_operations {
 #define DCACHE_SPECIAL_TYPE		(5 << 20) /* Other file type */
 #define DCACHE_SYMLINK_TYPE		(6 << 20) /* Symlink */
 
-#define DCACHE_MAY_FREE			BIT(23)
 #define DCACHE_NOKEY_NAME		BIT(25) /* Encrypted name encoded without key */
 #define DCACHE_OP_REAL			BIT(26)
 
-- 
cgit v1.2.3


From 1b6ae9f6e6c3e3c35aad0f11b116a81780b8aa03 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Mon, 6 Nov 2023 14:44:17 +0100
Subject: dcache: remove unnecessary NULL check in dget_dlock()

dget_dlock() requires dentry->d_lock to be held when called, yet
contains a NULL check for dentry.

An audit of all calls to dget_dlock() shows that it is never called
with a NULL pointer (as spin_lock()/spin_unlock() would crash in these
cases):

  $ git grep -W '\<dget_dlock\>'

  arch/powerpc/platforms/cell/spufs/inode.c-              spin_lock(&dentry->d_lock);
  arch/powerpc/platforms/cell/spufs/inode.c-              if (simple_positive(dentry)) {
  arch/powerpc/platforms/cell/spufs/inode.c:                      dget_dlock(dentry);

  fs/autofs/expire.c-             spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
  fs/autofs/expire.c-             if (simple_positive(child)) {
  fs/autofs/expire.c:                     dget_dlock(child);

  fs/autofs/root.c:                       dget_dlock(active);
  fs/autofs/root.c-                       spin_unlock(&active->d_lock);

  fs/autofs/root.c:                       dget_dlock(expiring);
  fs/autofs/root.c-                       spin_unlock(&expiring->d_lock);

  fs/ceph/dir.c-          if (!spin_trylock(&dentry->d_lock))
  fs/ceph/dir.c-                  continue;
  [...]
  fs/ceph/dir.c:                          dget_dlock(dentry);

  fs/ceph/mds_client.c-           spin_lock(&alias->d_lock);
  [...]
  fs/ceph/mds_client.c:                   dn = dget_dlock(alias);

  fs/configfs/inode.c-            spin_lock(&dentry->d_lock);
  fs/configfs/inode.c-            if (simple_positive(dentry)) {
  fs/configfs/inode.c:                    dget_dlock(dentry);

  fs/libfs.c:                             found = dget_dlock(d);
  fs/libfs.c-                     spin_unlock(&d->d_lock);

  fs/libfs.c:             found = dget_dlock(child);
  fs/libfs.c-     spin_unlock(&child->d_lock);

  fs/libfs.c:                             child = dget_dlock(d);
  fs/libfs.c-                     spin_unlock(&d->d_lock);

  fs/ocfs2/dcache.c:                      dget_dlock(dentry);
  fs/ocfs2/dcache.c-                      spin_unlock(&dentry->d_lock);

  include/linux/dcache.h:static inline struct dentry *dget_dlock(struct dentry *dentry)

After taking out the NULL check, dget_dlock() becomes almost identical
to __dget_dlock(); the only difference is that dget_dlock() returns the
dentry that was passed in. These are static inline helpers, so we can
rely on the compiler to discard unused return values. We can therefore
also remove __dget_dlock() and replace calls to it by dget_dlock().

Also fix up and improve the kerneldoc comments while we're at it.

Al Viro pointed out that we can also clean up some of the callers to
make use of the returned value and provided a bit more info for the
kerneldoc.

While preparing v2 I also noticed that the tabs used in the kerneldoc
comments were causing the kerneldoc to get parsed incorrectly so I also
fixed this up (including for d_unhashed, which is otherwise unrelated).

Testing: x86 defconfig build + boot; make htmldocs for the kerneldoc
warning. objdump shows there are code generation changes.

Link: https://lore.kernel.org/all/20231022164520.915013-1-vegard.nossum@oracle.com/
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Waiman Long <Waiman.Long@hp.com>
Cc: linux-doc@vger.kernel.org
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 48b393545ec2..1666c387861f 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -287,20 +287,40 @@ extern char *dentry_path(const struct dentry *, char *, int);
 /* Allocation counts.. */
 
 /**
- *	dget, dget_dlock -	get a reference to a dentry
- *	@dentry: dentry to get a reference to
+ * dget_dlock -	get a reference to a dentry
+ * @dentry: dentry to get a reference to
  *
- *	Given a dentry or %NULL pointer increment the reference count
- *	if appropriate and return the dentry. A dentry will not be 
- *	destroyed when it has references.
+ * Given a live dentry, increment the reference count and return the dentry.
+ * Caller must hold @dentry->d_lock.  Making sure that dentry is alive is
+ * caller's resonsibility.  There are many conditions sufficient to guarantee
+ * that; e.g. anything with non-negative refcount is alive, so's anything
+ * hashed, anything positive, anyone's parent, etc.
  */
 static inline struct dentry *dget_dlock(struct dentry *dentry)
 {
-	if (dentry)
-		dentry->d_lockref.count++;
+	dentry->d_lockref.count++;
 	return dentry;
 }
 
+
+/**
+ * dget - get a reference to a dentry
+ * @dentry: dentry to get a reference to
+ *
+ * Given a dentry or %NULL pointer increment the reference count
+ * if appropriate and return the dentry.  A dentry will not be
+ * destroyed when it has references.  Conversely, a dentry with
+ * no references can disappear for any number of reasons, starting
+ * with memory pressure.  In other words, that primitive is
+ * used to clone an existing reference; using it on something with
+ * zero refcount is a bug.
+ *
+ * NOTE: it will spin if @dentry->d_lock is held.  From the deadlock
+ * avoidance point of view it is equivalent to spin_lock()/increment
+ * refcount/spin_unlock(), so calling it under @dentry->d_lock is
+ * always a bug; so's calling it under ->d_lock on any of its descendents.
+ *
+ */
 static inline struct dentry *dget(struct dentry *dentry)
 {
 	if (dentry)
@@ -311,12 +331,11 @@ static inline struct dentry *dget(struct dentry *dentry)
 extern struct dentry *dget_parent(struct dentry *dentry);
 
 /**
- *	d_unhashed -	is dentry hashed
- *	@dentry: entry to check
+ * d_unhashed - is dentry hashed
+ * @dentry: entry to check
  *
- *	Returns true if the dentry passed is not currently hashed.
+ * Returns true if the dentry passed is not currently hashed.
  */
- 
 static inline int d_unhashed(const struct dentry *dentry)
 {
 	return hlist_bl_unhashed(&dentry->d_hash);
-- 
cgit v1.2.3


From 18caaedaf4c3712ab6821f292598a8f86e6d7972 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 26 Nov 2023 09:56:29 +0100
Subject: locking/lockdep: Slightly reorder 'struct lock_class' to save some
 memory

Based on pahole, 2 holes can be combined in the 'struct lock_class'. This
saves 8 bytes in the structure on my x86_64.

On a x86_64 configured with allmodconfig, this saves ~64kb of memory in
'kernel/locking/lockdep.o':

                text         data           bss           dec     filename
  Before:    102,501    1,912,490    11,531,636    13,546,627     kernel/locking/lockdep.o
  After:     102,181    1,912,490    11,466,100    13,480,771     kernel/locking/lockdep.o

because of:

  struct lock_class lock_classes[MAX_LOCKDEP_KEYS];

After the reorder, pahole gives:

  struct lock_class {
          struct hlist_node          hash_entry;           /*     0    16 */
          struct list_head           lock_entry;           /*    16    16 */
          struct list_head           locks_after;          /*    32    16 */
          struct list_head           locks_before;         /*    48    16 */
          /* --- cacheline 1 boundary (64 bytes) --- */
          const struct lockdep_subclass_key  * key;        /*    64     8 */
          lock_cmp_fn                cmp_fn;               /*    72     8 */
          lock_print_fn              print_fn;             /*    80     8 */
          unsigned int               subclass;             /*    88     4 */
          unsigned int               dep_gen_id;           /*    92     4 */
          long unsigned int          usage_mask;           /*    96     8 */
          const struct lock_trace  * usage_traces[10];     /*   104    80 */
          /* --- cacheline 2 boundary (128 bytes) was 56 bytes ago --- */
          const char  *              name;                 /*   184     8 */
          /* --- cacheline 3 boundary (192 bytes) --- */
          int                        name_version;         /*   192     4 */
          u8                         wait_type_inner;      /*   196     1 */
          u8                         wait_type_outer;      /*   197     1 */
          u8                         lock_type;            /*   198     1 */

          /* XXX 1 byte hole, try to pack */

          long unsigned int          contention_point[4];  /*   200    32 */
          long unsigned int          contending_point[4];  /*   232    32 */

          /* size: 264, cachelines: 5, members: 18 */
          /* sum members: 263, holes: 1, sum holes: 1 */
          /* last cacheline: 8 bytes */
  };

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://lore.kernel.org/r/801258371fc4101f96495a5aaecef638d6cbd8d3.1700988869.git.christophe.jaillet@wanadoo.fr
---
 include/linux/lockdep_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 2ebc323d345a..857d785e89e6 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -127,12 +127,12 @@ struct lock_class {
 	unsigned long			usage_mask;
 	const struct lock_trace		*usage_traces[LOCK_TRACE_STATES];
 
+	const char			*name;
 	/*
 	 * Generation counter, when doing certain classes of graph walking,
 	 * to ensure that we check one node only once:
 	 */
 	int				name_version;
-	const char			*name;
 
 	u8				wait_type_inner;
 	u8				wait_type_outer;
-- 
cgit v1.2.3


From a2e7e59a94269484a83386972ca07c22fd188854 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 15 Nov 2023 18:25:44 +0000
Subject: iommu: Avoid more races around device probe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It turns out there are more subtle races beyond just the main part of
__iommu_probe_device() itself running in parallel - the dev_iommu_free()
on the way out of an unsuccessful probe can still manage to trip up
concurrent accesses to a device's fwspec. Thus, extend the scope of
iommu_probe_device_lock() to also serialise fwspec creation and initial
retrieval.

Reported-by: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Link: https://lore.kernel.org/linux-iommu/e2e20e1c-6450-4ac5-9804-b0000acdf7de@quicinc.com/
Fixes: 01657bc14a39 ("iommu: Avoid races around device probe")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: André Draszik <andre.draszik@linaro.org>
Tested-by: André Draszik <andre.draszik@linaro.org>
Link: https://lore.kernel.org/r/16f433658661d7cadfea51e7c65da95826112a2b.1700071477.git.robin.murphy@arm.com
Cc: stable@vger.kernel.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ec289c1016f5..6291aa7b079b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -845,6 +845,7 @@ static inline void dev_iommu_priv_set(struct device *dev, void *priv)
 	dev->iommu->priv = priv;
 }
 
+extern struct mutex iommu_probe_device_lock;
 int iommu_probe_device(struct device *dev);
 
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f);
-- 
cgit v1.2.3


From a9c362db39207c4934c9125e56ed730c5297c37c Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 21 Nov 2023 18:03:59 +0000
Subject: iommu: Validate that devices match domains

Before we can allow drivers to coexist, we need to make sure that one
driver's domain ops can't misinterpret another driver's dev_iommu_priv
data. To that end, add a token to the domain so we can remember how it
was allocated - for now this may as well be the device ops, since they
still correlate 1:1 with drivers. We can trust ourselves for internal
default domain attachment, so add checks to cover all the public attach
interfaces.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/097c6f30480e4efe12195d00ba0e84ea4837fb4c.1700589539.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ec289c1016f5..077bf8cae2f7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -106,7 +106,7 @@ struct iommu_domain {
 	unsigned type;
 	const struct iommu_domain_ops *ops;
 	const struct iommu_dirty_ops *dirty_ops;
-
+	const struct iommu_ops *owner; /* Whose domain_alloc we came from */
 	unsigned long pgsize_bitmap;	/* Bitmap of page sizes in use */
 	struct iommu_domain_geometry geometry;
 	struct iommu_dma_cookie *iova_cookie;
-- 
cgit v1.2.3


From 17de3f5fdd35676b0e3d41c7c9bf4e3032eb3673 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 21 Nov 2023 18:04:02 +0000
Subject: iommu: Retire bus ops

With the rest of the API internals converted, it's time to finally
tackle probe_device and how we bootstrap the per-device ops association
to begin with. This ends up being disappointingly straightforward, since
fwspec users are already doing it in order to find their of_xlate
callback, and it works out that we can easily do the equivalent for
other drivers too. Then shuffle the remaining awareness of iommu_ops
into the couple of core headers that still need it, and breathe a sigh
of relief.

Ding dong the bus ops are gone!

CC: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/a59011ef65b4b6657cb0b7a388d786b779b61305.1700589539.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/device.h      | 1 -
 include/linux/device/bus.h  | 5 -----
 include/linux/dma-map-ops.h | 1 +
 3 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index d7a72a8749ea..0314dbbdb534 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -42,7 +42,6 @@ struct class;
 struct subsys_private;
 struct device_node;
 struct fwnode_handle;
-struct iommu_ops;
 struct iommu_group;
 struct dev_pin_info;
 struct dev_iommu;
diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index ae10c4322754..e25aab08f873 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -62,9 +62,6 @@ struct fwnode_handle;
  *			this bus.
  * @pm:		Power management operations of this bus, callback the specific
  *		device driver's pm-ops.
- * @iommu_ops:  IOMMU specific operations for this bus, used to attach IOMMU
- *              driver implementations to a bus and allow the driver to do
- *              bus-specific setup
  * @need_parent_lock:	When probing or removing a device on this bus, the
  *			device core should lock the device's parent.
  *
@@ -104,8 +101,6 @@ struct bus_type {
 
 	const struct dev_pm_ops *pm;
 
-	const struct iommu_ops *iommu_ops;
-
 	bool need_parent_lock;
 };
 
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index f2fc203fb8a1..a52e508d1869 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -11,6 +11,7 @@
 #include <linux/slab.h>
 
 struct cma;
+struct iommu_ops;
 
 /*
  * Values for struct dma_map_ops.flags:
-- 
cgit v1.2.3


From 17b226dcf80ce79d02f4f0b08813d8848885b986 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@collabora.com>
Date: Fri, 24 Nov 2023 15:24:33 +0100
Subject: iommu: Allow passing custom allocators to pgtable drivers

This will be useful for GPU drivers who want to keep page tables in a
pool so they can:

- keep freed page tables in a free pool and speed-up upcoming page
  table allocations
- batch page table allocation instead of allocating one page at a time
- pre-reserve pages for page tables needed for map/unmap operations,
  to ensure map/unmap operations don't try to allocate memory in paths
  they're allowed to block or fail

It might also be valuable for other aspects of GPU and similar
use-cases, like fine-grained memory accounting and resource limiting.

We will extend the Arm LPAE format to support custom allocators in a
separate commit.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20231124142434.1577550-2-boris.brezillon@collabora.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/io-pgtable.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 25142a0e2fc2..86cf1f7ae389 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -100,6 +100,30 @@ struct io_pgtable_cfg {
 	const struct iommu_flush_ops	*tlb;
 	struct device			*iommu_dev;
 
+	/**
+	 * @alloc: Custom page allocator.
+	 *
+	 * Optional hook used to allocate page tables. If this function is NULL,
+	 * @free must be NULL too.
+	 *
+	 * Memory returned should be zeroed and suitable for dma_map_single() and
+	 * virt_to_phys().
+	 *
+	 * Not all formats support custom page allocators. Before considering
+	 * passing a non-NULL value, make sure the chosen page format supports
+	 * this feature.
+	 */
+	void *(*alloc)(void *cookie, size_t size, gfp_t gfp);
+
+	/**
+	 * @free: Custom page de-allocator.
+	 *
+	 * Optional hook used to free page tables allocated with the @alloc
+	 * hook. Must be non-NULL if @alloc is not NULL, must be NULL
+	 * otherwise.
+	 */
+	void (*free)(void *cookie, void *pages, size_t size);
+
 	/* Low-level data specific to the table format */
 	union {
 		struct {
@@ -241,16 +265,26 @@ io_pgtable_tlb_add_page(struct io_pgtable *iop,
 		iop->cfg.tlb->tlb_add_page(gather, iova, granule, iop->cookie);
 }
 
+/**
+ * enum io_pgtable_caps - IO page table backend capabilities.
+ */
+enum io_pgtable_caps {
+	/** @IO_PGTABLE_CAP_CUSTOM_ALLOCATOR: Backend accepts custom page table allocators. */
+	IO_PGTABLE_CAP_CUSTOM_ALLOCATOR = BIT(0),
+};
+
 /**
  * struct io_pgtable_init_fns - Alloc/free a set of page tables for a
  *                              particular format.
  *
  * @alloc: Allocate a set of page tables described by cfg.
  * @free:  Free the page tables associated with iop.
+ * @caps:  Combination of @io_pgtable_caps flags encoding the backend capabilities.
  */
 struct io_pgtable_init_fns {
 	struct io_pgtable *(*alloc)(struct io_pgtable_cfg *cfg, void *cookie);
 	void (*free)(struct io_pgtable *iop);
+	u32 caps;
 };
 
 extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
-- 
cgit v1.2.3


From 8c88a474357ead632b07c70bf7f119ace8c3b39e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 24 Nov 2023 17:25:26 +0100
Subject: debugfs: add API to allow debugfs operations cancellation

In some cases there might be longer-running hardware accesses
in debugfs files, or attempts to acquire locks, and we want
to still be able to quickly remove the files.

Introduce a cancellations API to use inside the debugfs handler
functions to be able to cancel such operations on a per-file
basis.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/debugfs.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index ea2d919fd9c7..c9c65b132c0f 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -171,6 +171,25 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
 ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
 			      size_t count, loff_t *ppos);
 
+/**
+ * struct debugfs_cancellation - cancellation data
+ * @list: internal, for keeping track
+ * @cancel: callback to call
+ * @cancel_data: extra data for the callback to call
+ */
+struct debugfs_cancellation {
+	struct list_head list;
+	void (*cancel)(struct dentry *, void *);
+	void *cancel_data;
+};
+
+void __acquires(cancellation)
+debugfs_enter_cancellation(struct file *file,
+			   struct debugfs_cancellation *cancellation);
+void __releases(cancellation)
+debugfs_leave_cancellation(struct file *file,
+			   struct debugfs_cancellation *cancellation);
+
 #else
 
 #include <linux/err.h>
-- 
cgit v1.2.3


From 668bfeeabb5e402e3b36992f7859c284cc6e594d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 27 Nov 2023 08:20:02 +0100
Subject: block: move a few definitions out of CONFIG_BLK_DEV_ZONED

Allow using a few symbols with IS_ENABLED instead of #idef by moving
the declarations out of #idef CONFIG_BLK_DEV_ZONED, and move
bdev_nr_zones into the remaining  #idef CONFIG_BLK_DEV_ZONED, #else
block below.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231127072002.1332685-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 51fa7ffdee83..17c0a7d0d319 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -331,22 +331,13 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
 
 void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model);
 
-#ifdef CONFIG_BLK_DEV_ZONED
 #define BLK_ALL_ZONES  ((unsigned int)-1)
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
-			unsigned int nr_zones, report_zones_cb cb, void *data);
-unsigned int bdev_nr_zones(struct block_device *bdev);
-extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
-			    sector_t sectors, sector_t nr_sectors,
-			    gfp_t gfp_mask);
+		unsigned int nr_zones, report_zones_cb cb, void *data);
+int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
+		sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask);
 int blk_revalidate_disk_zones(struct gendisk *disk,
-			      void (*update_driver_data)(struct gendisk *disk));
-#else /* CONFIG_BLK_DEV_ZONED */
-static inline unsigned int bdev_nr_zones(struct block_device *bdev)
-{
-	return 0;
-}
-#endif /* CONFIG_BLK_DEV_ZONED */
+		void (*update_driver_data)(struct gendisk *disk));
 
 /*
  * Independent access ranges: struct blk_independent_access_range describes
@@ -643,6 +634,8 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
+unsigned int bdev_nr_zones(struct block_device *bdev);
+
 static inline unsigned int disk_nr_zones(struct gendisk *disk)
 {
 	return blk_queue_is_zoned(disk->queue) ? disk->nr_zones : 0;
@@ -687,6 +680,11 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
 }
 
 #else /* CONFIG_BLK_DEV_ZONED */
+static inline unsigned int bdev_nr_zones(struct block_device *bdev)
+{
+	return 0;
+}
+
 static inline unsigned int disk_nr_zones(struct gendisk *disk)
 {
 	return 0;
-- 
cgit v1.2.3


From 95ba893c9f4feb836ddce627efd0bb6af6667031 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Tue, 14 Nov 2023 13:37:09 +0100
Subject: dma-buf: fix check in dma_resv_add_fence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's valid to add the same fence multiple times to a dma-resv object and
we shouldn't need one extra slot for each.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Fixes: a3f7c10a269d5 ("dma-buf/dma-resv: check if the new fence is really later")
Cc: stable@vger.kernel.org # v5.19+
Link: https://patchwork.freedesktop.org/patch/msgid/20231115093035.1889-1-christian.koenig@amd.com
---
 include/linux/dma-fence.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index ebe78bd3d121..b3772edca2e6 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -498,6 +498,21 @@ static inline bool dma_fence_is_later(struct dma_fence *f1,
 	return __dma_fence_is_later(f1->seqno, f2->seqno, f1->ops);
 }
 
+/**
+ * dma_fence_is_later_or_same - return true if f1 is later or same as f2
+ * @f1: the first fence from the same context
+ * @f2: the second fence from the same context
+ *
+ * Returns true if f1 is chronologically later than f2 or the same fence. Both
+ * fences must be from the same context, since a seqno is not re-used across
+ * contexts.
+ */
+static inline bool dma_fence_is_later_or_same(struct dma_fence *f1,
+					      struct dma_fence *f2)
+{
+	return f1 == f2 || dma_fence_is_later(f1, f2);
+}
+
 /**
  * dma_fence_later - return the chronologically later fence
  * @f1:	the first fence from the same context
-- 
cgit v1.2.3


From 243ad8df7a1bd24c2e01bd99d9f0bb88844dae91 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 24 Nov 2023 12:27:52 +0000
Subject: net: phy: add possible interfaces

Add a possible_interfaces member to struct phy_device to indicate which
interfaces a clause 45 PHY may switch between depending on the media.
This must be populated by the PHY driver by the time the .config_init()
method completes according to the PHYs host-side configuration.

For example, the Marvell 88x3310 PHY can switch between 10GBASE-R,
5GBASE-R, 2500BASE-X, and SGMII on the host side depending on the media
side speed, so all these interface modes are set in the
possible_interfaces member.

This allows phylib users (such as phylink) to know in advance which
interface modes to expect, which allows them to appropriately restrict
the advertised link modes according to the capabilities of other parts
of the link.

Tested-by: Luo Jie <quic_luoj@quicinc.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/E1r6VHk-00DDLN-I7@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index e5f1f41e399c..6e7ebcc50b85 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -605,6 +605,8 @@ struct macsec_ops;
  * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended,
  *             requiring a rerun of the interrupt handler after resume
  * @interface: enum phy_interface_t value
+ * @possible_interfaces: bitmap if interface modes that the attached PHY
+ *			 will switch between depending on media speed.
  * @skb: Netlink message for cable diagnostics
  * @nest: Netlink nest used for cable diagnostics
  * @ehdr: nNtlink header for cable diagnostics
@@ -674,6 +676,7 @@ struct phy_device {
 	u32 dev_flags;
 
 	phy_interface_t interface;
+	DECLARE_PHY_INTERFACE_MASK(possible_interfaces);
 
 	/*
 	 * forced speed & duplex (no autoneg)
-- 
cgit v1.2.3


From 073d3d2ca7d462afc8159ca0175675b9b7b4f162 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 27 Oct 2023 12:40:04 +0530
Subject: OPP: Level zero is valid

The level zero can be used by some OPPs to drop performance state vote
for the device. It is perfectly fine to allow the same.

_set_opp_level() considers it as an invalid value currently and returns
early.

In order to support this properly, initialize the level field with
U32_MAX, which denotes unused level field.

Reported-by: Stephan Gerhold <stephan.gerhold@kernkonzept.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Stephan Gerhold <stephan.gerhold@kernkonzept.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm_opp.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index ccd97bcef269..af53101a1383 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -92,9 +92,12 @@ struct dev_pm_opp_config {
 	struct device ***virt_devs;
 };
 
+#define OPP_LEVEL_UNSET			U32_MAX
+
 /**
  * struct dev_pm_opp_data - The data to use to initialize an OPP.
- * @level: The performance level for the OPP.
+ * @level: The performance level for the OPP. Set level to OPP_LEVEL_UNSET if
+ * level field isn't used.
  * @freq: The clock rate in Hz for the OPP.
  * @u_volt: The voltage in uV for the OPP.
  */
-- 
cgit v1.2.3


From e37440e7e2c2760475d60c5556b59c8880a7fd63 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 27 Oct 2023 14:17:48 +0530
Subject: OPP: Call dev_pm_opp_set_opp() for required OPPs

Configuring the required OPP was never properly implemented, we just
took an exception for genpds and configured them directly, while leaving
out all other required OPP types.

Now that a standard call to dev_pm_opp_set_opp() takes care of
configuring the opp->level too, the special handling for genpds can be
avoided by simply calling dev_pm_opp_set_opp() for the required OPPs,
which shall eventually configure the corresponding level for genpds.

This also makes it possible for us to configure other type of required
OPPs (no concrete users yet though), via the same path. This is how
other frameworks take care of parent nodes, like clock, regulators, etc,
where we recursively call the same helper.

In order to call dev_pm_opp_set_opp() for the virtual genpd devices,
they must share the OPP table of the genpd. Call _add_opp_dev() for them
to get that done.

This commit also extends the struct dev_pm_opp_config to pass required
devices, for non-genpd cases, which can be used to call
dev_pm_opp_set_opp() for the non-genpd required devices.

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Stephan Gerhold <stephan.gerhold@kernkonzept.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm_opp.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index af53101a1383..81dff7facdc9 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -74,8 +74,10 @@ typedef int (*config_clks_t)(struct device *dev, struct opp_table *opp_table,
  * @supported_hw_count: Number of elements in the array.
  * @regulator_names: Array of pointers to the names of the regulator, NULL terminated.
  * @genpd_names: Null terminated array of pointers containing names of genpd to
- *		 attach.
- * @virt_devs: Pointer to return the array of virtual devices.
+ *		attach. Mutually exclusive with required_devs.
+ * @virt_devs: Pointer to return the array of genpd virtual devices. Mutually
+ *		exclusive with required_devs.
+ * @required_devs: Required OPP devices. Mutually exclusive with genpd_names/virt_devs.
  *
  * This structure contains platform specific OPP configurations for the device.
  */
@@ -90,6 +92,7 @@ struct dev_pm_opp_config {
 	const char * const *regulator_names;
 	const char * const *genpd_names;
 	struct device ***virt_devs;
+	struct device **required_devs;
 };
 
 #define OPP_LEVEL_UNSET			U32_MAX
-- 
cgit v1.2.3


From 3652117f854819a148ff0fbe4492587d3520b5e5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 22 Nov 2023 13:48:23 +0100
Subject: eventfd: simplify eventfd_signal()

Ever since the eventfd type was introduced back in 2007 in commit
e1ad7468c77d ("signal/timer/event: eventfd core") the eventfd_signal()
function only ever passed 1 as a value for @n. There's no point in
keeping that additional argument.

Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-2-bd549b14ce0c@kernel.org
Acked-by: Xu Yilun <yilun.xu@intel.com>
Acked-by: Andrew Donnellan <ajd@linux.ibm.com> # ocxl
Acked-by: Eric Farman <farman@linux.ibm.com>  # s390
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/eventfd.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index b9d83652c097..562089431551 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -35,7 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx);
 struct file *eventfd_fget(int fd);
 struct eventfd_ctx *eventfd_ctx_fdget(int fd);
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
-__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
+__u64 eventfd_signal(struct eventfd_ctx *ctx);
 __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
@@ -58,7 +58,7 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 	return ERR_PTR(-ENOSYS);
 }
 
-static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+static inline int eventfd_signal(struct eventfd_ctx *ctx)
 {
 	return -ENOSYS;
 }
-- 
cgit v1.2.3


From 120ae58593630819209a011a3f9c89f73bcc9894 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 22 Nov 2023 13:48:24 +0100
Subject: eventfd: simplify eventfd_signal_mask()

The eventfd_signal_mask() helper was introduced for io_uring and similar
to eventfd_signal() it always passed 1 for @n. So don't bother with that
argument at all.

Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-3-bd549b14ce0c@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/eventfd.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 562089431551..971943ecb2a6 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -36,7 +36,7 @@ struct file *eventfd_fget(int fd);
 struct eventfd_ctx *eventfd_ctx_fdget(int fd);
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
 __u64 eventfd_signal(struct eventfd_ctx *ctx);
-__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask);
+__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
 void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
@@ -63,8 +63,7 @@ static inline int eventfd_signal(struct eventfd_ctx *ctx)
 	return -ENOSYS;
 }
 
-static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n,
-				      unsigned mask)
+static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
 {
 	return -ENOSYS;
 }
-- 
cgit v1.2.3


From b7638ad0c7802ea854599ce753d0e6d20690f7e2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 22 Nov 2023 13:48:25 +0100
Subject: eventfd: make eventfd_signal{_mask}() void

No caller care about the return value.

Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-4-bd549b14ce0c@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/eventfd.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 971943ecb2a6..e32bee4345fb 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -35,8 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx);
 struct file *eventfd_fget(int fd);
 struct eventfd_ctx *eventfd_ctx_fdget(int fd);
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
-__u64 eventfd_signal(struct eventfd_ctx *ctx);
-__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask);
+void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
 void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
@@ -58,14 +57,8 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 	return ERR_PTR(-ENOSYS);
 }
 
-static inline int eventfd_signal(struct eventfd_ctx *ctx)
+static inline void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
 {
-	return -ENOSYS;
-}
-
-static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
-{
-	return -ENOSYS;
 }
 
 static inline void eventfd_ctx_put(struct eventfd_ctx *ctx)
@@ -91,5 +84,10 @@ static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 
 #endif
 
+static inline void eventfd_signal(struct eventfd_ctx *ctx)
+{
+	eventfd_signal_mask(ctx, 0);
+}
+
 #endif /* _LINUX_EVENTFD_H */
 
-- 
cgit v1.2.3


From e65a29f0235a438ece414d2d99bbf0d31aa97d04 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 22 Nov 2023 13:44:37 +0100
Subject: mnt_idmapping: remove check_fsmapping()

The helper is a bit pointless. Just open-code the check.

Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-1-dae4abdde5bd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/mnt_idmapping.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index b8da2db4ecd2..cd4d5c8781f5 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -244,7 +244,4 @@ static inline kgid_t mapped_fsgid(struct mnt_idmap *idmap,
 	return from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(current_fsgid()));
 }
 
-bool check_fsmapping(const struct mnt_idmap *idmap,
-		     const struct super_block *sb);
-
 #endif /* _LINUX_MNT_IDMAPPING_H */
-- 
cgit v1.2.3


From 783822e44594639848b78d4bb61dde26fba04e05 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 22 Nov 2023 13:44:39 +0100
Subject: mnt_idmapping: decouple from namespaces

There's no reason we need to couple mnt idmapping to namespaces in the
way we currently do. Copy the idmapping when an idmapped mount is
created and don't take any reference on the namespace at all.

We also can't easily refcount struct uid_gid_map because it needs to
stay the size of a cacheline otherwise we risk performance regressions
(Ignoring for a second that right now struct uid_gid_map isn't actually
 64 byte but 72 but that's a fix for another patch series.).

Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-3-dae4abdde5bd@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uidgid.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index b0542cd11aeb..415a7ca2b882 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -17,6 +17,7 @@
 
 struct user_namespace;
 extern struct user_namespace init_user_ns;
+struct uid_gid_map;
 
 typedef struct {
 	uid_t val;
@@ -138,6 +139,9 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
 	return from_kgid(ns, gid) != (gid_t) -1;
 }
 
+u32 map_id_down(struct uid_gid_map *map, u32 id);
+u32 map_id_up(struct uid_gid_map *map, u32 id);
+
 #else
 
 static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
@@ -186,6 +190,15 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
 	return gid_valid(gid);
 }
 
+static inline u32 map_id_down(struct uid_gid_map *map, u32 id)
+{
+	return id;
+}
+
+static inline u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+	return id;
+}
 #endif /* CONFIG_USER_NS */
 
 #endif /* _LINUX_UIDGID_H */
-- 
cgit v1.2.3


From e0894ff038d86f30614ec16ec26dacb88c8d2bd4 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Mon, 27 Nov 2023 12:05:21 +1300
Subject: platform/x86: asus-wmi: disable USB0 hub on ROG Ally before suspend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ASUS have worked around an issue in XInput where it doesn't support USB
selective suspend, which causes suspend issues in Windows. They worked
around this by adjusting the MCU firmware to disable the USB0 hub when
the screen is switched off during the Microsoft DSM suspend path in ACPI.

The issue we have with this however is one of timing - the call the tells
the MCU to this isn't able to complete before suspend is done so we call
this in a prepare() and add a small msleep() to ensure it is done. This
must be done before the screen is switched off to prevent a variety of
possible races.

Further to this the MCU powersave option must also be disabled as it can
cause a number of issues such as:
- unreliable resume connection of N-Key
- complete loss of N-Key if the power is plugged in while suspended
Disabling the powersave option prevents this.

Without this the MCU is unable to initialise itself correctly on resume.

Signed-off-by: "Luke D. Jones" <luke@ljones.dev>
Tested-by: Philip Mueller <philm@manjaro.org>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20231126230521.125708-2-luke@ljones.dev
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 63e630276499..ab1c7deff118 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -114,6 +114,9 @@
 /* Charging mode - 1=Barrel, 2=USB */
 #define ASUS_WMI_DEVID_CHARGE_MODE	0x0012006C
 
+/* MCU powersave mode */
+#define ASUS_WMI_DEVID_MCU_POWERSAVE   0x001200E2
+
 /* epu is connected? 1 == true */
 #define ASUS_WMI_DEVID_EGPU_CONNECTED	0x00090018
 /* egpu on/off */
-- 
cgit v1.2.3


From 083772c9f972dcc248913b52a0dec1025baa1e16 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sun, 26 Nov 2023 15:07:30 -0800
Subject: net: page_pool: record pools per netdev

Link the page pools with netdevs. This needs to be netns compatible
so we have two options. Either we record the pools per netns and
have to worry about moving them as the netdev gets moved.
Or we record them directly on the netdev so they move with the netdev
without any extra work.

Implement the latter option. Since pools may outlast netdev we need
a place to store orphans. In time honored tradition use loopback
for this purpose.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/list.h      | 20 ++++++++++++++++++++
 include/linux/netdevice.h |  4 ++++
 include/linux/poison.h    |  2 ++
 3 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 1837caedf723..059aa1fff41e 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -1119,6 +1119,26 @@ static inline void hlist_move_list(struct hlist_head *old,
 	old->first = NULL;
 }
 
+/**
+ * hlist_splice_init() - move all entries from one list to another
+ * @from: hlist_head from which entries will be moved
+ * @last: last entry on the @from list
+ * @to:   hlist_head to which entries will be moved
+ *
+ * @to can be empty, @from must contain at least @last.
+ */
+static inline void hlist_splice_init(struct hlist_head *from,
+				     struct hlist_node *last,
+				     struct hlist_head *to)
+{
+	if (to->first)
+		to->first->pprev = &last->next;
+	last->next = to->first;
+	to->first = from->first;
+	from->first->pprev = &to->first;
+	from->first = NULL;
+}
+
 #define hlist_entry(ptr, type, member) container_of(ptr,type,member)
 
 #define hlist_for_each(pos, head) \
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e87caa81f70c..998c7aaa98b8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2447,6 +2447,10 @@ struct net_device {
 #if IS_ENABLED(CONFIG_DPLL)
 	struct dpll_pin		*dpll_pin;
 #endif
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+	/** @page_pools: page pools created for this netdevice */
+	struct hlist_head	page_pools;
+#endif
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 851a855d3868..27a7dad17eef 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -83,6 +83,8 @@
 
 /********** net/core/skbuff.c **********/
 #define SKB_LIST_POISON_NEXT	((void *)(0x800 + POISON_POINTER_DELTA))
+/********** net/ **********/
+#define NET_PTR_POISON		((void *)(0x801 + POISON_POINTER_DELTA))
 
 /********** kernel/bpf/ **********/
 #define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA))
-- 
cgit v1.2.3


From c392cbecd8eca4c53f2bf508731257d9d0a21c2d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 27 Nov 2023 16:47:04 -0700
Subject: io_uring/kbuf: defer release of mapped buffer rings

If a provided buffer ring is setup with IOU_PBUF_RING_MMAP, then the
kernel allocates the memory for it and the application is expected to
mmap(2) this memory. However, io_uring uses remap_pfn_range() for this
operation, so we cannot rely on normal munmap/release on freeing them
for us.

Stash an io_buf_free entry away for each of these, if any, and provide
a helper to free them post ->release().

Cc: stable@vger.kernel.org
Fixes: c56e022c0a27 ("io_uring: add support for user mapped provided buffer ring")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d3009d56af0b..805bb635cdf5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -340,6 +340,9 @@ struct io_ring_ctx {
 
 	struct list_head	io_buffers_cache;
 
+	/* deferred free list, protected by ->uring_lock */
+	struct hlist_head	io_buf_list;
+
 	/* Keep this last, we don't need it for the fast path */
 	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;
-- 
cgit v1.2.3


From 9a5f580c1c71b6aedba696c4898a7a7184cef8ad Mon Sep 17 00:00:00 2001
From: Muralidhara M K <muralidhara.mk@amd.com>
Date: Thu, 2 Nov 2023 11:42:24 +0000
Subject: EDAC/mc: Add support for HBM3 memory type

AMD MI300A models use HBM3 (High Bandwidth Memory Gen 3) memory. HBM is
a high-speed computer memory interface for 3D-stacked synchronous
dynamic random-access memory (SDRAM).

Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20231102114225.2006878-4-muralimk@amd.com
---
 include/linux/edac.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/edac.h b/include/linux/edac.h
index fa4bda2a70f6..1174beb94ab6 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -187,6 +187,7 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  * @MEM_NVDIMM:		Non-volatile RAM
  * @MEM_WIO2:		Wide I/O 2.
  * @MEM_HBM2:		High bandwidth Memory Gen 2.
+ * @MEM_HBM3:		High bandwidth Memory Gen 3.
  */
 enum mem_type {
 	MEM_EMPTY = 0,
@@ -218,6 +219,7 @@ enum mem_type {
 	MEM_NVDIMM,
 	MEM_WIO2,
 	MEM_HBM2,
+	MEM_HBM3,
 };
 
 #define MEM_FLAG_EMPTY		BIT(MEM_EMPTY)
@@ -248,6 +250,7 @@ enum mem_type {
 #define MEM_FLAG_NVDIMM		BIT(MEM_NVDIMM)
 #define MEM_FLAG_WIO2		BIT(MEM_WIO2)
 #define MEM_FLAG_HBM2		BIT(MEM_HBM2)
+#define MEM_FLAG_HBM3		BIT(MEM_HBM3)
 
 /**
  * enum edac_type - Error Detection and Correction capabilities and mode
-- 
cgit v1.2.3


From fad907cffd4bde7384812cf32fcf69becab805cc Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 28 Nov 2023 20:30:26 +0800
Subject: block: move .bd_inode into 1st cacheline of block_device

The .bd_inode field of block_device is used in IO fast path of
blkdev_write_iter() and blkdev_llseek(), so it is more efficient to keep
it into the 1st cacheline.

.bd_openers is only touched in open()/close(), and .bd_size_lock is only
for updating bdev capacity, which is in slow path too.

So swap .bd_inode layout with .bd_openers & .bd_size_lock to move
.bd_inode into the 1st cache line.

Cc: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231128123027.971610-2-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d5c5e59ddbd2..f7d40692dd94 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -49,9 +49,10 @@ struct block_device {
 	bool			bd_write_holder;
 	bool			bd_has_submit_bio;
 	dev_t			bd_dev;
+	struct inode		*bd_inode;	/* will die */
+
 	atomic_t		bd_openers;
 	spinlock_t		bd_size_lock; /* for bd_inode->i_size updates */
-	struct inode *		bd_inode;	/* will die */
 	void *			bd_claiming;
 	void *			bd_holder;
 	const struct blk_holder_ops *bd_holder_ops;
-- 
cgit v1.2.3


From 67d995e069535c32829f5d368d919063492cec6e Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 28 Nov 2023 20:30:27 +0800
Subject: block: warn once for each partition in bio_check_ro()

Commit 1b0a151c10a6 ("blk-core: use pr_warn_ratelimited() in
bio_check_ro()") fix message storm by limit the rate, however, there
will still be lots of message in the long term. Fix it better by warn
once for each partition.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231128123027.971610-3-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index f7d40692dd94..b29ebd53417d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -70,6 +70,7 @@ struct block_device {
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	bool			bd_make_it_fail;
 #endif
+	bool			bd_ro_warned;
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
 	 * path
-- 
cgit v1.2.3


From a3db64c575ca201c9783f100c70b82d52bd78a93 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Mon, 27 Nov 2023 13:37:09 +0100
Subject: tty: make tty const in tty_get_baud_rate()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After commit 87888fb9ac0c ("tty: Remove baudrate dead code & make
ktermios params const"), the 'tty' parameter is only read in
tty_get_baud_rate(). Therefore, we can make 'tty' accepted in the
function 'const' for clarity.

The "the terminal bit flags may be updated." part of the
tty_get_baud_rate()'s kernel-doc is dropped as it is no longer true.
Because of the same commit above. And it was misplaced anyway.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20231127123713.14504-1-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 7625fc98fef3..e96c85f4f91e 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -440,10 +440,9 @@ void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud,
  *
  * Returns: the baud rate as an integer for this terminal
  *
- * Locking: The termios lock must be held by the caller and the terminal bit
- * flags may be updated.
+ * Locking: The termios lock must be held by the caller.
  */
-static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
+static inline speed_t tty_get_baud_rate(const struct tty_struct *tty)
 {
 	return tty_termios_baud_rate(&tty->termios);
 }
-- 
cgit v1.2.3


From eec4954b81c3d9a38b99e78afb553c359db40093 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 28 Nov 2023 10:28:15 +0000
Subject: driver core: make device_is_dependent() static

The function device_is_dependent() is only called by the driver core
internally and should not, at this time, be called by anyone else
outside of it, so mark it as static so as not to give driver authors the
wrong idea.

Cc: Saravana Kannan <saravanak@google.com>
Acked-by: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/2023112815-faculty-thud-add8@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index d7a72a8749ea..4aa34c8d1361 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1071,7 +1071,6 @@ int device_rename(struct device *dev, const char *new_name);
 int device_move(struct device *dev, struct device *new_parent,
 		enum dpm_order dpm_order);
 int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);
-int device_is_dependent(struct device *dev, void *target);
 
 static inline bool device_supports_offline(struct device *dev)
 {
-- 
cgit v1.2.3


From 33253d9e01d40542f07aaf718a655893cd2949e5 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Mon, 27 Nov 2023 14:15:59 +0100
Subject: fbdev: Move default fb_mmap code into helper function

Move the default fb_mmap code for I/O address spaces into the helper
function fb_io_mmap(). The helper can either be called via struct
fb_ops.fb_mmap or as the default if no fb_mmap has been set. Also
set the new helper in __FB_DEFAULT_IOMEM_OPS_MMAP.

In the mid-term, fb_io_mmap() is supposed to become optional. Fbdev
drivers will initialize their struct fb_ops.fb_mmap to the helper
and select a corresponding Kconfig token. The helper can then be made
optional at compile time.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231127131655.4020-31-tzimmermann@suse.de
---
 include/linux/fb.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index 94e2c44c6569..a36d05b576b0 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -536,6 +536,7 @@ extern ssize_t fb_io_read(struct fb_info *info, char __user *buf,
 			  size_t count, loff_t *ppos);
 extern ssize_t fb_io_write(struct fb_info *info, const char __user *buf,
 			   size_t count, loff_t *ppos);
+int fb_io_mmap(struct fb_info *info, struct vm_area_struct *vma);
 
 #define __FB_DEFAULT_IOMEM_OPS_RDWR \
 	.fb_read	= fb_io_read, \
@@ -547,7 +548,7 @@ extern ssize_t fb_io_write(struct fb_info *info, const char __user *buf,
 	.fb_imageblit	= cfb_imageblit
 
 #define __FB_DEFAULT_IOMEM_OPS_MMAP \
-	.fb_mmap	= NULL /* default implementation */
+	.fb_mmap	= fb_io_mmap
 
 #define FB_DEFAULT_IOMEM_OPS \
 	__FB_DEFAULT_IOMEM_OPS_RDWR, \
-- 
cgit v1.2.3


From b3e8813773c568fd2d65e9752abfda27442e502e Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Mon, 27 Nov 2023 14:16:00 +0100
Subject: fbdev: Warn on incorrect framebuffer access

Test in framebuffer read, write and drawing helpers if FBINFO_VIRTFB
has been set correctly. Framebuffers in I/O memory should only be
accessed with the architecture's respective helpers. Framebuffers
in system memory should be accessed with the regular load and
store operations. Presumably not all drivers get this right, so we
now warn about it.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231127131655.4020-32-tzimmermann@suse.de
---
 include/linux/fb.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index a36d05b576b0..24f0ec366235 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -849,7 +849,10 @@ static inline bool fb_modesetting_disabled(const char *drvname)
 }
 #endif
 
-/* Convenience logging macros */
+/*
+ * Convenience logging macros
+ */
+
 #define fb_err(fb_info, fmt, ...)					\
 	pr_err("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__)
 #define fb_notice(info, fmt, ...)					\
@@ -861,4 +864,7 @@ static inline bool fb_modesetting_disabled(const char *drvname)
 #define fb_dbg(fb_info, fmt, ...)					\
 	pr_debug("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__)
 
+#define fb_warn_once(fb_info, fmt, ...)					\
+	pr_warn_once("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__)
+
 #endif /* _LINUX_FB_H */
-- 
cgit v1.2.3


From 8813e86f6d82a7931446c3cbc5d596f77d0f1ba6 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Mon, 27 Nov 2023 14:16:01 +0100
Subject: fbdev: Remove default file-I/O implementations

Drop the default implementations for file read, write and mmap
operations. Each fbdev driver must now provide an implementation
and select any necessary helpers. If no implementation has been
set, fbdev returns an errno code to user space. The code is the
same as if the operation had not been set in the file_operations
struct.

This change makes the fbdev helpers for I/O memory optional. Most
systems only use system-memory framebuffers via DRM's fbdev emulation.

v2:
	* warn once if I/O callbacks are missing (Javier)

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231127131655.4020-33-tzimmermann@suse.de
---
 include/linux/fb.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index 24f0ec366235..05dc9624897d 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -867,4 +867,9 @@ static inline bool fb_modesetting_disabled(const char *drvname)
 #define fb_warn_once(fb_info, fmt, ...)					\
 	pr_warn_once("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__)
 
+#define fb_WARN_ONCE(fb_info, condition, fmt, ...) \
+	WARN_ONCE(condition, "fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__)
+#define fb_WARN_ON_ONCE(fb_info, x) \
+	fb_WARN_ONCE(fb_info, (x), "%s", "fb_WARN_ON_ONCE(" __stringify(x) ")")
+
 #endif /* _LINUX_FB_H */
-- 
cgit v1.2.3


From 5431fdd2c181dd2eac218e45b44deb2925fa48f0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sun, 17 Sep 2023 13:24:21 +0200
Subject: ptrace: Convert ptrace_attach() to use lock guards

Created as testing for the conditional guard infrastructure.
Specifically this makes use of the following form:

  scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR,
		     &task->signal->cred_guard_mutex) {
    ...
  }
  ...
  return 0;

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20231102110706.568467727%40infradead.org
---
 include/linux/sched/task.h |  2 ++
 include/linux/spinlock.h   | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a23af225c898..4f3dca353556 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -226,4 +226,6 @@ static inline void task_unlock(struct task_struct *p)
 	spin_unlock(&p->alloc_lock);
 }
 
+DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))
+
 #endif /* _LINUX_SCHED_TASK_H */
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index ceb56b39c70f..90bc853cafb6 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -548,5 +548,31 @@ DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
 DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try,
 			 spin_trylock_irqsave(_T->lock, _T->flags))
 
+DEFINE_LOCK_GUARD_1(read_lock, rwlock_t,
+		    read_lock(_T->lock),
+		    read_unlock(_T->lock))
+
+DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t,
+		    read_lock_irq(_T->lock),
+		    read_unlock_irq(_T->lock))
+
+DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t,
+		    read_lock_irqsave(_T->lock, _T->flags),
+		    read_unlock_irqrestore(_T->lock, _T->flags),
+		    unsigned long flags)
+
+DEFINE_LOCK_GUARD_1(write_lock, rwlock_t,
+		    write_lock(_T->lock),
+		    write_unlock(_T->lock))
+
+DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t,
+		    write_lock_irq(_T->lock),
+		    write_unlock_irq(_T->lock))
+
+DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t,
+		    write_lock_irqsave(_T->lock, _T->flags),
+		    write_unlock_irqrestore(_T->lock, _T->flags),
+		    unsigned long flags)
+
 #undef __LINUX_INSIDE_SPINLOCK_H
 #endif /* __LINUX_SPINLOCK_H */
-- 
cgit v1.2.3


From febab20caebac959fdc3d7520bc52de8b1184455 Mon Sep 17 00:00:00 2001
From: Wyes Karny <wyes.karny@amd.com>
Date: Fri, 17 Nov 2023 06:38:39 +0000
Subject: cpufreq/amd-pstate: Fix scaling_min_freq and scaling_max_freq update

When amd_pstate is running, writing to scaling_min_freq and
scaling_max_freq has no effect. These values are only passed to the
policy level, but not to the platform level. This means that the
platform does not know about the frequency limits set by the user.

To fix this, update the min_perf and max_perf values at the platform
level whenever the user changes the scaling_min_freq and scaling_max_freq
values.

Fixes: ffa5096a7c33 ("cpufreq: amd-pstate: implement Pstate EPP support for the AMD processors")
Acked-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Wyes Karny <wyes.karny@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/amd-pstate.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
index 446394f84606..6ad02ad9c7b4 100644
--- a/include/linux/amd-pstate.h
+++ b/include/linux/amd-pstate.h
@@ -70,6 +70,10 @@ struct amd_cpudata {
 	u32	nominal_perf;
 	u32	lowest_nonlinear_perf;
 	u32	lowest_perf;
+	u32     min_limit_perf;
+	u32     max_limit_perf;
+	u32     min_limit_freq;
+	u32     max_limit_freq;
 
 	u32	max_freq;
 	u32	min_freq;
-- 
cgit v1.2.3


From 48eb03dd26304c24f03bdbb9382e89c8564e71df Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 27 Nov 2023 11:03:08 -0800
Subject: xsk: Add TX timestamp and TX checksum offload support

This change actually defines the (initial) metadata layout
that should be used by AF_XDP userspace (xsk_tx_metadata).
The first field is flags which requests appropriate offloads,
followed by the offload-specific fields. The supported per-device
offloads are exported via netlink (new xsk-flags).

The offloads themselves are still implemented in a bit of a
framework-y fashion that's left from my initial kfunc attempt.
I'm introducing new xsk_tx_metadata_ops which drivers are
supposed to implement. The drivers are also supposed
to call xsk_tx_metadata_request/xsk_tx_metadata_complete in
the right places. Since xsk_tx_metadata_{request,_complete}
are static inline, we don't incur any extra overhead doing
indirect calls.

The benefit of this scheme is as follows:
- keeps all metadata layout parsing away from driver code
- makes it easy to grep and see which drivers implement what
- don't need any extra flags to maintain to keep track of what
  offloads are implemented; if the callback is implemented - the offload
  is supported (used by netlink reporting code)

Two offloads are defined right now:
1. XDP_TXMD_FLAGS_CHECKSUM: skb-style csum_start+csum_offset
2. XDP_TXMD_FLAGS_TIMESTAMP: writes TX timestamp back into metadata
   area upon completion (tx_timestamp field)

XDP_TXMD_FLAGS_TIMESTAMP is also implemented for XDP_COPY mode: it writes
SW timestamp from the skb destructor (note I'm reusing hwtstamps to pass
metadata pointer).

The struct is forward-compatible and can be extended in the future
by appending more fields.

Reviewed-by: Song Yoong Siang <yoong.siang.song@intel.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20231127190319.1190813-3-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/netdevice.h |  2 ++
 include/linux/skbuff.h    | 14 +++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e87caa81f70c..08da8b28c816 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1865,6 +1865,7 @@ enum netdev_stat_type {
  *	@netdev_ops:	Includes several pointers to callbacks,
  *			if one wants to override the ndo_*() functions
  *	@xdp_metadata_ops:	Includes pointers to XDP metadata callbacks.
+ *	@xsk_tx_metadata_ops:	Includes pointers to AF_XDP TX metadata callbacks.
  *	@ethtool_ops:	Management operations
  *	@l3mdev_ops:	Layer 3 master device operations
  *	@ndisc_ops:	Includes callbacks for different IPv6 neighbour
@@ -2128,6 +2129,7 @@ struct net_device {
 	unsigned long long	priv_flags;
 	const struct net_device_ops *netdev_ops;
 	const struct xdp_metadata_ops *xdp_metadata_ops;
+	const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops;
 	int			ifindex;
 	unsigned short		gflags;
 	unsigned short		hard_header_len;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 27998f73183e..b370eb8d70f7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -566,6 +566,15 @@ struct ubuf_info_msgzc {
 int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
 void mm_unaccount_pinned_pages(struct mmpin *mmp);
 
+/* Preserve some data across TX submission and completion.
+ *
+ * Note, this state is stored in the driver. Extending the layout
+ * might need some special care.
+ */
+struct xsk_tx_metadata_compl {
+	__u64 *tx_timestamp;
+};
+
 /* This data is invariant across clones and lives at
  * the end of the header data, ie. at skb->end.
  */
@@ -578,7 +587,10 @@ struct skb_shared_info {
 	/* Warning: this field is not always filled in (UFO)! */
 	unsigned short	gso_segs;
 	struct sk_buff	*frag_list;
-	struct skb_shared_hwtstamps hwtstamps;
+	union {
+		struct skb_shared_hwtstamps hwtstamps;
+		struct xsk_tx_metadata_compl xsk_meta;
+	};
 	unsigned int	gso_type;
 	u32		tskey;
 
-- 
cgit v1.2.3


From 8866730aed5100f06d3d965c22f1c61f74942541 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 28 Nov 2023 17:25:56 -0800
Subject: bpf, sockmap: af_unix stream sockets need to hold ref for pair sock

AF_UNIX stream sockets are a paired socket. So sending on one of the pairs
will lookup the paired socket as part of the send operation. It is possible
however to put just one of the pairs in a BPF map. This currently increments
the refcnt on the sock in the sockmap to ensure it is not free'd by the
stack before sockmap cleans up its state and stops any skbs being sent/recv'd
to that socket.

But we missed a case. If the peer socket is closed it will be free'd by the
stack. However, the paired socket can still be referenced from BPF sockmap
side because we hold a reference there. Then if we are sending traffic through
BPF sockmap to that socket it will try to dereference the free'd pair in its
send logic creating a use after free. And following splat:

   [59.900375] BUG: KASAN: slab-use-after-free in sk_wake_async+0x31/0x1b0
   [59.901211] Read of size 8 at addr ffff88811acbf060 by task kworker/1:2/954
   [...]
   [59.905468] Call Trace:
   [59.905787]  <TASK>
   [59.906066]  dump_stack_lvl+0x130/0x1d0
   [59.908877]  print_report+0x16f/0x740
   [59.910629]  kasan_report+0x118/0x160
   [59.912576]  sk_wake_async+0x31/0x1b0
   [59.913554]  sock_def_readable+0x156/0x2a0
   [59.914060]  unix_stream_sendmsg+0x3f9/0x12a0
   [59.916398]  sock_sendmsg+0x20e/0x250
   [59.916854]  skb_send_sock+0x236/0xac0
   [59.920527]  sk_psock_backlog+0x287/0xaa0

To fix let BPF sockmap hold a refcnt on both the socket in the sockmap and its
paired socket. It wasn't obvious how to contain the fix to bpf_unix logic. The
primarily problem with keeping this logic in bpf_unix was: In the sock close()
we could handle the deref by having a close handler. But, when we are destroying
the psock through a map delete operation we wouldn't have gotten any signal
thorugh the proto struct other than it being replaced. If we do the deref from
the proto replace its too early because we need to deref the sk_pair after the
backlog worker has been stopped.

Given all this it seems best to just cache it at the end of the psock and eat 8B
for the af_unix and vsock users. Notice dgram sockets are OK because they handle
locking already.

Fixes: 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20231129012557.95371-2-john.fastabend@gmail.com
---
 include/linux/skmsg.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index c1637515a8a4..c953b8c0d2f4 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -106,6 +106,7 @@ struct sk_psock {
 	struct mutex			work_mutex;
 	struct sk_psock_work_state	work_state;
 	struct delayed_work		work;
+	struct sock			*sk_pair;
 	struct rcu_work			rwork;
 };
 
-- 
cgit v1.2.3


From 7577bc8249c3fc86096ef1b1c9a8f4b6232231e7 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 28 Nov 2023 18:29:20 -0800
Subject: tcp: Don't pass cookie to __cookie_v[46]_check().

tcp_hdr(skb) and SYN Cookie are passed to __cookie_v[46]_check(), but
none of the callers passes cookie other than ntohl(th->ack_seq) - 1.

Let's fetch it in __cookie_v[46]_check() instead of passing the cookie
over and over.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20231129022924.96156-5-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netfilter_ipv6.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 7834c0be2831..61aa48f46dd7 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -51,7 +51,7 @@ struct nf_ipv6_ops {
 	u32 (*cookie_init_sequence)(const struct ipv6hdr *iph,
 				    const struct tcphdr *th, u16 *mssp);
 	int (*cookie_v6_check)(const struct ipv6hdr *iph,
-			       const struct tcphdr *th, __u32 cookie);
+			       const struct tcphdr *th);
 #endif
 	void (*route_input)(struct sk_buff *skb);
 	int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb,
@@ -179,16 +179,16 @@ static inline u32 nf_ipv6_cookie_init_sequence(const struct ipv6hdr *iph,
 }
 
 static inline int nf_cookie_v6_check(const struct ipv6hdr *iph,
-				     const struct tcphdr *th, __u32 cookie)
+				     const struct tcphdr *th)
 {
 #if IS_ENABLED(CONFIG_SYN_COOKIES)
 #if IS_MODULE(CONFIG_IPV6)
 	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
 
 	if (v6_ops)
-		return v6_ops->cookie_v6_check(iph, th, cookie);
+		return v6_ops->cookie_v6_check(iph, th);
 #elif IS_BUILTIN(CONFIG_IPV6)
-	return __cookie_v6_check(iph, th, cookie);
+	return __cookie_v6_check(iph, th);
 #endif
 #endif
 	return 0;
-- 
cgit v1.2.3


From 39cefd85098d12439586824c39f8e1948fac186d Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Wed, 29 Nov 2023 17:31:55 +0100
Subject: spi: introduce SPI_TRANS_FAIL_IO for error reporting

The default message transfer implementation - spi_transfer_one_message -
invokes the specific device driver's transfer_one(), then waits for
completion. However, there is no mechanism for the device driver to
report failure in the middle of the transfer.

Introduce SPI_TRANS_FAIL_IO for drivers to report transfer failure.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/4b420dac528e60f122adde16851da88e4798c1ea.1701274975.git.namcao@linutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 255a0562aea5..aa25ae04c5c3 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -461,10 +461,13 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  *                  - return 1 if the transfer is still in progress. When
  *                    the driver is finished with this transfer it must
  *                    call spi_finalize_current_transfer() so the subsystem
- *                    can issue the next transfer. Note: transfer_one and
- *                    transfer_one_message are mutually exclusive; when both
- *                    are set, the generic subsystem does not call your
- *                    transfer_one callback.
+ *                    can issue the next transfer. If the transfer fails, the
+ *                    driver must set the flag SPI_TRANS_FAIL_IO to
+ *                    spi_transfer->error first, before calling
+ *                    spi_finalize_current_transfer().
+ *                    Note: transfer_one and transfer_one_message are mutually
+ *                    exclusive; when both are set, the generic subsystem does
+ *                    not call your transfer_one callback.
  * @handle_err: the subsystem calls the driver to handle an error that occurs
  *		in the generic implementation of transfer_one_message().
  * @mem_ops: optimized/dedicated operations for interactions with SPI memory.
@@ -1040,6 +1043,7 @@ struct spi_transfer {
 	unsigned	len;
 
 #define SPI_TRANS_FAIL_NO_START	BIT(0)
+#define SPI_TRANS_FAIL_IO	BIT(1)
 	u16		error;
 
 	dma_addr_t	tx_dma;
-- 
cgit v1.2.3


From cff49d58f57e5667c10a0db85d7461790bb85cf8 Mon Sep 17 00:00:00 2001
From: "Chia-Lin Kao (AceLan)" <acelan.kao@canonical.com>
Date: Wed, 29 Nov 2023 14:43:10 +0800
Subject: spi: Unify error codes by replacing -ENOTSUPP with -EOPNOTSUPP

This commit updates the SPI subsystem, particularly affecting "SPI MEM"
drivers and core parts, by replacing the -ENOTSUPP error code with
-EOPNOTSUPP.

The key motivations for this change are as follows:
1. The spi-nor driver currently uses EOPNOTSUPP, whereas calls to spi-mem
might return ENOTSUPP. This update aims to unify the error reporting
within the SPI subsystem for clarity and consistency.

2. The use of ENOTSUPP has been flagged by checkpatch as inappropriate,
mainly being reserved for NFS-related errors. To align with kernel coding
standards and recommendations, this change is being made.

3. By using EOPNOTSUPP, we provide more specific context to the error,
indicating that a particular operation is not supported. This helps
differentiate from the more generic ENOTSUPP error, allowing drivers to
better handle and respond to different error scenarios.

Risks and Considerations:
While this change is primarily intended as a code cleanup and error code
unification, there is a minor risk of breaking user-space applications
that rely on specific return codes for unsupported operations. However,
this risk is considered low, as such use-cases are unlikely to be common
or critical. Nevertheless, developers and users should be aware of this
change, especially if they have scripts or tools that specifically handle
SPI error codes.

This commit does not introduce any functional changes to the SPI subsystem
or the affected drivers.

Signed-off-by: "Chia-Lin Kao (AceLan)" <acelan.kao@canonical.com>
Acked-by: Tudor Ambarus <tudor.ambarus@linaro.org>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20231129064311.272422-1-acelan.kao@canonical.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi-mem.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
index 6b0a7dc48a4b..f866d5c8ed32 100644
--- a/include/linux/spi/spi-mem.h
+++ b/include/linux/spi/spi-mem.h
@@ -233,6 +233,8 @@ static inline void *spi_mem_get_drvdata(struct spi_mem *mem)
  *		    limitations)
  * @supports_op: check if an operation is supported by the controller
  * @exec_op: execute a SPI memory operation
+ *           not all driver provides supports_op(), so it can return -EOPNOTSUPP
+ *           if the op is not supported by the driver/controller
  * @get_name: get a custom name for the SPI mem device from the controller.
  *	      This might be needed if the controller driver has been ported
  *	      to use the SPI mem layer and a custom name is used to keep
-- 
cgit v1.2.3


From 3d1ff9dfdc168722f570144aba0ce29d28d7f483 Mon Sep 17 00:00:00 2001
From: Ramesh Errabolu <Ramesh.Errabolu@amd.com>
Date: Wed, 22 Nov 2023 10:05:56 -0600
Subject: dma-buf: Correct the documentation of name and exp_name symbols
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the documentation of struct dma_buf members name and exp_name
as to how these members are to be used and accessed.

Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231122160556.24948-1-Ramesh.Errabolu@amd.com
Signed-off-by: Christian König <christian.koenig@amd.com>
---
 include/linux/dma-buf.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 3f31baa3293f..8ff4add71f88 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -343,16 +343,19 @@ struct dma_buf {
 	/**
 	 * @exp_name:
 	 *
-	 * Name of the exporter; useful for debugging. See the
-	 * DMA_BUF_SET_NAME IOCTL.
+	 * Name of the exporter; useful for debugging. Must not be NULL
 	 */
 	const char *exp_name;
 
 	/**
 	 * @name:
 	 *
-	 * Userspace-provided name; useful for accounting and debugging,
-	 * protected by dma_resv_lock() on @resv and @name_lock for read access.
+	 * Userspace-provided name. Default value is NULL. If not NULL,
+	 * length cannot be longer than DMA_BUF_NAME_LEN, including NIL
+	 * char. Useful for accounting and debugging. Read/Write accesses
+	 * are protected by @name_lock
+	 *
+	 * See the IOCTLs DMA_BUF_SET_NAME or DMA_BUF_SET_NAME_A/B
 	 */
 	const char *name;
 
-- 
cgit v1.2.3


From 4ea95c04fa6b9043a1a301240996aeebe3cb28ec Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 29 Nov 2023 16:10:00 -0800
Subject: vfio: Drop vfio_file_iommu_group() stub to fudge around a KVM wart

Drop the vfio_file_iommu_group() stub and instead unconditionally declare
the function to fudge around a KVM wart where KVM tries to do symbol_get()
on vfio_file_iommu_group() (and other VFIO symbols) even if CONFIG_VFIO=n.

Ensuring the symbol is always declared fixes a PPC build error when
modules are also disabled, in which case symbol_get() simply points at the
address of the symbol (with some attributes shenanigans).  Because KVM
does symbol_get() instead of directly depending on VFIO, the lack of a
fully defined symbol is not problematic (ugly, but "fine").

   arch/powerpc/kvm/../../../virt/kvm/vfio.c:89:7:
   error: attribute declaration must precede definition [-Werror,-Wignored-attributes]
           fn = symbol_get(vfio_file_iommu_group);
                ^
   include/linux/module.h:805:60: note: expanded from macro 'symbol_get'
   #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); })
                                                              ^
   include/linux/vfio.h:294:35: note: previous definition is here
   static inline struct iommu_group *vfio_file_iommu_group(struct file *file)
                                     ^
   arch/powerpc/kvm/../../../virt/kvm/vfio.c:89:7:
   error: attribute declaration must precede definition [-Werror,-Wignored-attributes]
           fn = symbol_get(vfio_file_iommu_group);
                ^
   include/linux/module.h:805:65: note: expanded from macro 'symbol_get'
   #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); })
                                                                   ^
   include/linux/vfio.h:294:35: note: previous definition is here
   static inline struct iommu_group *vfio_file_iommu_group(struct file *file)
                                     ^
   2 errors generated.

Although KVM is firmly in the wrong (there is zero reason for KVM to build
virt/kvm/vfio.c when VFIO is disabled), fudge around the error in VFIO as
the stub is unnecessary and doesn't serve its intended purpose (KVM is the
only external user of vfio_file_iommu_group()), and there is an in-flight
series to clean up the entire KVM<->VFIO interaction, i.e. fixing this in
KVM would result in more churn in the long run, and the stub needs to go
away regardless.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202308251949.5IiaV0sz-lkp@intel.com
Closes: https://lore.kernel.org/oe-kbuild-all/202309030741.82aLACDG-lkp@intel.com
Closes: https://lore.kernel.org/oe-kbuild-all/202309110914.QLH0LU6L-lkp@intel.com
Link: https://lore.kernel.org/all/0-v1-08396538817d+13c5-vfio_kvm_kconfig_jgg@nvidia.com
Link: https://lore.kernel.org/all/20230916003118.2540661-1-seanjc@google.com
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Fixes: c1cce6d079b8 ("vfio: Compile vfio_group infrastructure optionally")
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231130001000.543240-1-seanjc@google.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 454e9295970c..a65b2513f8cd 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -289,16 +289,12 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
 /*
  * External user API
  */
-#if IS_ENABLED(CONFIG_VFIO_GROUP)
 struct iommu_group *vfio_file_iommu_group(struct file *file);
+
+#if IS_ENABLED(CONFIG_VFIO_GROUP)
 bool vfio_file_is_group(struct file *file);
 bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
 #else
-static inline struct iommu_group *vfio_file_iommu_group(struct file *file)
-{
-	return NULL;
-}
-
 static inline bool vfio_file_is_group(struct file *file)
 {
 	return false;
-- 
cgit v1.2.3


From 3717194f249227a3dfd8433bd9374cc7e0cf823d Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Wed, 29 Nov 2023 13:06:15 +0200
Subject: Input: gpio-keys - add system suspend support for dedicated wakeirqs

Some SoCs have a separate dedicated wake-up interrupt controller that can
be used to wake up the system from deeper idle states. We already support
configuring a separate interrupt for a gpio-keys button to be used with a
gpio line. However, we are lacking support system suspend for cases where
a separate interrupt needs to be used in deeper sleep modes.

Because of it's nature, gpio-keys does not know about the runtime PM state
of the button gpios, and may have several gpio buttons configured for each
gpio-keys device instance. Implementing runtime PM support for gpio-keys
does not help, and we cannot use drivers/base/power/wakeirq.c support. We
need to implement custom wakeirq support for gpio-keys.

For handling a dedicated wakeirq for system suspend, we enable and disable
it with gpio_keys_enable_wakeup() and gpio_keys_disable_wakeup() that we
already use based on device_may_wakeup().

Some systems may have a dedicated wakeirq that can also be used as the
main interrupt, this is already working for gpio-keys. Let's add some
wakeirq related comments while at it as the usage with a gpio line and
separate interrupt line may not be obvious.

Tested-by: Dhruva Gole <d-gole@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20231129110618.27551-2-tony@atomide.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/gpio_keys.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio_keys.h b/include/linux/gpio_keys.h
index 3f84aeb81e48..80fa930b04c6 100644
--- a/include/linux/gpio_keys.h
+++ b/include/linux/gpio_keys.h
@@ -21,6 +21,7 @@ struct device;
  *			disable button via sysfs
  * @value:		axis value for %EV_ABS
  * @irq:		Irq number in case of interrupt keys
+ * @wakeirq:		Optional dedicated wake-up interrupt
  */
 struct gpio_keys_button {
 	unsigned int code;
@@ -34,6 +35,7 @@ struct gpio_keys_button {
 	bool can_disable;
 	int value;
 	unsigned int irq;
+	unsigned int wakeirq;
 };
 
 /**
-- 
cgit v1.2.3


From da2e08d4630ab04ee5b61515fe423c582b5c3be2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 3 Oct 2023 16:18:38 -0700
Subject: i40e: Annotate struct i40e_qvlist_info with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for
array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct i40e_qvlist_info.

Cc: Tony Nguyen <anthony.l.nguyen@intel.com>
Cc: Shiraz Saleem <shiraz.saleem@intel.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Gurucharan G <gurucharanx.g@intel.com>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci [1]
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20231003231838.work.510-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/net/intel/i40e_client.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/net/intel/i40e_client.h b/include/linux/net/intel/i40e_client.h
index ed42bd5f639f..0aa4411528fc 100644
--- a/include/linux/net/intel/i40e_client.h
+++ b/include/linux/net/intel/i40e_client.h
@@ -45,7 +45,7 @@ struct i40e_qv_info {
 
 struct i40e_qvlist_info {
 	u32 num_vectors;
-	struct i40e_qv_info qv_info[];
+	struct i40e_qv_info qv_info[] __counted_by(num_vectors);
 };
 
 
-- 
cgit v1.2.3


From 6a3afb6ac6dfab158ebdd4b87941178f58c8939f Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Wed, 29 Nov 2023 19:47:40 +0800
Subject: jbd2: increase the journal IO's priority

Current jbd2 only add REQ_SYNC for descriptor block, metadata log
buffer, commit buffer and superblock buffer, the submitted IO could be
throttled by writeback throttle in block layer, that could lead to
priority inversion in some cases. The log IO looks like a kind of high
priority metadata IO, so it should not be throttled by WBT like QOS
policies in block layer, let's add REQ_SYNC | REQ_IDLE to exempt from
writeback throttle, and also add REQ_META together indicates it's a
metadata IO.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20231129114740.2686201-2-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 6dcbb4eb80fb..beb30719ee16 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1374,6 +1374,9 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum2,		CSUM_V2)
 JBD2_FEATURE_INCOMPAT_FUNCS(csum3,		CSUM_V3)
 JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
 
+/* Journal high priority write IO operation flags */
+#define JBD2_JOURNAL_REQ_FLAGS		(REQ_META | REQ_SYNC | REQ_IDLE)
+
 /*
  * Journal flag definitions
  */
-- 
cgit v1.2.3


From d839a656d0f3caca9f96e9bf912fd394ac6a11bc Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Fri, 1 Dec 2023 14:53:55 +0900
Subject: kprobes: consistent rcu api usage for kretprobe holder

It seems that the pointer-to-kretprobe "rp" within the kretprobe_holder is
RCU-managed, based on the (non-rethook) implementation of get_kretprobe().
The thought behind this patch is to make use of the RCU API where possible
when accessing this pointer so that the needed barriers are always in place
and to self-document the code.

The __rcu annotation to "rp" allows for sparse RCU checking. Plain writes
done to the "rp" pointer are changed to make use of the RCU macro for
assignment. For the single read, the implementation of get_kretprobe()
is simplified by making use of an RCU macro which accomplishes the same,
but note that the log warning text will be more generic.

I did find that there is a difference in assembly generated between the
usage of the RCU macros vs without. For example, on arm64, when using
rcu_assign_pointer(), the corresponding store instruction is a
store-release (STLR) which has an implicit barrier. When normal assignment
is done, a regular store (STR) is found. In the macro case, this seems to
be a result of rcu_assign_pointer() using smp_store_release() when the
value to write is not NULL.

Link: https://lore.kernel.org/all/20231122132058.3359-1-inwardvessel@gmail.com/

Fixes: d741bf41d7c7 ("kprobes: Remove kretprobe hash")
Cc: stable@vger.kernel.org
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/kprobes.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index ab1da3142b06..64672bace560 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -139,7 +139,7 @@ static inline bool kprobe_ftrace(struct kprobe *p)
  *
  */
 struct kretprobe_holder {
-	struct kretprobe	*rp;
+	struct kretprobe __rcu *rp;
 	struct objpool_head	pool;
 };
 
@@ -245,10 +245,7 @@ unsigned long kretprobe_trampoline_handler(struct pt_regs *regs,
 
 static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri)
 {
-	RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
-		"Kretprobe is accessed from instance under preemptive context");
-
-	return READ_ONCE(ri->rph->rp);
+	return rcu_dereference_check(ri->rph->rp, rcu_read_lock_any_held());
 }
 
 static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri)
-- 
cgit v1.2.3


From a1461f1fd6cfdc4b8917c9d4a91e92605d1f28dc Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Fri, 1 Dec 2023 14:53:56 +0900
Subject: rethook: Use __rcu pointer for rethook::handler

Since the rethook::handler is an RCU-maganged pointer so that it will
notice readers the rethook is stopped (unregistered) or not, it should
be an __rcu pointer and use appropriate functions to be accessed. This
will use appropriate memory barrier when accessing it. OTOH,
rethook::data is never changed, so we don't need to check it in
get_kretprobe().

NOTE: To avoid sparse warning, rethook::handler is defined by a raw
function pointer type with __rcu instead of rethook_handler_t.

Link: https://lore.kernel.org/all/170126066201.398836.837498688669005979.stgit@devnote2/

Fixes: 54ecbe6f1ed5 ("rethook: Add a generic return hook")
Cc: stable@vger.kernel.org
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202311241808.rv9ceuAh-lkp@intel.com/
Tested-by: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/kprobes.h | 6 ++----
 include/linux/rethook.h | 7 ++++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 64672bace560..0ff44d6633e3 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -197,10 +197,8 @@ extern int arch_trampoline_kprobe(struct kprobe *p);
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri)
 {
-	RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
-		"Kretprobe is accessed from instance under preemptive context");
-
-	return (struct kretprobe *)READ_ONCE(ri->node.rethook->data);
+	/* rethook::data is non-changed field, so that you can access it freely. */
+	return (struct kretprobe *)ri->node.rethook->data;
 }
 static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri)
 {
diff --git a/include/linux/rethook.h b/include/linux/rethook.h
index ce69b2b7bc35..ba60962805f6 100644
--- a/include/linux/rethook.h
+++ b/include/linux/rethook.h
@@ -28,7 +28,12 @@ typedef void (*rethook_handler_t) (struct rethook_node *, void *, unsigned long,
  */
 struct rethook {
 	void			*data;
-	rethook_handler_t	handler;
+	/*
+	 * To avoid sparse warnings, this uses a raw function pointer with
+	 * __rcu, instead of rethook_handler_t. But this must be same as
+	 * rethook_handler_t.
+	 */
+	void (__rcu *handler) (struct rethook_node *, void *, unsigned long, struct pt_regs *);
 	struct objpool_head	pool;
 	struct rcu_head		rcu;
 };
-- 
cgit v1.2.3


From df16c1c51d8166958f533c0c886766f7ee9dd50f Mon Sep 17 00:00:00 2001
From: Andrew Halaney <ahalaney@redhat.com>
Date: Mon, 27 Nov 2023 15:41:10 -0600
Subject: net: phy: mdio_device: Reset device only when necessary

Currently the phy reset sequence is as shown below for a
devicetree described mdio phy on boot:

1. Assert the phy_device's reset as part of registering
2. Deassert the phy_device's reset as part of registering
3. Deassert the phy_device's reset as part of phy_probe
4. Deassert the phy_device's reset as part of phy_hw_init

The extra two deasserts include waiting the deassert delay afterwards,
which is adding unnecessary delay.

This applies to both possible types of resets (reset controller
reference and a reset gpio) that can be used.

Here's some snipped tracing output using the following command line
params "trace_event=gpio:* trace_options=stacktrace" illustrating
the reset handling and where its coming from:

    /* Assert */
       systemd-udevd-283     [002] .....     6.780434: gpio_value: 544 set 0
       systemd-udevd-283     [002] .....     6.783849: <stack trace>
     => gpiod_set_raw_value_commit
     => gpiod_set_value_nocheck
     => gpiod_set_value_cansleep
     => mdio_device_reset
     => mdiobus_register_device
     => phy_device_register
     => fwnode_mdiobus_phy_device_register
     => fwnode_mdiobus_register_phy
     => __of_mdiobus_register
     => stmmac_mdio_register
     => stmmac_dvr_probe
     => stmmac_pltfr_probe
     => devm_stmmac_pltfr_probe
     => qcom_ethqos_probe
     => platform_probe

    /* Deassert */
       systemd-udevd-283     [002] .....     6.802480: gpio_value: 544 set 1
       systemd-udevd-283     [002] .....     6.805886: <stack trace>
     => gpiod_set_raw_value_commit
     => gpiod_set_value_nocheck
     => gpiod_set_value_cansleep
     => mdio_device_reset
     => phy_device_register
     => fwnode_mdiobus_phy_device_register
     => fwnode_mdiobus_register_phy
     => __of_mdiobus_register
     => stmmac_mdio_register
     => stmmac_dvr_probe
     => stmmac_pltfr_probe
     => devm_stmmac_pltfr_probe
     => qcom_ethqos_probe
     => platform_probe

    /* Deassert */
       systemd-udevd-283     [002] .....     6.882601: gpio_value: 544 set 1
       systemd-udevd-283     [002] .....     6.886014: <stack trace>
     => gpiod_set_raw_value_commit
     => gpiod_set_value_nocheck
     => gpiod_set_value_cansleep
     => mdio_device_reset
     => phy_probe
     => really_probe
     => __driver_probe_device
     => driver_probe_device
     => __device_attach_driver
     => bus_for_each_drv
     => __device_attach
     => device_initial_probe
     => bus_probe_device
     => device_add
     => phy_device_register
     => fwnode_mdiobus_phy_device_register
     => fwnode_mdiobus_register_phy
     => __of_mdiobus_register
     => stmmac_mdio_register
     => stmmac_dvr_probe
     => stmmac_pltfr_probe
     => devm_stmmac_pltfr_probe
     => qcom_ethqos_probe
     => platform_probe

    /* Deassert */
      NetworkManager-477     [000] .....     7.023144: gpio_value: 544 set 1
      NetworkManager-477     [000] .....     7.026596: <stack trace>
     => gpiod_set_raw_value_commit
     => gpiod_set_value_nocheck
     => gpiod_set_value_cansleep
     => mdio_device_reset
     => phy_init_hw
     => phy_attach_direct
     => phylink_fwnode_phy_connect
     => __stmmac_open
     => stmmac_open

There's a lot of paths where the device is getting its reset
asserted and deasserted. Let's track the state and only actually
do the assert/deassert when it changes.

Reported-by: Sagar Cheluvegowda <quic_scheluve@quicinc.com>
Signed-off-by: Andrew Halaney <ahalaney@redhat.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20231127-net-phy-reset-once-v2-1-448e8658779e@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mdio.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 007fd9c3e4b6..79ceee3c8673 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -38,6 +38,7 @@ struct mdio_device {
 	/* Bus address of the MDIO device (0-31) */
 	int addr;
 	int flags;
+	int reset_state;
 	struct gpio_desc *reset_gpio;
 	struct reset_control *reset_ctrl;
 	unsigned int reset_assert_delay;
-- 
cgit v1.2.3


From 7232522e6cafdf466ed7649c14546fd07ccc1978 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 30 Nov 2023 18:56:18 +0200
Subject: fanotify: store fsid in mark instead of in connector

Some filesystems like fuse and nfs have zero or non-unique fsid.
We would like to avoid reporting ambiguous fsid in events, so we need
to avoid marking objects with same fsid and different sb.

To make this easier to enforce, store the fsid in the marks of the group
instead of in the shared conenctor.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20231130165619.3386452-2-amir73il@gmail.com>
---
 include/linux/fsnotify_backend.h | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index c0892d75ce33..a80b525ca653 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -472,10 +472,8 @@ typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t;
 struct fsnotify_mark_connector {
 	spinlock_t lock;
 	unsigned short type;	/* Type of object [lock] */
-#define FSNOTIFY_CONN_FLAG_HAS_FSID	0x01
 #define FSNOTIFY_CONN_FLAG_HAS_IREF	0x02
 	unsigned short flags;	/* flags [lock] */
-	__kernel_fsid_t fsid;	/* fsid of filesystem containing object */
 	union {
 		/* Object pointer [lock] */
 		fsnotify_connp_t *obj;
@@ -530,6 +528,7 @@ struct fsnotify_mark {
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x0100
 #define FSNOTIFY_MARK_FLAG_NO_IREF		0x0200
 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS	0x0400
+#define FSNOTIFY_MARK_FLAG_HAS_FSID		0x0800
 	unsigned int flags;		/* flags [mark->lock] */
 };
 
@@ -763,11 +762,10 @@ extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
 /* attach the mark to the object */
 extern int fsnotify_add_mark(struct fsnotify_mark *mark,
 			     fsnotify_connp_t *connp, unsigned int obj_type,
-			     int add_flags, __kernel_fsid_t *fsid);
+			     int add_flags);
 extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 				    fsnotify_connp_t *connp,
-				    unsigned int obj_type, int add_flags,
-				    __kernel_fsid_t *fsid);
+				    unsigned int obj_type, int add_flags);
 
 /* attach the mark to the inode */
 static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
@@ -775,15 +773,14 @@ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 					  int add_flags)
 {
 	return fsnotify_add_mark(mark, &inode->i_fsnotify_marks,
-				 FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL);
+				 FSNOTIFY_OBJ_TYPE_INODE, add_flags);
 }
 static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
 						 struct inode *inode,
 						 int add_flags)
 {
 	return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks,
-					FSNOTIFY_OBJ_TYPE_INODE, add_flags,
-					NULL);
+					FSNOTIFY_OBJ_TYPE_INODE, add_flags);
 }
 
 /* given a group and a mark, flag mark to be freed when all references are dropped */
-- 
cgit v1.2.3


From 30ad1938326bf9303ca38090339d948975a626f5 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 30 Nov 2023 18:56:19 +0200
Subject: fanotify: allow "weak" fsid when watching a single filesystem

So far, fanotify returns -ENODEV or -EXDEV when trying to set a mark
on a filesystem with a "weak" fsid, namely, zero fsid (e.g. fuse), or
non-uniform fsid (e.g. btrfs non-root subvol).

When group is watching inodes all from the same filesystem (or subvol),
allow adding inode marks with "weak" fsid, because there is no ambiguity
regarding which filesystem reports the event.

The first mark added to a group determines if this group is single or
multi filesystem, depending on the fsid at the path of the added mark.

If the first mark added has a "strong" fsid, marks with "weak" fsid
cannot be added and vice versa.

If the first mark added has a "weak" fsid, following marks must have
the same "weak" fsid and the same sb as the first mark.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20231130165619.3386452-3-amir73il@gmail.com>
---
 include/linux/fsnotify_backend.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index a80b525ca653..7f63be5ca0f1 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -529,6 +529,7 @@ struct fsnotify_mark {
 #define FSNOTIFY_MARK_FLAG_NO_IREF		0x0200
 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS	0x0400
 #define FSNOTIFY_MARK_FLAG_HAS_FSID		0x0800
+#define FSNOTIFY_MARK_FLAG_WEAK_FSID		0x1000
 	unsigned int flags;		/* flags [mark->lock] */
 };
 
-- 
cgit v1.2.3


From c66272a4c9932d6c585eef99039747617d48d662 Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Sat, 25 Nov 2023 19:50:10 +0800
Subject: crypto: hisilicon/qm - simplify the status of qm

The 'QM_INIT' and 'QM_CLOSE' status of qm and 'QP_INIT'
and 'QP_CLOSE' status of queue are not actually used. Currently,
driver only needs to switch status when the device or queue
is enabled or stopped, Therefore, remove unneeded status to
simplify driver. In addition, rename'QM_START to'QM_WORK' for
ease to understand.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/hisi_acc_qm.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index ddc7ebb70523..e3c0a1297b2c 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -108,17 +108,13 @@ enum qm_stop_reason {
 };
 
 enum qm_state {
-	QM_INIT = 0,
-	QM_START,
-	QM_CLOSE,
+	QM_WORK = 0,
 	QM_STOP,
 };
 
 enum qp_state {
-	QP_INIT = 1,
-	QP_START,
+	QP_START = 1,
 	QP_STOP,
-	QP_CLOSE,
 };
 
 enum qm_hw_ver {
-- 
cgit v1.2.3


From 488e8f685207e0758398963d6834f81e5e61c162 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 30 Nov 2023 16:16:22 +0200
Subject: fs: fork splice_file_range() from do_splice_direct()

In preparation of calling do_splice_direct() without file_start_write()
held, create a new helper splice_file_range(), to be called from context
of ->copy_file_range() methods instead of do_splice_direct().

Currently, the only difference is that splice_file_range() does not take
flags argument and that it asserts that file_start_write() is held, but
we factor out a common helper do_splice_direct_actor() that will be used
later.

Use the new helper from __ceph_copy_file_range(), that was incorrectly
passing to do_splice_direct() the copy flags argument as splice flags.
The value of copy flags in ceph is always 0, so it is a smenatic bug fix.

Move the declaration of both helpers to linux/splice.h.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231130141624.3338942-2-amir73il@gmail.com
Acked-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h     |  2 --
 include/linux/splice.h | 13 ++++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae0e2fb7bcea..04422a0eccdd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3052,8 +3052,6 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
 			 size_t len, unsigned int flags);
 extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
-extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-		loff_t *opos, size_t len, unsigned int flags);
 
 
 extern void
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 6c461573434d..49532d5dda52 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -80,11 +80,14 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *,
 long vfs_splice_read(struct file *in, loff_t *ppos,
 		     struct pipe_inode_info *pipe, size_t len,
 		     unsigned int flags);
-extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
-				      splice_direct_actor *);
-extern long do_splice(struct file *in, loff_t *off_in,
-		      struct file *out, loff_t *off_out,
-		      size_t len, unsigned int flags);
+ssize_t splice_direct_to_actor(struct file *file, struct splice_desc *sd,
+			       splice_direct_actor *actor);
+long do_splice(struct file *in, loff_t *off_in, struct file *out,
+	       loff_t *off_out, size_t len, unsigned int flags);
+long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+		      loff_t *opos, size_t len, unsigned int flags);
+long splice_file_range(struct file *in, loff_t *ppos, struct file *out,
+		       loff_t *opos, size_t len);
 
 extern long do_tee(struct file *in, struct file *out, size_t len,
 		   unsigned int flags);
-- 
cgit v1.2.3


From 77070eeb882124614a40616f01bfe60947be5778 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 30 Nov 2023 15:43:27 -0500
Subject: cgroup: Avoid false cacheline sharing of read mostly rstat_cpu

The rstat_cpu and also rstat_css_list of the cgroup structure are read
mostly variables. However, they may share the same cacheline as the
subsequent rstat_flush_next and *bstat variables which can be updated
frequently.  That will slow down the cgroup_rstat_cpu() call which is
called pretty frequently in the rstat code. Add a CACHELINE_PADDING()
line in between them to avoid false cacheline sharing.

A parallel kernel build on a 2-socket x86-64 server is used as the
benchmarking tool for measuring the lock hold time. Below were the lock
hold time frequency distribution before and after the patch:

      Run time        Before patch       After patch
      --------        ------------       -----------
       0-01 us         9,928,562          9,820,428
      01-05 us           110,151             50,935
      05-10 us               270                 93
      10-15 us               273                146
      15-20 us               135                 76
      20-25 us                 0                  2
      25-30 us                 1                  0

It can be seen that the patch further pushes the lock hold time towards
the lower end.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 37518436cfe7..5a97ea95b564 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -496,6 +496,13 @@ struct cgroup {
 	struct cgroup_rstat_cpu __percpu *rstat_cpu;
 	struct list_head rstat_css_list;
 
+	/*
+	 * Add padding to separate the read mostly rstat_cpu and
+	 * rstat_css_list into a different cacheline from the following
+	 * rstat_flush_next and *bstat fields which can have frequent updates.
+	 */
+	CACHELINE_PADDING(_pad_);
+
 	/*
 	 * A singly-linked list of cgroup structures to be rstat flushed.
 	 * This is a scratch field to be used exclusively by
-- 
cgit v1.2.3


From aabf7c37dfbce3e5fe24f0c86a34bc8f2f63cee8 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Wed, 29 Nov 2023 13:44:04 -0800
Subject: lkdtm: Add kfence read after free crash type

Add the ability to allocate memory from kfence and trigger a read after
free on that memory to validate that kfence is working properly. This is
used by ChromeOS integration tests to validate that kfence errors can be
collected on user devices and parsed properly.

Cc: Alexander Potapenko <glider@google.com>
Acked-by: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: kasan-dev@googlegroups.com
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Link: https://lore.kernel.org/r/20231129214413.3156334-1-swboyd@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/kfence.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 401af4757514..88100cc9caba 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -223,6 +223,8 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla
 
 #else /* CONFIG_KFENCE */
 
+#define kfence_sample_interval	(0)
+
 static inline bool is_kfence_address(const void *addr) { return false; }
 static inline void kfence_alloc_pool_and_metadata(void) { }
 static inline void kfence_init(void) { }
-- 
cgit v1.2.3


From 12cd3cd8c797e07afcc47bc4afa760e4ec75e9d7 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 20 Nov 2023 17:11:42 +0200
Subject: params: Introduce the param_unknown_fn type

Introduce a new type for the callback to parse an unknown argument.
This unifies function prototypes which takes that as a parameter.

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231120151419.1661807-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/moduleparam.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 4fa9726bc328..bfb85fd13e1f 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -385,6 +385,8 @@ extern bool parameq(const char *name1, const char *name2);
  */
 extern bool parameqn(const char *name1, const char *name2, size_t n);
 
+typedef int (*parse_unknown_fn)(char *param, char *val, const char *doing, void *arg);
+
 /* Called on module insert or kernel boot */
 extern char *parse_args(const char *name,
 		      char *args,
@@ -392,9 +394,7 @@ extern char *parse_args(const char *name,
 		      unsigned num,
 		      s16 level_min,
 		      s16 level_max,
-		      void *arg,
-		      int (*unknown)(char *param, char *val,
-				     const char *doing, void *arg));
+		      void *arg, parse_unknown_fn unknown);
 
 /* Called by module remove. */
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3


From 492c5d455969fc2e829f26ed4c83487b068f0dd7 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 30 Nov 2023 13:53:06 -0800
Subject: block: bio-integrity: directly map user buffers

Passthrough commands that utilize metadata currently need to bounce the
user space buffer through the kernel. Add support for mapping user space
directly so that we can avoid this costly overhead. This is similar to
how the normal bio data payload utilizes user addresses with
bio_map_user_iov().

If the user address can't directly be used for reason, like too many
segments or address unalignement, fallback to a copy of the user vec
while keeping the user address pinned for the IO duration so that it
can safely be copied on completion in any process context.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20231130215309.2923568-2-kbusch@meta.com
[axboe: fold in fix from Kanchan Joshi]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 41d417ee1349..ec4db73e5f4e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -324,6 +324,8 @@ enum bip_flags {
 	BIP_CTRL_NOCHECK	= 1 << 2, /* disable HBA integrity checking */
 	BIP_DISK_NOCHECK	= 1 << 3, /* disable disk integrity checking */
 	BIP_IP_CHECKSUM		= 1 << 4, /* IP checksum */
+	BIP_INTEGRITY_USER	= 1 << 5, /* Integrity payload is user address */
+	BIP_COPY_USER		= 1 << 6, /* Kernel bounce buffer in use */
 };
 
 /*
@@ -718,6 +720,7 @@ static inline bool bioset_initialized(struct bio_set *bs)
 	for_each_bio(_bio)						\
 		bip_for_each_vec(_bvl, _bio->bi_integrity, _iter)
 
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed);
 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
 extern bool bio_integrity_prep(struct bio *);
@@ -789,6 +792,12 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
 	return 0;
 }
 
+static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
+					 ssize_t len, u32 seed)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 /*
-- 
cgit v1.2.3


From e5da71f1e373f36c7506ffa9a60ef7ec6e84674d Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 30 Nov 2023 13:53:08 -0800
Subject: iouring: remove IORING_URING_CMD_POLLED

No more users of this flag.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20231130215309.2923568-4-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index aefb73eeeebf..fe23bf88f86f 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -28,7 +28,6 @@ enum io_uring_cmd_flags {
 
 /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
 #define IORING_URING_CMD_CANCELABLE	(1U << 30)
-#define IORING_URING_CMD_POLLED		(1U << 31)
 
 struct io_uring_cmd {
 	struct file	*file;
-- 
cgit v1.2.3


From 8fadb86d4ced8b8349a3b227d6d66736ff150819 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 30 Nov 2023 13:53:09 -0800
Subject: io_uring: remove uring_cmd cookie

No more users of this field.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20231130215309.2923568-5-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index fe23bf88f86f..9e6ce6d4ab51 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -32,12 +32,8 @@ enum io_uring_cmd_flags {
 struct io_uring_cmd {
 	struct file	*file;
 	const struct io_uring_sqe *sqe;
-	union {
-		/* callback to defer completions to task context */
-		void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
-		/* used for polled completion */
-		void *cookie;
-	};
+	/* callback to defer completions to task context */
+	void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
 	u32		cmd_op;
 	u32		flags;
 	u8		pdu[32]; /* available inline for free use */
-- 
cgit v1.2.3


From 45b5623f2d721c25d1a2fdc8c4600fb4b7b61c75 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 2 Dec 2023 09:56:55 -0800
Subject: bpf: rearrange bpf_func_state fields to save a bit of memory

It's a trivial rearrangement saving 8 bytes. We have 4 bytes of padding
at the end which can be filled with another field without increasing
struct bpf_func_state.

copy_func_state() logic remains correct without any further changes.

BEFORE
======
struct bpf_func_state {
        struct bpf_reg_state       regs[11];             /*     0  1320 */
        /* --- cacheline 20 boundary (1280 bytes) was 40 bytes ago --- */
        int                        callsite;             /*  1320     4 */
        u32                        frameno;              /*  1324     4 */
        u32                        subprogno;            /*  1328     4 */
        u32                        async_entry_cnt;      /*  1332     4 */
        bool                       in_callback_fn;       /*  1336     1 */

        /* XXX 7 bytes hole, try to pack */

        /* --- cacheline 21 boundary (1344 bytes) --- */
        struct tnum                callback_ret_range;   /*  1344    16 */
        bool                       in_async_callback_fn; /*  1360     1 */
        bool                       in_exception_callback_fn; /*  1361     1 */

        /* XXX 2 bytes hole, try to pack */

        int                        acquired_refs;        /*  1364     4 */
        struct bpf_reference_state * refs;               /*  1368     8 */
        int                        allocated_stack;      /*  1376     4 */

        /* XXX 4 bytes hole, try to pack */

        struct bpf_stack_state *   stack;                /*  1384     8 */

        /* size: 1392, cachelines: 22, members: 13 */
        /* sum members: 1379, holes: 3, sum holes: 13 */
        /* last cacheline: 48 bytes */
};

AFTER
=====
struct bpf_func_state {
        struct bpf_reg_state       regs[11];             /*     0  1320 */
        /* --- cacheline 20 boundary (1280 bytes) was 40 bytes ago --- */
        int                        callsite;             /*  1320     4 */
        u32                        frameno;              /*  1324     4 */
        u32                        subprogno;            /*  1328     4 */
        u32                        async_entry_cnt;      /*  1332     4 */
        struct tnum                callback_ret_range;   /*  1336    16 */
        /* --- cacheline 21 boundary (1344 bytes) was 8 bytes ago --- */
        bool                       in_callback_fn;       /*  1352     1 */
        bool                       in_async_callback_fn; /*  1353     1 */
        bool                       in_exception_callback_fn; /*  1354     1 */

        /* XXX 1 byte hole, try to pack */

        int                        acquired_refs;        /*  1356     4 */
        struct bpf_reference_state * refs;               /*  1360     8 */
        struct bpf_stack_state *   stack;                /*  1368     8 */
        int                        allocated_stack;      /*  1376     4 */

        /* size: 1384, cachelines: 22, members: 13 */
        /* sum members: 1379, holes: 1, sum holes: 1 */
        /* padding: 4 */
        /* last cacheline: 40 bytes */
};

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231202175705.885270-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d99a636d36a7..0c0e1bccad45 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -297,8 +297,8 @@ struct bpf_func_state {
 	 * void foo(void) { bpf_timer_set_callback(,foo); }
 	 */
 	u32 async_entry_cnt;
-	bool in_callback_fn;
 	struct tnum callback_ret_range;
+	bool in_callback_fn;
 	bool in_async_callback_fn;
 	bool in_exception_callback_fn;
 	/* For callback calling functions that limit number of possible
@@ -316,8 +316,8 @@ struct bpf_func_state {
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
 	struct bpf_reference_state *refs;
-	int allocated_stack;
 	struct bpf_stack_state *stack;
+	int allocated_stack;
 };
 
 struct bpf_idx_pair {
-- 
cgit v1.2.3


From 8fa4ecd49b81ccd9d1d87f1c8b2260e218644878 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 2 Dec 2023 09:56:58 -0800
Subject: bpf: enforce exact retval range on subprog/callback exit

Instead of relying on potentially imprecise tnum representation of
expected return value range for callbacks and subprogs, validate that
smin/smax range satisfy exact expected range of return values.

E.g., if callback would need to return [0, 2] range, tnum can't
represent this precisely and instead will allow [0, 3] range. By
checking smin/smax range, we can make sure that subprog/callback indeed
returns only valid [0, 2] range.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231202175705.885270-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 0c0e1bccad45..3378cc753061 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -275,6 +275,11 @@ struct bpf_reference_state {
 	int callback_ref;
 };
 
+struct bpf_retval_range {
+	s32 minval;
+	s32 maxval;
+};
+
 /* state of the program:
  * type of all registers and stack info
  */
@@ -297,7 +302,7 @@ struct bpf_func_state {
 	 * void foo(void) { bpf_timer_set_callback(,foo); }
 	 */
 	u32 async_entry_cnt;
-	struct tnum callback_ret_range;
+	struct bpf_retval_range callback_ret_range;
 	bool in_callback_fn;
 	bool in_async_callback_fn;
 	bool in_exception_callback_fn;
-- 
cgit v1.2.3


From aeb9ce058d7c6193dc41e06b3a5b29d22c446b14 Mon Sep 17 00:00:00 2001
From: Coco Li <lixiaoyan@google.com>
Date: Wed, 29 Nov 2023 07:27:53 +0000
Subject: cache: enforce cache groups

Set up build time warnings to safeguard against future header changes of
organized structs.

Warning includes:

1) whether all variables are still in the same cache group
2) whether all the cache groups have the sum of the members size (in the
   maximum condition, including all members defined in configs)

The __cache_group* variables are ignored in kernel-doc check in the
various header files they appear in to enforce the cache groups.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Coco Li <lixiaoyan@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cache.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cache.h b/include/linux/cache.h
index 9900d20b76c2..0ecb17bb6883 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -85,6 +85,31 @@
 #define cache_line_size()	L1_CACHE_BYTES
 #endif
 
+#ifndef __cacheline_group_begin
+#define __cacheline_group_begin(GROUP) \
+	__u8 __cacheline_group_begin__##GROUP[0]
+#endif
+
+#ifndef __cacheline_group_end
+#define __cacheline_group_end(GROUP) \
+	__u8 __cacheline_group_end__##GROUP[0]
+#endif
+
+#ifndef CACHELINE_ASSERT_GROUP_MEMBER
+#define CACHELINE_ASSERT_GROUP_MEMBER(TYPE, GROUP, MEMBER) \
+	BUILD_BUG_ON(!(offsetof(TYPE, MEMBER) >= \
+		       offsetofend(TYPE, __cacheline_group_begin__##GROUP) && \
+		       offsetofend(TYPE, MEMBER) <= \
+		       offsetof(TYPE, __cacheline_group_end__##GROUP)))
+#endif
+
+#ifndef CACHELINE_ASSERT_GROUP_SIZE
+#define CACHELINE_ASSERT_GROUP_SIZE(TYPE, GROUP, SIZE) \
+	BUILD_BUG_ON(offsetof(TYPE, __cacheline_group_end__##GROUP) - \
+		     offsetofend(TYPE, __cacheline_group_begin__##GROUP) > \
+		     SIZE)
+#endif
+
 /*
  * Helper to add padding within a struct to ensure data fall into separate
  * cachelines.
-- 
cgit v1.2.3


From a9c8c738066b7ba9e208cfc3200a6f60593982b4 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 6 Nov 2023 09:31:41 +0200
Subject: device property: Add fwnode_name_eq()

Add fwnode_name_eq() to implement the functionality of of_node_name_eq()
on fwnode property API. The same convention of ending the comparison at
'@' (besides NUL) is applied on also both ACPI and swnode. The function
is intended for comparing unit address-less node names on DT and firmware
or swnodes compliant with DT bindings.

Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Tested-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/linux/property.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 9f2585d705a8..4ebbb169df20 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -109,6 +109,7 @@ struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode,
 
 const char *fwnode_get_name(const struct fwnode_handle *fwnode);
 const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode);
+bool fwnode_name_eq(const struct fwnode_handle *fwnode, const char *name);
 
 struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode);
-- 
cgit v1.2.3


From 578dc962ff2000ba4bf52d50717aea0819615634 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <bbrezillon@kernel.org>
Date: Fri, 24 Nov 2023 17:24:35 -0800
Subject: mtd: rawnand: Add destructive operation

Erase and program operations need the write protect (wp) pin to be
de-asserted to take effect. Add the concept of destructive
operation and pass the information to exec_op() so controllers know
when they should de-assert this pin without having to decode
the command opcode.

Signed-off-by: Boris Brezillon <bbrezillon@kernel.org>
Signed-off-by: David Regan <dregan@broadcom.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20231125012438.15191-1-dregan@broadcom.com
---
 include/linux/mtd/rawnand.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index c29ace15a053..bd02aba5e6e3 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1003,6 +1003,8 @@ struct nand_op_parser {
 /**
  * struct nand_operation - NAND operation descriptor
  * @cs: the CS line to select for this NAND operation
+ * @deassert_wp: set to true when the operation requires the WP pin to be
+ *		 de-asserted (ERASE, PROG, ...)
  * @instrs: array of instructions to execute
  * @ninstrs: length of the @instrs array
  *
@@ -1010,6 +1012,7 @@ struct nand_op_parser {
  */
 struct nand_operation {
 	unsigned int cs;
+	bool deassert_wp;
 	const struct nand_op_instr *instrs;
 	unsigned int ninstrs;
 };
@@ -1021,6 +1024,14 @@ struct nand_operation {
 		.ninstrs = ARRAY_SIZE(_instrs),			\
 	}
 
+#define NAND_DESTRUCTIVE_OPERATION(_cs, _instrs)		\
+	{							\
+		.cs = _cs,					\
+		.deassert_wp = true,				\
+		.instrs = _instrs,				\
+		.ninstrs = ARRAY_SIZE(_instrs),			\
+	}
+
 int nand_op_parser_exec_op(struct nand_chip *chip,
 			   const struct nand_op_parser *parser,
 			   const struct nand_operation *op, bool check_only);
-- 
cgit v1.2.3


From 68cce21e3cc5fea8d955a62394454149270c98bc Mon Sep 17 00:00:00 2001
From: David Regan <dregan@broadcom.com>
Date: Fri, 24 Nov 2023 17:24:36 -0800
Subject: mtd: rawnand: NAND controller write protect

Allow NAND controller to be responsible for write protect pin
handling during fast path and exec_op destructive operation
when controller_wp flag is set.

Signed-off-by: David Regan <dregan@broadcom.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20231125012438.15191-2-dregan@broadcom.com
---
 include/linux/mtd/rawnand.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index bd02aba5e6e3..a17f795070d8 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1115,6 +1115,7 @@ struct nand_controller_ops {
  *			the bus without restarting an entire read operation nor
  *			changing the column.
  * @supported_op.cont_read: The controller supports sequential cache reads.
+ * @controller_wp:	the controller is in charge of handling the WP pin.
  */
 struct nand_controller {
 	struct mutex lock;
@@ -1123,6 +1124,7 @@ struct nand_controller {
 		unsigned int data_only_read: 1;
 		unsigned int cont_read: 1;
 	} supported_op;
+	bool controller_wp;
 };
 
 static inline void nand_controller_init(struct nand_controller *nfc)
-- 
cgit v1.2.3


From a87b8e3be926af0fc3b9b1af42b1127bd1ff077c Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 1 Dec 2023 10:29:51 -0800
Subject: usb: core: Allow subclassed USB drivers to override
 usb_choose_configuration()

For some USB devices we might want to do something different for
usb_choose_configuration(). One example here is the r8152 driver where
we want to end up using the vendor driver with the preferred
interface.

The r8152 driver tried to make things work by implementing a USB
generic_subclass driver and then overriding the normal config
selection after it happened. This is less than ideal and also caused
breakage if someone deauthorized and re-authorized the USB device
because the USB core ended up going back to it's default logic for
choosing the best config. I made an attempt to fix this [1] but it was
a bit ugly.

Let's do this better and allow USB generic_subclass drivers to
override usb_choose_configuration().

[1] https://lore.kernel.org/r/20231130154337.1.Ie00e07f07f87149c9ce0b27ae4e26991d307e14b@changeid

Suggested-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/20231201102946.v2.2.Iade5fa31997f1a0ca3e1dec0591633b02471df12@changeid
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 8c61643acd49..618e5a0b1a22 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1264,6 +1264,9 @@ struct usb_driver {
  *	module is being unloaded.
  * @suspend: Called when the device is going to be suspended by the system.
  * @resume: Called when the device is being resumed by the system.
+ * @choose_configuration: If non-NULL, called instead of the default
+ *	usb_choose_configuration(). If this returns an error then we'll go
+ *	on to call the normal usb_choose_configuration().
  * @dev_groups: Attributes attached to the device that will be created once it
  *	is bound to the driver.
  * @drvwrap: Driver-model core structure wrapper.
@@ -1287,6 +1290,9 @@ struct usb_device_driver {
 
 	int (*suspend) (struct usb_device *udev, pm_message_t message);
 	int (*resume) (struct usb_device *udev, pm_message_t message);
+
+	int (*choose_configuration) (struct usb_device *udev);
+
 	const struct attribute_group **dev_groups;
 	struct usbdrv_wrap drvwrap;
 	const struct usb_device_id *id_table;
-- 
cgit v1.2.3


From db9e54709895241dda23f9347f619afb15291353 Mon Sep 17 00:00:00 2001
From: RD Babiera <rdbabiera@google.com>
Date: Tue, 21 Nov 2023 20:38:47 +0000
Subject: usb: typec: tcpm: add tcpm_port_error_recovery symbol

Add tcpm_port_error_recovery symbol and corresponding event that runs in
tcpm_pd_event handler to set the port to the ERROR_RECOVERY state. tcpci
drivers can use the symbol to reset the port when tcpc faults affect port
functionality.

Signed-off-by: RD Babiera <rdbabiera@google.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20231121203845.170234-5-rdbabiera@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/tcpm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h
index ab7ca872950b..65fac5e1f317 100644
--- a/include/linux/usb/tcpm.h
+++ b/include/linux/usb/tcpm.h
@@ -173,5 +173,6 @@ void tcpm_pd_hard_reset(struct tcpm_port *port);
 void tcpm_tcpc_reset(struct tcpm_port *port);
 void tcpm_port_clean(struct tcpm_port *port);
 bool tcpm_port_is_toggling(struct tcpm_port *port);
+void tcpm_port_error_recovery(struct tcpm_port *port);
 
 #endif /* __LINUX_USB_TCPM_H */
-- 
cgit v1.2.3


From 5e4c8814a431d21bfaf20b464134f40f2f81e152 Mon Sep 17 00:00:00 2001
From: RD Babiera <rdbabiera@google.com>
Date: Tue, 21 Nov 2023 20:38:48 +0000
Subject: usb: typec: tcpci: add vconn over current fault handling to
 maxim_core

Add TCPC_FAULT_STATUS_VCONN_OC constant and corresponding mask definition.
Maxim TCPC is capable of detecting VConn over current faults, so add
fault to alert mask. When a Vconn over current fault is triggered, put the
port in an error recovery state via tcpm_port_error_recovery.

Signed-off-by: RD Babiera <rdbabiera@google.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20231121203845.170234-6-rdbabiera@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/tcpci.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h
index 83376473ac76..467e8045e9f8 100644
--- a/include/linux/usb/tcpci.h
+++ b/include/linux/usb/tcpci.h
@@ -36,7 +36,9 @@
 
 #define TCPC_ALERT_MASK			0x12
 #define TCPC_POWER_STATUS_MASK		0x14
-#define TCPC_FAULT_STATUS_MASK		0x15
+
+#define TCPC_FAULT_STATUS_MASK			0x15
+#define TCPC_FAULT_STATUS_MASK_VCONN_OC		BIT(1)
 
 #define TCPC_EXTENDED_STATUS_MASK		0x16
 #define TCPC_EXTENDED_STATUS_MASK_VSAFE0V	BIT(0)
@@ -104,6 +106,7 @@
 
 #define TCPC_FAULT_STATUS		0x1f
 #define TCPC_FAULT_STATUS_ALL_REG_RST_TO_DEFAULT BIT(7)
+#define TCPC_FAULT_STATUS_VCONN_OC	BIT(1)
 
 #define TCPC_ALERT_EXTENDED		0x21
 
-- 
cgit v1.2.3


From 7cc4e6b0e4ddf610477fcec8e3d2a9caae7e8a6c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 29 Nov 2023 18:06:46 +0200
Subject: pinctrl: Convert unsigned to unsigned int

Simple type conversion with no functional change implied.
While at it, adjust indentation where it makes sense.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231129161459.1002323-24-andriy.shevchenko@linux.intel.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/machine.h         |  6 +++---
 include/linux/pinctrl/pinconf-generic.h | 10 +++++-----
 include/linux/pinctrl/pinconf.h         | 16 ++++++++--------
 include/linux/pinctrl/pinctrl.h         | 24 ++++++++++++------------
 include/linux/pinctrl/pinmux.h          | 22 +++++++++++-----------
 5 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/machine.h b/include/linux/pinctrl/machine.h
index ee8803f6ad07..673e96df453b 100644
--- a/include/linux/pinctrl/machine.h
+++ b/include/linux/pinctrl/machine.h
@@ -47,7 +47,7 @@ struct pinctrl_map_mux {
 struct pinctrl_map_configs {
 	const char *group_or_pin;
 	unsigned long *configs;
-	unsigned num_configs;
+	unsigned int num_configs;
 };
 
 /**
@@ -154,13 +154,13 @@ struct pinctrl_map;
 #ifdef CONFIG_PINCTRL
 
 extern int pinctrl_register_mappings(const struct pinctrl_map *map,
-				     unsigned num_maps);
+				     unsigned int num_maps);
 extern void pinctrl_unregister_mappings(const struct pinctrl_map *map);
 extern void pinctrl_provide_dummies(void);
 #else
 
 static inline int pinctrl_register_mappings(const struct pinctrl_map *map,
-					    unsigned num_maps)
+					    unsigned int num_maps)
 {
 	return 0;
 }
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index d74b7a4ea154..a65d3d078e58 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -193,17 +193,17 @@ struct pinconf_generic_params {
 
 int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 		struct device_node *np, struct pinctrl_map **map,
-		unsigned *reserved_maps, unsigned *num_maps,
+		unsigned int *reserved_maps, unsigned int *num_maps,
 		enum pinctrl_map_type type);
 int pinconf_generic_dt_node_to_map(struct pinctrl_dev *pctldev,
 		struct device_node *np_config, struct pinctrl_map **map,
-		unsigned *num_maps, enum pinctrl_map_type type);
+		unsigned int *num_maps, enum pinctrl_map_type type);
 void pinconf_generic_dt_free_map(struct pinctrl_dev *pctldev,
-		struct pinctrl_map *map, unsigned num_maps);
+		struct pinctrl_map *map, unsigned int num_maps);
 
 static inline int pinconf_generic_dt_node_to_map_group(struct pinctrl_dev *pctldev,
 		struct device_node *np_config, struct pinctrl_map **map,
-		unsigned *num_maps)
+		unsigned int *num_maps)
 {
 	return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps,
 			PIN_MAP_TYPE_CONFIGS_GROUP);
@@ -211,7 +211,7 @@ static inline int pinconf_generic_dt_node_to_map_group(struct pinctrl_dev *pctld
 
 static inline int pinconf_generic_dt_node_to_map_pin(struct pinctrl_dev *pctldev,
 		struct device_node *np_config, struct pinctrl_map **map,
-		unsigned *num_maps)
+		unsigned int *num_maps)
 {
 	return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps,
 			PIN_MAP_TYPE_CONFIGS_PIN);
diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index f8a8215e9021..770ec2221156 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -40,25 +40,25 @@ struct pinconf_ops {
 	bool is_generic;
 #endif
 	int (*pin_config_get) (struct pinctrl_dev *pctldev,
-			       unsigned pin,
+			       unsigned int pin,
 			       unsigned long *config);
 	int (*pin_config_set) (struct pinctrl_dev *pctldev,
-			       unsigned pin,
+			       unsigned int pin,
 			       unsigned long *configs,
-			       unsigned num_configs);
+			       unsigned int num_configs);
 	int (*pin_config_group_get) (struct pinctrl_dev *pctldev,
-				     unsigned selector,
+				     unsigned int selector,
 				     unsigned long *config);
 	int (*pin_config_group_set) (struct pinctrl_dev *pctldev,
-				     unsigned selector,
+				     unsigned int selector,
 				     unsigned long *configs,
-				     unsigned num_configs);
+				     unsigned int num_configs);
 	void (*pin_config_dbg_show) (struct pinctrl_dev *pctldev,
 				     struct seq_file *s,
-				     unsigned offset);
+				     unsigned int offset);
 	void (*pin_config_group_dbg_show) (struct pinctrl_dev *pctldev,
 					   struct seq_file *s,
-					   unsigned selector);
+					   unsigned int selector);
 	void (*pin_config_config_dbg_show) (struct pinctrl_dev *pctldev,
 					    struct seq_file *s,
 					    unsigned long config);
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 4d252ea00ed1..9a8189ffd0f2 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -54,7 +54,7 @@ struct pingroup {
  * @drv_data: driver-defined per-pin data. pinctrl core does not touch this
  */
 struct pinctrl_pin_desc {
-	unsigned number;
+	unsigned int number;
 	const char *name;
 	void *drv_data;
 };
@@ -82,7 +82,7 @@ struct pinctrl_gpio_range {
 	unsigned int base;
 	unsigned int pin_base;
 	unsigned int npins;
-	unsigned const *pins;
+	unsigned int const *pins;
 	struct gpio_chip *gc;
 };
 
@@ -108,18 +108,18 @@ struct pinctrl_gpio_range {
 struct pinctrl_ops {
 	int (*get_groups_count) (struct pinctrl_dev *pctldev);
 	const char *(*get_group_name) (struct pinctrl_dev *pctldev,
-				       unsigned selector);
+				       unsigned int selector);
 	int (*get_group_pins) (struct pinctrl_dev *pctldev,
-			       unsigned selector,
-			       const unsigned **pins,
-			       unsigned *num_pins);
+			       unsigned int selector,
+			       const unsigned int **pins,
+			       unsigned int *num_pins);
 	void (*pin_dbg_show) (struct pinctrl_dev *pctldev, struct seq_file *s,
-			  unsigned offset);
+			      unsigned int offset);
 	int (*dt_node_to_map) (struct pinctrl_dev *pctldev,
 			       struct device_node *np_config,
-			       struct pinctrl_map **map, unsigned *num_maps);
+			       struct pinctrl_map **map, unsigned int *num_maps);
 	void (*dt_free_map) (struct pinctrl_dev *pctldev,
-			     struct pinctrl_map *map, unsigned num_maps);
+			     struct pinctrl_map *map, unsigned int num_maps);
 };
 
 /**
@@ -193,7 +193,7 @@ extern void pinctrl_add_gpio_range(struct pinctrl_dev *pctldev,
 				struct pinctrl_gpio_range *range);
 extern void pinctrl_add_gpio_ranges(struct pinctrl_dev *pctldev,
 				struct pinctrl_gpio_range *ranges,
-				unsigned nranges);
+				unsigned int nranges);
 extern void pinctrl_remove_gpio_range(struct pinctrl_dev *pctldev,
 				struct pinctrl_gpio_range *range);
 
@@ -203,8 +203,8 @@ extern struct pinctrl_gpio_range *
 pinctrl_find_gpio_range_from_pin(struct pinctrl_dev *pctldev,
 				 unsigned int pin);
 extern int pinctrl_get_group_pins(struct pinctrl_dev *pctldev,
-				const char *pin_group, const unsigned **pins,
-				unsigned *num_pins);
+				  const char *pin_group, const unsigned int **pins,
+				  unsigned int *num_pins);
 
 /**
  * struct pinfunction - Description about a function
diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h
index a7e370965c53..d6f7b58d6ad0 100644
--- a/include/linux/pinctrl/pinmux.h
+++ b/include/linux/pinctrl/pinmux.h
@@ -57,26 +57,26 @@ struct pinctrl_gpio_range;
  *	the pin request.
  */
 struct pinmux_ops {
-	int (*request) (struct pinctrl_dev *pctldev, unsigned offset);
-	int (*free) (struct pinctrl_dev *pctldev, unsigned offset);
+	int (*request) (struct pinctrl_dev *pctldev, unsigned int offset);
+	int (*free) (struct pinctrl_dev *pctldev, unsigned int offset);
 	int (*get_functions_count) (struct pinctrl_dev *pctldev);
 	const char *(*get_function_name) (struct pinctrl_dev *pctldev,
-					  unsigned selector);
+					  unsigned int selector);
 	int (*get_function_groups) (struct pinctrl_dev *pctldev,
-				  unsigned selector,
-				  const char * const **groups,
-				  unsigned *num_groups);
-	int (*set_mux) (struct pinctrl_dev *pctldev, unsigned func_selector,
-			unsigned group_selector);
+				    unsigned int selector,
+				    const char * const **groups,
+				    unsigned int *num_groups);
+	int (*set_mux) (struct pinctrl_dev *pctldev, unsigned int func_selector,
+			unsigned int group_selector);
 	int (*gpio_request_enable) (struct pinctrl_dev *pctldev,
 				    struct pinctrl_gpio_range *range,
-				    unsigned offset);
+				    unsigned int offset);
 	void (*gpio_disable_free) (struct pinctrl_dev *pctldev,
 				   struct pinctrl_gpio_range *range,
-				   unsigned offset);
+				   unsigned int offset);
 	int (*gpio_set_direction) (struct pinctrl_dev *pctldev,
 				   struct pinctrl_gpio_range *range,
-				   unsigned offset,
+				   unsigned int offset,
 				   bool input);
 	bool strict;
 };
-- 
cgit v1.2.3


From 2202844e4468c7539dba0c0b06577c93735af952 Mon Sep 17 00:00:00 2001
From: Longfang Liu <liulongfang@huawei.com>
Date: Mon, 6 Nov 2023 15:22:23 +0800
Subject: vfio/migration: Add debugfs to live migration driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are multiple devices, software and operational steps involved
in the process of live migration. An error occurred on any node may
cause the live migration operation to fail.
This complex process makes it very difficult to locate and analyze
the cause when the function fails.

In order to quickly locate the cause of the problem when the
live migration fails, I added a set of debugfs to the vfio
live migration driver.

    +-------------------------------------------+
    |                                           |
    |                                           |
    |                  QEMU                     |
    |                                           |
    |                                           |
    +---+----------------------------+----------+
        |      ^                     |      ^
        |      |                     |      |
        |      |                     |      |
        v      |                     v      |
     +---------+--+               +---------+--+
     |src vfio_dev|               |dst vfio_dev|
     +--+---------+               +--+---------+
        |      ^                     |      ^
        |      |                     |      |
        v      |                     |      |
   +-----------+----+           +-----------+----+
   |src dev debugfs |           |dst dev debugfs |
   +----------------+           +----------------+

The entire debugfs directory will be based on the definition of
the CONFIG_DEBUG_FS macro. If this macro is not enabled, the
interfaces in vfio.h will be empty definitions, and the creation
and initialization of the debugfs directory will not be executed.

   vfio
    |
    +---<dev_name1>
    |    +---migration
    |        +--state
    |
    +---<dev_name2>
         +---migration
             +--state

debugfs will create a public root directory "vfio" file.
then create a dev_name() file for each live migration device.
First, create a unified state acquisition file of "migration"
in this device directory.
Then, create a public live migration state lookup file "state".

Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Link: https://lore.kernel.org/r/20231106072225.28577-2-liulongfang@huawei.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index a65b2513f8cd..89b265bc6ec3 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -69,6 +69,13 @@ struct vfio_device {
 	u8 iommufd_attached:1;
 #endif
 	u8 cdev_opened:1;
+#ifdef CONFIG_DEBUG_FS
+	/*
+	 * debug_root is a static property of the vfio_device
+	 * which must be set prior to registering the vfio_device.
+	 */
+	struct dentry *debug_root;
+#endif
 };
 
 /**
-- 
cgit v1.2.3


From 20c20bd11a0702ce4dc9300c3da58acf551d9725 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 4 Dec 2023 22:04:20 +0800
Subject: bpf: Add map and need_defer parameters to .map_fd_put_ptr()

map is the pointer of outer map, and need_defer needs some explanation.
need_defer tells the implementation to defer the reference release of
the passed element and ensure that the element is still alive before
the bpf program, which may manipulate it, exits.

The following three cases will invoke map_fd_put_ptr() and different
need_defer values will be passed to these callers:

1) release the reference of the old element in the map during map update
   or map deletion. The release must be deferred, otherwise the bpf
   program may incur use-after-free problem, so need_defer needs to be
   true.
2) release the reference of the to-be-added element in the error path of
   map update. The to-be-added element is not visible to any bpf
   program, so it is OK to pass false for need_defer parameter.
3) release the references of all elements in the map during map release.
   Any bpf program which has access to the map must have been exited and
   released, so need_defer=false will be OK.

These two parameters will be used by the following patches to fix the
potential use-after-free problem for map-in-map.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231204140425.1480317-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eb447b0a9423..d273348cfb2f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -106,7 +106,11 @@ struct bpf_map_ops {
 	/* funcs called by prog_array and perf_event_array map */
 	void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
 				int fd);
-	void (*map_fd_put_ptr)(void *ptr);
+	/* If need_defer is true, the implementation should guarantee that
+	 * the to-be-put element is still alive before the bpf program, which
+	 * may manipulate it, exists.
+	 */
+	void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer);
 	int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
 	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
-- 
cgit v1.2.3


From 876673364161da50eed6b472d746ef88242b2368 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 4 Dec 2023 22:04:22 +0800
Subject: bpf: Defer the free of inner map when necessary

When updating or deleting an inner map in map array or map htab, the map
may still be accessed by non-sleepable program or sleepable program.
However bpf_map_fd_put_ptr() decreases the ref-counter of the inner map
directly through bpf_map_put(), if the ref-counter is the last one
(which is true for most cases), the inner map will be freed by
ops->map_free() in a kworker. But for now, most .map_free() callbacks
don't use synchronize_rcu() or its variants to wait for the elapse of a
RCU grace period, so after the invocation of ops->map_free completes,
the bpf program which is accessing the inner map may incur
use-after-free problem.

Fix the free of inner map by invoking bpf_map_free_deferred() after both
one RCU grace period and one tasks trace RCU grace period if the inner
map has been removed from the outer map before. The deferment is
accomplished by using call_rcu() or call_rcu_tasks_trace() when
releasing the last ref-counter of bpf map. The newly-added rcu_head
field in bpf_map shares the same storage space with work field to
reduce the size of bpf_map.

Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.")
Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231204140425.1480317-5-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d273348cfb2f..de3bd03cbeea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -276,7 +276,11 @@ struct bpf_map {
 	 */
 	atomic64_t refcnt ____cacheline_aligned;
 	atomic64_t usercnt;
-	struct work_struct work;
+	/* rcu is used before freeing and work is only used during freeing */
+	union {
+		struct work_struct work;
+		struct rcu_head rcu;
+	};
 	struct mutex freeze_mutex;
 	atomic64_t writecnt;
 	/* 'Ownership' of program-containing map is claimed by the first program
@@ -292,6 +296,7 @@ struct bpf_map {
 	} owner;
 	bool bypass_spec_v1;
 	bool frozen; /* write-once; write-protected by freeze_mutex */
+	bool free_after_mult_rcu_gp;
 	s64 __percpu *elem_count;
 };
 
-- 
cgit v1.2.3


From af66bfd3c8538ed21cf72af18426fc4a408665cf Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 4 Dec 2023 22:04:23 +0800
Subject: bpf: Optimize the free of inner map

When removing the inner map from the outer map, the inner map will be
freed after one RCU grace period and one RCU tasks trace grace
period, so it is certain that the bpf program, which may access the
inner map, has exited before the inner map is freed.

However there is no need to wait for one RCU tasks trace grace period if
the outer map is only accessed by non-sleepable program. So adding
sleepable_refcnt in bpf_map and increasing sleepable_refcnt when adding
the outer map into env->used_maps for sleepable program. Although the
max number of bpf program is INT_MAX - 1, the number of bpf programs
which are being loaded may be greater than INT_MAX, so using atomic64_t
instead of atomic_t for sleepable_refcnt. When removing the inner map
from the outer map, using sleepable_refcnt to decide whether or not a
RCU tasks trace grace period is needed before freeing the inner map.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231204140425.1480317-6-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index de3bd03cbeea..10e5e4d8a00f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -297,6 +297,8 @@ struct bpf_map {
 	bool bypass_spec_v1;
 	bool frozen; /* write-once; write-protected by freeze_mutex */
 	bool free_after_mult_rcu_gp;
+	bool free_after_rcu_gp;
+	atomic64_t sleepable_refcnt;
 	s64 __percpu *elem_count;
 };
 
-- 
cgit v1.2.3


From 2a502ff0c4e42a739b5aa550c901bf3852795532 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 1 Dec 2023 15:28:34 -0800
Subject: net: Add queue and napi association

Add the napi pointer in netdev queue for tracking the napi
instance for each queue. This achieves the queue<->napi mapping.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Link: https://lore.kernel.org/r/170147331483.5260.15723438819994285695.stgit@anambiarhost.jf.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c2d74bc112dd..5ddff11cbe26 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -665,6 +665,10 @@ struct netdev_queue {
 #ifdef CONFIG_XDP_SOCKETS
 	struct xsk_buff_pool    *pool;
 #endif
+	/* NAPI instance for the queue
+	 * Readers and writers must hold RTNL
+	 */
+	struct napi_struct      *napi;
 /*
  * write-mostly part
  */
@@ -2657,6 +2661,10 @@ static inline void *netdev_priv(const struct net_device *dev)
  */
 #define SET_NETDEV_DEVTYPE(net, devtype)	((net)->dev.type = (devtype))
 
+void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
+			  enum netdev_queue_type type,
+			  struct napi_struct *napi);
+
 /* Default NAPI poll() weight
  * Device drivers are strongly advised to not use bigger value
  */
-- 
cgit v1.2.3


From 26793bfb5d6072326d1465343e7cbf6156abca4f Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 1 Dec 2023 15:29:07 -0800
Subject: net: Add NAPI IRQ support

Add support to associate the interrupt vector number for a
NAPI instance.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Link: https://lore.kernel.org/r/170147334728.5260.13221803396905901904.stgit@anambiarhost.jf.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5ddff11cbe26..5551177e024e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -382,6 +382,7 @@ struct napi_struct {
 	/* control-path-only fields follow */
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
+	int			irq;
 };
 
 enum {
@@ -2665,6 +2666,11 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
 			  enum netdev_queue_type type,
 			  struct napi_struct *napi);
 
+static inline void netif_napi_set_irq(struct napi_struct *napi, int irq)
+{
+	napi->irq = irq;
+}
+
 /* Default NAPI poll() weight
  * Device drivers are strongly advised to not use bigger value
  */
-- 
cgit v1.2.3


From 37e4b8df27bc68340f3fc80dbb27e3549c7f881c Mon Sep 17 00:00:00 2001
From: Jianheng Zhang <Jianheng.Zhang@synopsys.com>
Date: Fri, 1 Dec 2023 03:22:03 +0000
Subject: net: stmmac: fix FPE events losing

The status bits of register MAC_FPE_CTRL_STS are clear on read. Using
32-bit read for MAC_FPE_CTRL_STS in dwmac5_fpe_configure() and
dwmac5_fpe_send_mpacket() clear the status bits. Then the stmmac interrupt
handler missing FPE event status and leads to FPE handshaking failure and
retries.
To avoid clear status bits of MAC_FPE_CTRL_STS in dwmac5_fpe_configure()
and dwmac5_fpe_send_mpacket(), add fpe_csr to stmmac_fpe_cfg structure to
cache the control bits of MAC_FPE_CTRL_STS and to avoid reading
MAC_FPE_CTRL_STS in those methods.

Fixes: 5a5586112b92 ("net: stmmac: support FPE link partner hand-shaking procedure")
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Jianheng Zhang <Jianheng.Zhang@synopsys.com>
Link: https://lore.kernel.org/r/CY5PR12MB637225A7CF529D5BE0FBE59CBF81A@CY5PR12MB6372.namprd12.prod.outlook.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/stmmac.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 0b4658a7eceb..dee5ad6e48c5 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -175,6 +175,7 @@ struct stmmac_fpe_cfg {
 	bool hs_enable;				/* FPE handshake enable */
 	enum stmmac_fpe_state lp_fpe_state;	/* Link Partner FPE state */
 	enum stmmac_fpe_state lo_fpe_state;	/* Local station FPE state */
+	u32 fpe_csr;				/* MAC_FPE_CTRL_STS reg cache */
 };
 
 struct stmmac_safety_feature_cfg {
-- 
cgit v1.2.3


From a5e400a985df8041ed4659ed1462aa9134318130 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sun, 20 Aug 2023 20:58:56 +0300
Subject: net/mlx5e: Honor user choice of IPsec replay window size

Users can configure IPsec replay window size, but mlx5 driver didn't
honor their choice and set always 32bits. Fix assignment logic to
configure right size from the beginning.

Fixes: 7db21ef4566e ("net/mlx5e: Set IPsec replay sequence numbers")
Reviewed-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6f3631425f38..90ca63f4bf63 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -12001,6 +12001,13 @@ enum {
 	MLX5_IPSEC_ASO_INC_SN            = 0x2,
 };
 
+enum {
+	MLX5_IPSEC_ASO_REPLAY_WIN_32BIT  = 0x0,
+	MLX5_IPSEC_ASO_REPLAY_WIN_64BIT  = 0x1,
+	MLX5_IPSEC_ASO_REPLAY_WIN_128BIT = 0x2,
+	MLX5_IPSEC_ASO_REPLAY_WIN_256BIT = 0x3,
+};
+
 struct mlx5_ifc_ipsec_aso_bits {
 	u8         valid[0x1];
 	u8         reserved_at_201[0x1];
-- 
cgit v1.2.3


From c2bf84f1d1a1595dcc45fe867f0e02b331993fee Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sun, 12 Nov 2023 13:50:00 +0200
Subject: net/mlx5e: Tidy up IPsec NAT-T SA discovery

IPsec NAT-T packets are UDP encapsulated packets over ESP normal ones.
In case they arrive to RX, the SPI and ESP are located in inner header,
while the check was performed on outer header instead.

That wrong check caused to the situation where received rekeying request
was missed and caused to rekey timeout, which "compensated" this failure
by completing rekeying.

Fixes: d65954934937 ("net/mlx5e: Support IPsec NAT-T functionality")
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 90ca63f4bf63..3f7b664d625b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -621,7 +621,7 @@ struct mlx5_ifc_fte_match_set_misc_bits {
 
 	u8         reserved_at_140[0x8];
 	u8         bth_dst_qp[0x18];
-	u8	   reserved_at_160[0x20];
+	u8	   inner_esp_spi[0x20];
 	u8	   outer_esp_spi[0x20];
 	u8         reserved_at_1a0[0x60];
 };
-- 
cgit v1.2.3


From 70da1d01edf6da3fde1df98b2125a77083a0fb82 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 2 Oct 2023 16:36:55 +0200
Subject: cpu/hotplug: remove CPUHP_SLAB_PREPARE hooks

The CPUHP_SLAB_PREPARE hooks are only used by SLAB which is removed.
SLUB defines them as NULL, so we can remove those altogether.

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/cpuhotplug.h | 1 -
 include/linux/slab.h       | 8 --------
 2 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index d305db70674b..07cb8f7030b6 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -108,7 +108,6 @@ enum cpuhp_state {
 	CPUHP_X2APIC_PREPARE,
 	CPUHP_SMPCFD_PREPARE,
 	CPUHP_RELAY_PREPARE,
-	CPUHP_SLAB_PREPARE,
 	CPUHP_MD_RAID5_PREPARE,
 	CPUHP_RCUTREE_PREP,
 	CPUHP_CPUIDLE_COUPLED_PREPARE,
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d6d6ffeeb9a2..34e43cddc520 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -788,12 +788,4 @@ size_t kmalloc_size_roundup(size_t size);
 
 void __init kmem_cache_init_late(void);
 
-#if defined(CONFIG_SMP) && defined(CONFIG_SLAB)
-int slab_prepare_cpu(unsigned int cpu);
-int slab_dead_cpu(unsigned int cpu);
-#else
-#define slab_prepare_cpu	NULL
-#define slab_dead_cpu		NULL
-#endif
-
 #endif	/* _LINUX_SLAB_H */
-- 
cgit v1.2.3


From a9e0b9f27266d46ed6e73aac8d0844602cd0cb93 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 2 Oct 2023 17:43:38 +0200
Subject: mm/slab: remove CONFIG_SLAB code from slab common code

In slab_common.c and slab.h headers, we can now remove all code behind
CONFIG_SLAB and CONFIG_DEBUG_SLAB ifdefs, and remove all CONFIG_SLUB
ifdefs.

Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 34e43cddc520..b2015d0e01ad 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -24,7 +24,7 @@
 
 /*
  * Flags to pass to kmem_cache_create().
- * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
+ * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op
  */
 /* DEBUG: Perform (expensive) checks on alloc/free */
 #define SLAB_CONSISTENCY_CHECKS	((slab_flags_t __force)0x00000100U)
@@ -302,25 +302,15 @@ static inline unsigned int arch_slab_minalign(void)
  * Kmalloc array related definitions
  */
 
-#ifdef CONFIG_SLAB
 /*
- * SLAB and SLUB directly allocates requests fitting in to an order-1 page
+ * SLUB directly allocates requests fitting in to an order-1 page
  * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
  */
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
 #define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
 #ifndef KMALLOC_SHIFT_LOW
-#define KMALLOC_SHIFT_LOW	5
-#endif
-#endif
-
-#ifdef CONFIG_SLUB
-#define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
-#ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	3
 #endif
-#endif
 
 /* Maximum allocatable size */
 #define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_MAX)
-- 
cgit v1.2.3


From 6ac805d13870925c787a28e3fe5cc73610cacd03 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 4 Dec 2023 10:47:49 -0700
Subject: iov_iter: remove unused 'iov' argument from import_single_range()

It is entirely unused, just get rid of it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20231204174827.1258875-2-axboe@kernel.dk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index b6214cbf2a43..bfafd3542fa7 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -348,7 +348,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec,
 		 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
 		 struct iov_iter *i, bool compat);
 int import_single_range(int type, void __user *buf, size_t len,
-		 struct iovec *iov, struct iov_iter *i);
+		 struct iov_iter *i);
 int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);
 
 static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
-- 
cgit v1.2.3


From 9fd7874c0e5c89d7da0b4442271696ec0f8edcba Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 4 Dec 2023 10:47:50 -0700
Subject: iov_iter: replace import_single_range() with import_ubuf()

With the removal of the 'iov' argument to import_single_range(), the two
functions are now fully identical. Convert the import_single_range()
callers to import_ubuf(), and remove the former fully.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20231204174827.1258875-3-axboe@kernel.dk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uio.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index bfafd3542fa7..bea9c89922d9 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -347,8 +347,6 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec,
 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
 		 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
 		 struct iov_iter *i, bool compat);
-int import_single_range(int type, void __user *buf, size_t len,
-		 struct iov_iter *i);
 int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);
 
 static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
-- 
cgit v1.2.3


From 118eb89b1e7f6807776c012cffc5c9b07fd26164 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 15 Nov 2023 14:58:05 +0530
Subject: drivers: perf: arm_pmu: Drop 'pmu_lock' element from 'struct
 pmu_hw_events'

As 'pmu_lock' element is not being used in any ARM PMU implementation, just
drop this from 'struct pmu_hw_events'.

Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20231115092805.737822-3-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmu.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 143fbc10ecfe..e2503d48ddee 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -59,12 +59,6 @@ struct pmu_hw_events {
 	 */
 	DECLARE_BITMAP(used_mask, ARMPMU_MAX_HWEVENTS);
 
-	/*
-	 * Hardware lock to serialize accesses to PMU registers. Needed for the
-	 * read/modify/write sequences.
-	 */
-	raw_spinlock_t		pmu_lock;
-
 	/*
 	 * When using percpu IRQs, we need a percpu dev_id. Place it here as we
 	 * already have to allocate this struct per cpu.
-- 
cgit v1.2.3


From 41f6f64e6999a837048b1bd13a2f8742964eca6b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 5 Dec 2023 10:42:39 -0800
Subject: bpf: support non-r10 register spill/fill to/from stack in precision
 tracking

Use instruction (jump) history to record instructions that performed
register spill/fill to/from stack, regardless if this was done through
read-only r10 register, or any other register after copying r10 into it
*and* potentially adjusting offset.

To make this work reliably, we push extra per-instruction flags into
instruction history, encoding stack slot index (spi) and stack frame
number in extra 10 bit flags we take away from prev_idx in instruction
history. We don't touch idx field for maximum performance, as it's
checked most frequently during backtracking.

This change removes basically the last remaining practical limitation of
precision backtracking logic in BPF verifier. It fixes known
deficiencies, but also opens up new opportunities to reduce number of
verified states, explored in the subsequent patches.

There are only three differences in selftests' BPF object files
according to veristat, all in the positive direction (less states).

File                                    Program        Insns (A)  Insns (B)  Insns  (DIFF)  States (A)  States (B)  States (DIFF)
--------------------------------------  -------------  ---------  ---------  -------------  ----------  ----------  -------------
test_cls_redirect_dynptr.bpf.linked3.o  cls_redirect        2987       2864  -123 (-4.12%)         240         231    -9 (-3.75%)
xdp_synproxy_kern.bpf.linked3.o         syncookie_tc       82848      82661  -187 (-0.23%)        5107        5073   -34 (-0.67%)
xdp_synproxy_kern.bpf.linked3.o         syncookie_xdp      85116      84964  -152 (-0.18%)        5162        5130   -32 (-0.62%)

Note, I avoided renaming jmp_history to more generic insn_hist to
minimize number of lines changed and potential merge conflicts between
bpf and bpf-next trees.

Notice also cur_hist_entry pointer reset to NULL at the beginning of
instruction verification loop. This pointer avoids the problem of
relying on last jump history entry's insn_idx to determine whether we
already have entry for current instruction or not. It can happen that we
added jump history entry because current instruction is_jmp_point(), but
also we need to add instruction flags for stack access. In this case, we
don't want to entries, so we need to reuse last added entry, if it is
present.

Relying on insn_idx comparison has the same ambiguity problem as the one
that was fixed recently in [0], so we avoid that.

  [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231110002638.4168352-3-andrii@kernel.org/

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reported-by: Tao Lyu <tao.lyu@epfl.ch>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231205184248.1502704-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3378cc753061..bada59812e00 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -325,12 +325,34 @@ struct bpf_func_state {
 	int allocated_stack;
 };
 
-struct bpf_idx_pair {
-	u32 prev_idx;
+#define MAX_CALL_FRAMES 8
+
+/* instruction history flags, used in bpf_jmp_history_entry.flags field */
+enum {
+	/* instruction references stack slot through PTR_TO_STACK register;
+	 * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8)
+	 * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512,
+	 * 8 bytes per slot, so slot index (spi) is [0, 63])
+	 */
+	INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */
+
+	INSN_F_SPI_MASK = 0x3f, /* 6 bits */
+	INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */
+
+	INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */
+};
+
+static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES);
+static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8);
+
+struct bpf_jmp_history_entry {
 	u32 idx;
+	/* insn idx can't be bigger than 1 million */
+	u32 prev_idx : 22;
+	/* special flags, e.g., whether insn is doing register stack spill/load */
+	u32 flags : 10;
 };
 
-#define MAX_CALL_FRAMES 8
 /* Maximum number of register states that can exist at once */
 #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES)
 struct bpf_verifier_state {
@@ -413,7 +435,7 @@ struct bpf_verifier_state {
 	 * For most states jmp_history_cnt is [0-3].
 	 * For loops can go up to ~40.
 	 */
-	struct bpf_idx_pair *jmp_history;
+	struct bpf_jmp_history_entry *jmp_history;
 	u32 jmp_history_cnt;
 	u32 dfs_depth;
 	u32 callback_unroll_depth;
@@ -656,6 +678,7 @@ struct bpf_verifier_env {
 		int cur_stack;
 	} cfg;
 	struct backtrack_state bt;
+	struct bpf_jmp_history_entry *cur_hist_ent;
 	u32 pass_cnt; /* number of times do_check() was called */
 	u32 subprog_cnt;
 	/* number of instructions analyzed by the verifier */
-- 
cgit v1.2.3


From 0949dd96dffec39683c6066cf8d0877cebc321ec Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Tue, 21 Nov 2023 13:44:25 +0000
Subject: drivers: base: Allow parts of GENERIC_CPU_DEVICES to be overridden

Architectures often have extra per-cpu work that needs doing
before a CPU is registered, often to determine if a CPU is
hotpluggable.

To allow the ACPI architectures to use GENERIC_CPU_DEVICES, move
the cpu_register() call into arch_register_cpu(), which is made __weak
so architectures with extra work can override it.
This aligns with the way x86, ia64 and loongarch register hotplug CPUs
when they become present.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/E1r5R3B-00Csz6-Uh@rmk-PC.armlinux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cpu.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index fc8094419084..1e982d63eae8 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -88,6 +88,10 @@ extern ssize_t arch_cpu_probe(const char *, size_t);
 extern ssize_t arch_cpu_release(const char *, size_t);
 #endif
 
+#ifdef CONFIG_GENERIC_CPU_DEVICES
+DECLARE_PER_CPU(struct cpu, cpu_devices);
+#endif
+
 /*
  * These states are not related to the core CPU hotplug mechanism. They are
  * used by various (sub)architectures to track internal state
-- 
cgit v1.2.3


From bb5e44fb3be685ecb3feb120aca4269a92cc84cf Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 21 Nov 2023 13:44:36 +0000
Subject: drivers: base: add arch_cpu_is_hotpluggable()

The differences between architecture specific implementations of
arch_register_cpu() are down to whether the CPU is hotpluggable or not.
Rather than overriding the weak version of arch_register_cpu(), provide
a function that can be used to provide this detail instead.

Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
Signed-off-by: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/E1r5R3M-00CszH-6r@rmk-PC.armlinux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cpu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 1e982d63eae8..dcb89c987164 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -80,6 +80,7 @@ extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,
 				 const struct attribute_group **groups,
 				 const char *fmt, ...);
+extern bool arch_cpu_is_hotpluggable(int cpu);
 extern int arch_register_cpu(int cpu);
 extern void arch_unregister_cpu(int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-- 
cgit v1.2.3


From 43a71cd66b9c0a4af3d15d8644359fde35bdbed0 Mon Sep 17 00:00:00 2001
From: Coco Li <lixiaoyan@google.com>
Date: Mon, 4 Dec 2023 20:12:30 +0000
Subject: net-device: reorganize net_device fast path variables

Reorganize fast path variables on tx-txrx-rx order
Fastpath variables end after npinfo.

Below data generated with pahole on x86 architecture.

Fast path variables span cache lines before change: 12
Fast path variables span cache lines after change: 4

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Coco Li <lixiaoyan@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20231204201232.520025-2-lixiaoyan@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 117 +++++++++++++++++++++++++---------------------
 1 file changed, 64 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5551177e024e..cb96aad6a6ee 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2097,6 +2097,70 @@ enum netdev_stat_type {
  */
 
 struct net_device {
+	/* Cacheline organization can be found documented in
+	 * Documentation/networking/net_cachelines/net_device.rst.
+	 * Please update the document when adding new fields.
+	 */
+
+	/* TX read-mostly hotpath */
+	__cacheline_group_begin(net_device_read_tx);
+	unsigned long long	priv_flags;
+	const struct net_device_ops *netdev_ops;
+	const struct header_ops *header_ops;
+	struct netdev_queue	*_tx;
+	unsigned int		real_num_tx_queues;
+	unsigned int		gso_max_size;
+	unsigned int		gso_ipv4_max_size;
+	u16			gso_max_segs;
+	s16			num_tc;
+	/* Note : dev->mtu is often read without holding a lock.
+	 * Writers usually hold RTNL.
+	 * It is recommended to use READ_ONCE() to annotate the reads,
+	 * and to use WRITE_ONCE() to annotate the writes.
+	 */
+	unsigned int		mtu;
+	unsigned short		needed_headroom;
+	struct netdev_tc_txq	tc_to_txq[TC_MAX_QUEUE];
+#ifdef CONFIG_XPS
+	struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	struct nf_hook_entries __rcu *nf_hooks_egress;
+#endif
+#ifdef CONFIG_NET_XGRESS
+	struct bpf_mprog_entry __rcu *tcx_egress;
+#endif
+	__cacheline_group_end(net_device_read_tx);
+
+	/* TXRX read-mostly hotpath */
+	__cacheline_group_begin(net_device_read_txrx);
+	unsigned int		flags;
+	unsigned short		hard_header_len;
+	netdev_features_t	features;
+	struct inet6_dev __rcu	*ip6_ptr;
+	__cacheline_group_end(net_device_read_txrx);
+
+	/* RX read-mostly hotpath */
+	__cacheline_group_begin(net_device_read_rx);
+	struct list_head	ptype_specific;
+	int			ifindex;
+	unsigned int		real_num_rx_queues;
+	struct netdev_rx_queue	*_rx;
+	unsigned long		gro_flush_timeout;
+	int			napi_defer_hard_irqs;
+	unsigned int		gro_max_size;
+	unsigned int		gro_ipv4_max_size;
+	rx_handler_func_t __rcu	*rx_handler;
+	void __rcu		*rx_handler_data;
+	possible_net_t			nd_net;
+#ifdef CONFIG_NETPOLL
+	struct netpoll_info __rcu	*npinfo;
+#endif
+#ifdef CONFIG_NET_XGRESS
+	struct bpf_mprog_entry __rcu *tcx_ingress;
+#endif
+	__cacheline_group_end(net_device_read_rx);
+
 	char			name[IFNAMSIZ];
 	struct netdev_name_node	*name_node;
 	struct dev_ifalias	__rcu *ifalias;
@@ -2121,7 +2185,6 @@ struct net_device {
 	struct list_head	unreg_list;
 	struct list_head	close_list;
 	struct list_head	ptype_all;
-	struct list_head	ptype_specific;
 
 	struct {
 		struct list_head upper;
@@ -2129,26 +2192,13 @@ struct net_device {
 	} adj_list;
 
 	/* Read-mostly cache-line for fast-path access */
-	unsigned int		flags;
 	xdp_features_t		xdp_features;
-	unsigned long long	priv_flags;
-	const struct net_device_ops *netdev_ops;
 	const struct xdp_metadata_ops *xdp_metadata_ops;
 	const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops;
-	int			ifindex;
 	unsigned short		gflags;
-	unsigned short		hard_header_len;
 
-	/* Note : dev->mtu is often read without holding a lock.
-	 * Writers usually hold RTNL.
-	 * It is recommended to use READ_ONCE() to annotate the reads,
-	 * and to use WRITE_ONCE() to annotate the writes.
-	 */
-	unsigned int		mtu;
-	unsigned short		needed_headroom;
 	unsigned short		needed_tailroom;
 
-	netdev_features_t	features;
 	netdev_features_t	hw_features;
 	netdev_features_t	wanted_features;
 	netdev_features_t	vlan_features;
@@ -2192,8 +2242,6 @@ struct net_device {
 	const struct tlsdev_ops *tlsdev_ops;
 #endif
 
-	const struct header_ops *header_ops;
-
 	unsigned char		operstate;
 	unsigned char		link_mode;
 
@@ -2234,9 +2282,7 @@ struct net_device {
 
 
 	/* Protocol-specific pointers */
-
 	struct in_device __rcu	*ip_ptr;
-	struct inet6_dev __rcu	*ip6_ptr;
 #if IS_ENABLED(CONFIG_VLAN_8021Q)
 	struct vlan_info __rcu	*vlan_info;
 #endif
@@ -2271,26 +2317,14 @@ struct net_device {
 	/* Interface address info used in eth_type_trans() */
 	const unsigned char	*dev_addr;
 
-	struct netdev_rx_queue	*_rx;
 	unsigned int		num_rx_queues;
-	unsigned int		real_num_rx_queues;
-
 	struct bpf_prog __rcu	*xdp_prog;
-	unsigned long		gro_flush_timeout;
-	int			napi_defer_hard_irqs;
 #define GRO_LEGACY_MAX_SIZE	65536u
 /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
  * and shinfo->gso_segs is a 16bit field.
  */
 #define GRO_MAX_SIZE		(8 * 65535u)
-	unsigned int		gro_max_size;
-	unsigned int		gro_ipv4_max_size;
 	unsigned int		xdp_zc_max_segs;
-	rx_handler_func_t __rcu	*rx_handler;
-	void __rcu		*rx_handler_data;
-#ifdef CONFIG_NET_XGRESS
-	struct bpf_mprog_entry __rcu *tcx_ingress;
-#endif
 	struct netdev_queue __rcu *ingress_queue;
 #ifdef CONFIG_NETFILTER_INGRESS
 	struct nf_hook_entries __rcu *nf_hooks_ingress;
@@ -2305,25 +2339,13 @@ struct net_device {
 /*
  * Cache lines mostly used on transmit path
  */
-	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
 	unsigned int		num_tx_queues;
-	unsigned int		real_num_tx_queues;
 	struct Qdisc __rcu	*qdisc;
 	unsigned int		tx_queue_len;
 	spinlock_t		tx_global_lock;
 
 	struct xdp_dev_bulk_queue __percpu *xdp_bulkq;
 
-#ifdef CONFIG_XPS
-	struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
-#endif
-#ifdef CONFIG_NET_XGRESS
-	struct bpf_mprog_entry __rcu *tcx_egress;
-#endif
-#ifdef CONFIG_NETFILTER_EGRESS
-	struct nf_hook_entries __rcu *nf_hooks_egress;
-#endif
-
 #ifdef CONFIG_NET_SCHED
 	DECLARE_HASHTABLE	(qdisc_hash, 4);
 #endif
@@ -2362,12 +2384,6 @@ struct net_device {
 	bool needs_free_netdev;
 	void (*priv_destructor)(struct net_device *dev);
 
-#ifdef CONFIG_NETPOLL
-	struct netpoll_info __rcu	*npinfo;
-#endif
-
-	possible_net_t			nd_net;
-
 	/* mid-layer private */
 	void				*ml_priv;
 	enum netdev_ml_priv_type	ml_priv_type;
@@ -2402,20 +2418,15 @@ struct net_device {
  */
 #define GSO_MAX_SIZE		(8 * GSO_MAX_SEGS)
 
-	unsigned int		gso_max_size;
 #define TSO_LEGACY_MAX_SIZE	65536
 #define TSO_MAX_SIZE		UINT_MAX
 	unsigned int		tso_max_size;
-	u16			gso_max_segs;
 #define TSO_MAX_SEGS		U16_MAX
 	u16			tso_max_segs;
-	unsigned int		gso_ipv4_max_size;
 
 #ifdef CONFIG_DCB
 	const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
-	s16			num_tc;
-	struct netdev_tc_txq	tc_to_txq[TC_MAX_QUEUE];
 	u8			prio_tc_map[TC_BITMASK + 1];
 
 #if IS_ENABLED(CONFIG_FCOE)
-- 
cgit v1.2.3


From d5fed5addb2b6bc13035de4338b7ea2052a2e006 Mon Sep 17 00:00:00 2001
From: Coco Li <lixiaoyan@google.com>
Date: Mon, 4 Dec 2023 20:12:31 +0000
Subject: tcp: reorganize tcp_sock fast path variables

The variables are organized according in the following way:

- TX read-mostly hotpath cache lines
- TXRX read-mostly hotpath cache lines
- RX read-mostly hotpath cache lines
- TX read-write hotpath cache line
- TXRX read-write hotpath cache line
- RX read-write hotpath cache line

Fastpath cachelines end after rcvq_space.

Cache line boundaries are enforced only between read-mostly and
read-write. That is, if read-mostly tx cachelines bleed into
read-mostly txrx cachelines, we do not care. We care about the
boundaries between read and write cachelines because we want
to prevent false sharing.

Fast path variables span cache lines before change: 12
Fast path variables span cache lines after change: 8

Suggested-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Wei Wang <weiwan@google.com>
Signed-off-by: Coco Li <lixiaoyan@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20231204201232.520025-3-lixiaoyan@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/tcp.h | 248 ++++++++++++++++++++++++++++------------------------
 1 file changed, 134 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 68f3d315d2e1..f55ec155f5b7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -194,23 +194,121 @@ static inline bool tcp_rsk_used_ao(const struct request_sock *req)
 #define TCP_RMEM_TO_WIN_SCALE 8
 
 struct tcp_sock {
+	/* Cacheline organization can be found documented in
+	 * Documentation/networking/net_cachelines/tcp_sock.rst.
+	 * Please update the document when adding new fields.
+	 */
+
 	/* inet_connection_sock has to be the first member of tcp_sock */
 	struct inet_connection_sock	inet_conn;
-	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
+
+	/* TX read-mostly hotpath cache lines */
+	__cacheline_group_begin(tcp_sock_read_tx);
+	/* timestamp of last sent data packet (for restart window) */
+	u32	max_window;	/* Maximal window ever seen from peer	*/
+	u32	rcv_ssthresh;	/* Current window clamp			*/
+	u32	reordering;	/* Packet reordering metric.		*/
+	u32	notsent_lowat;	/* TCP_NOTSENT_LOWAT */
 	u16	gso_segs;	/* Max number of segs per GSO packet	*/
+	/* from STCP, retrans queue hinting */
+	struct sk_buff *lost_skb_hint;
+	struct sk_buff *retransmit_skb_hint;
+	__cacheline_group_end(tcp_sock_read_tx);
+
+	/* TXRX read-mostly hotpath cache lines */
+	__cacheline_group_begin(tcp_sock_read_txrx);
+	u32	tsoffset;	/* timestamp offset */
+	u32	snd_wnd;	/* The window we expect to receive	*/
+	u32	mss_cache;	/* Cached effective mss, not including SACKS */
+	u32	snd_cwnd;	/* Sending congestion window		*/
+	u32	prr_out;	/* Total number of pkts sent during Recovery. */
+	u32	lost_out;	/* Lost packets			*/
+	u32	sacked_out;	/* SACK'd packets			*/
+	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
+	u8	chrono_type : 2,	/* current chronograph type */
+		repair      : 1,
+		is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
+		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
+	__cacheline_group_end(tcp_sock_read_txrx);
+
+	/* RX read-mostly hotpath cache lines */
+	__cacheline_group_begin(tcp_sock_read_rx);
+	u32	copied_seq;	/* Head of yet unread data */
+	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
+	u32	snd_wl1;	/* Sequence for window update		*/
+	u32	tlp_high_seq;	/* snd_nxt at the time of TLP */
+	u32	rttvar_us;	/* smoothed mdev_max			*/
+	u32	retrans_out;	/* Retransmitted packets out		*/
+	u16	advmss;		/* Advertised MSS			*/
+	u16	urg_data;	/* Saved octet of OOB data and control flags */
+	u32	lost;		/* Total data packets lost incl. rexmits */
+	struct  minmax rtt_min;
+	/* OOO segments go in this rbtree. Socket lock must be held. */
+	struct rb_root	out_of_order_queue;
+	u32	snd_ssthresh;	/* Slow start size threshold		*/
+	__cacheline_group_end(tcp_sock_read_rx);
 
+	/* TX read-write hotpath cache lines */
+	__cacheline_group_begin(tcp_sock_write_tx) ____cacheline_aligned;
+	u32	segs_out;	/* RFC4898 tcpEStatsPerfSegsOut
+				 * The total number of segments sent.
+				 */
+	u32	data_segs_out;	/* RFC4898 tcpEStatsPerfDataSegsOut
+				 * total number of data segments sent.
+				 */
+	u64	bytes_sent;	/* RFC4898 tcpEStatsPerfHCDataOctetsOut
+				 * total number of data bytes sent.
+				 */
+	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
+	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
+	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
+	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
+	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
+	u32	lsndtime;
+	u32	mdev_us;	/* medium deviation			*/
+	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
+	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
+	u64	tcp_mstamp;	/* most recent packet received/sent */
+	u32	rtt_seq;	/* sequence number to update rttvar	*/
+	struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
+	struct sk_buff *highest_sack;   /* skb just after the highest
+					 * skb with SACKed bit set
+					 * (validity guaranteed only if
+					 * sacked_out > 0)
+					 */
+	u8	ecn_flags;	/* ECN status bits.			*/
+	__cacheline_group_end(tcp_sock_write_tx);
+
+	/* TXRX read-write hotpath cache lines */
+	__cacheline_group_begin(tcp_sock_write_txrx);
 /*
  *	Header prediction flags
  *	0x5?10 << 16 + snd_wnd in net byte order
  */
 	__be32	pred_flags;
-
+	u32	rcv_nxt;	/* What we want to receive next		*/
+	u32	snd_nxt;	/* Next sequence we send		*/
+	u32	snd_una;	/* First byte we want an ack for	*/
+	u32	window_clamp;	/* Maximal window to advertise		*/
+	u32	srtt_us;	/* smoothed round trip time << 3 in usecs */
+	u32	packets_out;	/* Packets which are "in flight"	*/
+	u32	snd_up;		/* Urgent pointer		*/
+	u32	delivered;	/* Total data packets delivered incl. rexmits */
+	u32	delivered_ce;	/* Like the above but only ECE marked packets */
+	u32	app_limited;	/* limited until "delivered" reaches this val */
+	u32	rcv_wnd;	/* Current receiver window		*/
 /*
- *	RFC793 variables by their proper names. This means you can
- *	read the code and the spec side by side (and laugh ...)
- *	See RFC793 and RFC1122. The RFC writes these in capitals.
+ *      Options received (usually on last packet, some only on SYN packets).
  */
-	u64	bytes_received;	/* RFC4898 tcpEStatsAppHCThruOctetsReceived
+	struct tcp_options_received rx_opt;
+	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
+		rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
+	__cacheline_group_end(tcp_sock_write_txrx);
+
+	/* RX read-write hotpath cache lines */
+	__cacheline_group_begin(tcp_sock_write_rx);
+	u64	bytes_received;
+				/* RFC4898 tcpEStatsAppHCThruOctetsReceived
 				 * sum(delta(rcv_nxt)), or how many bytes
 				 * were acked.
 				 */
@@ -220,45 +318,44 @@ struct tcp_sock {
 	u32	data_segs_in;	/* RFC4898 tcpEStatsPerfDataSegsIn
 				 * total number of data segments in.
 				 */
- 	u32	rcv_nxt;	/* What we want to receive next 	*/
-	u32	copied_seq;	/* Head of yet unread data		*/
 	u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
- 	u32	snd_nxt;	/* Next sequence we send		*/
-	u32	segs_out;	/* RFC4898 tcpEStatsPerfSegsOut
-				 * The total number of segments sent.
-				 */
-	u32	data_segs_out;	/* RFC4898 tcpEStatsPerfDataSegsOut
-				 * total number of data segments sent.
-				 */
-	u64	bytes_sent;	/* RFC4898 tcpEStatsPerfHCDataOctetsOut
-				 * total number of data bytes sent.
-				 */
+	u32	max_packets_out;  /* max packets_out in last window */
+	u32	cwnd_usage_seq;  /* right edge of cwnd usage tracking flight */
+	u32	rate_delivered;    /* saved rate sample: packets delivered */
+	u32	rate_interval_us;  /* saved rate sample: time elapsed */
+	u32	rcv_rtt_last_tsecr;
+	u64	first_tx_mstamp;  /* start of window send phase */
+	u64	delivered_mstamp; /* time we reached "delivered" */
 	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
 				 */
+	struct {
+		u32	rtt_us;
+		u32	seq;
+		u64	time;
+	} rcv_rtt_est;
+/* Receiver queue space */
+	struct {
+		u32	space;
+		u32	seq;
+		u64	time;
+	} rcvq_space;
+	__cacheline_group_end(tcp_sock_write_rx);
+	/* End of Hot Path */
+
+/*
+ *	RFC793 variables by their proper names. This means you can
+ *	read the code and the spec side by side (and laugh ...)
+ *	See RFC793 and RFC1122. The RFC writes these in capitals.
+ */
 	u32	dsack_dups;	/* RFC4898 tcpEStatsStackDSACKDups
 				 * total number of DSACK blocks received
 				 */
- 	u32	snd_una;	/* First byte we want an ack for	*/
- 	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
-	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
-	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */
 	u32	last_oow_ack_time;  /* timestamp of last out-of-window ACK */
 	u32	compressed_ack_rcv_nxt;
-
-	u32	tsoffset;	/* timestamp offset */
-
 	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
-	struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
-
-	u32	snd_wl1;	/* Sequence for window update		*/
-	u32	snd_wnd;	/* The window we expect to receive	*/
-	u32	max_window;	/* Maximal window ever seen from peer	*/
-	u32	mss_cache;	/* Cached effective mss, not including SACKS */
 
-	u32	window_clamp;	/* Maximal window to advertise		*/
-	u32	rcv_ssthresh;	/* Current window clamp			*/
 	u8	scaling_ratio;	/* see tcp_win_from_space() */
 	/* Information of the most recently (s)acked skb */
 	struct tcp_rack {
@@ -272,24 +369,16 @@ struct tcp_sock {
 		   dsack_seen:1, /* Whether DSACK seen after last adj */
 		   advanced:1;	 /* mstamp advanced since last lost marking */
 	} rack;
-	u16	advmss;		/* Advertised MSS			*/
 	u8	compressed_ack;
 	u8	dup_ack_counter:2,
 		tlp_retrans:1,	/* TLP is a retransmission */
 		tcp_usec_ts:1, /* TSval values in usec */
 		unused:4;
-	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
-	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
-	u8	chrono_type:2,	/* current chronograph type */
-		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
+	u8	thin_lto    : 1,/* Use linear timeouts for thin streams */
+		recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
-		is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
-		fastopen_client_fail:2; /* reason why fastopen failed */
-	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
-		thin_lto    : 1,/* Use linear timeouts for thin streams */
-		recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
-		repair      : 1,
+		fastopen_client_fail:2, /* reason why fastopen failed */
 		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
 	u8	repair_queue;
 	u8	save_syn:2,	/* Save headers of SYN packet */
@@ -297,45 +386,19 @@ struct tcp_sock {
 		syn_fastopen:1,	/* SYN includes Fast Open option */
 		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
 		syn_fastopen_ch:1, /* Active TFO re-enabling probe */
-		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
-		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
-	u32	tlp_high_seq;	/* snd_nxt at the time of TLP */
+		syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
 
 	u32	tcp_tx_delay;	/* delay (in usec) added to TX packets */
-	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
-	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
 
 /* RTT measurement */
-	u64	tcp_mstamp;	/* most recent packet received/sent */
-	u32	srtt_us;	/* smoothed round trip time << 3 in usecs */
-	u32	mdev_us;	/* medium deviation			*/
 	u32	mdev_max_us;	/* maximal mdev for the last rtt period	*/
-	u32	rttvar_us;	/* smoothed mdev_max			*/
-	u32	rtt_seq;	/* sequence number to update rttvar	*/
-	struct  minmax rtt_min;
 
-	u32	packets_out;	/* Packets which are "in flight"	*/
-	u32	retrans_out;	/* Retransmitted packets out		*/
-	u32	max_packets_out;  /* max packets_out in last window */
-	u32	cwnd_usage_seq;  /* right edge of cwnd usage tracking flight */
-
-	u16	urg_data;	/* Saved octet of OOB data and control flags */
-	u8	ecn_flags;	/* ECN status bits.			*/
 	u8	keepalive_probes; /* num of allowed keep alive probes	*/
-	u32	reordering;	/* Packet reordering metric.		*/
 	u32	reord_seen;	/* number of data packet reordering events */
-	u32	snd_up;		/* Urgent pointer		*/
-
-/*
- *      Options received (usually on last packet, some only on SYN packets).
- */
-	struct tcp_options_received rx_opt;
 
 /*
  *	Slow start and congestion control (see also Nagle, and Karn & Partridge)
  */
- 	u32	snd_ssthresh;	/* Slow start size threshold		*/
- 	u32	snd_cwnd;	/* Sending congestion window		*/
 	u32	snd_cwnd_cnt;	/* Linear increase counter		*/
 	u32	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
 	u32	snd_cwnd_used;
@@ -343,32 +406,10 @@ struct tcp_sock {
 	u32	prior_cwnd;	/* cwnd right before starting loss recovery */
 	u32	prr_delivered;	/* Number of newly delivered packets to
 				 * receiver in Recovery. */
-	u32	prr_out;	/* Total number of pkts sent during Recovery. */
-	u32	delivered;	/* Total data packets delivered incl. rexmits */
-	u32	delivered_ce;	/* Like the above but only ECE marked packets */
-	u32	lost;		/* Total data packets lost incl. rexmits */
-	u32	app_limited;	/* limited until "delivered" reaches this val */
-	u64	first_tx_mstamp;  /* start of window send phase */
-	u64	delivered_mstamp; /* time we reached "delivered" */
-	u32	rate_delivered;    /* saved rate sample: packets delivered */
-	u32	rate_interval_us;  /* saved rate sample: time elapsed */
-
- 	u32	rcv_wnd;	/* Current receiver window		*/
-	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
-	u32	notsent_lowat;	/* TCP_NOTSENT_LOWAT */
-	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
-	u32	lost_out;	/* Lost packets			*/
-	u32	sacked_out;	/* SACK'd packets			*/
 
 	struct hrtimer	pacing_timer;
 	struct hrtimer	compressed_ack_timer;
 
-	/* from STCP, retrans queue hinting */
-	struct sk_buff* lost_skb_hint;
-	struct sk_buff *retransmit_skb_hint;
-
-	/* OOO segments go in this rbtree. Socket lock must be held. */
-	struct rb_root	out_of_order_queue;
 	struct sk_buff	*ooo_last_skb; /* cache rb_last(out_of_order_queue) */
 
 	/* SACKs data, these 2 need to be together (see tcp_options_write) */
@@ -377,12 +418,6 @@ struct tcp_sock {
 
 	struct tcp_sack_block recv_sack_cache[4];
 
-	struct sk_buff *highest_sack;   /* skb just after the highest
-					 * skb with SACKed bit set
-					 * (validity guaranteed only if
-					 * sacked_out > 0)
-					 */
-
 	int     lost_cnt_hint;
 
 	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
@@ -433,21 +468,6 @@ struct tcp_sock {
 
 	u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */
 
-/* Receiver side RTT estimation */
-	u32 rcv_rtt_last_tsecr;
-	struct {
-		u32	rtt_us;
-		u32	seq;
-		u64	time;
-	} rcv_rtt_est;
-
-/* Receiver queue space */
-	struct {
-		u32	space;
-		u32	seq;
-		u64	time;
-	} rcvq_space;
-
 /* TCP-specific MTU probe information. */
 	struct {
 		u32		  probe_seq_start;
-- 
cgit v1.2.3


From facd15dfd69122042502d99ab8c9f888b48ee994 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 4 Dec 2023 21:47:07 +0100
Subject: net: core: synchronize link-watch when carrier is queried

There are multiple ways to query for the carrier state: through
rtnetlink, sysfs, and (possibly) ethtool. Synchronize linkwatch
work before these operations so that we don't have a situation
where userspace queries the carrier state between the driver's
carrier off->on transition and linkwatch running and expects it
to work, when really (at least) TX cannot work until linkwatch
has run.

I previously posted a longer explanation of how this applies to
wireless [1] but with this wireless can simply query the state
before sending data, to ensure the kernel is ready for it.

[1] https://lore.kernel.org/all/346b21d87c69f817ea3c37caceb34f1f56255884.camel@sipsolutions.net/

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/20231204214706.303c62768415.I1caedccae72ee5a45c9085c5eb49c145ce1c0dd5@changeid
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb96aad6a6ee..1b935ee341b4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4229,6 +4229,15 @@ static inline void netdev_ref_replace(struct net_device *odev,
  */
 void linkwatch_fire_event(struct net_device *dev);
 
+/**
+ * linkwatch_sync_dev - sync linkwatch for the given device
+ * @dev: network device to sync linkwatch for
+ *
+ * Sync linkwatch for the given device, removing it from the
+ * pending work list (if queued).
+ */
+void linkwatch_sync_dev(struct net_device *dev);
+
 /**
  *	netif_carrier_ok - test if carrier present
  *	@dev: network device
-- 
cgit v1.2.3


From 7037d95a047cd89b1f680eed253c6ab586bef1ed Mon Sep 17 00:00:00 2001
From: Kelly Kane <kelly@hawknetworks.com>
Date: Sat, 2 Dec 2023 17:17:12 -0800
Subject: r8152: add vendor/device ID pair for ASUS USB-C2500

The ASUS USB-C2500 is an RTL8156 based 2.5G Ethernet controller.

Add the vendor and product ID values to the driver. This makes Ethernet
work with the adapter.

Signed-off-by: Kelly Kane <kelly@hawknetworks.com>
Link: https://lore.kernel.org/r/20231203011712.6314-1-kelly@hawknetworks.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/usb/r8152.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h
index 287e9d83fb8b..33a4c146dc19 100644
--- a/include/linux/usb/r8152.h
+++ b/include/linux/usb/r8152.h
@@ -30,6 +30,7 @@
 #define VENDOR_ID_NVIDIA		0x0955
 #define VENDOR_ID_TPLINK		0x2357
 #define VENDOR_ID_DLINK			0x2001
+#define VENDOR_ID_ASUS			0x0b05
 
 #if IS_REACHABLE(CONFIG_USB_RTL8152)
 extern u8 rtl8152_get_version(struct usb_interface *intf);
-- 
cgit v1.2.3


From a3a44d2d3a5c5ff6e73c711db5b1911b5a676bb0 Mon Sep 17 00:00:00 2001
From: Even Xu <even.xu@intel.com>
Date: Tue, 5 Dec 2023 09:50:30 +0800
Subject: HID: Intel-ish-hid: Ishtp: Add helper functions for client connection

For every ishtp client driver during initialization state, the flow is:
1 - Allocate an ISHTP client instance
2 - Reserve a host id and link the client instance
3 - Search a firmware client using UUID and get related
    client information
4 - Bind firmware client id to the ISHTP client instance
5 - Set the state the ISHTP client instance to CONNECTING
6 - Send connect request to firmware
7 - Register event callback for messages from the firmware

During deinitizalization state, the flow is:
9 - Set the state the ISHTP client instance to ISHTP_CL_DISCONNECTING
10 - Issue disconnect request to firmware
11 - Unlike the client instance
12 - Flush message queue
13 - Free ISHTP client instance

Step 2-7 are identical to the steps of client driver initialization
and driver reset flow, but reallocation of the RX/TX ring buffers
can be avoided in reset flow.

Also for step 9-12, they are identical to the steps of client driver
failure handling after connect request, driver reset flow and
driver removing.

So, add two helper functions to simplify client driver code.
ishtp_cl_establish_connection()
ishtp_cl_destroy_connection()

No functional changes are expected.

Signed-off-by: Even Xu <even.xu@intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 include/linux/intel-ish-client-if.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index f45f13304add..771622650247 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -94,6 +94,9 @@ int ishtp_cl_link(struct ishtp_cl *cl);
 void ishtp_cl_unlink(struct ishtp_cl *cl);
 int ishtp_cl_disconnect(struct ishtp_cl *cl);
 int ishtp_cl_connect(struct ishtp_cl *cl);
+int ishtp_cl_establish_connection(struct ishtp_cl *cl, const guid_t *uuid,
+				  int tx_size, int rx_size, bool reset);
+void ishtp_cl_destroy_connection(struct ishtp_cl *cl, bool reset);
 int ishtp_cl_send(struct ishtp_cl *cl, uint8_t *buf, size_t length);
 int ishtp_cl_flush_queues(struct ishtp_cl *cl);
 int ishtp_cl_io_rb_recycle(struct ishtp_cl_rb *rb);
-- 
cgit v1.2.3


From 16a1d968358aa9e897ce995fa45cb15d55a0e83d Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 2 Oct 2023 20:43:43 +0200
Subject: mm/slab: remove mm/slab.c and slab_def.h

Remove the SLAB implementation. Update CREDITS.
Also update and properly sort the SLOB entry there.

RIP SLAB allocator (1996 - 2024)

Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab_def.h | 124 -----------------------------------------------
 1 file changed, 124 deletions(-)
 delete mode 100644 include/linux/slab_def.h

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
deleted file mode 100644
index a61e7d55d0d3..000000000000
--- a/include/linux/slab_def.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SLAB_DEF_H
-#define	_LINUX_SLAB_DEF_H
-
-#include <linux/kfence.h>
-#include <linux/reciprocal_div.h>
-
-/*
- * Definitions unique to the original Linux SLAB allocator.
- */
-
-struct kmem_cache {
-	struct array_cache __percpu *cpu_cache;
-
-/* 1) Cache tunables. Protected by slab_mutex */
-	unsigned int batchcount;
-	unsigned int limit;
-	unsigned int shared;
-
-	unsigned int size;
-	struct reciprocal_value reciprocal_buffer_size;
-/* 2) touched by every alloc & free from the backend */
-
-	slab_flags_t flags;		/* constant flags */
-	unsigned int num;		/* # of objs per slab */
-
-/* 3) cache_grow/shrink */
-	/* order of pgs per slab (2^n) */
-	unsigned int gfporder;
-
-	/* force GFP flags, e.g. GFP_DMA */
-	gfp_t allocflags;
-
-	size_t colour;			/* cache colouring range */
-	unsigned int colour_off;	/* colour offset */
-	unsigned int freelist_size;
-
-	/* constructor func */
-	void (*ctor)(void *obj);
-
-/* 4) cache creation/removal */
-	const char *name;
-	struct list_head list;
-	int refcount;
-	int object_size;
-	int align;
-
-/* 5) statistics */
-#ifdef CONFIG_DEBUG_SLAB
-	unsigned long num_active;
-	unsigned long num_allocations;
-	unsigned long high_mark;
-	unsigned long grown;
-	unsigned long reaped;
-	unsigned long errors;
-	unsigned long max_freeable;
-	unsigned long node_allocs;
-	unsigned long node_frees;
-	unsigned long node_overflow;
-	atomic_t allochit;
-	atomic_t allocmiss;
-	atomic_t freehit;
-	atomic_t freemiss;
-
-	/*
-	 * If debugging is enabled, then the allocator can add additional
-	 * fields and/or padding to every object. 'size' contains the total
-	 * object size including these internal fields, while 'obj_offset'
-	 * and 'object_size' contain the offset to the user object and its
-	 * size.
-	 */
-	int obj_offset;
-#endif /* CONFIG_DEBUG_SLAB */
-
-#ifdef CONFIG_KASAN_GENERIC
-	struct kasan_cache kasan_info;
-#endif
-
-#ifdef CONFIG_SLAB_FREELIST_RANDOM
-	unsigned int *random_seq;
-#endif
-
-#ifdef CONFIG_HARDENED_USERCOPY
-	unsigned int useroffset;	/* Usercopy region offset */
-	unsigned int usersize;		/* Usercopy region size */
-#endif
-
-	struct kmem_cache_node *node[MAX_NUMNODES];
-};
-
-static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab,
-				void *x)
-{
-	void *object = x - (x - slab->s_mem) % cache->size;
-	void *last_object = slab->s_mem + (cache->num - 1) * cache->size;
-
-	if (unlikely(object > last_object))
-		return last_object;
-	else
-		return object;
-}
-
-/*
- * We want to avoid an expensive divide : (offset / cache->size)
- *   Using the fact that size is a constant for a particular cache,
- *   we can replace (offset / cache->size) by
- *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
- */
-static inline unsigned int obj_to_index(const struct kmem_cache *cache,
-					const struct slab *slab, void *obj)
-{
-	u32 offset = (obj - slab->s_mem);
-	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
-}
-
-static inline int objs_per_slab(const struct kmem_cache *cache,
-				     const struct slab *slab)
-{
-	if (is_kfence_address(slab_address(slab)))
-		return 1;
-	return cache->num;
-}
-
-#endif	/* _LINUX_SLAB_DEF_H */
-- 
cgit v1.2.3


From 7ef08ae8277c66657127844179912214c67fb4bc Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 3 Oct 2023 09:54:15 +0200
Subject: mm/slab: move struct kmem_cache_cpu declaration to slub.c

Nothing outside SLUB itself accesses the struct kmem_cache_cpu fields so
it does not need to be declared in slub_def.h. This allows also to move
enum stat_item.

Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slub_def.h | 54 ------------------------------------------------
 1 file changed, 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index deb90cf4bffb..a0229ea42977 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -12,60 +12,6 @@
 #include <linux/reciprocal_div.h>
 #include <linux/local_lock.h>
 
-enum stat_item {
-	ALLOC_FASTPATH,		/* Allocation from cpu slab */
-	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
-	FREE_FASTPATH,		/* Free to cpu slab */
-	FREE_SLOWPATH,		/* Freeing not to cpu slab */
-	FREE_FROZEN,		/* Freeing to frozen slab */
-	FREE_ADD_PARTIAL,	/* Freeing moves slab to partial list */
-	FREE_REMOVE_PARTIAL,	/* Freeing removes last object */
-	ALLOC_FROM_PARTIAL,	/* Cpu slab acquired from node partial list */
-	ALLOC_SLAB,		/* Cpu slab acquired from page allocator */
-	ALLOC_REFILL,		/* Refill cpu slab from slab freelist */
-	ALLOC_NODE_MISMATCH,	/* Switching cpu slab */
-	FREE_SLAB,		/* Slab freed to the page allocator */
-	CPUSLAB_FLUSH,		/* Abandoning of the cpu slab */
-	DEACTIVATE_FULL,	/* Cpu slab was full when deactivated */
-	DEACTIVATE_EMPTY,	/* Cpu slab was empty when deactivated */
-	DEACTIVATE_TO_HEAD,	/* Cpu slab was moved to the head of partials */
-	DEACTIVATE_TO_TAIL,	/* Cpu slab was moved to the tail of partials */
-	DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
-	DEACTIVATE_BYPASS,	/* Implicit deactivation */
-	ORDER_FALLBACK,		/* Number of times fallback was necessary */
-	CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */
-	CMPXCHG_DOUBLE_FAIL,	/* Number of times that cmpxchg double did not match */
-	CPU_PARTIAL_ALLOC,	/* Used cpu partial on alloc */
-	CPU_PARTIAL_FREE,	/* Refill cpu partial on free */
-	CPU_PARTIAL_NODE,	/* Refill cpu partial from node partial */
-	CPU_PARTIAL_DRAIN,	/* Drain cpu partial to node partial */
-	NR_SLUB_STAT_ITEMS
-};
-
-#ifndef CONFIG_SLUB_TINY
-/*
- * When changing the layout, make sure freelist and tid are still compatible
- * with this_cpu_cmpxchg_double() alignment requirements.
- */
-struct kmem_cache_cpu {
-	union {
-		struct {
-			void **freelist;	/* Pointer to next available object */
-			unsigned long tid;	/* Globally unique transaction id */
-		};
-		freelist_aba_t freelist_tid;
-	};
-	struct slab *slab;	/* The slab from which we are allocating */
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-	struct slab *partial;	/* Partially allocated frozen slabs */
-#endif
-	local_lock_t lock;	/* Protects the fields above */
-#ifdef CONFIG_SLUB_STATS
-	unsigned stat[NR_SLUB_STAT_ITEMS];
-#endif
-};
-#endif /* CONFIG_SLUB_TINY */
-
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 #define slub_percpu_partial(c)		((c)->partial)
 
-- 
cgit v1.2.3


From 19975f83412fbb9b1458f3dfbf16ca043a57788a Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 3 Oct 2023 09:59:48 +0200
Subject: mm/slab: move the rest of slub_def.h to mm/slab.h

mm/slab.h is the only place to include include/linux/slub_def.h which
has allowed switching between SLAB and SLUB. Now we can simply move the
contents over and remove slub_def.h.

Use this opportunity to fix up some whitespace (alignment) issues.

Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slub_def.h | 150 -----------------------------------------------
 1 file changed, 150 deletions(-)
 delete mode 100644 include/linux/slub_def.h

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
deleted file mode 100644
index a0229ea42977..000000000000
--- a/include/linux/slub_def.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SLUB_DEF_H
-#define _LINUX_SLUB_DEF_H
-
-/*
- * SLUB : A Slab allocator without object queues.
- *
- * (C) 2007 SGI, Christoph Lameter
- */
-#include <linux/kfence.h>
-#include <linux/kobject.h>
-#include <linux/reciprocal_div.h>
-#include <linux/local_lock.h>
-
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-#define slub_percpu_partial(c)		((c)->partial)
-
-#define slub_set_percpu_partial(c, p)		\
-({						\
-	slub_percpu_partial(c) = (p)->next;	\
-})
-
-#define slub_percpu_partial_read_once(c)     READ_ONCE(slub_percpu_partial(c))
-#else
-#define slub_percpu_partial(c)			NULL
-
-#define slub_set_percpu_partial(c, p)
-
-#define slub_percpu_partial_read_once(c)	NULL
-#endif // CONFIG_SLUB_CPU_PARTIAL
-
-/*
- * Word size structure that can be atomically updated or read and that
- * contains both the order and the number of objects that a slab of the
- * given order would contain.
- */
-struct kmem_cache_order_objects {
-	unsigned int x;
-};
-
-/*
- * Slab cache management.
- */
-struct kmem_cache {
-#ifndef CONFIG_SLUB_TINY
-	struct kmem_cache_cpu __percpu *cpu_slab;
-#endif
-	/* Used for retrieving partial slabs, etc. */
-	slab_flags_t flags;
-	unsigned long min_partial;
-	unsigned int size;	/* The size of an object including metadata */
-	unsigned int object_size;/* The size of an object without metadata */
-	struct reciprocal_value reciprocal_size;
-	unsigned int offset;	/* Free pointer offset */
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-	/* Number of per cpu partial objects to keep around */
-	unsigned int cpu_partial;
-	/* Number of per cpu partial slabs to keep around */
-	unsigned int cpu_partial_slabs;
-#endif
-	struct kmem_cache_order_objects oo;
-
-	/* Allocation and freeing of slabs */
-	struct kmem_cache_order_objects min;
-	gfp_t allocflags;	/* gfp flags to use on each alloc */
-	int refcount;		/* Refcount for slab cache destroy */
-	void (*ctor)(void *);
-	unsigned int inuse;		/* Offset to metadata */
-	unsigned int align;		/* Alignment */
-	unsigned int red_left_pad;	/* Left redzone padding size */
-	const char *name;	/* Name (only for display!) */
-	struct list_head list;	/* List of slab caches */
-#ifdef CONFIG_SYSFS
-	struct kobject kobj;	/* For sysfs */
-#endif
-#ifdef CONFIG_SLAB_FREELIST_HARDENED
-	unsigned long random;
-#endif
-
-#ifdef CONFIG_NUMA
-	/*
-	 * Defragmentation by allocating from a remote node.
-	 */
-	unsigned int remote_node_defrag_ratio;
-#endif
-
-#ifdef CONFIG_SLAB_FREELIST_RANDOM
-	unsigned int *random_seq;
-#endif
-
-#ifdef CONFIG_KASAN_GENERIC
-	struct kasan_cache kasan_info;
-#endif
-
-#ifdef CONFIG_HARDENED_USERCOPY
-	unsigned int useroffset;	/* Usercopy region offset */
-	unsigned int usersize;		/* Usercopy region size */
-#endif
-
-	struct kmem_cache_node *node[MAX_NUMNODES];
-};
-
-#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY)
-#define SLAB_SUPPORTS_SYSFS
-void sysfs_slab_unlink(struct kmem_cache *);
-void sysfs_slab_release(struct kmem_cache *);
-#else
-static inline void sysfs_slab_unlink(struct kmem_cache *s)
-{
-}
-static inline void sysfs_slab_release(struct kmem_cache *s)
-{
-}
-#endif
-
-void *fixup_red_left(struct kmem_cache *s, void *p);
-
-static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab,
-				void *x) {
-	void *object = x - (x - slab_address(slab)) % cache->size;
-	void *last_object = slab_address(slab) +
-		(slab->objects - 1) * cache->size;
-	void *result = (unlikely(object > last_object)) ? last_object : object;
-
-	result = fixup_red_left(cache, result);
-	return result;
-}
-
-/* Determine object index from a given position */
-static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
-					  void *addr, void *obj)
-{
-	return reciprocal_divide(kasan_reset_tag(obj) - addr,
-				 cache->reciprocal_size);
-}
-
-static inline unsigned int obj_to_index(const struct kmem_cache *cache,
-					const struct slab *slab, void *obj)
-{
-	if (is_kfence_address(obj))
-		return 0;
-	return __obj_to_index(cache, slab_address(slab), obj);
-}
-
-static inline int objs_per_slab(const struct kmem_cache *cache,
-				     const struct slab *slab)
-{
-	return slab->objects;
-}
-#endif /* _LINUX_SLUB_DEF_H */
-- 
cgit v1.2.3


From 9396c4ee93f9ac03cd0cea0bb345fbc657772943 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 4 Dec 2023 19:00:44 +0000
Subject: net/tcp: Don't store TCP-AO maclen on reqsk

This extra check doesn't work for a handshake when SYN segment has
(current_key.maclen != rnext_key.maclen). It could be amended to
preserve rnext_key.maclen instead of current_key.maclen, but that
requires a lookup on listen socket.

Originally, this extra maclen check was introduced just because it was
cheap. Drop it and convert tcp_request_sock::maclen into boolean
tcp_request_sock::used_tcp_ao.

Fixes: 06b22ef29591 ("net/tcp: Wire TCP-AO to request sockets")
Signed-off-by: Dmitry Safonov <dima@arista.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/tcp.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 68f3d315d2e1..b646b574b060 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -169,7 +169,7 @@ struct tcp_request_sock {
 #ifdef CONFIG_TCP_AO
 	u8				ao_keyid;
 	u8				ao_rcv_next;
-	u8				maclen;
+	bool				used_tcp_ao;
 #endif
 };
 
@@ -180,14 +180,10 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
 
 static inline bool tcp_rsk_used_ao(const struct request_sock *req)
 {
-	/* The real length of MAC is saved in the request socket,
-	 * signing anything with zero-length makes no sense, so here is
-	 * a little hack..
-	 */
 #ifndef CONFIG_TCP_AO
 	return false;
 #else
-	return tcp_rsk(req)->maclen != 0;
+	return tcp_rsk(req)->used_tcp_ao;
 #endif
 }
 
-- 
cgit v1.2.3


From 16e5ac127d8d18adf85fe5ba847d77b58d1ed418 Mon Sep 17 00:00:00 2001
From: Naresh Solanki <naresh.solanki@9elements.com>
Date: Tue, 5 Dec 2023 16:22:04 +0530
Subject: regulator: event: Add regulator netlink event support

This commit introduces netlink event support to the regulator subsystem.

Changes:
- Introduce event.c and regnl.h for netlink event handling.
- Implement reg_generate_netlink_event to broadcast regulator events.
- Update Makefile to include the new event.c file.

Signed-off-by: Naresh Solanki <naresh.solanki@9elements.com>
Link: https://lore.kernel.org/r/20231205105207.1262928-1-naresh.solanki@9elements.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/consumer.h | 47 +-------------------------------------
 1 file changed, 1 insertion(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 39b666b40ea6..4660582a3302 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -33,6 +33,7 @@
 
 #include <linux/err.h>
 #include <linux/suspend.h>
+#include <regulator/regulator.h>
 
 struct device;
 struct notifier_block;
@@ -84,52 +85,6 @@ struct regulator_dev;
 #define REGULATOR_MODE_IDLE			0x4
 #define REGULATOR_MODE_STANDBY			0x8
 
-/*
- * Regulator notifier events.
- *
- * UNDER_VOLTAGE  Regulator output is under voltage.
- * OVER_CURRENT   Regulator output current is too high.
- * REGULATION_OUT Regulator output is out of regulation.
- * FAIL           Regulator output has failed.
- * OVER_TEMP      Regulator over temp.
- * FORCE_DISABLE  Regulator forcibly shut down by software.
- * VOLTAGE_CHANGE Regulator voltage changed.
- *                Data passed is old voltage cast to (void *).
- * DISABLE        Regulator was disabled.
- * PRE_VOLTAGE_CHANGE   Regulator is about to have voltage changed.
- *                      Data passed is "struct pre_voltage_change_data"
- * ABORT_VOLTAGE_CHANGE Regulator voltage change failed for some reason.
- *                      Data passed is old voltage cast to (void *).
- * PRE_DISABLE    Regulator is about to be disabled
- * ABORT_DISABLE  Regulator disable failed for some reason
- *
- * NOTE: These events can be OR'ed together when passed into handler.
- */
-
-#define REGULATOR_EVENT_UNDER_VOLTAGE		0x01
-#define REGULATOR_EVENT_OVER_CURRENT		0x02
-#define REGULATOR_EVENT_REGULATION_OUT		0x04
-#define REGULATOR_EVENT_FAIL			0x08
-#define REGULATOR_EVENT_OVER_TEMP		0x10
-#define REGULATOR_EVENT_FORCE_DISABLE		0x20
-#define REGULATOR_EVENT_VOLTAGE_CHANGE		0x40
-#define REGULATOR_EVENT_DISABLE			0x80
-#define REGULATOR_EVENT_PRE_VOLTAGE_CHANGE	0x100
-#define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE	0x200
-#define REGULATOR_EVENT_PRE_DISABLE		0x400
-#define REGULATOR_EVENT_ABORT_DISABLE		0x800
-#define REGULATOR_EVENT_ENABLE			0x1000
-/*
- * Following notifications should be emitted only if detected condition
- * is such that the HW is likely to still be working but consumers should
- * take a recovery action to prevent problems esacalating into errors.
- */
-#define REGULATOR_EVENT_UNDER_VOLTAGE_WARN	0x2000
-#define REGULATOR_EVENT_OVER_CURRENT_WARN	0x4000
-#define REGULATOR_EVENT_OVER_VOLTAGE_WARN	0x8000
-#define REGULATOR_EVENT_OVER_TEMP_WARN		0x10000
-#define REGULATOR_EVENT_WARN_MASK		0x1E000
-
 /*
  * Regulator errors that can be queried using regulator_get_error_flags
  *
-- 
cgit v1.2.3


From 15bece7bec0df91a8ed1c185483d67708425ca8e Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Fri, 24 Nov 2023 20:16:15 +0800
Subject: cpu/hotplug: Remove unused CPU hotplug states

There are unused hotplug states which either have never been used or the
removal of the usage did not remove the state constant.

Drop them to reduce the size of the cpuhp_hp_states array.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231124121615.1604-1-yuzenghui@huawei.com
---
 include/linux/cpuhotplug.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index efc0c0b07efb..af6c21aab985 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -66,15 +66,12 @@ enum cpuhp_state {
 	CPUHP_PERF_POWER,
 	CPUHP_PERF_SUPERH,
 	CPUHP_X86_HPET_DEAD,
-	CPUHP_X86_APB_DEAD,
 	CPUHP_X86_MCE_DEAD,
 	CPUHP_VIRT_NET_DEAD,
 	CPUHP_IBMVNIC_DEAD,
 	CPUHP_SLUB_DEAD,
 	CPUHP_DEBUG_OBJ_DEAD,
 	CPUHP_MM_WRITEBACK_DEAD,
-	/* Must be after CPUHP_MM_VMSTAT_DEAD */
-	CPUHP_MM_DEMOTION_DEAD,
 	CPUHP_MM_VMSTAT_DEAD,
 	CPUHP_SOFTIRQ_DEAD,
 	CPUHP_NET_MVNETA_DEAD,
@@ -96,7 +93,6 @@ enum cpuhp_state {
 	CPUHP_NET_DEV_DEAD,
 	CPUHP_PCI_XGENE_DEAD,
 	CPUHP_IOMMU_IOVA_DEAD,
-	CPUHP_LUSTRE_CFS_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
 	CPUHP_PADATA_DEAD,
 	CPUHP_AP_DTPM_CPU_DEAD,
@@ -118,7 +114,6 @@ enum cpuhp_state {
 	CPUHP_XEN_EVTCHN_PREPARE,
 	CPUHP_ARM_SHMOBILE_SCU_PREPARE,
 	CPUHP_SH_SH3X_PREPARE,
-	CPUHP_NET_FLOW_PREPARE,
 	CPUHP_TOPOLOGY_PREPARE,
 	CPUHP_NET_IUCV_PREPARE,
 	CPUHP_ARM_BL_PREPARE,
@@ -151,18 +146,14 @@ enum cpuhp_state {
 	CPUHP_AP_IRQ_ARMADA_XP_STARTING,
 	CPUHP_AP_IRQ_BCM2836_STARTING,
 	CPUHP_AP_IRQ_MIPS_GIC_STARTING,
-	CPUHP_AP_IRQ_RISCV_STARTING,
 	CPUHP_AP_IRQ_LOONGARCH_STARTING,
 	CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
 	CPUHP_AP_ARM_MVEBU_COHERENCY,
-	CPUHP_AP_MICROCODE_LOADER,
 	CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
 	CPUHP_AP_PERF_X86_STARTING,
 	CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
-	CPUHP_AP_PERF_X86_CQM_STARTING,
 	CPUHP_AP_PERF_X86_CSTATE_STARTING,
 	CPUHP_AP_PERF_XTENSA_STARTING,
-	CPUHP_AP_MIPS_OP_LOONGSON3_STARTING,
 	CPUHP_AP_ARM_VFP_STARTING,
 	CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
 	CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING,
@@ -179,7 +170,6 @@ enum cpuhp_state {
 	CPUHP_AP_QCOM_TIMER_STARTING,
 	CPUHP_AP_TEGRA_TIMER_STARTING,
 	CPUHP_AP_ARMADA_TIMER_STARTING,
-	CPUHP_AP_MARCO_TIMER_STARTING,
 	CPUHP_AP_MIPS_GIC_TIMER_STARTING,
 	CPUHP_AP_ARC_TIMER_STARTING,
 	CPUHP_AP_RISCV_TIMER_STARTING,
@@ -217,9 +207,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
 	CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
 	CPUHP_AP_PERF_X86_RAPL_ONLINE,
-	CPUHP_AP_PERF_X86_CQM_ONLINE,
 	CPUHP_AP_PERF_X86_CSTATE_ONLINE,
-	CPUHP_AP_PERF_X86_IDXD_ONLINE,
 	CPUHP_AP_PERF_S390_CF_ONLINE,
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
@@ -252,8 +240,6 @@ enum cpuhp_state {
 	CPUHP_AP_BASE_CACHEINFO_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
-	/* Must be after CPUHP_AP_ONLINE_DYN for node_states[N_CPU] update */
-	CPUHP_AP_MM_DEMOTION_ONLINE,
 	CPUHP_AP_X86_HPET_ONLINE,
 	CPUHP_AP_X86_KVM_CLK_ONLINE,
 	CPUHP_AP_ACTIVE,
-- 
cgit v1.2.3


From 57b8543ceee82ea72be1745a6dc3a9111d55a151 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Thu, 23 Nov 2023 15:36:13 +0530
Subject: ACPI: bus: update acpi_dev_uid_match() to support multiple types

According to the ACPI specification, a _UID object can evaluate to
either a numeric value or a string.

Update acpi_dev_uid_match() to support _UID matching for both integer
and string types.

Suggested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Raag Jadav <raag.jadav@intel.com>
[ rjw: Rename auxiliary macros, relocate kerneldoc comment ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4db54e928b36..2abe81f074de 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -756,6 +756,9 @@ const char *acpi_get_subsystem_id(acpi_handle handle);
 #define ACPI_HANDLE(dev)		(NULL)
 #define ACPI_HANDLE_FWNODE(fwnode)	(NULL)
 
+/* Get rid of the -Wunused-variable for adev */
+#define acpi_dev_uid_match(adev, uid2)			(adev && false)
+
 #include <acpi/acpi_numa.h>
 
 struct fwnode_handle;
@@ -772,11 +775,6 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
 
 struct acpi_device;
 
-static inline bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2)
-{
-	return false;
-}
-
 static inline bool
 acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2)
 {
-- 
cgit v1.2.3


From b2b32a1738815155d4a0039bb7a6092d40f23e81 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Thu, 23 Nov 2023 15:36:14 +0530
Subject: ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types

Now that we have _UID matching support for both integer and string types,
we can support them into acpi_dev_hid_uid_match() helper as well.

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 2abe81f074de..75274585656c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -758,6 +758,7 @@ const char *acpi_get_subsystem_id(acpi_handle handle);
 
 /* Get rid of the -Wunused-variable for adev */
 #define acpi_dev_uid_match(adev, uid2)			(adev && false)
+#define acpi_dev_hid_uid_match(adev, hid2, uid2)	(adev && false)
 
 #include <acpi/acpi_numa.h>
 
@@ -775,12 +776,6 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
 
 struct acpi_device;
 
-static inline bool
-acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2)
-{
-	return false;
-}
-
 static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 40bba140c60fbb3ee8df6203c82fbd3de9f19d95 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:14 -0800
Subject: bpf: add BPF token delegation mount options to BPF FS

Add few new mount options to BPF FS that allow to specify that a given
BPF FS instance allows creation of BPF token (added in the next patch),
and what sort of operations are allowed under BPF token. As such, we get
4 new mount options, each is a bit mask
  - `delegate_cmds` allow to specify which bpf() syscall commands are
    allowed with BPF token derived from this BPF FS instance;
  - if BPF_MAP_CREATE command is allowed, `delegate_maps` specifies
    a set of allowable BPF map types that could be created with BPF token;
  - if BPF_PROG_LOAD command is allowed, `delegate_progs` specifies
    a set of allowable BPF program types that could be loaded with BPF token;
  - if BPF_PROG_LOAD command is allowed, `delegate_attachs` specifies
    a set of allowable BPF program attach types that could be loaded with
    BPF token; delegate_progs and delegate_attachs are meant to be used
    together, as full BPF program type is, in general, determined
    through both program type and program attach type.

Currently, these mount options accept the following forms of values:
  - a special value "any", that enables all possible values of a given
  bit set;
  - numeric value (decimal or hexadecimal, determined by kernel
  automatically) that specifies a bit mask value directly;
  - all the values for a given mount option are combined, if specified
  multiple times. E.g., `mount -t bpf nodev /path/to/mount -o
  delegate_maps=0x1 -o delegate_maps=0x2` will result in a combined 0x3
  mask.

Ideally, more convenient (for humans) symbolic form derived from
corresponding UAPI enums would be accepted (e.g., `-o
delegate_progs=kprobe|tracepoint`) and I intend to implement this, but
it requires a bunch of UAPI header churn, so I postponed it until this
feature lands upstream or at least there is a definite consensus that
this feature is acceptable and is going to make it, just to minimize
amount of wasted effort and not increase amount of non-essential code to
be reviewed.

Attentive reader will notice that BPF FS is now marked as
FS_USERNS_MOUNT, which theoretically makes it mountable inside non-init
user namespace as long as the process has sufficient *namespaced*
capabilities within that user namespace. But in reality we still
restrict BPF FS to be mountable only by processes with CAP_SYS_ADMIN *in
init userns* (extra check in bpf_fill_super()). FS_USERNS_MOUNT is added
to allow creating BPF FS context object (i.e., fsopen("bpf")) from
inside unprivileged process inside non-init userns, to capture that
userns as the owning userns. It will still be required to pass this
context object back to privileged process to instantiate and mount it.

This manipulation is important, because capturing non-init userns as the
owning userns of BPF FS instance (super block) allows to use that userns
to constraint BPF token to that userns later on (see next patch). So
creating BPF FS with delegation inside unprivileged userns will restrict
derived BPF token objects to only "work" inside that intended userns,
making it scoped to a intended "container". Also, setting these
delegation options requires capable(CAP_SYS_ADMIN), so unprivileged
process cannot set this up without involvement of a privileged process.

There is a set of selftests at the end of the patch set that simulates
this sequence of steps and validates that everything works as intended.
But careful review is requested to make sure there are no missed gaps in
the implementation and testing.

This somewhat subtle set of aspects is the result of previous
discussions ([0]) about various user namespace implications and
interactions with BPF token functionality and is necessary to contain
BPF token inside intended user namespace.

  [0] https://lore.kernel.org/bpf/20230704-hochverdient-lehne-eeb9eeef785e@brauner/

Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 10e5e4d8a00f..d3c9acc593ea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1581,6 +1581,16 @@ struct bpf_link_primer {
 	u32 id;
 };
 
+struct bpf_mount_opts {
+	umode_t mode;
+
+	/* BPF token-related delegation options */
+	u64 delegate_cmds;
+	u64 delegate_maps;
+	u64 delegate_progs;
+	u64 delegate_attachs;
+};
+
 struct bpf_struct_ops_value;
 struct btf_member;
 
-- 
cgit v1.2.3


From 4527358b76861dfd64ee34aba45d81648fbc8a61 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:15 -0800
Subject: bpf: introduce BPF token object

Add new kind of BPF kernel object, BPF token. BPF token is meant to
allow delegating privileged BPF functionality, like loading a BPF
program or creating a BPF map, from privileged process to a *trusted*
unprivileged process, all while having a good amount of control over which
privileged operations could be performed using provided BPF token.

This is achieved through mounting BPF FS instance with extra delegation
mount options, which determine what operations are delegatable, and also
constraining it to the owning user namespace (as mentioned in the
previous patch).

BPF token itself is just a derivative from BPF FS and can be created
through a new bpf() syscall command, BPF_TOKEN_CREATE, which accepts BPF
FS FD, which can be attained through open() API by opening BPF FS mount
point. Currently, BPF token "inherits" delegated command, map types,
prog type, and attach type bit sets from BPF FS as is. In the future,
having an BPF token as a separate object with its own FD, we can allow
to further restrict BPF token's allowable set of things either at the
creation time or after the fact, allowing the process to guard itself
further from unintentionally trying to load undesired kind of BPF
programs. But for now we keep things simple and just copy bit sets as is.

When BPF token is created from BPF FS mount, we take reference to the
BPF super block's owning user namespace, and then use that namespace for
checking all the {CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN}
capabilities that are normally only checked against init userns (using
capable()), but now we check them using ns_capable() instead (if BPF
token is provided). See bpf_token_capable() for details.

Such setup means that BPF token in itself is not sufficient to grant BPF
functionality. User namespaced process has to *also* have necessary
combination of capabilities inside that user namespace. So while
previously CAP_BPF was useless when granted within user namespace, now
it gains a meaning and allows container managers and sys admins to have
a flexible control over which processes can and need to use BPF
functionality within the user namespace (i.e., container in practice).
And BPF FS delegation mount options and derived BPF tokens serve as
a per-container "flag" to grant overall ability to use bpf() (plus further
restrict on which parts of bpf() syscalls are treated as namespaced).

Note also, BPF_TOKEN_CREATE command itself requires ns_capable(CAP_BPF)
within the BPF FS owning user namespace, rounding up the ns_capable()
story of BPF token.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d3c9acc593ea..aa9cf8e5fab1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -51,6 +51,10 @@ struct module;
 struct bpf_func_state;
 struct ftrace_ops;
 struct cgroup;
+struct bpf_token;
+struct user_namespace;
+struct super_block;
+struct inode;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1591,6 +1595,13 @@ struct bpf_mount_opts {
 	u64 delegate_attachs;
 };
 
+struct bpf_token {
+	struct work_struct work;
+	atomic64_t refcnt;
+	struct user_namespace *userns;
+	u64 allowed_cmds;
+};
+
 struct bpf_struct_ops_value;
 struct btf_member;
 
@@ -2048,6 +2059,7 @@ static inline void bpf_enable_instrumentation(void)
 	migrate_enable();
 }
 
+extern const struct super_operations bpf_super_ops;
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
@@ -2182,6 +2194,8 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map)
 
 extern int sysctl_unprivileged_bpf_disabled;
 
+bool bpf_token_capable(const struct bpf_token *token, int cap);
+
 static inline bool bpf_allow_ptr_leaks(void)
 {
 	return perfmon_capable();
@@ -2216,8 +2230,17 @@ int bpf_link_new_fd(struct bpf_link *link);
 struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
 
+void bpf_token_inc(struct bpf_token *token);
+void bpf_token_put(struct bpf_token *token);
+int bpf_token_create(union bpf_attr *attr);
+struct bpf_token *bpf_token_get_from_fd(u32 ufd);
+
+bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
+struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir,
+			    umode_t mode);
 
 #define BPF_ITER_FUNC_PREFIX "bpf_iter_"
 #define DEFINE_BPF_ITER_FUNC(target, args...)			\
@@ -2580,6 +2603,24 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 	return -EOPNOTSUPP;
 }
 
+static inline bool bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
+}
+
+static inline void bpf_token_inc(struct bpf_token *token)
+{
+}
+
+static inline void bpf_token_put(struct bpf_token *token)
+{
+}
+
+static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline void __dev_flush(void)
 {
 }
-- 
cgit v1.2.3


From 688b7270b3cb75e8ac78123d719967db40336e5b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:16 -0800
Subject: bpf: add BPF token support to BPF_MAP_CREATE command

Allow providing token_fd for BPF_MAP_CREATE command to allow controlled
BPF map creation from unprivileged process through delegated BPF token.

Wire through a set of allowed BPF map types to BPF token, derived from
BPF FS at BPF token creation time. This, in combination with allowed_cmds
allows to create a narrowly-focused BPF token (controlled by privileged
agent) with a restrictive set of BPF maps that application can attempt
to create.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index aa9cf8e5fab1..e08e8436df38 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1600,6 +1600,7 @@ struct bpf_token {
 	atomic64_t refcnt;
 	struct user_namespace *userns;
 	u64 allowed_cmds;
+	u64 allowed_maps;
 };
 
 struct bpf_struct_ops_value;
@@ -2236,6 +2237,7 @@ int bpf_token_create(union bpf_attr *attr);
 struct bpf_token *bpf_token_get_from_fd(u32 ufd);
 
 bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
 
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
-- 
cgit v1.2.3


From e1cef620f598853a90f17701fcb1057a6768f7b8 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:18 -0800
Subject: bpf: add BPF token support to BPF_PROG_LOAD command

Add basic support of BPF token to BPF_PROG_LOAD. Wire through a set of
allowed BPF program types and attach types, derived from BPF FS at BPF
token creation time. Then make sure we perform bpf_token_capable()
checks everywhere where it's relevant.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-7-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e08e8436df38..20af87b59d70 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1461,6 +1461,7 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
+	struct bpf_token *token;
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
 	struct bpf_func_info *func_info;
@@ -1601,6 +1602,8 @@ struct bpf_token {
 	struct user_namespace *userns;
 	u64 allowed_cmds;
 	u64 allowed_maps;
+	u64 allowed_progs;
+	u64 allowed_attachs;
 };
 
 struct bpf_struct_ops_value;
@@ -2238,6 +2241,9 @@ struct bpf_token *bpf_token_get_from_fd(u32 ufd);
 
 bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
 bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
+bool bpf_token_allow_prog_type(const struct bpf_token *token,
+			       enum bpf_prog_type prog_type,
+			       enum bpf_attach_type attach_type);
 
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
-- 
cgit v1.2.3


From 4cbb270e115bc197ff2046aeb54cc951666b16ec Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:19 -0800
Subject: bpf: take into account BPF token when fetching helper protos

Instead of performing unconditional system-wide bpf_capable() and
perfmon_capable() calls inside bpf_base_func_proto() function (and other
similar ones) to determine eligibility of a given BPF helper for a given
program, use previously recorded BPF token during BPF_PROG_LOAD command
handling to inform the decision.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-8-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 20af87b59d70..2a3ab4f3dd8c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2492,7 +2492,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type
 struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
 
-const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
+						 const struct bpf_prog *prog);
 void bpf_task_storage_free(struct task_struct *task);
 void bpf_cgrp_storage_free(struct cgroup *cgroup);
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
@@ -2752,7 +2753,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log,
 }
 
 static inline const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 8062fb12de99b2da33754c6a3be1bfc30d9a35f4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:20 -0800
Subject: bpf: consistently use BPF token throughout BPF verifier logic

Remove remaining direct queries to perfmon_capable() and bpf_capable()
in BPF verifier logic and instead use BPF token (if available) to make
decisions about privileges.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-9-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    | 16 ++++++++--------
 include/linux/filter.h |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2a3ab4f3dd8c..435abad3cc61 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2200,24 +2200,24 @@ extern int sysctl_unprivileged_bpf_disabled;
 
 bool bpf_token_capable(const struct bpf_token *token, int cap);
 
-static inline bool bpf_allow_ptr_leaks(void)
+static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token)
 {
-	return perfmon_capable();
+	return bpf_token_capable(token, CAP_PERFMON);
 }
 
-static inline bool bpf_allow_uninit_stack(void)
+static inline bool bpf_allow_uninit_stack(const struct bpf_token *token)
 {
-	return perfmon_capable();
+	return bpf_token_capable(token, CAP_PERFMON);
 }
 
-static inline bool bpf_bypass_spec_v1(void)
+static inline bool bpf_bypass_spec_v1(const struct bpf_token *token)
 {
-	return cpu_mitigations_off() || perfmon_capable();
+	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
 }
 
-static inline bool bpf_bypass_spec_v4(void)
+static inline bool bpf_bypass_spec_v4(const struct bpf_token *token)
 {
-	return cpu_mitigations_off() || perfmon_capable();
+	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a4953fafc8cb..14354605ad26 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
 		return false;
 	if (!bpf_jit_harden)
 		return false;
-	if (bpf_jit_harden == 1 && bpf_capable())
+	if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF))
 		return false;
 
 	return true;
-- 
cgit v1.2.3


From c3dd6e94df7193f33f45d33303f5e85afb2a72dc Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:21 -0800
Subject: bpf,lsm: refactor bpf_prog_alloc/bpf_prog_free LSM hooks

Based on upstream discussion ([0]), rework existing
bpf_prog_alloc_security LSM hook. Rename it to bpf_prog_load and instead
of passing bpf_prog_aux, pass proper bpf_prog pointer for a full BPF
program struct. Also, we pass bpf_attr union with all the user-provided
arguments for BPF_PROG_LOAD command.  This will give LSMs as much
information as we can basically provide.

The hook is also BPF token-aware now, and optional bpf_token struct is
passed as a third argument. bpf_prog_load LSM hook is called after
a bunch of sanity checks were performed, bpf_prog and bpf_prog_aux were
allocated and filled out, but right before performing full-fledged BPF
verification step.

bpf_prog_free LSM hook is now accepting struct bpf_prog argument, for
consistency. SELinux code is adjusted to all new names, types, and
signatures.

Note, given that bpf_prog_load (previously bpf_prog_alloc) hook can be
used by some LSMs to allocate extra security blob, but also by other
LSMs to reject BPF program loading, we need to make sure that
bpf_prog_free LSM hook is called after bpf_prog_load/bpf_prog_alloc one
*even* if the hook itself returned error. If we don't do that, we run
the risk of leaking memory. This seems to be possible today when
combining SELinux and BPF LSM, as one example, depending on their
relative ordering.

Also, for BPF LSM setup, add bpf_prog_load and bpf_prog_free to
sleepable LSM hooks list, as they are both executed in sleepable
context. Also drop bpf_prog_load hook from untrusted, as there is no
issue with refcount or anything else anymore, that originally forced us
to add it to untrusted list in c0c852dd1876 ("bpf: Do not mark certain LSM
hook arguments as trusted"). We now trigger this hook much later and it
should not be an issue anymore.

  [0] https://lore.kernel.org/bpf/9fe88aef7deabbe87d3fc38c4aea3c69.paul@paul-moore.com/

Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-10-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/lsm_hook_defs.h |  5 +++--
 include/linux/security.h      | 12 +++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ff217a5ce552..41ec4a7c070e 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -400,8 +400,9 @@ LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
 LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
 LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
 LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
-LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux)
-LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux)
+LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
+	 struct bpf_token *token)
+LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
 #endif /* CONFIG_BPF_SYSCALL */
 
 LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
diff --git a/include/linux/security.h b/include/linux/security.h
index 1d1df326c881..65467eef6678 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2020,15 +2020,16 @@ static inline void securityfs_remove(struct dentry *dentry)
 union bpf_attr;
 struct bpf_map;
 struct bpf_prog;
-struct bpf_prog_aux;
+struct bpf_token;
 #ifdef CONFIG_SECURITY
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
 extern int security_bpf_prog(struct bpf_prog *prog);
 extern int security_bpf_map_alloc(struct bpf_map *map);
 extern void security_bpf_map_free(struct bpf_map *map);
-extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
-extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
+extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+				  struct bpf_token *token);
+extern void security_bpf_prog_free(struct bpf_prog *prog);
 #else
 static inline int security_bpf(int cmd, union bpf_attr *attr,
 					     unsigned int size)
@@ -2054,12 +2055,13 @@ static inline int security_bpf_map_alloc(struct bpf_map *map)
 static inline void security_bpf_map_free(struct bpf_map *map)
 { }
 
-static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+					 struct bpf_token *token)
 {
 	return 0;
 }
 
-static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
+static inline void security_bpf_prog_free(struct bpf_prog *prog)
 { }
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_BPF_SYSCALL */
-- 
cgit v1.2.3


From 66d636d70a79c1d37e3eea67ab50969e6aaef983 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:22 -0800
Subject: bpf,lsm: refactor bpf_map_alloc/bpf_map_free LSM hooks

Similarly to bpf_prog_alloc LSM hook, rename and extend bpf_map_alloc
hook into bpf_map_create, taking not just struct bpf_map, but also
bpf_attr and bpf_token, to give a fuller context to LSMs.

Unlike bpf_prog_alloc, there is no need to move the hook around, as it
currently is firing right before allocating BPF map ID and FD, which
seems to be a sweet spot.

But like bpf_prog_alloc/bpf_prog_free combo, make sure that bpf_map_free
LSM hook is called even if bpf_map_create hook returned error, as if few
LSMs are combined together it could be that one LSM successfully
allocated security blob for its needs, while subsequent LSM rejected BPF
map creation. The former LSM would still need to free up LSM blob, so we
need to ensure security_bpf_map_free() is called regardless of the
outcome.

Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-11-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/lsm_hook_defs.h | 5 +++--
 include/linux/security.h      | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 41ec4a7c070e..adb25cc63ce3 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -398,8 +398,9 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule)
 LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size)
 LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
 LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
-LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
-LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
+LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr,
+	 struct bpf_token *token)
+LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
 LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
 	 struct bpf_token *token)
 LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
diff --git a/include/linux/security.h b/include/linux/security.h
index 65467eef6678..08fd777cbe94 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2025,7 +2025,8 @@ struct bpf_token;
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
 extern int security_bpf_prog(struct bpf_prog *prog);
-extern int security_bpf_map_alloc(struct bpf_map *map);
+extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+				   struct bpf_token *token);
 extern void security_bpf_map_free(struct bpf_map *map);
 extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 				  struct bpf_token *token);
@@ -2047,7 +2048,8 @@ static inline int security_bpf_prog(struct bpf_prog *prog)
 	return 0;
 }
 
-static inline int security_bpf_map_alloc(struct bpf_map *map)
+static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+					  struct bpf_token *token)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From d734ca7b33dbf60eb15dcf7c44f3da7073356777 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:23 -0800
Subject: bpf,lsm: add BPF token LSM hooks

Wire up bpf_token_create and bpf_token_free LSM hooks, which allow to
allocate LSM security blob (we add `void *security` field to struct
bpf_token for that), but also control who can instantiate BPF token.
This follows existing pattern for BPF map and BPF prog.

Also add security_bpf_token_allow_cmd() and security_bpf_token_capable()
LSM hooks that allow LSM implementation to control and negate (if
necessary) BPF token's delegation of a specific bpf_cmd and capability,
respectively.

Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-12-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           |  3 +++
 include/linux/lsm_hook_defs.h |  5 +++++
 include/linux/security.h      | 25 +++++++++++++++++++++++++
 3 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 435abad3cc61..7a483f6b6d5f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1604,6 +1604,9 @@ struct bpf_token {
 	u64 allowed_maps;
 	u64 allowed_progs;
 	u64 allowed_attachs;
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 };
 
 struct bpf_struct_ops_value;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index adb25cc63ce3..3fdd00b452ac 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -404,6 +404,11 @@ LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
 LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
 	 struct bpf_token *token)
 LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
+LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr,
+	 struct path *path)
+LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token)
+LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd)
+LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap)
 #endif /* CONFIG_BPF_SYSCALL */
 
 LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
diff --git a/include/linux/security.h b/include/linux/security.h
index 08fd777cbe94..00809d2d5c38 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/sockptr.h>
+#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -2031,6 +2032,11 @@ extern void security_bpf_map_free(struct bpf_map *map);
 extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 				  struct bpf_token *token);
 extern void security_bpf_prog_free(struct bpf_prog *prog);
+extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
+				     struct path *path);
+extern void security_bpf_token_free(struct bpf_token *token);
+extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+extern int security_bpf_token_capable(const struct bpf_token *token, int cap);
 #else
 static inline int security_bpf(int cmd, union bpf_attr *attr,
 					     unsigned int size)
@@ -2065,6 +2071,25 @@ static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *
 
 static inline void security_bpf_prog_free(struct bpf_prog *prog)
 { }
+
+static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
+				     struct path *path)
+{
+	return 0;
+}
+
+static inline void security_bpf_token_free(struct bpf_token *token)
+{ }
+
+static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+	return 0;
+}
+
+static inline int security_bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	return 0;
+}
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_BPF_SYSCALL */
 
-- 
cgit v1.2.3


From 3232e7aad11e541da86bbb1fa5ea5737b30bd006 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 5 Dec 2023 17:21:14 -0500
Subject: cgroup/cpuset: Include isolated cpuset CPUs in cpu_is_isolated()
 check

Currently, the cpu_is_isolated() function checks only the statically
isolated CPUs specified via the "isolcpus" and "nohz_full" kernel
command line options. This function is used by vmstat and memcg to
reduce interference with isolated CPUs by not doing stat flushing
or scheduling works on those CPUs.

Workloads running on isolated CPUs within isolated cpuset
partitions should receive the same treatment to reduce unnecessary
interference. This patch introduces a new cpuset_cpu_is_isolated()
function to be called by cpu_is_isolated() so that the set of dynamically
created cpuset isolated CPUs will be included in the check.

Assuming that testing a bit in a cpumask is atomic, no synchronization
primitive is currently used to synchronize access to the cpuset's
isolated_cpus mask.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpuset.h          | 6 ++++++
 include/linux/sched/isolation.h | 4 +++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index d629094fac6e..875d12598bd2 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -77,6 +77,7 @@ extern void cpuset_lock(void);
 extern void cpuset_unlock(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
+extern bool cpuset_cpu_is_isolated(int cpu);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -207,6 +208,11 @@ static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
 	return false;
 }
 
+static inline bool cpuset_cpu_is_isolated(int cpu)
+{
+	return false;
+}
+
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 {
 	return node_possible_map;
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index fe1a46f30d24..2b461129d1fa 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -2,6 +2,7 @@
 #define _LINUX_SCHED_ISOLATION_H
 
 #include <linux/cpumask.h>
+#include <linux/cpuset.h>
 #include <linux/init.h>
 #include <linux/tick.h>
 
@@ -67,7 +68,8 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type)
 static inline bool cpu_is_isolated(int cpu)
 {
 	return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
-		 !housekeeping_test_cpu(cpu, HK_TYPE_TICK);
+	       !housekeeping_test_cpu(cpu, HK_TYPE_TICK) ||
+	       cpuset_cpu_is_isolated(cpu);
 }
 
 #endif /* _LINUX_SCHED_ISOLATION_H */
-- 
cgit v1.2.3


From be0a3600aa1ebe9d23243c91d41ab1a2d5091a9b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 5 Dec 2023 13:24:08 +0100
Subject: thermal: sysfs: Rework the handling of trip point updates

Both trip_point_temp_store() and trip_point_hyst_store() use
thermal_zone_set_trip() to update a given trip point, but none of them
actually needs to change more than one field in struct thermal_trip
representing it.  However, each of them effectively calls
__thermal_zone_get_trip() twice in a row for the same trip index value,
once directly and once via thermal_zone_set_trip(), which is not
particularly efficient, and the way in which thermal_zone_set_trip()
carries out the update is not particularly straightforward.

Moreover, input processing need not be done under the thermal zone lock
in any of these functions.

Rework trip_point_temp_store() and trip_point_hyst_store() to address
the above, move the part of thermal_zone_set_trip() that is still
useful to a new function called thermal_zone_trip_updated() and drop
the rest of it.

While at it, make trip_point_hyst_store() reject negative hysteresis
values.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/thermal.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 1f9ee869f9f9..0ea99f50d57c 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -282,10 +282,6 @@ int __thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id,
 			    struct thermal_trip *trip);
 int thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id,
 			  struct thermal_trip *trip);
-
-int thermal_zone_set_trip(struct thermal_zone_device *tz, int trip_id,
-			  const struct thermal_trip *trip);
-
 int for_each_thermal_trip(struct thermal_zone_device *tz,
 			  int (*cb)(struct thermal_trip *, void *),
 			  void *data);
-- 
cgit v1.2.3


From 4b7de801606e504e69689df71475d27e35336fb3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 6 Dec 2023 09:30:40 +0100
Subject: bpf: Fix prog_array_map_poke_run map poke update

Lee pointed out issue found by syscaller [0] hitting BUG in prog array
map poke update in prog_array_map_poke_run function due to error value
returned from bpf_arch_text_poke function.

There's race window where bpf_arch_text_poke can fail due to missing
bpf program kallsym symbols, which is accounted for with check for
-EINVAL in that BUG_ON call.

The problem is that in such case we won't update the tail call jump
and cause imbalance for the next tail call update check which will
fail with -EBUSY in bpf_arch_text_poke.

I'm hitting following race during the program load:

  CPU 0                             CPU 1

  bpf_prog_load
    bpf_check
      do_misc_fixups
        prog_array_map_poke_track

                                    map_update_elem
                                      bpf_fd_array_map_update_elem
                                        prog_array_map_poke_run

                                          bpf_arch_text_poke returns -EINVAL

    bpf_prog_kallsyms_add

After bpf_arch_text_poke (CPU 1) fails to update the tail call jump, the next
poke update fails on expected jump instruction check in bpf_arch_text_poke
with -EBUSY and triggers the BUG_ON in prog_array_map_poke_run.

Similar race exists on the program unload.

Fixing this by moving the update to bpf_arch_poke_desc_update function which
makes sure we call __bpf_arch_text_poke that skips the bpf address check.

Each architecture has slightly different approach wrt looking up bpf address
in bpf_arch_text_poke, so instead of splitting the function or adding new
'checkip' argument in previous version, it seems best to move the whole
map_poke_run update as arch specific code.

  [0] https://syzkaller.appspot.com/bug?extid=97a4fe20470e9bc30810

Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
Reported-by: syzbot+97a4fe20470e9bc30810@syzkaller.appspotmail.com
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Cc: Lee Jones <lee@kernel.org>
Cc: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20231206083041.1306660-2-jolsa@kernel.org
---
 include/linux/bpf.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6762dac3ef76..cff5bb08820e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3175,6 +3175,9 @@ enum bpf_text_poke_type {
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *addr1, void *addr2);
 
+void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+			       struct bpf_prog *new, struct bpf_prog *old);
+
 void *bpf_arch_text_copy(void *dst, void *src, size_t len);
 int bpf_arch_text_invalidate(void *dst, size_t len);
 
-- 
cgit v1.2.3


From 187da0f8250aa94bd96266096aef6f694e0b4cd2 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Mon, 13 Nov 2023 17:20:33 -0800
Subject: hugetlb: fix null-ptr-deref in hugetlb_vma_lock_write

The routine __vma_private_lock tests for the existence of a reserve map
associated with a private hugetlb mapping.  A pointer to the reserve map
is in vma->vm_private_data.  __vma_private_lock was checking the pointer
for NULL.  However, it is possible that the low bits of the pointer could
be used as flags.  In such instances, vm_private_data is not NULL and not
a valid pointer.  This results in the null-ptr-deref reported by syzbot:

general protection fault, probably for non-canonical address 0xdffffc000000001d:
 0000 [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x00000000000000e8-0x00000000000000ef]
CPU: 0 PID: 5048 Comm: syz-executor139 Not tainted 6.6.0-rc7-syzkaller-00142-g88
8cf78c29e2 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 1
0/09/2023
RIP: 0010:__lock_acquire+0x109/0x5de0 kernel/locking/lockdep.c:5004
...
Call Trace:
 <TASK>
 lock_acquire kernel/locking/lockdep.c:5753 [inline]
 lock_acquire+0x1ae/0x510 kernel/locking/lockdep.c:5718
 down_write+0x93/0x200 kernel/locking/rwsem.c:1573
 hugetlb_vma_lock_write mm/hugetlb.c:300 [inline]
 hugetlb_vma_lock_write+0xae/0x100 mm/hugetlb.c:291
 __hugetlb_zap_begin+0x1e9/0x2b0 mm/hugetlb.c:5447
 hugetlb_zap_begin include/linux/hugetlb.h:258 [inline]
 unmap_vmas+0x2f4/0x470 mm/memory.c:1733
 exit_mmap+0x1ad/0xa60 mm/mmap.c:3230
 __mmput+0x12a/0x4d0 kernel/fork.c:1349
 mmput+0x62/0x70 kernel/fork.c:1371
 exit_mm kernel/exit.c:567 [inline]
 do_exit+0x9ad/0x2a20 kernel/exit.c:861
 __do_sys_exit kernel/exit.c:991 [inline]
 __se_sys_exit kernel/exit.c:989 [inline]
 __x64_sys_exit+0x42/0x50 kernel/exit.c:989
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

Mask off low bit flags before checking for NULL pointer.  In addition, the
reserve map only 'belongs' to the OWNER (parent in parent/child
relationships) so also check for the OWNER flag.

Link: https://lkml.kernel.org/r/20231114012033.259600-1-mike.kravetz@oracle.com
Reported-by: syzbot+6ada951e7c0f7bc8a71e@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-mm/00000000000078d1e00608d7878b@google.com/
Fixes: bf4916922c60 ("hugetlbfs: extend hugetlb_vma_lock to private VMAs")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Rik van Riel <riel@surriel.com>
Cc: Edward Adam Davis <eadavis@qq.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Tom Rix <trix@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d3acecc5db4b..236ec7b63c54 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1268,10 +1268,7 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
 	return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
 }
 
-static inline bool __vma_private_lock(struct vm_area_struct *vma)
-{
-	return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data;
-}
+bool __vma_private_lock(struct vm_area_struct *vma);
 
 /*
  * Safe version of huge_pte_offset() to check the locks.  See comments
-- 
cgit v1.2.3


From 8e92157d7f6190c86bfd6144a409001469827100 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 28 Nov 2023 19:44:03 +0200
Subject: units: add missing header

BITS_PER_BYTE is defined in bits.h.

Link: https://lkml.kernel.org/r/20231128174404.393393-1-andriy.shevchenko@linux.intel.com
Fixes: e8eed5f7366f ("units: Add BYTES_PER_*BIT")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Damian Muszynski <damian.muszynski@intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/units.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/units.h b/include/linux/units.h
index ff1bd6b5f5b3..45110daaf8d3 100644
--- a/include/linux/units.h
+++ b/include/linux/units.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_UNITS_H
 #define _LINUX_UNITS_H
 
+#include <linux/bits.h>
 #include <linux/math.h>
 
 /* Metric prefixes in accordance with Système international (d'unités) */
-- 
cgit v1.2.3


From 73424d00dc63ba681856e06cfb0a5abbdb62e2b5 Mon Sep 17 00:00:00 2001
From: Su Hui <suhui@nfschina.com>
Date: Thu, 30 Nov 2023 11:40:18 +0800
Subject: highmem: fix a memory copy problem in memcpy_from_folio

Clang static checker complains that value stored to 'from' is never read.
And memcpy_from_folio() only copy the last chunk memory from folio to
destination.  Use 'to += chunk' to replace 'from += chunk' to fix this
typo problem.

Link: https://lkml.kernel.org/r/20231130034017.1210429-1-suhui@nfschina.com
Fixes: b23d03ef7af5 ("highmem: add memcpy_to_folio() and memcpy_from_folio()")
Signed-off-by: Su Hui <suhui@nfschina.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jiaqi Yan <jiaqiyan@google.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Tom Rix <trix@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 4cacc0e43b51..be20cff4ba73 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -454,7 +454,7 @@ static inline void memcpy_from_folio(char *to, struct folio *folio,
 		memcpy(to, from, chunk);
 		kunmap_local(from);
 
-		from += chunk;
+		to += chunk;
 		offset += chunk;
 		len -= chunk;
 	} while (len > 0);
-- 
cgit v1.2.3


From f08a1c658257c73697a819c4ded3a84b6f0ead74 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:48 -0800
Subject: bpf: Let bpf_prog_pack_free handle any pointer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, bpf_prog_pack_free only can only free pointer to struct
bpf_binary_header, which is not flexible. Add a size argument to
bpf_prog_pack_free so that it can handle any pointer.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Reviewed-by: Björn Töpel <bjorn@rivosinc.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20231206224054.492250-2-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 14354605ad26..12d907f17d36 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1067,7 +1067,7 @@ struct bpf_binary_header *
 bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
 
 void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
-void bpf_prog_pack_free(struct bpf_binary_header *hdr);
+void bpf_prog_pack_free(void *ptr, u32 size);
 
 static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 {
-- 
cgit v1.2.3


From 7a3d9a159b178e87306a6e989071ed9a114a1a31 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:49 -0800
Subject: bpf: Adjust argument names of arch_prepare_bpf_trampoline()

We are using "im" for "struct bpf_tramp_image" and "tr" for "struct
bpf_trampoline" in most of the code base. The only exception is the
prototype and fallback version of arch_prepare_bpf_trampoline(). Update
them to match the rest of the code base.

We mix "orig_call" and "func_addr" for the argument in different versions
of arch_prepare_bpf_trampoline(). s/orig_call/func_addr/g so they match.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20231206224054.492250-3-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7a483f6b6d5f..17eb6d905204 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1098,10 +1098,10 @@ struct bpf_tramp_run_ctx;
  *      fexit = a set of program to run after original function
  */
 struct bpf_tramp_image;
-int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
 				struct bpf_tramp_links *tlinks,
-				void *orig_call);
+				void *func_addr);
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
-- 
cgit v1.2.3


From 82583daa2efc2e336962b231a46bad03a280b3e0 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:50 -0800
Subject: bpf: Add helpers for trampoline image management

As BPF trampoline of different archs moves from bpf_jit_[alloc|free]_exec()
to bpf_prog_pack_[alloc|free](), we need to use different _alloc, _free for
different archs during the transition. Add the following helpers for this
transition:

void *arch_alloc_bpf_trampoline(unsigned int size);
void arch_free_bpf_trampoline(void *image, unsigned int size);
void arch_protect_bpf_trampoline(void *image, unsigned int size);
void arch_unprotect_bpf_trampoline(void *image, unsigned int size);

The fallback version of these helpers require size <= PAGE_SIZE, but they
are only called with size == PAGE_SIZE. They will be called with size <
PAGE_SIZE when arch_bpf_trampoline_size() helper is introduced later.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20231206224054.492250-4-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 17eb6d905204..b7fca151cf1b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1102,6 +1102,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 				const struct btf_func_model *m, u32 flags,
 				struct bpf_tramp_links *tlinks,
 				void *func_addr);
+void *arch_alloc_bpf_trampoline(unsigned int size);
+void arch_free_bpf_trampoline(void *image, unsigned int size);
+void arch_protect_bpf_trampoline(void *image, unsigned int size);
+void arch_unprotect_bpf_trampoline(void *image, unsigned int size);
+
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
-- 
cgit v1.2.3


From 96d1b7c081c0c96cbe8901045f4ff15a2e9974a2 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:52 -0800
Subject: bpf: Add arch_bpf_trampoline_size()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This helper will be used to calculate the size of the trampoline before
allocating the memory.

arch_prepare_bpf_trampoline() for arm64 and riscv64 can use
arch_bpf_trampoline_size() to check the trampoline fits in the image.

OTOH, arch_prepare_bpf_trampoline() for s390 has to call the JIT process
twice, so it cannot use arch_bpf_trampoline_size().

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Björn Töpel <bjorn@rivosinc.com>
Tested-by: Björn Töpel <bjorn@rivosinc.com> # on riscv
Link: https://lore.kernel.org/r/20231206224054.492250-6-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b7fca151cf1b..2332ddeb396b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1106,6 +1106,8 @@ void *arch_alloc_bpf_trampoline(unsigned int size);
 void arch_free_bpf_trampoline(void *image, unsigned int size);
 void arch_protect_bpf_trampoline(void *image, unsigned int size);
 void arch_unprotect_bpf_trampoline(void *image, unsigned int size);
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+			     struct bpf_tramp_links *tlinks, void *func_addr);
 
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
-- 
cgit v1.2.3


From 26ef208c209a0e6eed8942a5d191b39dccfa6e38 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:53 -0800
Subject: bpf: Use arch_bpf_trampoline_size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of blindly allocating PAGE_SIZE for each trampoline, check the size
of the trampoline with arch_bpf_trampoline_size(). This size is saved in
bpf_tramp_image->size, and used for modmem charge/uncharge. The fallback
arch_alloc_bpf_trampoline() still allocates a whole page because we need to
use set_memory_* to protect the memory.

struct_ops trampoline still uses a whole page for multiple trampolines.

With this size check at caller (regular trampoline and struct_ops
trampoline), remove arch_bpf_trampoline_size() from
arch_prepare_bpf_trampoline() in archs.

Also, update bpf_image_ksym_add() to handle symbol of different sizes.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Björn Töpel <bjorn@rivosinc.com>
Tested-by: Björn Töpel <bjorn@rivosinc.com> # on riscv
Link: https://lore.kernel.org/r/20231206224054.492250-7-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2332ddeb396b..c1a06263a4f3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1141,6 +1141,7 @@ enum bpf_tramp_prog_type {
 
 struct bpf_tramp_image {
 	void *image;
+	int size;
 	struct bpf_ksym ksym;
 	struct percpu_ref pcref;
 	void *ip_after_call;
@@ -1325,7 +1326,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
 void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 				struct bpf_prog *to);
 /* Called only from JIT-enabled code, so there's no need for stubs. */
-void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
+void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
-- 
cgit v1.2.3


From f922b16aa5fad7284e2b7fd7c22bab13c0e418b6 Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Wed, 29 Nov 2023 03:27:09 -0800
Subject: firmware: xilinx: Update firmware call interface to support
 additional args

System-level platform management layer (do_fw_call()) has support for
maximum of 5 arguments as of now (1 EEMI API ID + 4 command arguments).
In order to support new EEMI PM_IOCTL IDs (Secure Read/Write), this
support must be extended to support one additional argument, which
results in a configuration of - 1 EEMI API ID + 5 command arguments.

Update zynqmp_pm_invoke_fn() and do_fw_call() with this new definition
containing variable arguments. As a result, update all the references
to pm invoke function with the updated definition.

Co-developed-by: Izhar Ameer Shaikh <izhar.ameer.shaikh@amd.com>
Signed-off-by: Izhar Ameer Shaikh <izhar.ameer.shaikh@amd.com>
Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20231129112713.22718-2-jay.buddhabhatti@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index d1ea3898564c..41190bbed8ed 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -509,8 +509,7 @@ struct zynqmp_pm_query_data {
 	u32 arg3;
 };
 
-int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1,
-			u32 arg2, u32 arg3, u32 *ret_payload);
+int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...);
 
 #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE)
 int zynqmp_pm_get_api_version(u32 *version);
-- 
cgit v1.2.3


From f689a0ca45fcdf4139727a3a02a49efbb1902306 Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Wed, 29 Nov 2023 03:27:10 -0800
Subject: firmware: xilinx: Expand feature check to support all PLM modules

To support feature check for all modules, append the module id of the API
that is being checked to the feature check API so it could be routed to
the target module for processing.

There is no need to check compatible string because the board information
is taken via firmware interface.

Co-developed-by: Saeed Nowshadi <saeed.nowshadi@amd.com>
Signed-off-by: Saeed Nowshadi <saeed.nowshadi@amd.com>
Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20231129112713.22718-3-jay.buddhabhatti@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 41190bbed8ed..e9a7fece5efe 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -32,6 +32,7 @@
 #define PM_SIP_SVC			0xC2000000
 
 /* PM API versions */
+#define PM_API_VERSION_1	1
 #define PM_API_VERSION_2	2
 
 #define PM_PINCTRL_PARAM_SET_VERSION	2
@@ -47,6 +48,9 @@
 #define FAMILY_CODE_MASK	GENMASK(27, 21)
 #define SUB_FAMILY_CODE_MASK	GENMASK(20, 19)
 
+#define API_ID_MASK		GENMASK(7, 0)
+#define MODULE_ID_MASK		GENMASK(11, 8)
+
 /* ATF only commands */
 #define TF_A_PM_REGISTER_SGI		0xa04
 #define PM_GET_TRUSTZONE_VERSION	0xa03
@@ -112,6 +116,12 @@
 #define XPM_EVENT_ERROR_MASK_NOC_NCR		BIT(13)
 #define XPM_EVENT_ERROR_MASK_NOC_CR		BIT(12)
 
+enum pm_module_id {
+	PM_MODULE_ID = 0x0,
+	XSEM_MODULE_ID = 0x3,
+	TF_A_MODULE_ID = 0xa,
+};
+
 enum pm_api_cb_id {
 	PM_INIT_SUSPEND_CB = 30,
 	PM_ACKNOWLEDGE_CB = 31,
@@ -119,6 +129,7 @@ enum pm_api_cb_id {
 };
 
 enum pm_api_id {
+	PM_API_FEATURES = 0,
 	PM_GET_API_VERSION = 1,
 	PM_REGISTER_NOTIFIER = 5,
 	PM_FORCE_POWERDOWN = 8,
-- 
cgit v1.2.3


From 8c016c807a90535432543204dbbb032e4a709009 Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Wed, 29 Nov 2023 03:27:12 -0800
Subject: drivers: soc: xilinx: Fix error message on SGI registration failure

Failure to register SGI for firmware event notification is non-fatal error
when feature is not supported by other modules such as Xen and TF-A. Add
_info level log message for such special case.

Also add XST_PM_INVALID_VERSION error code and map it to -EOPNOSUPP Linux
kernel error code. If feature is not supported or EEMI API version is
mismatch, firmware can return XST_PM_INVALID_VERSION = 4 or
XST_PM_NO_FEATURE = 19 error code.

Co-developed-by: Tanmay Shah <tanmay.shah@amd.com>
Signed-off-by: Tanmay Shah <tanmay.shah@amd.com>
Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20231129112713.22718-5-jay.buddhabhatti@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index e9a7fece5efe..ec1800c86006 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -172,6 +172,7 @@ enum pm_api_id {
 /* PMU-FW return status codes */
 enum pm_ret_status {
 	XST_PM_SUCCESS = 0,
+	XST_PM_INVALID_VERSION = 4,
 	XST_PM_NO_FEATURE = 19,
 	XST_PM_INTERNAL = 2000,
 	XST_PM_CONFLICT = 2001,
-- 
cgit v1.2.3


From 5dac2a98f6542ae1ce78b702374ea4be3f5ee07d Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Wed, 29 Nov 2023 03:27:13 -0800
Subject: firmware: zynqmp: Add support to handle IPI CRC failure

Added new PM error code XST_PM_INVALID_CRC to handle CRC validation failure
during IPI communication.

Co-developed-by: Naman Trivedi Manojbhai <naman.trivedimanojbhai@amd.com>
Signed-off-by: Naman Trivedi Manojbhai <naman.trivedimanojbhai@amd.com>
Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20231129112713.22718-6-jay.buddhabhatti@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index ec1800c86006..6b48294f3c92 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -174,6 +174,7 @@ enum pm_ret_status {
 	XST_PM_SUCCESS = 0,
 	XST_PM_INVALID_VERSION = 4,
 	XST_PM_NO_FEATURE = 19,
+	XST_PM_INVALID_CRC = 301,
 	XST_PM_INTERNAL = 2000,
 	XST_PM_CONFLICT = 2001,
 	XST_PM_NO_ACCESS = 2002,
-- 
cgit v1.2.3


From 5ec42bf04d72fd6d0a6855810cc779e0ee31dfd7 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 4 Dec 2023 15:27:06 -0600
Subject: PCI: add INTEL_HDA_ARL to pci_ids.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PCI ID insertion follows the increasing order in the table, but
this hardware follows MTL (MeteorLake).

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20231204212710.185976-2-pierre-louis.bossart@linux.intel.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 275799b5f535..97cc0baad0f4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -3065,6 +3065,7 @@
 #define PCI_DEVICE_ID_INTEL_82443GX_0	0x71a0
 #define PCI_DEVICE_ID_INTEL_82443GX_2	0x71a2
 #define PCI_DEVICE_ID_INTEL_82372FB_1	0x7601
+#define PCI_DEVICE_ID_INTEL_HDA_ARL	0x7728
 #define PCI_DEVICE_ID_INTEL_HDA_RPL_S	0x7a50
 #define PCI_DEVICE_ID_INTEL_HDA_ADL_S	0x7ad0
 #define PCI_DEVICE_ID_INTEL_HDA_MTL	0x7e28
-- 
cgit v1.2.3


From fa422b353d212373fb2b2857a5ea5a6fa4876f9c Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Mon, 23 Oct 2023 15:20:46 +0800
Subject: mm, pmem, xfs: Introduce MF_MEM_PRE_REMOVE for unbind

Now, if we suddenly remove a PMEM device(by calling unbind) which
contains FSDAX while programs are still accessing data in this device,
e.g.:
```
 $FSSTRESS_PROG -d $SCRATCH_MNT -n 99999 -p 4 &
 # $FSX_PROG -N 1000000 -o 8192 -l 500000 $SCRATCH_MNT/t001 &
 echo "pfn1.1" > /sys/bus/nd/drivers/nd_pmem/unbind
```
it could come into an unacceptable state:
  1. device has gone but mount point still exists, and umount will fail
       with "target is busy"
  2. programs will hang and cannot be killed
  3. may crash with NULL pointer dereference

To fix this, we introduce a MF_MEM_PRE_REMOVE flag to let it know that we
are going to remove the whole device, and make sure all related processes
could be notified so that they could end up gracefully.

This patch is inspired by Dan's "mm, dax, pmem: Introduce
dev_pagemap_failure()"[1].  With the help of dax_holder and
->notify_failure() mechanism, the pmem driver is able to ask filesystem
on it to unmap all files in use, and notify processes who are using
those files.

Call trace:
trigger unbind
 -> unbind_store()
  -> ... (skip)
   -> devres_release_all()
    -> kill_dax()
     -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
      -> xfs_dax_notify_failure()
      `-> freeze_super()             // freeze (kernel call)
      `-> do xfs rmap
      ` -> mf_dax_kill_procs()
      `  -> collect_procs_fsdax()    // all associated processes
      `  -> unmap_and_kill()
      ` -> invalidate_inode_pages2_range() // drop file's cache
      `-> thaw_super()               // thaw (both kernel & user call)

Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
new dax mapping from being created.  Do not shutdown filesystem directly
if configuration is not supported, or if failure range includes metadata
area.  Make sure all files and processes(not only the current progress)
are handled correctly.  Also drop the cache of associated files before
pmem is removed.

[1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
[2]: https://lore.kernel.org/linux-xfs/169116275623.3187159.16862410128731457358.stg-ugh@frogsfrogsfrogs/

Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 418d26608ece..caf13e94260e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3904,6 +3904,7 @@ enum mf_flags {
 	MF_UNPOISON = 1 << 4,
 	MF_SW_SIMULATED = 1 << 5,
 	MF_NO_RETRY = 1 << 6,
+	MF_MEM_PRE_REMOVE = 1 << 7,
 };
 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 		      unsigned long count, int mf_flags);
-- 
cgit v1.2.3


From 3bc05faf37876f99e2a7baffa9c66fdcfb11d1f7 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 5 Dec 2023 17:42:30 +0100
Subject: net: dsa: microchip: properly support platform_data probing

The ksz driver has bits and pieces of platform_data probing support, but
it doesn't work.

The conventional thing to do is to have an encapsulating structure for
struct dsa_chip_data that gets put into dev->platform_data. This driver
expects a struct ksz_platform_data, but that doesn't contain a struct
dsa_chip_data as first element, which will obviously not work with
dsa_switch_probe() -> dsa_switch_parse().

Pointing dev->platform_data to a struct dsa_chip_data directly is in
principle possible, but that doesn't work either. The driver has
ksz_switch_detect() to read the device ID from hardware, followed by
ksz_check_device_id() to compare it against a predetermined expected
value. This protects against early errors in the SPI/I2C communication.
With platform_data, the mechanism in ksz_check_device_id() doesn't work
and even leads to NULL pointer dereferences, since of_device_get_match_data()
doesn't work in that probe path.

So obviously, the platform_data support is actually missing, and the
existing handling of struct ksz_platform_data is bogus. Complete the
support by adding a struct dsa_chip_data as first element, and fixing up
ksz_check_device_id() to pick up the platform_data instead of the
unavailable of_device_get_match_data().

The early dev->chip_id assignment from ksz_switch_register() is also
bogus, because ksz_switch_detect() sets it to an initial value. So
remove it.

Also, ksz_platform_data :: enabled_ports isn't used anywhere, delete it.

Link: https://lore.kernel.org/netdev/20231204154315.3906267-1-dd@embedd.com/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Danzberger <dd@embedd.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/microchip-ksz.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h
index ea1cc6d829e9..6480bf4af0fb 100644
--- a/include/linux/platform_data/microchip-ksz.h
+++ b/include/linux/platform_data/microchip-ksz.h
@@ -20,10 +20,12 @@
 #define __MICROCHIP_KSZ_H
 
 #include <linux/types.h>
+#include <linux/platform_data/dsa.h>
 
 struct ksz_platform_data {
+	/* Must be first such that dsa_register_switch() can access it */
+	struct dsa_chip_data cd;
 	u32 chip_id;
-	u16 enabled_ports;
 };
 
 #endif
-- 
cgit v1.2.3


From d16f1096b320d42e41ad9dee4d4098afd140d3e1 Mon Sep 17 00:00:00 2001
From: Daniel Danzberger <dd@embedd.com>
Date: Tue, 5 Dec 2023 17:42:31 +0100
Subject: net: dsa: microchip: move ksz_chip_id enum to platform include

With the ksz_chip_id enums moved to the platform include file for ksz
switches, platform code that instantiates a device can now use these to
set ksz_platform_data::chip_id.

Signed-off-by: Daniel Danzberger <dd@embedd.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/microchip-ksz.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h
index 6480bf4af0fb..f177416635a2 100644
--- a/include/linux/platform_data/microchip-ksz.h
+++ b/include/linux/platform_data/microchip-ksz.h
@@ -22,6 +22,25 @@
 #include <linux/types.h>
 #include <linux/platform_data/dsa.h>
 
+enum ksz_chip_id {
+	KSZ8563_CHIP_ID = 0x8563,
+	KSZ8795_CHIP_ID = 0x8795,
+	KSZ8794_CHIP_ID = 0x8794,
+	KSZ8765_CHIP_ID = 0x8765,
+	KSZ8830_CHIP_ID = 0x8830,
+	KSZ9477_CHIP_ID = 0x00947700,
+	KSZ9896_CHIP_ID = 0x00989600,
+	KSZ9897_CHIP_ID = 0x00989700,
+	KSZ9893_CHIP_ID = 0x00989300,
+	KSZ9563_CHIP_ID = 0x00956300,
+	KSZ9567_CHIP_ID = 0x00956700,
+	LAN9370_CHIP_ID = 0x00937000,
+	LAN9371_CHIP_ID = 0x00937100,
+	LAN9372_CHIP_ID = 0x00937200,
+	LAN9373_CHIP_ID = 0x00937300,
+	LAN9374_CHIP_ID = 0x00937400,
+};
+
 struct ksz_platform_data {
 	/* Must be first such that dsa_register_switch() can access it */
 	struct dsa_chip_data cd;
-- 
cgit v1.2.3


From 37c8ceb6d92c955f5dd8223c3f6c90b277322210 Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@wdc.com>
Date: Mon, 30 Oct 2023 08:22:26 +0200
Subject: mmc: core: Remove packed command leftovers

Packed commands support was removed long time ago, but some bits got
left behind. Remove them.

Signed-off-by: Avri Altman <avri.altman@wdc.com>
Link: https://lore.kernel.org/r/20231030062226.1895692-1-avri.altman@wdc.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/card.h |  3 ---
 include/linux/mmc/core.h |  1 -
 include/linux/mmc/mmc.h  | 10 ----------
 3 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 7b12eebc5586..47eeb122524c 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -52,9 +52,6 @@ struct mmc_ext_csd {
 	u8			part_config;
 	u8			cache_ctrl;
 	u8			rst_n_function;
-	u8			max_packed_writes;
-	u8			max_packed_reads;
-	u8			packed_event_en;
 	unsigned int		part_time;		/* Units: ms */
 	unsigned int		sa_timeout;		/* Units: 100ns */
 	unsigned int		generic_cmd6_time;	/* Units: 10ms */
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 6efec0b9820c..2c7928a50907 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -27,7 +27,6 @@ struct mmc_command {
 	u32			opcode;
 	u32			arg;
 #define MMC_CMD23_ARG_REL_WR	(1 << 31)
-#define MMC_CMD23_ARG_PACKED	((0 << 31) | (1 << 30))
 #define MMC_CMD23_ARG_TAG_REQ	(1 << 29)
 	u32			resp[4];
 	unsigned int		flags;		/* expected response type */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 6f7993803ee7..cf2bcb5da30d 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -257,8 +257,6 @@ static inline bool mmc_ready_for_data(u32 status)
 #define EXT_CSD_FLUSH_CACHE		32      /* W */
 #define EXT_CSD_CACHE_CTRL		33      /* R/W */
 #define EXT_CSD_POWER_OFF_NOTIFICATION	34	/* R/W */
-#define EXT_CSD_PACKED_FAILURE_INDEX	35	/* RO */
-#define EXT_CSD_PACKED_CMD_STATUS	36	/* RO */
 #define EXT_CSD_EXP_EVENTS_STATUS	54	/* RO, 2 bytes */
 #define EXT_CSD_EXP_EVENTS_CTRL		56	/* R/W, 2 bytes */
 #define EXT_CSD_DATA_SECTOR_SIZE	61	/* R */
@@ -321,8 +319,6 @@ static inline bool mmc_ready_for_data(u32 status)
 #define EXT_CSD_SUPPORTED_MODE		493	/* RO */
 #define EXT_CSD_TAG_UNIT_SIZE		498	/* RO */
 #define EXT_CSD_DATA_TAG_SUPPORT	499	/* RO */
-#define EXT_CSD_MAX_PACKED_WRITES	500	/* RO */
-#define EXT_CSD_MAX_PACKED_READS	501	/* RO */
 #define EXT_CSD_BKOPS_SUPPORT		502	/* RO */
 #define EXT_CSD_HPI_FEATURES		503	/* RO */
 
@@ -402,18 +398,12 @@ static inline bool mmc_ready_for_data(u32 status)
 #define EXT_CSD_PWR_CL_8BIT_SHIFT	4
 #define EXT_CSD_PWR_CL_4BIT_SHIFT	0
 
-#define EXT_CSD_PACKED_EVENT_EN	BIT(3)
-
 /*
  * EXCEPTION_EVENT_STATUS field
  */
 #define EXT_CSD_URGENT_BKOPS		BIT(0)
 #define EXT_CSD_DYNCAP_NEEDED		BIT(1)
 #define EXT_CSD_SYSPOOL_EXHAUSTED	BIT(2)
-#define EXT_CSD_PACKED_FAILURE		BIT(3)
-
-#define EXT_CSD_PACKED_GENERIC_ERROR	BIT(0)
-#define EXT_CSD_PACKED_INDEXED_ERROR	BIT(1)
 
 /*
  * BKOPS status level
-- 
cgit v1.2.3


From 9c0a5b3f9e55cf9a3dc85843666cae28adfdf7e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Mon, 4 Dec 2023 23:05:21 +0100
Subject: w1: gpio: Don't use platform data for driver data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct device's .platform_data isn't for drivers to write to. For
driver-specific data there is .driver_data instead.

As there is no in-tree platform that provides w1_gpio_platform_data,
drop the include file and replace it by a local struct w1_gpio_ddata.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/8f7ebe03ddaa5a5c6e2b36fecdf59da7fc373527.1701727212.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 include/linux/w1-gpio.h | 22 ----------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 include/linux/w1-gpio.h

(limited to 'include/linux')

diff --git a/include/linux/w1-gpio.h b/include/linux/w1-gpio.h
deleted file mode 100644
index 3495fd0dc790..000000000000
--- a/include/linux/w1-gpio.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * w1-gpio interface to platform code
- *
- * Copyright (C) 2007 Ville Syrjala <syrjala@sci.fi>
- */
-#ifndef _LINUX_W1_GPIO_H
-#define _LINUX_W1_GPIO_H
-
-struct gpio_desc;
-
-/**
- * struct w1_gpio_platform_data - Platform-dependent data for w1-gpio
- */
-struct w1_gpio_platform_data {
-	struct gpio_desc *gpiod;
-	struct gpio_desc *pullup_gpiod;
-	void (*enable_external_pullup)(int enable);
-	unsigned int pullup_duration;
-};
-
-#endif /* _LINUX_W1_GPIO_H */
-- 
cgit v1.2.3


From 1fe13d83e2873b0aedeb5b9a299ca763bd37d75f Mon Sep 17 00:00:00 2001
From: Kaihua Zhong <zhongkaihua@kylinos.cn>
Date: Wed, 29 Nov 2023 09:55:26 +0800
Subject: mfd: Fix a few spelling mistakes in PMIC header file comments

Fix four comment typos in MFD PMIC header files.

Reported-by: k2ci <kernel-bot@kylinos.cn>
Signed-off-by: Kaihua Zhong <zhongkaihua@kylinos.cn>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20231129015526.3302865-1-zhongkaihua@kylinos.cn
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/max77693-private.h | 2 +-
 include/linux/mfd/max77843-private.h | 2 +-
 include/linux/mfd/si476x-platform.h  | 2 +-
 include/linux/mfd/tps65910.h         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
index 311f7d3d2323..54444ff2a5de 100644
--- a/include/linux/mfd/max77693-private.h
+++ b/include/linux/mfd/max77693-private.h
@@ -405,7 +405,7 @@ enum max77693_haptic_reg {
 	MAX77693_HAPTIC_REG_END,
 };
 
-/* max77693-pmic LSCNFG configuraton register */
+/* max77693-pmic LSCNFG configuration register */
 #define MAX77693_PMIC_LOW_SYS_MASK      0x80
 #define MAX77693_PMIC_LOW_SYS_SHIFT     7
 
diff --git a/include/linux/mfd/max77843-private.h b/include/linux/mfd/max77843-private.h
index 0bc7454c4dbe..2fb4db67f110 100644
--- a/include/linux/mfd/max77843-private.h
+++ b/include/linux/mfd/max77843-private.h
@@ -198,7 +198,7 @@ enum max77843_irq_muic {
 #define MAX77843_MCONFIG_MEN_MASK		BIT(MCONFIG_MEN_SHIFT)
 #define MAX77843_MCONFIG_PDIV_MASK		(0x3 << MCONFIG_PDIV_SHIFT)
 
-/* Max77843 charger insterrupts */
+/* Max77843 charger interrupts */
 #define MAX77843_CHG_BYP_I			BIT(0)
 #define MAX77843_CHG_BATP_I			BIT(2)
 #define MAX77843_CHG_BAT_I			BIT(3)
diff --git a/include/linux/mfd/si476x-platform.h b/include/linux/mfd/si476x-platform.h
index 18363b773d07..cb99e16ca947 100644
--- a/include/linux/mfd/si476x-platform.h
+++ b/include/linux/mfd/si476x-platform.h
@@ -10,7 +10,7 @@
 #ifndef __SI476X_PLATFORM_H__
 #define __SI476X_PLATFORM_H__
 
-/* It is possible to select one of the four adresses using pins A0
+/* It is possible to select one of the four addresses using pins A0
  * and A1 on SI476x */
 #define SI476X_I2C_ADDR_1	0x60
 #define SI476X_I2C_ADDR_2	0x61
diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index 701925db75b3..f67ef0a4e041 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -749,7 +749,7 @@
 #define VDDCTRL_ST_SHIFT                                 0
 
 
-/*Register VDDCTRL_OP  (0x28) bit definitios */
+/*Register VDDCTRL_OP  (0x28) bit definitions */
 #define VDDCTRL_OP_CMD_MASK                              0x80
 #define VDDCTRL_OP_CMD_SHIFT                             7
 #define VDDCTRL_OP_SEL_MASK                              0x7F
-- 
cgit v1.2.3


From 4d8ff6b0991d5e86b17b235fc46ec62e9195cb9b Mon Sep 17 00:00:00 2001
From: Amit Kumar Mahapatra <amit.kumar-mahapatra@amd.com>
Date: Sat, 25 Nov 2023 14:51:30 +0530
Subject: spi: Add multi-cs memories support in SPI core

AMD-Xilinx GQSPI controller has two advanced mode that allows the
controller to consider two flashes as one single device.

One of these two mode is the parallel mode in which each byte of data is
stored in both devices, the even bits in the lower flash & the odd bits in
the upper flash. The byte split is automatically handled by the QSPI
controller.

The other mode is the stacked mode in which both the flashes share the
same SPI bus but each of the device contain half of the data. In this mode,
the controller does not follow CS requests but instead internally wires the
two CS levels with the value of the most significant address bit.

For supporting both these modes SPI core need to be updated for providing
multiple CS for a single SPI device.

For adding multi CS support the SPI device need to be aware of all the CS
values. So, the "chip_select" member in the spi_device structure is now an
array that holds all the CS values.

spi_device structure now has a "cs_index_mask" member. This acts as an
index to the chip_select array. If nth bit of spi->cs_index_mask is set
then the driver would assert spi->chip_select[n].

In parallel mode all the chip selects are asserted/de-asserted
simultaneously and each byte of data is stored in both devices, the even
bits in one, the odd bits in the other. The split is automatically handled
by the GQSPI controller. The GQSPI controller supports a maximum of two
flashes connected in parallel mode. A SPI_CONTROLLER_MULTI_CS flag bit is
added in the spi controller flags, through ctlr->flags the spi core
will make sure that the controller is capable of handling multiple chip
selects at once.

For supporting multiple CS via GPIO the cs_gpiod member of the spi_device
structure is now an array that holds the gpio descriptor for each
chipselect.

CS GPIO is not tested on our hardware, but it has been tested by @Stefan
https://lore.kernel.org/all/005001da1efc$619ad5a0$24d080e0$@opensource.cirrus.com/

Signed-off-by: Amit Kumar Mahapatra <amit.kumar-mahapatra@amd.com>
Tested-by: Stefan Binding <sbinding@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20231125092137.2948-4-amit.kumar-mahapatra@amd.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 51 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 255a0562aea5..50622054b6af 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -20,6 +20,9 @@
 
 #include <uapi/linux/spi/spi.h>
 
+/* Max no. of CS supported per spi device */
+#define SPI_CS_CNT_MAX 4
+
 struct dma_chan;
 struct software_node;
 struct ptp_system_timestamp;
@@ -132,7 +135,8 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg,
  * @max_speed_hz: Maximum clock rate to be used with this chip
  *	(on this board); may be changed by the device's driver.
  *	The spi_transfer.speed_hz can override this for each transfer.
- * @chip_select: Chipselect, distinguishing chips handled by @controller.
+ * @chip_select: Array of physical chipselect, spi->chipselect[i] gives
+ *	the corresponding physical CS for logical CS i.
  * @mode: The spi mode defines how data is clocked out and in.
  *	This may be changed by the device's driver.
  *	The "active low" default for chipselect mode can be overridden
@@ -157,8 +161,8 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg,
  *	the device will bind to the named driver and only the named driver.
  *	Do not set directly, because core frees it; use driver_set_override() to
  *	set or clear it.
- * @cs_gpiod: GPIO descriptor of the chipselect line (optional, NULL when
- *	not using a GPIO line)
+ * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines
+ *	(optional, NULL when not using a GPIO line)
  * @word_delay: delay to be inserted between consecutive
  *	words of a transfer
  * @cs_setup: delay to be introduced by the controller after CS is asserted
@@ -167,6 +171,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg,
  *	deasserted. If @cs_change_delay is used from @spi_transfer, then the
  *	two delays will be added up.
  * @pcpu_statistics: statistics for the spi_device
+ * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array
  *
  * A @spi_device is used to interchange data between an SPI slave
  * (usually a discrete chip) and CPU memory.
@@ -182,7 +187,7 @@ struct spi_device {
 	struct spi_controller	*controller;
 	struct spi_controller	*master;	/* Compatibility layer */
 	u32			max_speed_hz;
-	u8			chip_select;
+	u8			chip_select[SPI_CS_CNT_MAX];
 	u8			bits_per_word;
 	bool			rt;
 #define SPI_NO_TX		BIT(31)		/* No transmit wire */
@@ -213,7 +218,7 @@ struct spi_device {
 	void			*controller_data;
 	char			modalias[SPI_NAME_SIZE];
 	const char		*driver_override;
-	struct gpio_desc	*cs_gpiod;	/* Chip select GPIO descriptor */
+	struct gpio_desc	*cs_gpiod[SPI_CS_CNT_MAX];	/* Chip select gpio desc */
 	struct spi_delay	word_delay; /* Inter-word delay */
 	/* CS delays */
 	struct spi_delay	cs_setup;
@@ -223,6 +228,13 @@ struct spi_device {
 	/* The statistics */
 	struct spi_statistics __percpu	*pcpu_statistics;
 
+	/* Bit mask of the chipselect(s) that the driver need to use from
+	 * the chipselect array.When the controller is capable to handle
+	 * multiple chip selects & memories are connected in parallel
+	 * then more than one bit need to be set in cs_index_mask.
+	 */
+	u32			cs_index_mask : SPI_CS_CNT_MAX;
+
 	/*
 	 * Likely need more hooks for more protocol options affecting how
 	 * the controller talks to each chip, like:
@@ -279,22 +291,33 @@ static inline void *spi_get_drvdata(const struct spi_device *spi)
 
 static inline u8 spi_get_chipselect(const struct spi_device *spi, u8 idx)
 {
-	return spi->chip_select;
+	return spi->chip_select[idx];
 }
 
 static inline void spi_set_chipselect(struct spi_device *spi, u8 idx, u8 chipselect)
 {
-	spi->chip_select = chipselect;
+	spi->chip_select[idx] = chipselect;
 }
 
 static inline struct gpio_desc *spi_get_csgpiod(const struct spi_device *spi, u8 idx)
 {
-	return spi->cs_gpiod;
+	return spi->cs_gpiod[idx];
 }
 
 static inline void spi_set_csgpiod(struct spi_device *spi, u8 idx, struct gpio_desc *csgpiod)
 {
-	spi->cs_gpiod = csgpiod;
+	spi->cs_gpiod[idx] = csgpiod;
+}
+
+static inline bool spi_is_csgpiod(struct spi_device *spi)
+{
+	u8 idx;
+
+	for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) {
+		if (spi_get_csgpiod(spi, idx))
+			return true;
+	}
+	return false;
 }
 
 /**
@@ -399,6 +422,8 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  * @bus_lock_spinlock: spinlock for SPI bus locking
  * @bus_lock_mutex: mutex for exclusion of multiple callers
  * @bus_lock_flag: indicates that the SPI bus is locked for exclusive use
+ * @multi_cs_cap: indicates that the SPI Controller can assert/de-assert
+ *	more than one chip select at once.
  * @setup: updates the device mode and clocking records used by a
  *	device's SPI controller; protocol code may call this.  This
  *	must fail if an unrecognized or unsupported mode is requested.
@@ -567,6 +592,11 @@ struct spi_controller {
 #define SPI_CONTROLLER_MUST_TX		BIT(4)	/* Requires tx */
 #define SPI_CONTROLLER_GPIO_SS		BIT(5)	/* GPIO CS must select slave */
 #define SPI_CONTROLLER_SUSPENDED	BIT(6)	/* Currently suspended */
+	/*
+	 * The spi-controller has multi chip select capability and can
+	 * assert/de-assert more than one chip select at once.
+	 */
+#define SPI_CONTROLLER_MULTI_CS		BIT(7)
 
 	/* Flag indicating if the allocation of this struct is devres-managed */
 	bool			devm_allocated;
@@ -677,7 +707,8 @@ struct spi_controller {
 	bool				rt;
 	bool				auto_runtime_pm;
 	bool				cur_msg_mapped;
-	char				last_cs;
+	char				last_cs[SPI_CS_CNT_MAX];
+	char				last_cs_index_mask;
 	bool				last_cs_mode_high;
 	bool                            fallback;
 	struct completion               xfer_completion;
-- 
cgit v1.2.3


From a7fb0423c201ba12815877a0b5a68a6a1710b23a Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 7 Dec 2023 08:46:14 -0500
Subject: cgroup: Move rcu_head up near the top of cgroup_root
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit d23b5c577715 ("cgroup: Make operations on the cgroup root_list RCU
safe") adds a new rcu_head to the cgroup_root structure and kvfree_rcu()
for freeing the cgroup_root.

The current implementation of kvfree_rcu(), however, has the limitation
that the offset of the rcu_head structure within the larger data
structure must be less than 4096 or the compilation will fail. See the
macro definition of __is_kvfree_rcu_offset() in include/linux/rcupdate.h
for more information.

By putting rcu_head below the large cgroup structure, any change to the
cgroup structure that makes it larger run the risk of causing build
failure under certain configurations. Commit 77070eeb8821 ("cgroup:
Avoid false cacheline sharing of read mostly rstat_cpu") happens to be
the last straw that breaks it. Fix this problem by moving the rcu_head
structure up before the cgroup structure.

Fixes: d23b5c577715 ("cgroup: Make operations on the cgroup root_list RCU safe")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/lkml/20231207143806.114e0a74@canb.auug.org.au/
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5a97ea95b564..ea48c861cd36 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -562,6 +562,10 @@ struct cgroup_root {
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
+	/* A list running through the active hierarchies */
+	struct list_head root_list;
+	struct rcu_head rcu;	/* Must be near the top */
+
 	/*
 	 * The root cgroup. The containing cgroup_root will be destroyed on its
 	 * release. cgrp->ancestors[0] will be used overflowing into the
@@ -575,10 +579,6 @@ struct cgroup_root {
 	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
 	atomic_t nr_cgrps;
 
-	/* A list running through the active hierarchies */
-	struct list_head root_list;
-	struct rcu_head rcu;
-
 	/* Hierarchy-specific flags */
 	unsigned int flags;
 
-- 
cgit v1.2.3


From f76f0d7f20672611974d3cc705996751fc403734 Mon Sep 17 00:00:00 2001
From: Wenkai Lin <linwenkai6@hisilicon.com>
Date: Sat, 2 Dec 2023 17:17:18 +0800
Subject: crypto: hisilicon/qm - add a function to set qm algs

Extract a public function to set qm algs and remove
the similar code for setting qm algs in each module.

Signed-off-by: Wenkai Lin <linwenkai6@hisilicon.com>
Signed-off-by: Hao Fang <fanghao11@huawei.com>
Signed-off-by: Zhiqi Song <songzhiqi1@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/hisi_acc_qm.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index e3c0a1297b2c..cdc979f66dba 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -156,6 +156,11 @@ enum qm_cap_bits {
 	QM_SUPPORT_RPM,
 };
 
+struct qm_dev_alg {
+	u64 alg_msk;
+	const char *alg;
+};
+
 struct dfx_diff_registers {
 	u32 *regs;
 	u32 reg_offset;
@@ -361,7 +366,6 @@ struct hisi_qm {
 	struct work_struct rst_work;
 	struct work_struct cmd_process;
 
-	const char *algs;
 	bool use_sva;
 
 	resource_size_t phys_base;
@@ -559,6 +563,8 @@ void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset);
 u32 hisi_qm_get_hw_info(struct hisi_qm *qm,
 			const struct hisi_qm_cap_info *info_table,
 			u32 index, bool is_read);
+int hisi_qm_set_algs(struct hisi_qm *qm, u64 alg_msk, const struct qm_dev_alg *dev_algs,
+		     u32 dev_algs_size);
 
 /* Used by VFIO ACC live migration driver */
 struct pci_driver *hisi_sec_get_pf_driver(void);
-- 
cgit v1.2.3


From cabe13d0bd2efb8dd50ed2310f57b33e1a69a0d4 Mon Sep 17 00:00:00 2001
From: Zhiqi Song <songzhiqi1@huawei.com>
Date: Sat, 2 Dec 2023 17:17:19 +0800
Subject: crypto: hisilicon/qm - save capability registers in qm init process

In previous capability register implementation, qm irq related values
were read from capability registers dynamically when needed. But in
abnormal scenario, e.g. the core is timeout and the device needs to
soft reset and reset failed after disabling the MSE, the device can
not be removed normally, causing the following call trace:

	| Call trace:
        |  pci_irq_vector+0xfc/0x140
        |  hisi_qm_uninit+0x278/0x3b0 [hisi_qm]
        |  hpre_remove+0x16c/0x1c0 [hisi_hpre]
        |  pci_device_remove+0x6c/0x264
        |  device_release_driver_internal+0x1ec/0x3e0
        |  device_release_driver+0x3c/0x60
        |  pci_stop_bus_device+0xfc/0x22c
        |  pci_stop_and_remove_bus_device+0x38/0x70
        |  pci_iov_remove_virtfn+0x108/0x1c0
        |  sriov_disable+0x7c/0x1e4
        |  pci_disable_sriov+0x4c/0x6c
        |  hisi_qm_sriov_disable+0x90/0x160 [hisi_qm]
        |  hpre_remove+0x1a8/0x1c0 [hisi_hpre]
        |  pci_device_remove+0x6c/0x264
        |  device_release_driver_internal+0x1ec/0x3e0
        |  driver_detach+0x168/0x2d0
        |  bus_remove_driver+0xc0/0x230
        |  driver_unregister+0x58/0xdc
        |  pci_unregister_driver+0x40/0x220
        |  hpre_exit+0x34/0x64 [hisi_hpre]
        |  __arm64_sys_delete_module+0x374/0x620
        [...]

        | Call trace:
        |  free_msi_irqs+0x25c/0x300
        |  pci_disable_msi+0x19c/0x264
        |  pci_free_irq_vectors+0x4c/0x70
        |  hisi_qm_pci_uninit+0x44/0x90 [hisi_qm]
        |  hisi_qm_uninit+0x28c/0x3b0 [hisi_qm]
        |  hpre_remove+0x16c/0x1c0 [hisi_hpre]
        |  pci_device_remove+0x6c/0x264
        [...]

The reason for this call trace is that when the MSE is disabled, the value
of capability registers in the BAR space become invalid. This will make the
subsequent unregister process get the wrong irq vector through capability
registers and get the wrong irq number by pci_irq_vector().

So add a capability table structure to pre-store the valid value of the irq
information capability register in qm init process, avoid obtaining invalid
capability register value after the MSE is disabled.

Fixes: 3536cc55cada ("crypto: hisilicon/qm - support get device irq information from hardware registers")
Signed-off-by: Zhiqi Song <songzhiqi1@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/hisi_acc_qm.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index cdc979f66dba..5f4c74facf6a 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -266,6 +266,16 @@ struct hisi_qm_cap_info {
 	u32 v3_val;
 };
 
+struct hisi_qm_cap_record {
+	u32 type;
+	u32 cap_val;
+};
+
+struct hisi_qm_cap_tables {
+	struct hisi_qm_cap_record *qm_cap_table;
+	struct hisi_qm_cap_record *dev_cap_table;
+};
+
 struct hisi_qm_list {
 	struct mutex lock;
 	struct list_head list;
@@ -376,6 +386,8 @@ struct hisi_qm {
 	u32 mb_qos;
 	u32 type_rate;
 	struct qm_err_isolate isolate_data;
+
+	struct hisi_qm_cap_tables cap_tables;
 };
 
 struct hisi_qp_status {
-- 
cgit v1.2.3


From ee25fba76acd8324f9de6628872c8c612a684209 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 4 Dec 2023 10:35:00 +0100
Subject: gpiolib: provide gpiochip_dup_line_label()

gpiochip_is_requested() not only has a misleading name but it returns
a pointer to a string that is freed when the descriptor is released.

Provide a new helper meant to replace it, which returns a copy of the
label string instead.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 0aed62f0c633..5ac6dc30c547 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -532,6 +532,7 @@ struct gpio_chip {
 };
 
 const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset);
+char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset);
 
 /**
  * for_each_requested_gpio_in_range - iterates over requested GPIOs in a given range
-- 
cgit v1.2.3


From 6fd9c9933475a3efd7eed2f80c7778908a560a1f Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 4 Dec 2023 10:35:08 +0100
Subject: gpiolib: use gpiochip_dup_line_label() in for_each helpers

Rework for_each_requested_gpio_in_range() to use the new helper to
retrieve a dynamically allocated copy of the descriptor label and free
it at the end of each iteration. We need to leverage the CLASS()'
destructor to make sure that the label is freed even when breaking out
of the loop.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 5ac6dc30c547..cae4cdaa87db 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -534,17 +534,38 @@ struct gpio_chip {
 const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset);
 char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset);
 
+
+struct _gpiochip_for_each_data {
+	const char **label;
+	unsigned int *i;
+};
+
+DEFINE_CLASS(_gpiochip_for_each_data,
+	     struct _gpiochip_for_each_data,
+	     if (*_T.label) kfree(*_T.label),
+	     ({
+		struct _gpiochip_for_each_data _data = { label, i };
+		*_data.i = 0;
+		_data;
+	     }),
+	     const char **label, int *i)
+
 /**
  * for_each_requested_gpio_in_range - iterates over requested GPIOs in a given range
- * @chip:	the chip to query
- * @i:		loop variable
- * @base:	first GPIO in the range
- * @size:	amount of GPIOs to check starting from @base
- * @label:	label of current GPIO
+ * @_chip:	the chip to query
+ * @_i:		loop variable
+ * @_base:	first GPIO in the range
+ * @_size:	amount of GPIOs to check starting from @base
+ * @_label:	label of current GPIO
  */
-#define for_each_requested_gpio_in_range(chip, i, base, size, label)			\
-	for (i = 0; i < size; i++)							\
-		if ((label = gpiochip_is_requested(chip, base + i)) == NULL) {} else
+#define for_each_requested_gpio_in_range(_chip, _i, _base, _size, _label)		\
+	for (CLASS(_gpiochip_for_each_data, _data)(&_label, &_i);			\
+	     *_data.i < _size;								\
+	     (*_data.i)++, kfree(*(_data.label)), *_data.label = NULL)			\
+		if ((*_data.label =							\
+			gpiochip_dup_line_label(_chip, _base + *_data.i)) == NULL) {}	\
+		else if (IS_ERR(*_data.label)) {}					\
+		else
 
 /* Iterates over all requested GPIO of the given @chip */
 #define for_each_requested_gpio(chip, i, label)						\
-- 
cgit v1.2.3


From f8d05e276b45e3097dfddd628fa991ce69c05c99 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 4 Dec 2023 10:35:09 +0100
Subject: gpiolib: remove gpiochip_is_requested()

We have no external users of gpiochip_is_requested(). Let's remove it
and replace its internal calls with direct testing of the REQUESTED flag.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index cae4cdaa87db..d1a3cb061927 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -531,7 +531,6 @@ struct gpio_chip {
 #endif /* CONFIG_OF_GPIO */
 };
 
-const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset);
 char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset);
 
 
-- 
cgit v1.2.3


From 2a48c635fd9a48699805bbfeee1e4b94b8fe819d Mon Sep 17 00:00:00 2001
From: "justinstitt@google.com" <justinstitt@google.com>
Date: Wed, 6 Dec 2023 23:16:10 +0000
Subject: ethtool: Implement ethtool_puts()

Use strscpy() to implement ethtool_puts().

Functionally the same as ethtool_sprintf() when it's used with two
arguments or with just "%s" format specifier.

Signed-off-by: Justin Stitt <justinstitt@google.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Madhuri Sripada <madhuri.sripada@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index c2bb74143eda..deb683d3360f 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -1061,6 +1061,19 @@ int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info
  */
 extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...);
 
+/**
+ * ethtool_puts - Write string to ethtool string data
+ * @data: Pointer to a pointer to the start of string to update
+ * @str: String to write
+ *
+ * Write string to *data without a trailing newline. Update *data
+ * to point at start of next string.
+ *
+ * Prefer this function to ethtool_sprintf() when given only
+ * two arguments or if @fmt is just "%s".
+ */
+extern void ethtool_puts(u8 **data, const char *str);
+
 /* Link mode to forced speed capabilities maps */
 struct ethtool_forced_speed_map {
 	u32		speed;
-- 
cgit v1.2.3


From 675bf8ef209cc8da28ffefd7d8a93c53735cc84a Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Thu, 30 Nov 2023 12:30:01 +0100
Subject: tty: virtio: drop virtio_cons_early_init()

The last user of virtio_cons_early_init() was dropped in commit
7fb2b2d51244 ("s390/virtio: remove the old KVM virtio transport").

So now, drop virtio_cons_early_init() and the logic and headers behind
too.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Acked-by: Jason Wang <jasowang@redhat.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: linux-alpha@vger.kernel.org
Cc: virtualization@lists.linux.dev
Link: https://lore.kernel.org/r/20231130113001.29154-1-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/virtio_console.h | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 include/linux/virtio_console.h

(limited to 'include/linux')

diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h
deleted file mode 100644
index d2e2785af602..000000000000
--- a/include/linux/virtio_console.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
- * anyone can use the definitions to implement compatible drivers/servers:
- *
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of IBM nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * Copyright (C) Red Hat, Inc., 2009, 2010, 2011
- * Copyright (C) Amit Shah <amit.shah@redhat.com>, 2009, 2010, 2011
- */
-#ifndef _LINUX_VIRTIO_CONSOLE_H
-#define _LINUX_VIRTIO_CONSOLE_H
-
-#include <uapi/linux/virtio_console.h>
-
-int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int));
-#endif /* _LINUX_VIRTIO_CONSOLE_H */
-- 
cgit v1.2.3


From 093258a9963bfac043244995bff87dc2c931b9b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= <theo.lebrun@bootlin.com>
Date: Thu, 30 Nov 2023 15:07:13 +0100
Subject: tty: serial: amba: cleanup whitespace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix whitespace in include/linux/amba/serial.h to match current kernel
coding standards. Fixes about:

 - CHECK: spaces preferred around that '|' (ctx:VxV)
 - ERROR: code indent should use tabs where possible
 - WARNING: Unnecessary space before function pointer arguments
 - WARNING: please, no spaces at the start of a line

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://lore.kernel.org/r/20231130-mbly-uart-v5-1-6566703a04b5@bootlin.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/amba/serial.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index a1307b58cc2c..27003ec52114 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -75,10 +75,10 @@
 #define UART011_DR_PE		(1 << 9)
 #define UART011_DR_FE		(1 << 8)
 
-#define UART01x_RSR_OE 		0x08
-#define UART01x_RSR_BE 		0x04
-#define UART01x_RSR_PE 		0x02
-#define UART01x_RSR_FE 		0x01
+#define UART01x_RSR_OE		0x08
+#define UART01x_RSR_BE		0x04
+#define UART01x_RSR_PE		0x02
+#define UART01x_RSR_FE		0x01
 
 #define UART011_FR_RI		0x100
 #define UART011_FR_TXFE		0x080
@@ -86,9 +86,9 @@
 #define UART01x_FR_TXFF		0x020
 #define UART01x_FR_RXFE		0x010
 #define UART01x_FR_BUSY		0x008
-#define UART01x_FR_DCD 		0x004
-#define UART01x_FR_DSR 		0x002
-#define UART01x_FR_CTS 		0x001
+#define UART01x_FR_DCD		0x004
+#define UART01x_FR_DSR		0x002
+#define UART01x_FR_CTS		0x001
 #define UART01x_FR_TMSK		(UART01x_FR_TXFF + UART01x_FR_BUSY)
 
 /*
@@ -110,14 +110,14 @@
 #define UART011_CR_TXE		0x0100	/* transmit enable */
 #define UART011_CR_LBE		0x0080	/* loopback enable */
 #define UART010_CR_RTIE		0x0040
-#define UART010_CR_TIE 		0x0020
-#define UART010_CR_RIE 		0x0010
+#define UART010_CR_TIE		0x0020
+#define UART010_CR_RIE		0x0010
 #define UART010_CR_MSIE		0x0008
 #define ST_UART011_CR_OVSFACT	0x0008	/* Oversampling factor */
 #define UART01x_CR_IIRLP	0x0004	/* SIR low power mode */
 #define UART01x_CR_SIREN	0x0002	/* SIR enable */
 #define UART01x_CR_UARTEN	0x0001	/* UART enable */
- 
+
 #define UART011_LCRH_SPS	0x80
 #define UART01x_LCRH_WLEN_8	0x60
 #define UART01x_LCRH_WLEN_7	0x40
@@ -203,8 +203,8 @@
 #define UART011_TXDMAE		(1 << 1)	/* enable transmit dma */
 #define UART011_RXDMAE		(1 << 0)	/* enable receive dma */
 
-#define UART01x_RSR_ANY		(UART01x_RSR_OE|UART01x_RSR_BE|UART01x_RSR_PE|UART01x_RSR_FE)
-#define UART01x_FR_MODEM_ANY	(UART01x_FR_DCD|UART01x_FR_DSR|UART01x_FR_CTS)
+#define UART01x_RSR_ANY		(UART01x_RSR_OE | UART01x_RSR_BE | UART01x_RSR_PE | UART01x_RSR_FE)
+#define UART01x_FR_MODEM_ANY	(UART01x_FR_DCD | UART01x_FR_DSR | UART01x_FR_CTS)
 
 #ifndef __ASSEMBLY__
 struct amba_device; /* in uncompress this is included but amba/bus.h is not */
@@ -220,8 +220,8 @@ struct amba_pl011_data {
 	bool dma_rx_poll_enable;
 	unsigned int dma_rx_poll_rate;
 	unsigned int dma_rx_poll_timeout;
-        void (*init) (void);
-	void (*exit) (void);
+	void (*init)(void);
+	void (*exit)(void);
 };
 #endif
 
-- 
cgit v1.2.3


From 49943393c9f0be61fd494a884851aa117cd72382 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 6 Dec 2023 08:36:48 +0100
Subject: tty: switch tty_port::xmit_* to u8

Both xmit_buf and xmit_fifo of struct tty_port should be u8. To conform
to characters in the rest of the tty layer.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20231206073712.17776-4-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_port.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h
index 3276311a7f38..1b861f2100b6 100644
--- a/include/linux/tty_port.h
+++ b/include/linux/tty_port.h
@@ -114,8 +114,8 @@ struct tty_port {
 	unsigned char		console:1;
 	struct mutex		mutex;
 	struct mutex		buf_mutex;
-	unsigned char		*xmit_buf;
-	DECLARE_KFIFO_PTR(xmit_fifo, unsigned char);
+	u8			*xmit_buf;
+	DECLARE_KFIFO_PTR(xmit_fifo, u8);
 	unsigned int		close_delay;
 	unsigned int		closing_wait;
 	int			drain_delay;
-- 
cgit v1.2.3


From 3a00da027946cd08db1c1be2de4620950bbdf074 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 6 Dec 2023 08:36:49 +0100
Subject: tty: make tty_operations::send_xchar accept u8 char

tty_operations::send_xchar is one of the last users of 'char' type for
characters in the tty layer. Convert it to u8 now.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Cc: Karsten Keil <isdn@linux-pingi.de>
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Cc: netdev@vger.kernel.org
Cc: linux-mmc@vger.kernel.org
Cc: linux-bluetooth@vger.kernel.org
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20231206073712.17776-5-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h        | 2 +-
 include/linux/tty_driver.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index e96c85f4f91e..d3bedcc08738 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -410,7 +410,7 @@ void tty_wait_until_sent(struct tty_struct *tty, long timeout);
 void stop_tty(struct tty_struct *tty);
 void start_tty(struct tty_struct *tty);
 void tty_write_message(struct tty_struct *tty, char *msg);
-int tty_send_xchar(struct tty_struct *tty, char ch);
+int tty_send_xchar(struct tty_struct *tty, u8 ch);
 int tty_put_char(struct tty_struct *tty, unsigned char c);
 unsigned int tty_chars_in_buffer(struct tty_struct *tty);
 unsigned int tty_write_room(struct tty_struct *tty);
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index f428c1b784a2..7372124fbf90 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -242,7 +242,7 @@ struct serial_struct;
  *	Optional: If not provided, the device is assumed to have no FIFO.
  *	Usually correct to invoke via tty_wait_until_sent(). May sleep.
  *
- * @send_xchar: ``void ()(struct tty_struct *tty, char ch)``
+ * @send_xchar: ``void ()(struct tty_struct *tty, u8 ch)``
  *
  *	This routine is used to send a high-priority XON/XOFF character (@ch)
  *	to the @tty device.
@@ -374,7 +374,7 @@ struct tty_operations {
 	void (*flush_buffer)(struct tty_struct *tty);
 	void (*set_ldisc)(struct tty_struct *tty);
 	void (*wait_until_sent)(struct tty_struct *tty, int timeout);
-	void (*send_xchar)(struct tty_struct *tty, char ch);
+	void (*send_xchar)(struct tty_struct *tty, u8 ch);
 	int (*tiocmget)(struct tty_struct *tty);
 	int (*tiocmset)(struct tty_struct *tty,
 			unsigned int set, unsigned int clear);
-- 
cgit v1.2.3


From 4e8d8878145f1478886e1630c44113ad2c2eb99d Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 6 Dec 2023 08:36:50 +0100
Subject: tty: core: the rest to u8

There are still last minor users in the tty core that still reference
characters by the 'char' type. Switch them to u8.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20231206073712.17776-6-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index d3bedcc08738..cc08f7e1c122 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -242,7 +242,7 @@ struct tty_struct {
 	void *driver_data;
 	spinlock_t files_lock;
 	int write_cnt;
-	unsigned char *write_buf;
+	u8 *write_buf;
 
 	struct list_head tty_files;
 
@@ -411,7 +411,7 @@ void stop_tty(struct tty_struct *tty);
 void start_tty(struct tty_struct *tty);
 void tty_write_message(struct tty_struct *tty, char *msg);
 int tty_send_xchar(struct tty_struct *tty, u8 ch);
-int tty_put_char(struct tty_struct *tty, unsigned char c);
+int tty_put_char(struct tty_struct *tty, u8 c);
 unsigned int tty_chars_in_buffer(struct tty_struct *tty);
 unsigned int tty_write_room(struct tty_struct *tty);
 void tty_driver_flush_buffer(struct tty_struct *tty);
-- 
cgit v1.2.3


From f2470d2bc4327c2c1a604c6e247442dbb14c90c5 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 6 Dec 2023 08:37:07 +0100
Subject: tty: serdev: convert to u8 and size_t in serdev_controller_ops

Switch character types to u8 and sizes to size_t. To conform to
characters/sizes in the rest of the tty layer.

In this patch, only struct serdev_controller_ops hooks. The rest will
follow.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20231206073712.17776-23-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serdev.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 0ebf53bb254f..8cdab2c3b6d5 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -82,7 +82,7 @@ enum serdev_parity {
  * serdev controller structures
  */
 struct serdev_controller_ops {
-	int (*write_buf)(struct serdev_controller *, const unsigned char *, size_t);
+	ssize_t (*write_buf)(struct serdev_controller *, const u8 *, size_t);
 	void (*write_flush)(struct serdev_controller *);
 	int (*write_room)(struct serdev_controller *);
 	int (*open)(struct serdev_controller *);
@@ -185,9 +185,9 @@ static inline void serdev_controller_write_wakeup(struct serdev_controller *ctrl
 	serdev->ops->write_wakeup(serdev);
 }
 
-static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl,
-					      const unsigned char *data,
-					      size_t count)
+static inline ssize_t serdev_controller_receive_buf(struct serdev_controller *ctrl,
+						    const u8 *data,
+						    size_t count)
 {
 	struct serdev_device *serdev = ctrl->serdev;
 
-- 
cgit v1.2.3


From 475fc6e2de6fec0ff3c9a74ddcfd2b52c90adc0d Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 6 Dec 2023 08:37:08 +0100
Subject: tty: serdev: convert to u8 and size_t

Switch character types to u8 and sizes to size_t. To conform to
characters/sizes in the rest of the tty layer.

This patch converts struct serdev_device_ops hooks and its
instantiations.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Acked-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20231206073712.17776-24-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serdev.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 8cdab2c3b6d5..3fab88ba265e 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -27,7 +27,7 @@ struct serdev_device;
  *			not sleep.
  */
 struct serdev_device_ops {
-	int (*receive_buf)(struct serdev_device *, const unsigned char *, size_t);
+	ssize_t (*receive_buf)(struct serdev_device *, const u8 *, size_t);
 	void (*write_wakeup)(struct serdev_device *);
 };
 
@@ -204,13 +204,13 @@ void serdev_device_close(struct serdev_device *);
 int devm_serdev_device_open(struct device *, struct serdev_device *);
 unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int);
 void serdev_device_set_flow_control(struct serdev_device *, bool);
-int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t);
+int serdev_device_write_buf(struct serdev_device *, const u8 *, size_t);
 void serdev_device_wait_until_sent(struct serdev_device *, long);
 int serdev_device_get_tiocm(struct serdev_device *);
 int serdev_device_set_tiocm(struct serdev_device *, int, int);
 int serdev_device_break_ctl(struct serdev_device *serdev, int break_state);
 void serdev_device_write_wakeup(struct serdev_device *);
-int serdev_device_write(struct serdev_device *, const unsigned char *, size_t, long);
+ssize_t serdev_device_write(struct serdev_device *, const u8 *, size_t, long);
 void serdev_device_write_flush(struct serdev_device *);
 int serdev_device_write_room(struct serdev_device *);
 
@@ -248,7 +248,7 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev
 }
 static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {}
 static inline int serdev_device_write_buf(struct serdev_device *serdev,
-					  const unsigned char *buf,
+					  const u8 *buf,
 					  size_t count)
 {
 	return -ENODEV;
@@ -266,8 +266,9 @@ static inline int serdev_device_break_ctl(struct serdev_device *serdev, int brea
 {
 	return -EOPNOTSUPP;
 }
-static inline int serdev_device_write(struct serdev_device *sdev, const unsigned char *buf,
-				      size_t count, unsigned long timeout)
+static inline ssize_t serdev_device_write(struct serdev_device *sdev,
+					  const u8 *buf, size_t count,
+					  unsigned long timeout)
 {
 	return -ENODEV;
 }
-- 
cgit v1.2.3


From 8132d887a7023b212f242a51ae89281c69fde996 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 18 Oct 2023 12:11:56 -0400
Subject: KVM: remove CONFIG_HAVE_KVM_EVENTFD

virt/kvm/eventfd.c is compiled unconditionally, meaning that the ioeventfds
member of struct kvm is accessed unconditionally.  CONFIG_HAVE_KVM_EVENTFD
therefore must be defined for KVM common code to compile successfully,
remove it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ea1523a7b83a..3fe5a6be7768 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -782,7 +782,6 @@ struct kvm {
 	struct list_head vm_list;
 	struct mutex lock;
 	struct kvm_io_bus __rcu *buses[KVM_NR_BUSES];
-#ifdef CONFIG_HAVE_KVM_EVENTFD
 	struct {
 		spinlock_t        lock;
 		struct list_head  items;
@@ -791,7 +790,6 @@ struct kvm {
 		struct mutex      resampler_lock;
 	} irqfds;
 	struct list_head ioeventfds;
-#endif
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
 	refcount_t users_count;
@@ -2056,8 +2054,6 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 
 int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
 
-#ifdef CONFIG_HAVE_KVM_EVENTFD
-
 void kvm_eventfd_init(struct kvm *kvm);
 int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
 
@@ -2082,31 +2078,7 @@ static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm,
 {
 	return false;
 }
-#endif
-
-#else
-
-static inline void kvm_eventfd_init(struct kvm *kvm) {}
-
-static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
-{
-	return -EINVAL;
-}
-
-static inline void kvm_irqfd_release(struct kvm *kvm) {}
-
-#ifdef CONFIG_HAVE_KVM_IRQCHIP
-static inline void kvm_irq_routing_update(struct kvm *kvm)
-{
-}
-#endif
-
-static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
-{
-	return -ENOSYS;
-}
-
-#endif /* CONFIG_HAVE_KVM_EVENTFD */
+#endif /* CONFIG_HAVE_KVM_IRQFD */
 
 void kvm_arch_irq_routing_update(struct kvm *kvm);
 
-- 
cgit v1.2.3


From c5b31cc2371728ddefe9baf1d036aeb630a25d96 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 18 Oct 2023 12:07:32 -0400
Subject: KVM: remove CONFIG_HAVE_KVM_IRQFD

All platforms with a kernel irqchip have support for irqfd.  Unify the
two configuration items so that userspace can expect to use irqfd to
inject interrupts into the irqchip.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3fe5a6be7768..1bba24a13ec9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -805,8 +805,7 @@ struct kvm {
 	 * Update side is protected by irq_lock.
 	 */
 	struct kvm_irq_routing_table __rcu *irq_routing;
-#endif
-#ifdef CONFIG_HAVE_KVM_IRQFD
+
 	struct hlist_head irq_ack_notifier_list;
 #endif
 
@@ -996,7 +995,7 @@ static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm)
 }
 #endif
 
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 int kvm_irqfd_init(void);
 void kvm_irqfd_exit(void);
 #else
@@ -2057,7 +2056,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
 void kvm_eventfd_init(struct kvm *kvm);
 int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
 
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
 void kvm_irqfd_release(struct kvm *kvm);
 bool kvm_notify_irqfd_resampler(struct kvm *kvm,
@@ -2078,7 +2077,7 @@ static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm,
 {
 	return false;
 }
-#endif /* CONFIG_HAVE_KVM_IRQFD */
+#endif /* CONFIG_HAVE_KVM_IRQCHIP */
 
 void kvm_arch_irq_routing_update(struct kvm *kvm);
 
-- 
cgit v1.2.3


From 8ed26ab8d59111c2f7b86d200d1eb97d2a458fd1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 18 Oct 2023 12:18:00 -0400
Subject: KVM: clean up directives to compile out irqfds

Keep all #ifdef CONFIG_HAVE_KVM_IRQCHIP parts of eventfd.c together, and
compile out the irqfds field of struct kvm if the symbol is not defined.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1bba24a13ec9..7e7fd25b09b3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -782,6 +782,7 @@ struct kvm {
 	struct list_head vm_list;
 	struct mutex lock;
 	struct kvm_io_bus __rcu *buses[KVM_NR_BUSES];
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 	struct {
 		spinlock_t        lock;
 		struct list_head  items;
@@ -789,6 +790,7 @@ struct kvm {
 		struct list_head  resampler_list;
 		struct mutex      resampler_lock;
 	} irqfds;
+#endif
 	struct list_head ioeventfds;
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
-- 
cgit v1.2.3


From 92e1567ee3e3f6f160e320890ac77eec50bf8e7d Mon Sep 17 00:00:00 2001
From: Andrei Matei <andreimatei1@gmail.com>
Date: Thu, 7 Dec 2023 22:25:17 -0500
Subject: bpf: Add some comments to stack representation

Add comments to the datastructure tracking the stack state, as the
mapping between each stack slot and where its state is stored is not
entirely obvious.

Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20231208032519.260451-2-andreimatei1@gmail.com
---
 include/linux/bpf_verifier.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index bada59812e00..314b679fb494 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -321,7 +321,17 @@ struct bpf_func_state {
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
 	struct bpf_reference_state *refs;
+	/* The state of the stack. Each element of the array describes BPF_REG_SIZE
+	 * (i.e. 8) bytes worth of stack memory.
+	 * stack[0] represents bytes [*(r10-8)..*(r10-1)]
+	 * stack[1] represents bytes [*(r10-16)..*(r10-9)]
+	 * ...
+	 * stack[allocated_stack/8 - 1] represents [*(r10-allocated_stack)..*(r10-allocated_stack+7)]
+	 */
 	struct bpf_stack_state *stack;
+	/* Size of the current stack, in bytes. The stack state is tracked below, in
+	 * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE.
+	 */
 	int allocated_stack;
 };
 
@@ -658,6 +668,10 @@ struct bpf_verifier_env {
 	int exception_callback_subprog;
 	bool explore_alu_limits;
 	bool allow_ptr_leaks;
+	/* Allow access to uninitialized stack memory. Writes with fixed offset are
+	 * always allowed, so this refers to reads (with fixed or variable offset),
+	 * to writes with variable offset and to indirect (helper) accesses.
+	 */
 	bool allow_uninit_stack;
 	bool bpf_capable;
 	bool bypass_spec_v1;
-- 
cgit v1.2.3


From 5b20755b7780464fea3e54af0af744258dcc2841 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 26 Nov 2023 16:19:14 +0900
Subject: init: move THIS_MODULE from <linux/export.h> to <linux/init.h>

Commit f50169324df4 ("module.h: split out the EXPORT_SYMBOL into
export.h") appropriately separated EXPORT_SYMBOL into <linux/export.h>
because modules and EXPORT_SYMBOL are orthogonal; modules are symbol
consumers, while EXPORT_SYMBOL are used by symbol providers, which
may not be necessarily a module.

However, that commit also relocated THIS_MODULE. As explained in the
commit description, the intention was to define THIS_MODULE in a
lightweight header, but I do not believe <linux/export.h> was the
best location because EXPORT_SYMBOL and THIS_MODULE are unrelated.

Move it to another lightweight header, <linux/init.h>. The reason for
choosing <linux/init.h> is to make <linux/moduleparam.h> self-contained
without relying on <linux/linkage.h> incorrectly including
<linux/export.h>.

With this adjustment, the role of <linux/export.h> becomes clearer as
it only defines EXPORT_SYMBOL.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/export.h | 18 ------------------
 include/linux/init.h   |  7 +++++++
 2 files changed, 7 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/export.h b/include/linux/export.h
index 9911508a9604..0bbd02fd351d 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -6,15 +6,6 @@
 #include <linux/linkage.h>
 #include <linux/stringify.h>
 
-/*
- * Export symbols from the kernel to modules.  Forked from module.h
- * to reduce the amount of pointless cruft we feed to gcc when only
- * exporting a simple symbol or two.
- *
- * Try not to add #includes here.  It slows compilation and makes kernel
- * hackers place grumpy comments in header files.
- */
-
 /*
  * This comment block is used by fixdep. Please do not remove.
  *
@@ -23,15 +14,6 @@
  * side effect of the *.o build rule.
  */
 
-#ifndef __ASSEMBLY__
-#ifdef MODULE
-extern struct module __this_module;
-#define THIS_MODULE (&__this_module)
-#else
-#define THIS_MODULE ((struct module *)0)
-#endif
-#endif /* __ASSEMBLY__ */
-
 #ifdef CONFIG_64BIT
 #define __EXPORT_SYMBOL_REF(sym)			\
 	.balign 8				ASM_NL	\
diff --git a/include/linux/init.h b/include/linux/init.h
index 01b52c9c7526..3fa3f6241350 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -179,6 +179,13 @@ extern void (*late_time_init)(void);
 
 extern bool initcall_debug;
 
+#ifdef MODULE
+extern struct module __this_module;
+#define THIS_MODULE (&__this_module)
+#else
+#define THIS_MODULE ((struct module *)0)
+#endif
+
 #endif
   
 #ifndef MODULE
-- 
cgit v1.2.3


From b73f08bb7fe5a0901646ca5ceaa1e7a2d5ee6293 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Thu, 7 Dec 2023 13:39:27 +0100
Subject: iio: adc: ad9467: fix scale setting

When reading in_voltage_scale we can get something like:

root@analog:/sys/bus/iio/devices/iio:device2# cat in_voltage_scale
0.038146

However, when reading the available options:

root@analog:/sys/bus/iio/devices/iio:device2# cat
in_voltage_scale_available
2000.000000 2100.000006 2200.000007 2300.000008 2400.000009 2500.000010

which does not make sense. Moreover, when trying to set a new scale we
get an error because there's no call to __ad9467_get_scale() to give us
values as given when reading in_voltage_scale. Fix it by computing the
available scales during probe and properly pass the list when
.read_available() is called.

While at it, change to use .read_available() from iio_info. Also note
that to properly fix this, adi-axi-adc.c has to be changed accordingly.

Fixes: ad6797120238 ("iio: adc: ad9467: add support AD9467 ADC")
Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Reviewed-by: David Lechner <dlechner@baylibre.com>
Link: https://lore.kernel.org/r/20231207-iio-backend-prep-v2-4-a4a33bc4d70e@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/adc/adi-axi-adc.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/adc/adi-axi-adc.h b/include/linux/iio/adc/adi-axi-adc.h
index 52620e5b8052..b7904992d561 100644
--- a/include/linux/iio/adc/adi-axi-adc.h
+++ b/include/linux/iio/adc/adi-axi-adc.h
@@ -41,6 +41,7 @@ struct adi_axi_adc_chip_info {
  * @reg_access		IIO debugfs_reg_access hook for the client ADC
  * @read_raw		IIO read_raw hook for the client ADC
  * @write_raw		IIO write_raw hook for the client ADC
+ * @read_avail		IIO read_avail hook for the client ADC
  */
 struct adi_axi_adc_conv {
 	const struct adi_axi_adc_chip_info		*chip_info;
@@ -54,6 +55,9 @@ struct adi_axi_adc_conv {
 	int (*write_raw)(struct adi_axi_adc_conv *conv,
 			 struct iio_chan_spec const *chan,
 			 int val, int val2, long mask);
+	int (*read_avail)(struct adi_axi_adc_conv *conv,
+			  struct iio_chan_spec const *chan,
+			  const int **val, int *type, int *length, long mask);
 };
 
 struct adi_axi_adc_conv *devm_adi_axi_adc_conv_register(struct device *dev,
-- 
cgit v1.2.3


From 23e9f0138963ceef2a252d887534923a0502b2da Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Fri, 3 Nov 2023 11:14:50 +0800
Subject: mm/vmstat: move pgdemote_* to per-node stats

Demotion will migrate pages across nodes.  Previously, only the global
demotion statistics were accounted for.  Changed them to per-node
statistics, making it easier to observe where demotion occurs on each
node.

This will help to identify which nodes are under pressure.

This patch also make pgdemote_* behind CONFIG_NUMA_BALANCING, since
demotion is not available for !CONFIG_NUMA_BALANCING

With this patch, here is a sample where node0 node1 are DRAM,
node3 is PMEM:
Global stats:
$ grep demote /proc/vmstat
pgdemote_kswapd 254288
pgdemote_direct 113497
pgdemote_khugepaged 0

Per-node stats:
$ grep demote /sys/devices/system/node/node0/vmstat # demotion source
pgdemote_kswapd 68454
pgdemote_direct 83431
pgdemote_khugepaged 0
$ grep demote /sys/devices/system/node/node1/vmstat # demotion source
pgdemote_kswapd 185834
pgdemote_direct 30066
pgdemote_khugepaged 0
$ grep demote /sys/devices/system/node/node3/vmstat # demotion target
pgdemote_kswapd 0
pgdemote_direct 0
pgdemote_khugepaged 0

Link: https://lkml.kernel.org/r/20231103031450.1456523-1-lizhijian@fujitsu.com
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Acked-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h        | 4 ++++
 include/linux/vm_event_item.h | 3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3c25226beeed..14faffa4354f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -206,6 +206,10 @@ enum node_stat_item {
 #ifdef CONFIG_NUMA_BALANCING
 	PGPROMOTE_SUCCESS,	/* promote successfully */
 	PGPROMOTE_CANDIDATE,	/* candidate pages to promote */
+	/* PGDEMOTE_*: pages demoted */
+	PGDEMOTE_KSWAPD,
+	PGDEMOTE_DIRECT,
+	PGDEMOTE_KHUGEPAGED,
 #endif
 	NR_VM_NODE_STAT_ITEMS
 };
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 8abfa1240040..d1b847502f09 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -41,9 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PGSTEAL_KSWAPD,
 		PGSTEAL_DIRECT,
 		PGSTEAL_KHUGEPAGED,
-		PGDEMOTE_KSWAPD,
-		PGDEMOTE_DIRECT,
-		PGDEMOTE_KHUGEPAGED,
 		PGSCAN_KSWAPD,
 		PGSCAN_DIRECT,
 		PGSCAN_KHUGEPAGED,
-- 
cgit v1.2.3


From b2472efe4316b2687c153919c1513a098bd82c17 Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:37 +0800
Subject: maple_tree: introduce {mtree,mas}_lock_nested()

In some cases, nested locks may be needed, so {mtree,mas}_lock_nested is
introduced.  For example, when duplicating maple tree, we need to hold the
locks of two trees, in which case nested locks are needed.

At the same time, add the definition of spin_lock_nested() in tools for
testing.

Link: https://lkml.kernel.org/r/20231027033845.90608-3-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index d01e850b570f..f91dbc7fe091 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -256,6 +256,8 @@ struct maple_tree {
 	struct maple_tree name = MTREE_INIT(name, 0)
 
 #define mtree_lock(mt)		spin_lock((&(mt)->ma_lock))
+#define mtree_lock_nested(mas, subclass) \
+		spin_lock_nested((&(mt)->ma_lock), subclass)
 #define mtree_unlock(mt)	spin_unlock((&(mt)->ma_lock))
 
 /*
@@ -406,6 +408,8 @@ struct ma_wr_state {
 };
 
 #define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
+#define mas_lock_nested(mas, subclass) \
+		spin_lock_nested(&((mas)->tree->ma_lock), subclass)
 #define mas_unlock(mas)         spin_unlock(&((mas)->tree->ma_lock))
 
 
-- 
cgit v1.2.3


From fd32e4e9b7646510ee9010e0d5f8b8857d48a6f7 Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:38 +0800
Subject: maple_tree: introduce interfaces __mt_dup() and mtree_dup()

Introduce interfaces __mt_dup() and mtree_dup(), which are used to
duplicate a maple tree.  They duplicate a maple tree in Depth-First Search
(DFS) pre-order traversal.  It uses memcopy() to copy nodes in the source
tree and allocate new child nodes in non-leaf nodes.  The new node is
exactly the same as the source node except for all the addresses stored in
it.  It will be faster than traversing all elements in the source tree and
inserting them one by one into the new tree.  The time complexity of these
two functions is O(n).

The difference between __mt_dup() and mtree_dup() is that mtree_dup()
handles locks internally.

Analysis of the average time complexity of this algorithm:

For simplicity, let's assume that the maximum branching factor of all
non-leaf nodes is 16 (in allocation mode, it is 10), and the tree is a
full tree.

Under the given conditions, if there is a maple tree with n elements, the
number of its leaves is n/16.  From bottom to top, the number of nodes in
each level is 1/16 of the number of nodes in the level below.  So the
total number of nodes in the entire tree is given by the sum of n/16 +
n/16^2 + n/16^3 + ...  + 1.  This is a geometric series, and it has log(n)
terms with base 16.  According to the formula for the sum of a geometric
series, the sum of this series can be calculated as (n-1)/15.  Each node
has only one parent node pointer, which can be considered as an edge.  In
total, there are (n-1)/15-1 edges.

This algorithm consists of two operations:

1. Traversing all nodes in DFS order.
2. For each node, making a copy and performing necessary modifications
   to create a new node.

For the first part, DFS traversal will visit each edge twice.  Let
T(ascend) represent the cost of taking one step downwards, and T(descend)
represent the cost of taking one step upwards.  And both of them are
constants (although mas_ascend() may not be, as it contains a loop, but
here we ignore it and treat it as a constant).  So the time spent on the
first part can be represented as ((n-1)/15-1) * (T(ascend) + T(descend)).

For the second part, each node will be copied, and the cost of copying a
node is denoted as T(copy_node).  For each non-leaf node, it is necessary
to reallocate all child nodes, and the cost of this operation is denoted
as T(dup_alloc).  The behavior behind memory allocation is complex and not
specific to the maple tree operation.  Here, we assume that the time
required for a single allocation is constant.  Since the size of a node is
fixed, both of these symbols are also constants.  We can calculate that
the time spent on the second part is ((n-1)/15) * T(copy_node) + ((n-1)/15
- n/16) * T(dup_alloc).

Adding both parts together, the total time spent by the algorithm can be
represented as:

((n-1)/15) * (T(ascend) + T(descend) + T(copy_node) + T(dup_alloc)) -
n/16 * T(dup_alloc) - (T(ascend) + T(descend))

Let C1 = T(ascend) + T(descend) + T(copy_node) + T(dup_alloc)
Let C2 = T(dup_alloc)
Let C3 = T(ascend) + T(descend)

Finally, the expression can be simplified as:
((16 * C1 - 15 * C2) / (15 * 16)) * n - (C1 / 15 + C3).

This is a linear function, so the average time complexity is O(n).

Link: https://lkml.kernel.org/r/20231027033845.90608-4-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Suggested-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index f91dbc7fe091..a452dd8a1e5c 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -329,6 +329,9 @@ int mtree_store(struct maple_tree *mt, unsigned long index,
 		void *entry, gfp_t gfp);
 void *mtree_erase(struct maple_tree *mt, unsigned long index);
 
+int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
+int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
+
 void mtree_destroy(struct maple_tree *mt);
 void __mt_destroy(struct maple_tree *mt);
 
-- 
cgit v1.2.3


From d2406291483775ecddaee929231a39c70c08fda2 Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:45 +0800
Subject: fork: use __mt_dup() to duplicate maple tree in dup_mmap()

In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then
directly replacing the entries of VMAs in the new maple tree can result in
better performance.  __mt_dup() uses DFS pre-order to duplicate the maple
tree, so it is efficient.

The average time complexity of __mt_dup() is O(n), where n is the number
of VMAs.  The proof of the time complexity is provided in the commit log
that introduces __mt_dup().  After duplicating the maple tree, each
element is traversed and replaced (ignoring the cases of deletion, which
are rare).  Since it is only a replacement operation for each element,
this process is also O(n).

Analyzing the exact time complexity of the previous algorithm is
challenging because each insertion can involve appending to a node,
pushing data to adjacent nodes, or even splitting nodes.  The frequency of
each action is difficult to calculate.  The worst-case scenario for a
single insertion is when the tree undergoes splitting at every level.  If
we consider each insertion as the worst-case scenario, we can determine
that the upper bound of the time complexity is O(n*log(n)), although this
is a loose upper bound.  However, based on the test data, it appears that
the actual time complexity is likely to be O(n).

As the entire maple tree is duplicated using __mt_dup(), if dup_mmap()
fails, there will be a portion of VMAs that have not been duplicated in
the maple tree.  To handle this, we mark the failure point with
XA_ZERO_ENTRY.  In exit_mmap(), if this marker is encountered, stop
releasing VMAs that have not been duplicated after this point.

There is a "spawn" in byte-unixbench[1], which can be used to test the
performance of fork().  I modified it slightly to make it work with
different number of VMAs.

Below are the test results.  The first row shows the number of VMAs.  The
second and third rows show the number of fork() calls per ten seconds,
corresponding to next-20231006 and the this patchset, respectively.  The
test results were obtained with CPU binding to avoid scheduler load
balancing that could cause unstable results.  There are still some
fluctuations in the test results, but at least they are better than the
original performance.

21     121   221    421    821    1621   3221   6421   12821  25621  51221
112100 76261 54227  34035  20195  11112  6017   3161   1606   802    393
114558 83067 65008  45824  28751  16072  8922   4747   2436   1233   599
2.19%  8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42%

[1] https://github.com/kdlucas/byte-unixbench/tree/master

Link: https://lkml.kernel.org/r/20231027033845.90608-11-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Suggested-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 418d26608ece..64cd1ee4aacc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -994,6 +994,17 @@ static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
 	return mas_expected_entries(&vmi->mas, count);
 }
 
+static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
+			unsigned long start, unsigned long end, gfp_t gfp)
+{
+	__mas_set_range(&vmi->mas, start, end - 1);
+	mas_store_gfp(&vmi->mas, NULL, gfp);
+	if (unlikely(mas_is_err(&vmi->mas)))
+		return -ENOMEM;
+
+	return 0;
+}
+
 /* Free any unused preallocations */
 static inline void vma_iter_free(struct vma_iterator *vmi)
 {
-- 
cgit v1.2.3


From ff6c3d81f2e86b63a3a530683f89ef393882782a Mon Sep 17 00:00:00 2001
From: Liam Ni <zhiguangni01@gmail.com>
Date: Thu, 26 Oct 2023 10:03:29 +0800
Subject: NUMA: optimize detection of memory with no node id assigned by
 firmware

Sanity check that makes sure the nodes cover all memory loops over
numa_meminfo to count the pages that have node id assigned by the
firmware, then loops again over memblock.memory to find the total amount
of memory and in the end checks that the difference between the total
memory and memory that covered by nodes is less than some threshold.
Worse, the loop over numa_meminfo calls __absent_pages_in_range() that
also partially traverses memblock.memory.

It's much simpler and more efficient to have a single traversal of
memblock.memory that verifies that amount of memory not covered by nodes
is less than a threshold.

Introduce memblock_validate_numa_coverage() that does exactly that and use
it instead of numa_meminfo_cover_memory().

Link: https://lkml.kernel.org/r/20231026020329.327329-1-zhiguangni01@gmail.com
Signed-off-by: Liam Ni <zhiguangni01@gmail.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Bibo Mao <maobibo@loongson.cn>
Cc: Binbin Zhou <zhoubinbin@loongson.cn>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Feiyang Chen <chenfeiyang@loongson.cn>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: WANG Xuerui <kernel@xen0n.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memblock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ae3bde302f70..b695f9e946da 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -123,6 +123,7 @@ int memblock_physmem_add(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
 bool memblock_overlaps_region(struct memblock_type *type,
 			      phys_addr_t base, phys_addr_t size);
+bool memblock_validate_numa_coverage(unsigned long threshold_bytes);
 int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
-- 
cgit v1.2.3


From a4fc4a0c45f2617c3aa8b693739de264e0c09909 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 7 Nov 2023 21:26:40 +0000
Subject: mm: add folio_zero_tail() and use it in ext4

Patch series "Add folio_zero_tail() and folio_fill_tail()".

I'm trying to make it easier for filesystems with tailpacking / stuffing /
inline data to use folios.  The primary function here is
folio_fill_tail().  You give it a pointer to memory where the data
currently is, and it takes care of copying it into the folio at that
offset.  That works for gfs2 & iomap.  Then There's Ext4.  Rather than gin
up some kind of specialist "Here's a two pointers to two blocks of memory"
routine, just let it do its current thing, and let it call
folio_zero_tail(), which is also called by folio_fill_tail().

Other filesystems can be converted later; these ones seemed like good
examples as they're already partly or completely converted to folios.


This patch (of 3):

Instead of unmapping the folio after copying the data to it, then mapping
it again to zero the tail, provide folio_zero_tail() to zero the tail of
an already-mapped folio.

[akpm@linux-foundation.org: fix kerneldoc argument ordering]
Link: https://lkml.kernel.org/r/20231107212643.3490372-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231107212643.3490372-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index be20cff4ba73..5ebd5e4dfbf8 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -483,6 +483,44 @@ static inline void memcpy_to_folio(struct folio *folio, size_t offset,
 	flush_dcache_folio(folio);
 }
 
+/**
+ * folio_zero_tail - Zero the tail of a folio.
+ * @folio: The folio to zero.
+ * @offset: The byte offset in the folio to start zeroing at.
+ * @kaddr: The address the folio is currently mapped to.
+ *
+ * If you have already used kmap_local_folio() to map a folio, written
+ * some data to it and now need to zero the end of the folio (and flush
+ * the dcache), you can use this function.  If you do not have the
+ * folio kmapped (eg the folio has been partially populated by DMA),
+ * use folio_zero_range() or folio_zero_segment() instead.
+ *
+ * Return: An address which can be passed to kunmap_local().
+ */
+static inline __must_check void *folio_zero_tail(struct folio *folio,
+		size_t offset, void *kaddr)
+{
+	size_t len = folio_size(folio) - offset;
+
+	if (folio_test_highmem(folio)) {
+		size_t max = PAGE_SIZE - offset_in_page(offset);
+
+		while (len > max) {
+			memset(kaddr, 0, max);
+			kunmap_local(kaddr);
+			len -= max;
+			offset += max;
+			max = PAGE_SIZE;
+			kaddr = kmap_local_folio(folio, offset);
+		}
+	}
+
+	memset(kaddr, 0, len);
+	flush_dcache_folio(folio);
+
+	return kaddr;
+}
+
 /**
  * memcpy_from_file_folio - Copy some bytes from a file folio.
  * @to: The destination buffer.
-- 
cgit v1.2.3


From 6eaa266b54660f6b3654ad8902b4f7027054f55a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 7 Nov 2023 21:26:41 +0000
Subject: mm: add folio_fill_tail() and use it in iomap

The iomap code was limited to PAGE_SIZE bytes; generalise it to cover
an arbitrary-sized folio, and move it to be a common helper.

[akpm@linux-foundation.org: fix folio_fill_tail(), per Andreas Gruenbacher]
Link: https://lkml.kernel.org/r/20231107212643.3490372-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 5ebd5e4dfbf8..451c1dff0e87 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -521,6 +521,44 @@ static inline __must_check void *folio_zero_tail(struct folio *folio,
 	return kaddr;
 }
 
+/**
+ * folio_fill_tail - Copy some data to a folio and pad with zeroes.
+ * @folio: The destination folio.
+ * @offset: The offset into @folio at which to start copying.
+ * @from: The data to copy.
+ * @len: How many bytes of data to copy.
+ *
+ * This function is most useful for filesystems which support inline data.
+ * When they want to copy data from the inode into the page cache, this
+ * function does everything for them.  It supports large folios even on
+ * HIGHMEM configurations.
+ */
+static inline void folio_fill_tail(struct folio *folio, size_t offset,
+		const char *from, size_t len)
+{
+	char *to = kmap_local_folio(folio, offset);
+
+	VM_BUG_ON(offset + len > folio_size(folio));
+
+	if (folio_test_highmem(folio)) {
+		size_t max = PAGE_SIZE - offset_in_page(offset);
+
+		while (len > max) {
+			memcpy(to, from, max);
+			kunmap_local(to);
+			len -= max;
+			from += max;
+			offset += max;
+			max = PAGE_SIZE;
+			to = kmap_local_folio(folio, offset);
+		}
+	}
+
+	memcpy(to, from, len);
+	to = folio_zero_tail(folio, offset + len, to + len);
+	kunmap_local(to);
+}
+
 /**
  * memcpy_from_file_folio - Copy some bytes from a file folio.
  * @to: The destination buffer.
-- 
cgit v1.2.3


From c36f9d3d2c3e17f9eef1d2f47a63c91d51d55e87 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 8 Nov 2023 20:46:02 +0000
Subject: mm: remove test_set_page_writeback()

Patch series "Make folio_start_writeback return void".

Most of the folio flag-setting functions return void.
folio_start_writeback is gratuitously different; the only two filesystems
that do anything with the return value emit debug messages if it's already
set, and we can (and should) do that internally without bothering the
filesystem to do it.


This patch (of 4):

There are no more callers of this wrapper.

Link: https://lkml.kernel.org/r/20231108204605.745109-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231108204605.745109-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Steve French <sfrench@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a88e64acebfe..a440062e9386 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -780,11 +780,6 @@ bool set_page_writeback(struct page *page);
 #define folio_start_writeback_keepwrite(folio)	\
 	__folio_start_writeback(folio, true)
 
-static inline bool test_set_page_writeback(struct page *page)
-{
-	return set_page_writeback(page);
-}
-
 static __always_inline bool folio_test_head(struct folio *folio)
 {
 	return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY));
-- 
cgit v1.2.3


From b5612c368648a7be52411b288d09593e5945d1aa Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 8 Nov 2023 20:46:05 +0000
Subject: mm: return void from folio_start_writeback() and related functions

Nobody now checks the return value from any of these functions, so
add an assertion at the beginning of the function and return void.

Link: https://lkml.kernel.org/r/20231108204605.745109-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Steve French <sfrench@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a440062e9386..735cddc13d20 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -772,8 +772,8 @@ static __always_inline void SetPageUptodate(struct page *page)
 
 CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
 
-bool __folio_start_writeback(struct folio *folio, bool keep_write);
-bool set_page_writeback(struct page *page);
+void __folio_start_writeback(struct folio *folio, bool keep_write);
+void set_page_writeback(struct page *page);
 
 #define folio_start_writeback(folio)			\
 	__folio_start_writeback(folio, false)
-- 
cgit v1.2.3


From 16f5dfbc851b55b87101a20e181d4a14be3007d6 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 9 Nov 2023 21:15:07 +0000
Subject: gfp: include __GFP_NOWARN in GFP_NOWAIT

GFP_NOWAIT callers are always prepared for their allocations to fail
because they fail so frequently.  Forcing the callers to remember to add
__GFP_NOWARN is just annoying and leads to an endless stream of patches
for the places where we forgot to add it.

We can now remove __GFP_NOWARN from all the callers which specify
GFP_NOWAIT, but I'd rather wait a cycle and send patches to each
maintainer instead of creating a big pile of merge conflicts.

Link: https://lkml.kernel.org/r/20231109211507.2262419-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp_types.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 6583a58670c5..ae994534a12a 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -274,7 +274,8 @@ typedef unsigned int __bitwise gfp_t;
  * accounted to kmemcg.
  *
  * %GFP_NOWAIT is for kernel allocations that should not stall for direct
- * reclaim, start physical IO or use any filesystem callback.
+ * reclaim, start physical IO or use any filesystem callback.  It is very
+ * likely to fail to allocate memory, even for very small allocations.
  *
  * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
  * that do not require the starting of any physical IO.
@@ -325,7 +326,7 @@ typedef unsigned int __bitwise gfp_t;
 #define GFP_ATOMIC	(__GFP_HIGH|__GFP_KSWAPD_RECLAIM)
 #define GFP_KERNEL	(__GFP_RECLAIM | __GFP_IO | __GFP_FS)
 #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
-#define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM)
+#define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM | __GFP_NOWARN)
 #define GFP_NOIO	(__GFP_RECLAIM)
 #define GFP_NOFS	(__GFP_RECLAIM | __GFP_IO)
 #define GFP_USER	(__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
-- 
cgit v1.2.3


From af7628d6ec196999175ecb3fdb38336489b0f88a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Nov 2023 16:14:47 +0000
Subject: fs: convert error_remove_page to error_remove_folio

There were already assertions that we were not passing a tail page to
error_remove_page(), so make the compiler enforce that by converting
everything to pass and use a folio.

Link: https://lkml.kernel.org/r/20231117161447.2461643-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h | 2 +-
 include/linux/mm.h | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..31b2cf963db9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -434,7 +434,7 @@ struct address_space_operations {
 	bool (*is_partially_uptodate) (struct folio *, size_t from,
 			size_t count);
 	void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb);
-	int (*error_remove_page)(struct address_space *, struct page *);
+	int (*error_remove_folio)(struct address_space *, struct folio *);
 
 	/* swapfile support */
 	int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 64cd1ee4aacc..13a090271716 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2384,7 +2384,8 @@ extern void truncate_pagecache(struct inode *inode, loff_t new);
 extern void truncate_setsize(struct inode *inode, loff_t newsize);
 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
-int generic_error_remove_page(struct address_space *mapping, struct page *page);
+int generic_error_remove_folio(struct address_space *mapping,
+		struct folio *folio);
 
 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
 		unsigned long address, struct pt_regs *regs);
-- 
cgit v1.2.3


From 022012dcf44209074af97b6ae531a10c08736b31 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Mon, 20 Nov 2023 18:47:13 +0100
Subject: lib/stackdepot, kasan: add flags to __stack_depot_save and rename

Change the bool can_alloc argument of __stack_depot_save to a u32
  argument that accepts a set of flags.

The following patch will add another flag to stack_depot_save_flags
  besides the existing STACK_DEPOT_FLAG_CAN_ALLOC.

Also rename the function to stack_depot_save_flags, as
  __stack_depot_save is a cryptic name,

Link: https://lkml.kernel.org/r/645fa15239621eebbd3a10331e5864b718839512.1700502145.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/stackdepot.h | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index e58306783d8e..0b262e14144e 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -32,6 +32,17 @@ typedef u32 depot_stack_handle_t;
  */
 #define STACK_DEPOT_EXTRA_BITS 5
 
+typedef u32 depot_flags_t;
+
+/*
+ * Flags that can be passed to stack_depot_save_flags(); see the comment next
+ * to its declaration for more details.
+ */
+#define STACK_DEPOT_FLAG_CAN_ALLOC	((depot_flags_t)0x0001)
+
+#define STACK_DEPOT_FLAGS_NUM	1
+#define STACK_DEPOT_FLAGS_MASK	((depot_flags_t)((1 << STACK_DEPOT_FLAGS_NUM) - 1))
+
 /*
  * Using stack depot requires its initialization, which can be done in 3 ways:
  *
@@ -69,31 +80,34 @@ static inline int stack_depot_early_init(void)	{ return 0; }
 #endif
 
 /**
- * __stack_depot_save - Save a stack trace to stack depot
+ * stack_depot_save_flags - Save a stack trace to stack depot
  *
  * @entries:		Pointer to the stack trace
  * @nr_entries:		Number of frames in the stack
  * @alloc_flags:	Allocation GFP flags
- * @can_alloc:		Allocate stack pools (increased chance of failure if false)
+ * @depot_flags:	Stack depot flags
+ *
+ * Saves a stack trace from @entries array of size @nr_entries.
  *
- * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is
- * %true, stack depot can replenish the stack pools in case no space is left
- * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids
- * any allocations and fails if no space is left to store the stack trace.
+ * If STACK_DEPOT_FLAG_CAN_ALLOC is set in @depot_flags, stack depot can
+ * replenish the stack pools in case no space is left (allocates using GFP
+ * flags of @alloc_flags). Otherwise, stack depot avoids any allocations and
+ * fails if no space is left to store the stack trace.
  *
  * If the provided stack trace comes from the interrupt context, only the part
  * up to the interrupt entry is saved.
  *
- * Context: Any context, but setting @can_alloc to %false is required if
+ * Context: Any context, but setting STACK_DEPOT_FLAG_CAN_ALLOC is required if
  *          alloc_pages() cannot be used from the current context. Currently
  *          this is the case for contexts where neither %GFP_ATOMIC nor
  *          %GFP_NOWAIT can be used (NMI, raw_spin_lock).
  *
  * Return: Handle of the stack struct stored in depot, 0 on failure
  */
-depot_stack_handle_t __stack_depot_save(unsigned long *entries,
-					unsigned int nr_entries,
-					gfp_t gfp_flags, bool can_alloc);
+depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
+					    unsigned int nr_entries,
+					    gfp_t gfp_flags,
+					    depot_flags_t depot_flags);
 
 /**
  * stack_depot_save - Save a stack trace to stack depot
@@ -103,7 +117,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries,
  * @alloc_flags:	Allocation GFP flags
  *
  * Context: Contexts where allocations via alloc_pages() are allowed.
- *          See __stack_depot_save() for more details.
+ *          See stack_depot_save_flags() for more details.
  *
  * Return: Handle of the stack trace stored in depot, 0 on failure
  */
-- 
cgit v1.2.3


From 410b764f89f59cce858d94fc781b68c1f27a0ca9 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Mon, 20 Nov 2023 18:47:14 +0100
Subject: lib/stackdepot: add refcount for records

Add a reference counter for how many times a stack records has been
  added to stack depot.

Add a new STACK_DEPOT_FLAG_GET flag to stack_depot_save_flags that
  instructs the stack depot to increment the refcount.

Do not yet decrement the refcount; this is implemented in one of the
  following patches.

Do not yet enable any users to use the flag to avoid overflowing the
  refcount.

This is preparatory patch for implementing the eviction of stack records
  from the stack depot.

Link: https://lkml.kernel.org/r/a3fc14a2359d019d2a008d4ff8b46a665371ffee.1700502145.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/stackdepot.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 0b262e14144e..611716702d73 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -39,8 +39,9 @@ typedef u32 depot_flags_t;
  * to its declaration for more details.
  */
 #define STACK_DEPOT_FLAG_CAN_ALLOC	((depot_flags_t)0x0001)
+#define STACK_DEPOT_FLAG_GET		((depot_flags_t)0x0002)
 
-#define STACK_DEPOT_FLAGS_NUM	1
+#define STACK_DEPOT_FLAGS_NUM	2
 #define STACK_DEPOT_FLAGS_MASK	((depot_flags_t)((1 << STACK_DEPOT_FLAGS_NUM) - 1))
 
 /*
@@ -94,6 +95,9 @@ static inline int stack_depot_early_init(void)	{ return 0; }
  * flags of @alloc_flags). Otherwise, stack depot avoids any allocations and
  * fails if no space is left to store the stack trace.
  *
+ * If STACK_DEPOT_FLAG_GET is set in @depot_flags, stack depot will increment
+ * the refcount on the saved stack trace if it already exists in stack depot.
+ *
  * If the provided stack trace comes from the interrupt context, only the part
  * up to the interrupt entry is saved.
  *
@@ -116,8 +120,11 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
  * @nr_entries:		Number of frames in the stack
  * @alloc_flags:	Allocation GFP flags
  *
- * Context: Contexts where allocations via alloc_pages() are allowed.
- *          See stack_depot_save_flags() for more details.
+ * Does not increment the refcount on the saved stack trace; see
+ * stack_depot_save_flags() for more details.
+ *
+ * Context: Contexts where allocations via alloc_pages() are allowed;
+ *          see stack_depot_save_flags() for more details.
  *
  * Return: Handle of the stack trace stored in depot, 0 on failure
  */
-- 
cgit v1.2.3


From 108be8def46e9422f5a5abc96b0ab8fb6b3fb344 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Mon, 20 Nov 2023 18:47:15 +0100
Subject: lib/stackdepot: allow users to evict stack traces

Add stack_depot_put, a function that decrements the reference counter on a
stack record and removes it from the stack depot once the counter reaches
0.

Internally, when removing a stack record, the function unlinks it from the
hash table bucket and returns to the freelist.

With this change, the users of stack depot can call stack_depot_put when
keeping a stack trace in the stack depot is not needed anymore.  This
allows avoiding polluting the stack depot with irrelevant stack traces and
thus have more space to store the relevant ones before the stack depot
reaches its capacity.

Link: https://lkml.kernel.org/r/1d1ad5692ee43d4fc2b3fd9d221331d30b36123f.1700502145.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/stackdepot.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 611716702d73..a6796f178913 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -97,6 +97,8 @@ static inline int stack_depot_early_init(void)	{ return 0; }
  *
  * If STACK_DEPOT_FLAG_GET is set in @depot_flags, stack depot will increment
  * the refcount on the saved stack trace if it already exists in stack depot.
+ * Users of this flag must also call stack_depot_put() when keeping the stack
+ * trace is no longer required to avoid overflowing the refcount.
  *
  * If the provided stack trace comes from the interrupt context, only the part
  * up to the interrupt entry is saved.
@@ -162,6 +164,18 @@ void stack_depot_print(depot_stack_handle_t stack);
 int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
 		       int spaces);
 
+/**
+ * stack_depot_put - Drop a reference to a stack trace from stack depot
+ *
+ * @handle:	Stack depot handle returned from stack_depot_save()
+ *
+ * The stack trace is evicted from stack depot once all references to it have
+ * been dropped (once the number of stack_depot_evict() calls matches the
+ * number of stack_depot_save_flags() calls with STACK_DEPOT_FLAG_GET set for
+ * this stack trace).
+ */
+void stack_depot_put(depot_stack_handle_t handle);
+
 /**
  * stack_depot_set_extra_bits - Set extra bits in a stack depot handle
  *
-- 
cgit v1.2.3


From 95a2ac937013cc3aaaea02abcdd167b96874548d Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Mon, 20 Nov 2023 15:53:54 +0100
Subject: mm: use vmem_altmap code without CONFIG_ZONE_DEVICE

vmem_altmap_free() and vmem_altmap_offset() could be utlized without
CONFIG_ZONE_DEVICE enabled.  For example,
mm/memory_hotplug.c:__add_pages() relies on that.  The altmap is no longer
restricted to ZONE_DEVICE handling, but instead depends on
CONFIG_SPARSEMEM_VMEMMAP.

When CONFIG_SPARSEMEM_VMEMMAP is disabled, these functions are defined as
inline stubs, ensuring compatibility with configurations that do not use
sparsemem vmemmap.  Without it, lkp reported the following:

ld: arch/x86/mm/init_64.o: in function `remove_pagetable':
init_64.c:(.meminit.text+0xfc7): undefined reference to
`vmem_altmap_free'

Link: https://lkml.kernel.org/r/20231120145354.308999-4-sumanthk@linux.ibm.com
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202311180545.VeyRXEDq-lkp@intel.com/
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h | 12 ------------
 include/linux/mm.h       | 26 ++++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 1314d9c5f05b..744c830f4b13 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -196,8 +196,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 		struct dev_pagemap *pgmap);
 bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
 
-unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
-void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
 unsigned long memremap_compat_align(void);
 #else
 static inline void *devm_memremap_pages(struct device *dev,
@@ -228,16 +226,6 @@ static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
 	return false;
 }
 
-static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
-{
-	return 0;
-}
-
-static inline void vmem_altmap_free(struct vmem_altmap *altmap,
-		unsigned long nr_pfns)
-{
-}
-
 /* when memremap_pages() is disabled all archs can remap a single page */
 static inline unsigned long memremap_compat_align(void)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 13a090271716..a422cc123a2d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3871,6 +3871,32 @@ void vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap);
 #endif
 
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+	/* number of pfns from base where pfn_to_page() is valid */
+	if (altmap)
+		return altmap->reserve + altmap->free;
+	return 0;
+}
+
+static inline void vmem_altmap_free(struct vmem_altmap *altmap,
+				    unsigned long nr_pfns)
+{
+	altmap->alloc -= nr_pfns;
+}
+#else
+static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+	return 0;
+}
+
+static inline void vmem_altmap_free(struct vmem_altmap *altmap,
+				    unsigned long nr_pfns)
+{
+}
+#endif
+
 #define VMEMMAP_RESERVE_NR	2
 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
 static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
-- 
cgit v1.2.3


From 38ca8a185389716e9f7566bce4bb0085f71da61d Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@linux.ibm.com>
Date: Tue, 21 Nov 2023 20:43:49 +0100
Subject: pgtable: fix s390 ptdesc field comments

Patch series "minor ptdesc updates", v3.


This patch (of 2):

Since commit d08d4e7cd6bf ("s390/mm: use full 4KB page for 2KB PTE") there
is no fragmented page tracking on s390.  Fix the corresponding comments.

Link: https://lkml.kernel.org/r/cover.1700594815.git.agordeev@linux.ibm.com
Link: https://lkml.kernel.org/r/2eead241f3a45bed26c7911cf66bded1e35670b8.1700594815.git.agordeev@linux.ibm.com
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Suggested-by: Heiko Carstens <hca@linux.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 957ce38768b2..fbec64036baa 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -401,11 +401,11 @@ FOLIO_MATCH(compound_head, _head_2a);
  * @pmd_huge_pte:     Protected by ptdesc->ptl, used for THPs.
  * @__page_mapping:   Aliases with page->mapping. Unused for page tables.
  * @pt_mm:            Used for x86 pgds.
- * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only.
+ * @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
  * @_pt_pad_2:        Padding to ensure proper alignment.
  * @ptl:              Lock for the page table.
  * @__page_type:      Same as page->page_type. Unused for page tables.
- * @_refcount:        Same as page refcount. Used for s390 page tables.
+ * @_refcount:        Same as page refcount.
  * @pt_memcg_data:    Memcg data. Tracked for page tables here.
  *
  * This struct overlays struct page for now. Do not modify without a good
-- 
cgit v1.2.3


From f7dd74ac239aad5ef7575ea03c45fd7956e00285 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@linux.ibm.com>
Date: Tue, 21 Nov 2023 20:43:50 +0100
Subject: pgtable: rename ptdesc _refcount field to __page_refcount

Rename ptdesc _refcount field to __page_refcount similar to the other
unused page fields.

Link: https://lkml.kernel.org/r/982bdc652ba79a606c3d01c905766e7e076b3315.1700594815.git.agordeev@linux.ibm.com
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Suggested-by: Vishal Moola <vishal.moola@gmail.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fbec64036baa..ef18d2b25378 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -405,7 +405,7 @@ FOLIO_MATCH(compound_head, _head_2a);
  * @_pt_pad_2:        Padding to ensure proper alignment.
  * @ptl:              Lock for the page table.
  * @__page_type:      Same as page->page_type. Unused for page tables.
- * @_refcount:        Same as page refcount.
+ * @__page_refcount:  Same as page refcount.
  * @pt_memcg_data:    Memcg data. Tracked for page tables here.
  *
  * This struct overlays struct page for now. Do not modify without a good
@@ -438,7 +438,7 @@ struct ptdesc {
 #endif
 	};
 	unsigned int __page_type;
-	atomic_t _refcount;
+	atomic_t __page_refcount;
 #ifdef CONFIG_MEMCG
 	unsigned long pt_memcg_data;
 #endif
@@ -452,7 +452,7 @@ TABLE_MATCH(compound_head, _pt_pad_1);
 TABLE_MATCH(mapping, __page_mapping);
 TABLE_MATCH(rcu_head, pt_rcu_head);
 TABLE_MATCH(page_type, __page_type);
-TABLE_MATCH(_refcount, _refcount);
+TABLE_MATCH(_refcount, __page_refcount);
 #ifdef CONFIG_MEMCG
 TABLE_MATCH(memcg_data, pt_memcg_data);
 #endif
-- 
cgit v1.2.3


From 7679e14098c9c3c8118a7130d6e1e9cfe2565c04 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 23 Nov 2023 19:23:17 +0200
Subject: mm: list_lru: Update kernel documentation to follow the requirements

kernel-doc is not happy about documentation in list_lru.h:

list_lru.h:90: warning: Function parameter or member 'lru' not described in 'list_lru_add'
list_lru.h:90: warning: Excess function parameter 'list_lru' description in 'list_lru_add'
list_lru.h:90: warning: No description found for return value of 'list_lru_add'
list_lru.h:103: warning: Function parameter or member 'lru' not described in 'list_lru_del'
list_lru.h:103: warning: Excess function parameter 'list_lru' description in 'list_lru_del'
list_lru.h:103: warning: No description found for return value of 'list_lru_del'
list_lru.h:116: warning: No description found for return value of 'list_lru_count_one'
list_lru.h:168: warning: No description found for return value of 'list_lru_walk_one'
list_lru.h:185: warning: No description found for return value of 'list_lru_walk_one_irq'

Fix the documentation accordingly.

While at it, fix the references to the parameters in functions
inside the long descriptions, on which the above script is not
complaining (yet?).

Link: https://lkml.kernel.org/r/20231123172320.2434780-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list_lru.h | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index b35968ee9fb5..db86ad78d428 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -73,7 +73,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
 
 /**
  * list_lru_add: add an element to the lru list's tail
- * @list_lru: the lru pointer
+ * @lru: the lru pointer
  * @item: the item to be added.
  *
  * If the element is already part of a list, this function returns doing
@@ -83,22 +83,22 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
  * the caller organize itself in a way that elements can be in more than
  * one type of list, it is up to the caller to fully remove the item from
  * the previous list (with list_lru_del() for instance) before moving it
- * to @list_lru
+ * to @lru.
  *
- * Return value: true if the list was updated, false otherwise
+ * Return: true if the list was updated, false otherwise
  */
 bool list_lru_add(struct list_lru *lru, struct list_head *item);
 
 /**
  * list_lru_del: delete an element to the lru list
- * @list_lru: the lru pointer
+ * @lru: the lru pointer
  * @item: the item to be deleted.
  *
- * This function works analogously as list_lru_add in terms of list
+ * This function works analogously as list_lru_add() in terms of list
  * manipulation. The comments about an element already pertaining to
- * a list are also valid for list_lru_del.
+ * a list are also valid for list_lru_del().
  *
- * Return value: true if the list was updated, false otherwise
+ * Return: true if the list was updated, false otherwise
  */
 bool list_lru_del(struct list_lru *lru, struct list_head *item);
 
@@ -108,9 +108,11 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item);
  * @nid: the node id to count from.
  * @memcg: the cgroup to count from.
  *
- * Always return a non-negative number, 0 for empty lists. There is no
- * guarantee that the list is not updated while the count is being computed.
- * Callers that want such a guarantee need to provide an outer lock.
+ * There is no guarantee that the list is not updated while the count is being
+ * computed. Callers that want such a guarantee need to provide an outer lock.
+ *
+ * Return: 0 for empty lists, otherwise the number of objects
+ * currently held by @lru.
  */
 unsigned long list_lru_count_one(struct list_lru *lru,
 				 int nid, struct mem_cgroup *memcg);
@@ -141,7 +143,7 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
 		struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
 
 /**
- * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_one: walk a @lru, isolating and disposing freeable items.
  * @lru: the lru pointer.
  * @nid: the node id to scan from.
  * @memcg: the cgroup to scan from.
@@ -150,24 +152,24 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
  * @cb_arg: opaque type that will be passed to @isolate
  * @nr_to_walk: how many items to scan.
  *
- * This function will scan all elements in a particular list_lru, calling the
+ * This function will scan all elements in a particular @lru, calling the
  * @isolate callback for each of those items, along with the current list
  * spinlock and a caller-provided opaque. The @isolate callback can choose to
  * drop the lock internally, but *must* return with the lock held. The callback
- * will return an enum lru_status telling the list_lru infrastructure what to
+ * will return an enum lru_status telling the @lru infrastructure what to
  * do with the object being scanned.
  *
- * Please note that nr_to_walk does not mean how many objects will be freed,
+ * Please note that @nr_to_walk does not mean how many objects will be freed,
  * just how many objects will be scanned.
  *
- * Return value: the number of objects effectively removed from the LRU.
+ * Return: the number of objects effectively removed from the LRU.
  */
 unsigned long list_lru_walk_one(struct list_lru *lru,
 				int nid, struct mem_cgroup *memcg,
 				list_lru_walk_cb isolate, void *cb_arg,
 				unsigned long *nr_to_walk);
 /**
- * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_one_irq: walk a @lru, isolating and disposing freeable items.
  * @lru: the lru pointer.
  * @nid: the node id to scan from.
  * @memcg: the cgroup to scan from.
@@ -176,7 +178,7 @@ unsigned long list_lru_walk_one(struct list_lru *lru,
  * @cb_arg: opaque type that will be passed to @isolate
  * @nr_to_walk: how many items to scan.
  *
- * Same as @list_lru_walk_one except that the spinlock is acquired with
+ * Same as list_lru_walk_one() except that the spinlock is acquired with
  * spin_lock_irq().
  */
 unsigned long list_lru_walk_one_irq(struct list_lru *lru,
-- 
cgit v1.2.3


From 61a7a5e25fe79b6c43f1c49705a0294be113c4a5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 30 Oct 2023 16:57:10 +0100
Subject: introduce for_other_threads(p, t)

Cosmetic, but imho it makes the usage look more clear and simple, the new
helper doesn't require to initialize "t".

After this change while_each_thread() has only 3 users, and it is only
used in the do/while loops.

Link: https://lkml.kernel.org/r/20231030155710.GA9095@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/signal.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3499c1a8b929..41d6759d6a4a 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -646,6 +646,9 @@ extern bool current_is_single_threaded(void);
 #define while_each_thread(g, t) \
 	while ((t = next_thread(t)) != g)
 
+#define for_other_threads(p, t)	\
+	for (t = p; (t = next_thread(t)) != p; )
+
 #define __for_each_thread(signal, t)	\
 	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
 		lockdep_is_held(&tasklist_lock))
-- 
cgit v1.2.3


From 0eb5085c38749f2a91e5bd8cbebb1ebf3398343c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Thu, 16 Nov 2023 14:36:38 +0100
Subject: arch: remove ARCH_TASK_STRUCT_ON_STACK

IA-64 was the only architecture which selected ARCH_TASK_STRUCT_ON_STACK.
IA-64 was removed with commit cf8e8658100d ("arch: Remove Itanium (IA-64)
architecture"). Therefore remove support for ARCH_TASK_STRUCT_ON_STACK
as well.

Note: this also reveals a potential bug in powerpc code, which makes use of
__init_task_data without selecting ARCH_TASK_STRUCT_ON_STACK which makes
__init_task_data a no-op. This is broken since commit d11ed3ab3166 ("Expand
INIT_TASK() in init/init_task.c and remove") from 2018 and needs to be
addressed separately.

Link: https://lkml.kernel.org/r/20231116133638.1636277-4-hca@linux.ibm.com
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/init_task.h | 7 -------
 include/linux/sched.h     | 2 --
 2 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 40fc5813cf93..bccb3f1f6262 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -37,13 +37,6 @@ extern struct cred init_cred;
 
 #define INIT_TASK_COMM "swapper"
 
-/* Attach to the init_task data structure for proper alignment */
-#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
-#define __init_task_data __section(".data..init_task")
-#else
-#define __init_task_data /**/
-#endif
-
 /* Attach to the thread_info data structure for proper alignment */
 #define __init_thread_info __section(".data..init_thread_info")
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..c2ecb2e06046 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1955,9 +1955,7 @@ extern void ia64_set_curr_task(int cpu, struct task_struct *p);
 void yield(void);
 
 union thread_union {
-#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
 	struct task_struct task;
-#endif
 #ifndef CONFIG_THREAD_INFO_IN_TASK
 	struct thread_info thread_info;
 #endif
-- 
cgit v1.2.3


From b454ec29225cda9ae85ed0a154f4228f1922c872 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 20 Nov 2023 16:16:49 +0100
Subject: kernel/signal.c: simplify force_sig_info_to_task(), kill
 recalc_sigpending_and_wake()

The purpose of recalc_sigpending_and_wake() is not clear, it looks
"obviously unneeded" because we are going to send the signal which can't
be blocked or ignored.

Add the comment to explain why we can't rely on send_signal_locked() and
make this logic more simple/explicit.  recalc_sigpending_and_wake() has no
other users, it can die.

In fact I think we don't even need signal_wake_up(), the target task must
be either current or a TASK_TRACED child, otherwise the usage of siglock
is not safe.  But this needs another change.

Link: https://lkml.kernel.org/r/20231120151649.GA15995@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/signal.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 41d6759d6a4a..015c0e3a3e1d 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -432,7 +432,6 @@ static inline bool fault_signal_pending(vm_fault_t fault_flags,
  * This is required every time the blocked sigset_t changes.
  * callers must hold sighand->siglock.
  */
-extern void recalc_sigpending_and_wake(struct task_struct *t);
 extern void recalc_sigpending(void);
 extern void calculate_sigpending(void);
 
-- 
cgit v1.2.3


From 7acf164b259d9007264d9d8501da1023f140a3b4 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 15 Nov 2023 21:00:27 +0800
Subject: resource: add walk_system_ram_res_rev()

This function, being a variant of walk_system_ram_res() introduced in
commit 8c86e70acead ("resource: provide new functions to walk through
resources"), walks through a list of all the resources of System RAM in
reversed order, i.e., from higher to lower.

It will be used in kexec_file code to load kernel, initrd etc when
preparing kexec reboot.

Link: https://lkml.kernel.org/r/ZVTA6z/06cLnWKUz@MiWiFi-R3L-srv
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ioport.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 14f5cfabbbc8..db7fe25f3370 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -331,6 +331,9 @@ extern int
 walk_system_ram_res(u64 start, u64 end, void *arg,
 		    int (*func)(struct resource *, void *));
 extern int
+walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+			int (*func)(struct resource *, void *));
+extern int
 walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
 		    void *arg, int (*func)(struct resource *, void *));
 
-- 
cgit v1.2.3


From 7973be94724464222ae0b1860a25be04ab7b0132 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 8 Dec 2023 18:52:38 +0200
Subject: clk: x86: lpss-atom: Drop unneeded 'extern' in the header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

'extern' for the functions is not needed, drop it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20231208165238.3309058-1-andriy.shevchenko@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/clk-lpss.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/clk-lpss.h b/include/linux/platform_data/x86/clk-lpss.h
index 41df326583f9..7f132029316a 100644
--- a/include/linux/platform_data/x86/clk-lpss.h
+++ b/include/linux/platform_data/x86/clk-lpss.h
@@ -15,6 +15,6 @@ struct lpss_clk_data {
 	struct clk *clk;
 };
 
-extern int lpss_atom_clk_init(void);
+int lpss_atom_clk_init(void);
 
 #endif /* __CLK_LPSS_H */
-- 
cgit v1.2.3


From 72dd14d241e1c6e241fc5b265746c59f306c6aa3 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 4 Dec 2023 14:17:37 -0800
Subject: platform/x86/intel/tpmi: Modify external interface to get read/write
 state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Modify the external interface tpmi_get_feature_status() to get read
and write blocked instead of locked and disabled. Since auxiliary device
is not created when disabled, no use of returning disabled state. Also
locked state is not useful as feature driver can't use locked state
in a meaningful way.

Using read and write state, feature driver can decide which operations
to restrict for that feature.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20231204221740.3645130-3-srinivas.pandruvada@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/intel_tpmi.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h
index ee07393445f9..4f89c5bd8663 100644
--- a/include/linux/intel_tpmi.h
+++ b/include/linux/intel_tpmi.h
@@ -32,7 +32,6 @@ struct intel_tpmi_plat_info {
 struct intel_tpmi_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev);
 struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int index);
 int tpmi_get_resource_count(struct auxiliary_device *auxdev);
-
-int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, int *locked,
-			    int *disabled);
+int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, bool *read_blocked,
+			    bool *write_blocked);
 #endif
-- 
cgit v1.2.3


From 046d7be6210e7f870e53eb38fd410237e9d1d88f Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 4 Dec 2023 14:17:38 -0800
Subject: platform/x86/intel/tpmi: Move TPMI ID definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move TPMI ID definitions to common include file. In this way other
feature drivers don't have to redefine.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20231204221740.3645130-4-srinivas.pandruvada@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/intel_tpmi.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h
index 4f89c5bd8663..a3529b962be6 100644
--- a/include/linux/intel_tpmi.h
+++ b/include/linux/intel_tpmi.h
@@ -12,6 +12,19 @@
 #define TPMI_MINOR_VERSION(val)	FIELD_GET(GENMASK(4, 0), val)
 #define TPMI_MAJOR_VERSION(val)	FIELD_GET(GENMASK(7, 5), val)
 
+/*
+ * List of supported TMPI IDs.
+ * Some TMPI IDs are not used by Linux, so the numbers are not consecutive.
+ */
+enum intel_tpmi_id {
+	TPMI_ID_RAPL = 0,	/* Running Average Power Limit */
+	TPMI_ID_PEM = 1,	/* Power and Perf excursion Monitor */
+	TPMI_ID_UNCORE = 2,	/* Uncore Frequency Scaling */
+	TPMI_ID_SST = 5,	/* Speed Select Technology */
+	TPMI_CONTROL_ID = 0x80,	/* Special ID for getting feature status */
+	TPMI_INFO_ID = 0x81,	/* Special ID for PCI BDF and Package ID information */
+};
+
 /**
  * struct intel_tpmi_plat_info - Platform information for a TPMI device instance
  * @package_id:	CPU Package id
-- 
cgit v1.2.3


From 6bb3703aa52c9b5bb9716cbeae7350247b675209 Mon Sep 17 00:00:00 2001
From: Masahisa Kojima <masahisa.kojima@linaro.org>
Date: Tue, 7 Nov 2023 14:40:52 +0900
Subject: efi: expose efivar generic ops register function

This is a preparation for supporting efivar operations provided by other
than efi subsystem.  Both register and unregister functions are exposed
so that non-efi subsystem can revert the efi generic operation.

Acked-by: Sumit Garg <sumit.garg@linaro.org>
Co-developed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Masahisa Kojima <masahisa.kojima@linaro.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 9cc5bf32f6f2..1b2f50efb98c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1348,4 +1348,7 @@ bool efi_config_table_is_usable(const efi_guid_t *guid, unsigned long table)
 
 umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n);
 
+void efivars_generic_ops_register(void);
+void efivars_generic_ops_unregister(void);
+
 #endif /* _LINUX_EFI_H */
-- 
cgit v1.2.3


From 1f71f37fbbd065b3326d9b7d8bb5ae688cd653d0 Mon Sep 17 00:00:00 2001
From: Masahisa Kojima <masahisa.kojima@linaro.org>
Date: Tue, 7 Nov 2023 14:40:53 +0900
Subject: efi: Add EFI_ACCESS_DENIED status code

This commit adds the EFI_ACCESS_DENIED status code.

Acked-by: Sumit Garg <sumit.garg@linaro.org>
Co-developed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Masahisa Kojima <masahisa.kojima@linaro.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 1b2f50efb98c..3668aa204c47 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -40,6 +40,7 @@ struct screen_info;
 #define EFI_WRITE_PROTECTED	( 8 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_OUT_OF_RESOURCES	( 9 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_NOT_FOUND		(14 | (1UL << (BITS_PER_LONG-1)))
+#define EFI_ACCESS_DENIED	(15 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_TIMEOUT		(18 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_ABORTED		(21 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_SECURITY_VIOLATION	(26 | (1UL << (BITS_PER_LONG-1)))
-- 
cgit v1.2.3


From 94f7f6182c72ba642c1f20111681f9cc8621c95f Mon Sep 17 00:00:00 2001
From: Masahisa Kojima <masahisa.kojima@linaro.org>
Date: Tue, 7 Nov 2023 14:40:55 +0900
Subject: efivarfs: automatically update super block flag

efivar operation is updated when the tee_stmm_efi module is probed.
tee_stmm_efi module supports SetVariable runtime service, but user needs
to manually remount the efivarfs as RW to enable the write access if the
previous efivar operation does not support SetVariable and efivarfs is
mounted as read-only.

This commit notifies the update of efivar operation to efivarfs
subsystem, then drops SB_RDONLY flag if the efivar operation supports
SetVariable.

Signed-off-by: Masahisa Kojima <masahisa.kojima@linaro.org>
[ardb: use per-superblock instance of the notifier block]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 3668aa204c47..c74f47711f0b 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1349,6 +1349,14 @@ bool efi_config_table_is_usable(const efi_guid_t *guid, unsigned long table)
 
 umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n);
 
+/*
+ * efivar ops event type
+ */
+#define EFIVAR_OPS_RDONLY 0
+#define EFIVAR_OPS_RDWR 1
+
+extern struct blocking_notifier_head efivar_ops_nh;
+
 void efivars_generic_ops_register(void);
 void efivars_generic_ops_unregister(void);
 
-- 
cgit v1.2.3


From 704af3a40747e395b67892127943e6ffd5e2b642 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Sun, 10 Dec 2023 21:24:43 +0100
Subject: platform/x86: wmi: Remove chardev interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The design of the WMI chardev interface is broken:
- it assumes that WMI drivers are not instantiated twice
- it offers next to no abstractions, the WMI driver gets
a raw byte buffer
- it is only used by a single driver, something which is
unlikely to change

Since the only user (dell-smbios-wmi) has been migrated
to his own ioctl interface, remove it.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20231210202443.646427-6-W_Armin@gmx.de
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/wmi.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 8a643c39fcce..50f7f1e4fd4f 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -11,7 +11,6 @@
 #include <linux/device.h>
 #include <linux/acpi.h>
 #include <linux/mod_devicetable.h>
-#include <uapi/linux/wmi.h>
 
 /**
  * struct wmi_device - WMI device structure
@@ -47,8 +46,6 @@ acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct
 
 u8 wmidev_instance_count(struct wmi_device *wdev);
 
-extern int set_required_buffer_size(struct wmi_device *wdev, u64 length);
-
 /**
  * struct wmi_driver - WMI driver structure
  * @driver: Driver model structure
@@ -57,11 +54,8 @@ extern int set_required_buffer_size(struct wmi_device *wdev, u64 length);
  * @probe: Callback for device binding
  * @remove: Callback for device unbinding
  * @notify: Callback for receiving WMI events
- * @filter_callback: Callback for filtering device IOCTLs
  *
  * This represents WMI drivers which handle WMI devices.
- * @filter_callback is only necessary for drivers which
- * want to set up a WMI IOCTL interface.
  */
 struct wmi_driver {
 	struct device_driver driver;
@@ -71,8 +65,6 @@ struct wmi_driver {
 	int (*probe)(struct wmi_device *wdev, const void *context);
 	void (*remove)(struct wmi_device *wdev);
 	void (*notify)(struct wmi_device *device, union acpi_object *data);
-	long (*filter_callback)(struct wmi_device *wdev, unsigned int cmd,
-				struct wmi_ioctl_buffer *arg);
 };
 
 extern int __must_check __wmi_driver_register(struct wmi_driver *driver,
-- 
cgit v1.2.3


From 58e82a62669da52e688f4a8b89922c1839bf1001 Mon Sep 17 00:00:00 2001
From: Ma Jun <Jun.Ma2@amd.com>
Date: Mon, 11 Dec 2023 18:06:23 +0800
Subject: platform/x86/amd: Add support for AMD ACPI based Wifi band RFI
 mitigation feature

Due to electrical and mechanical constraints in certain platform designs
there may be likely interference of relatively high-powered harmonics of
the (G-)DDR memory clocks with local radio module frequency bands used
by Wifi 6/6e/7.

To mitigate this, AMD has introduced a mechanism that devices can use to
notify active use of particular frequencies so that other devices can make
relative internal adjustments as necessary to avoid this resonance.

Co-developed-by: Evan Quan <quanliangl@hotmail.com>
Signed-off-by: Evan Quan <quanliangl@hotmail.com>
Signed-off-by: Ma Jun <Jun.Ma2@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/acpi_amd_wbrf.h | 91 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 include/linux/acpi_amd_wbrf.h

(limited to 'include/linux')

diff --git a/include/linux/acpi_amd_wbrf.h b/include/linux/acpi_amd_wbrf.h
new file mode 100644
index 000000000000..898f31d536d4
--- /dev/null
+++ b/include/linux/acpi_amd_wbrf.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Wifi Band Exclusion Interface (AMD ACPI Implementation)
+ * Copyright (C) 2023 Advanced Micro Devices
+ */
+
+#ifndef _ACPI_AMD_WBRF_H
+#define _ACPI_AMD_WBRF_H
+
+#include <linux/device.h>
+#include <linux/notifier.h>
+
+/* The maximum number of frequency band ranges */
+#define MAX_NUM_OF_WBRF_RANGES		11
+
+/* Record actions */
+#define WBRF_RECORD_ADD		0x0
+#define WBRF_RECORD_REMOVE	0x1
+
+/**
+ * struct freq_band_range - Wifi frequency band range definition
+ * @start: start frequency point (in Hz)
+ * @end: end frequency point (in Hz)
+ */
+struct freq_band_range {
+	u64		start;
+	u64		end;
+};
+
+/**
+ * struct wbrf_ranges_in_out - wbrf ranges info
+ * @num_of_ranges: total number of band ranges in this struct
+ * @band_list: array of Wifi band ranges
+ */
+struct wbrf_ranges_in_out {
+	u64			num_of_ranges;
+	struct freq_band_range	band_list[MAX_NUM_OF_WBRF_RANGES];
+};
+
+/**
+ * enum wbrf_notifier_actions - wbrf notifier actions index
+ * @WBRF_CHANGED: there was some frequency band updates. The consumers
+ *               should retrieve the latest active frequency bands.
+ */
+enum wbrf_notifier_actions {
+	WBRF_CHANGED,
+};
+
+#if IS_ENABLED(CONFIG_AMD_WBRF)
+bool acpi_amd_wbrf_supported_producer(struct device *dev);
+int acpi_amd_wbrf_add_remove(struct device *dev, uint8_t action, struct wbrf_ranges_in_out *in);
+bool acpi_amd_wbrf_supported_consumer(struct device *dev);
+int amd_wbrf_retrieve_freq_band(struct device *dev, struct wbrf_ranges_in_out *out);
+int amd_wbrf_register_notifier(struct notifier_block *nb);
+int amd_wbrf_unregister_notifier(struct notifier_block *nb);
+#else
+static inline
+bool acpi_amd_wbrf_supported_consumer(struct device *dev)
+{
+	return false;
+}
+
+static inline
+int acpi_amd_wbrf_add_remove(struct device *dev, uint8_t action, struct wbrf_ranges_in_out *in)
+{
+	return -ENODEV;
+}
+
+static inline
+bool acpi_amd_wbrf_supported_producer(struct device *dev)
+{
+	return false;
+}
+static inline
+int amd_wbrf_retrieve_freq_band(struct device *dev, struct wbrf_ranges_in_out *out)
+{
+	return -ENODEV;
+}
+static inline
+int amd_wbrf_register_notifier(struct notifier_block *nb)
+{
+	return -ENODEV;
+}
+static inline
+int amd_wbrf_unregister_notifier(struct notifier_block *nb)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_AMD_WBRF */
+
+#endif /* _ACPI_AMD_WBRF_H */
-- 
cgit v1.2.3


From 2ffdd4773d98b1f7488f8e37bd881bbecec24d85 Mon Sep 17 00:00:00 2001
From: Hsiao Chien Sung <shawn.sung@mediatek.com>
Date: Tue, 24 Oct 2023 21:00:33 +0800
Subject: soc: mediatek: Support MT8188 VDOSYS1 Padding in mtk-mmsys

- Add Padding components
- Add Mutex module definitions for Padding

Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Hsiao Chien Sung <shawn.sung@mediatek.com>
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 include/linux/soc/mediatek/mtk-mmsys.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h
index 2475ef914746..4885b065b849 100644
--- a/include/linux/soc/mediatek/mtk-mmsys.h
+++ b/include/linux/soc/mediatek/mtk-mmsys.h
@@ -62,6 +62,14 @@ enum mtk_ddp_comp_id {
 	DDP_COMPONENT_OVL_2L1,
 	DDP_COMPONENT_OVL_2L2,
 	DDP_COMPONENT_OVL1,
+	DDP_COMPONENT_PADDING0,
+	DDP_COMPONENT_PADDING1,
+	DDP_COMPONENT_PADDING2,
+	DDP_COMPONENT_PADDING3,
+	DDP_COMPONENT_PADDING4,
+	DDP_COMPONENT_PADDING5,
+	DDP_COMPONENT_PADDING6,
+	DDP_COMPONENT_PADDING7,
 	DDP_COMPONENT_POSTMASK0,
 	DDP_COMPONENT_PWM0,
 	DDP_COMPONENT_PWM1,
-- 
cgit v1.2.3


From d1c371035c8204112d84266e6bde7537f25448f7 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Sun, 10 Dec 2023 10:50:28 +0800
Subject: quota: convert dquot_claim_space_nodirty() to return void

dquot_claim_space_nodirty() always return zero, let's convert it
to return void, then, its caller can get rid of handling failure
case.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20231210025028.3262900-1-chao@kernel.org>
---
 include/linux/quotaops.h | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 4fa4ef0a173a..06cc8888199e 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -74,7 +74,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags);
 
 int dquot_alloc_inode(struct inode *inode);
 
-int dquot_claim_space_nodirty(struct inode *inode, qsize_t number);
+void dquot_claim_space_nodirty(struct inode *inode, qsize_t number);
 void dquot_free_inode(struct inode *inode);
 void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number);
 
@@ -257,10 +257,9 @@ static inline void __dquot_free_space(struct inode *inode, qsize_t number,
 		inode_sub_bytes(inode, number);
 }
 
-static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
+static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
 	inode_add_bytes(inode, number);
-	return 0;
 }
 
 static inline int dquot_reclaim_space_nodirty(struct inode *inode,
@@ -358,14 +357,10 @@ static inline int dquot_reserve_block(struct inode *inode, qsize_t nr)
 				DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE);
 }
 
-static inline int dquot_claim_block(struct inode *inode, qsize_t nr)
+static inline void dquot_claim_block(struct inode *inode, qsize_t nr)
 {
-	int ret;
-
-	ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits);
-	if (!ret)
-		mark_inode_dirty_sync(inode);
-	return ret;
+	dquot_claim_space_nodirty(inode, nr << inode->i_blkbits);
+	mark_inode_dirty_sync(inode);
 }
 
 static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr)
-- 
cgit v1.2.3


From 2ebe81c814355d000fe49d9c4213983844dcb32b Mon Sep 17 00:00:00 2001
From: Aleksander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 6 Dec 2023 21:59:19 +0100
Subject: net, xdp: Allow metadata > 32

32 bytes may be not enough for some custom metadata. Relax the restriction,
allow metadata larger than 32 bytes and make __skb_metadata_differs() work
with bigger lengths.

Now size of metadata is only limited by the fact it is stored as u8 in
skb_shared_info, so maximum possible value is 255. Size still has to be
aligned to 4, so the actual upper limit becomes 252. Most driver
implementations will offer less, none can offer more.

Other important conditions, such as having enough space for xdp_frame
building, are already checked in bpf_xdp_adjust_meta().

Signed-off-by: Aleksander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/eb87653c-8ff8-447d-a7a1-25961f60518a@kernel.org
Link: https://lore.kernel.org/bpf/20231206205919.404415-3-larysa.zaremba@intel.com
---
 include/linux/skbuff.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b370eb8d70f7..df6ef42639d8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4247,10 +4247,13 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
 {
 	const void *a = skb_metadata_end(skb_a);
 	const void *b = skb_metadata_end(skb_b);
-	/* Using more efficient varaiant than plain call to memcmp(). */
-#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
 	u64 diffs = 0;
 
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+	    BITS_PER_LONG != 64)
+		goto slow;
+
+	/* Using more efficient variant than plain call to memcmp(). */
 	switch (meta_len) {
 #define __it(x, op) (x -= sizeof(u##op))
 #define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
@@ -4270,11 +4273,11 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
 		fallthrough;
 	case  4: diffs |= __it_diff(a, b, 32);
 		break;
+	default:
+slow:
+		return memcmp(a - meta_len, b - meta_len, meta_len);
 	}
 	return diffs;
-#else
-	return memcmp(a - meta_len, b - meta_len, meta_len);
-#endif
 }
 
 static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
-- 
cgit v1.2.3


From 718ab8226636a1a3a7d281f5d6a7ad7c925efe5a Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Tue, 28 Nov 2023 09:15:07 +0100
Subject: PCI/ASPM: Add pci_enable_link_state_locked()

Add pci_enable_link_state_locked() for enabling link states that can be
used in contexts where a pci_bus_sem read lock is already held (e.g. from
pci_walk_bus()).

This helper will be used to fix a couple of potential deadlocks where
the current helper is called with the lock already held, hence the CC
stable tag.

Fixes: f492edb40b54 ("PCI: vmd: Add quirk to configure PCIe ASPM and LTR")
Link: https://lore.kernel.org/r/20231128081512.19387-2-johan+linaro@kernel.org
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
[bhelgaas: include helper name in subject, commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Cc: <stable@vger.kernel.org>	# 6.3
Cc: Michael Bottini <michael.a.bottini@linux.intel.com>
Cc: David E. Box <david.e.box@linux.intel.com>
---
 include/linux/pci.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60ca768bc867..dea043bc1e38 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1829,6 +1829,7 @@ extern bool pcie_ports_native;
 int pci_disable_link_state(struct pci_dev *pdev, int state);
 int pci_disable_link_state_locked(struct pci_dev *pdev, int state);
 int pci_enable_link_state(struct pci_dev *pdev, int state);
+int pci_enable_link_state_locked(struct pci_dev *pdev, int state);
 void pcie_no_aspm(void);
 bool pcie_aspm_support_enabled(void);
 bool pcie_aspm_enabled(struct pci_dev *pdev);
@@ -1839,6 +1840,8 @@ static inline int pci_disable_link_state_locked(struct pci_dev *pdev, int state)
 { return 0; }
 static inline int pci_enable_link_state(struct pci_dev *pdev, int state)
 { return 0; }
+static inline int pci_enable_link_state_locked(struct pci_dev *pdev, int state)
+{ return 0; }
 static inline void pcie_no_aspm(void) { }
 static inline bool pcie_aspm_support_enabled(void) { return false; }
 static inline bool pcie_aspm_enabled(struct pci_dev *pdev) { return false; }
-- 
cgit v1.2.3


From 46eae99ef73302f9fb3dddcd67c374b3dffe8fd6 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 25 Oct 2023 16:02:02 +0200
Subject: add statmount(2) syscall

Add a way to query attributes of a single mount instead of having to parse
the complete /proc/$PID/mountinfo, which might be huge.

Lookup the mount the new 64bit mount ID.  If a mount needs to be queried
based on path, then statx(2) can be used to first query the mount ID
belonging to the path.

Design is based on a suggestion by Linus:

  "So I'd suggest something that is very much like "statfsat()", which gets
   a buffer and a length, and returns an extended "struct statfs" *AND*
   just a string description at the end."

The interface closely mimics that of statx.

Handle ASCII attributes by appending after the end of the structure (as per
above suggestion).  Pointers to strings are stored in u64 members to make
the structure the same regardless of pointer size.  Strings are nul
terminated.

Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20231025140205.3586473-5-mszeredi@redhat.com
Reviewed-by: Ian Kent <raven@themaw.net>
[Christian Brauner <brauner@kernel.org>: various minor changes]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/syscalls.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index fd9d12de7e92..530ca9adf5f1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -74,6 +74,8 @@ struct landlock_ruleset_attr;
 enum landlock_rule_type;
 struct cachestat_range;
 struct cachestat;
+struct statmount;
+struct mnt_id_req;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -407,6 +409,9 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz,
 asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
 asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
 				struct statfs64 __user *buf);
+asmlinkage long sys_statmount(const struct mnt_id_req __user *req,
+			      struct statmount __user *buf, size_t bufsize,
+			      unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
 #if BITS_PER_LONG == 32
-- 
cgit v1.2.3


From 5bc2ea60897e0f899fb93930dd867dae7c8eb11f Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Mon, 11 Dec 2023 20:27:47 +0800
Subject: iio: core: introduce trough info element for minimum values

The IIO_CHAN_INFO_PEAK info element is used for maximum values and
currently there is no equivalent for minimum values. Instead of
overloading the existing peak info element, a new info element can
be added.

In principle there is no need to add a _TROUGH_SCALE element as the
scale will be the same as the one required for INFO_PEAK, which in
turn is sometimes omitted if a single scale for peaks and raw values
is required.

Add an IIO_CHAN_INFO_TROUGH info element for minimum values.

Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Link: https://lore.kernel.org/r/20231211122747.9723-1-579lpy@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 117bde7d6ad7..d89982c98368 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -68,6 +68,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_THERMOCOUPLE_TYPE,
 	IIO_CHAN_INFO_CALIBAMBIENT,
 	IIO_CHAN_INFO_ZEROPOINT,
+	IIO_CHAN_INFO_TROUGH,
 };
 
 #endif /* _IIO_TYPES_H_ */
-- 
cgit v1.2.3


From 4649620d9404d3aceb25891c24bab77143e3f21c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 8 Dec 2023 20:13:44 +0100
Subject: thermal: core: Make thermal_zone_device_unregister() return after
 freeing the zone

Make thermal_zone_device_unregister() wait until all of the references
to the given thermal zone object have been dropped and free it before
returning.

This guarantees that when thermal_zone_device_unregister() returns,
there is no leftover activity regarding the thermal zone in question
which is required by some of its callers (for instance, modular driver
code that wants to know when it is safe to let the module go away).

Subsequently, this will allow some confusing device_is_registered()
checks to be dropped from the thermal sysfs and core code.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-and-tested-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/thermal.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 0ea99f50d57c..bedbaec9a42e 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -117,6 +117,7 @@ struct thermal_cooling_device {
  * @id:		unique id number for each thermal zone
  * @type:	the thermal zone device type
  * @device:	&struct device for this thermal zone
+ * @removal:	removal completion
  * @trip_temp_attrs:	attributes for trip points for sysfs: trip temperature
  * @trip_type_attrs:	attributes for trip points for sysfs: trip type
  * @trip_hyst_attrs:	attributes for trip points for sysfs: trip hysteresis
@@ -156,6 +157,7 @@ struct thermal_zone_device {
 	int id;
 	char type[THERMAL_NAME_LENGTH];
 	struct device device;
+	struct completion removal;
 	struct attribute_group trips_attribute_group;
 	struct thermal_attr *trip_temp_attrs;
 	struct thermal_attr *trip_type_attrs;
-- 
cgit v1.2.3


From 4e58aaeebb3c27993c734c99eae6881b196b1ddb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 1 Nov 2023 18:28:38 -0700
Subject: rcu: Restrict access to RCU CPU stall notifiers

Although the RCU CPU stall notifiers can be useful for dumping state when
tracking down delicate forward-progress bugs where NUMA effects cause
cache lines to be delivered to a given CPU regularly, but always in a
state that prevents that CPU from making forward progress.  These bugs can
be detected by the RCU CPU stall-warning mechanism, but in some cases,
the stall-warnings printk()s disrupt the forward-progress bug before
any useful state can be obtained.

Unfortunately, the notifier mechanism added by commit 5b404fdabacf ("rcu:
Add RCU CPU stall notifier") can make matters worse if used at all
carelessly. For example, if the stall warning was caused by a lock not
being released, then any attempt to acquire that lock in the notifier
will hang. This will prevent not only the notifier from producing any
useful output, but it will also prevent the stall-warning message from
ever appearing.

This commit therefore hides this new RCU CPU stall notifier
mechanism under a new RCU_CPU_STALL_NOTIFIER Kconfig option that
depends on both DEBUG_KERNEL and RCU_EXPERT.  In addition, the
rcupdate.rcu_cpu_stall_notifiers=1 kernel boot parameter must also
be specified.  The RCU_CPU_STALL_NOTIFIER Kconfig option's help text
contains a warning and explains the dangers of careless use, recommending
lockless notifier code.  In addition, a WARN() is triggered each time
that an attempt is made to register a stall-warning notifier in kernels
built with CONFIG_RCU_CPU_STALL_NOTIFIER=y.

This combination of measures will keep use of this mechanism confined to
debug kernels and away from routine deployments.

[ paulmck: Apply Dan Carpenter feedback. ]

Fixes: 5b404fdabacf ("rcu: Add RCU CPU stall notifier")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 include/linux/rcu_notifier.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_notifier.h b/include/linux/rcu_notifier.h
index ebf371364581..5640f024773b 100644
--- a/include/linux/rcu_notifier.h
+++ b/include/linux/rcu_notifier.h
@@ -13,7 +13,7 @@
 #define RCU_STALL_NOTIFY_NORM	1
 #define RCU_STALL_NOTIFY_EXP	2
 
-#ifdef CONFIG_RCU_STALL_COMMON
+#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 
 #include <linux/notifier.h>
 #include <linux/types.h>
@@ -21,12 +21,12 @@
 int rcu_stall_chain_notifier_register(struct notifier_block *n);
 int rcu_stall_chain_notifier_unregister(struct notifier_block *n);
 
-#else // #ifdef CONFIG_RCU_STALL_COMMON
+#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 
 // No RCU CPU stall warnings in Tiny RCU.
 static inline int rcu_stall_chain_notifier_register(struct notifier_block *n) { return -EEXIST; }
 static inline int rcu_stall_chain_notifier_unregister(struct notifier_block *n) { return -ENOENT; }
 
-#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON
+#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 
 #endif /* __LINUX_RCU_NOTIFIER_H */
-- 
cgit v1.2.3


From 23d90b2404050c00c15058710d56bb46e1c5ab36 Mon Sep 17 00:00:00 2001
From: Pedro Falcato <pedro.falcato@gmail.com>
Date: Fri, 20 Oct 2023 18:30:15 +0100
Subject: rcu: Remove unused macros from rcupdate.h

ulong2long, USHORT_CMP_GE and USHORT_CMP_LT are redundant and have been
unused for quite a few releases.

Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 include/linux/rcupdate.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f7206b2623c9..aa87c82236dd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -34,9 +34,6 @@
 
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
-#define ulong2long(a)		(*(long *)(&(a)))
-#define USHORT_CMP_GE(a, b)	(USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
-#define USHORT_CMP_LT(a, b)	(USHRT_MAX / 2 < (unsigned short)((a) - (b)))
 
 /* Exported common interfaces */
 void call_rcu(struct rcu_head *head, rcu_callback_t func);
-- 
cgit v1.2.3


From c5e2a973448d958feb7881e4d875eac59fdeff3d Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Fri, 8 Dec 2023 16:28:41 -0300
Subject: rtnl: add helper to check if rtnl group has listeners

As of today, rtnl code creates a new skb and unconditionally fills and
broadcasts it to the relevant group. For most operations this is okay
and doesn't waste resources in general.

When operations are done without the rtnl_lock, as in tc-flower, such
skb allocation, message fill and no-op broadcasting can happen in all
cores of the system, which contributes to system pressure and wastes
precious cpu cycles when no one will receive the built message.

Introduce this helper so rtnetlink operations can simply check if someone
is listening and then proceed if necessary.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Link: https://lore.kernel.org/r/20231208192847.714940-2-pctammela@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/rtnetlink.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 3d6cf306cd55..a7d757e96c55 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -130,4 +130,11 @@ extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 
 extern void rtnl_offload_xstats_notify(struct net_device *dev);
 
+static inline int rtnl_has_listeners(const struct net *net, u32 group)
+{
+	struct sock *rtnl = net->rtnl;
+
+	return netlink_has_listeners(rtnl, group);
+}
+
 #endif	/* __LINUX_RTNETLINK_H */
-- 
cgit v1.2.3


From 8439109b76a3c405808383bf9dd532fc4b9c2dbd Mon Sep 17 00:00:00 2001
From: Victor Nogueira <victor@mojatatu.com>
Date: Fri, 8 Dec 2023 16:28:42 -0300
Subject: rtnl: add helper to check if a notification is needed

Building on the rtnl_has_listeners helper, add the rtnl_notify_needed
helper to check if we can bail out early in the notification routines.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Link: https://lore.kernel.org/r/20231208192847.714940-3-pctammela@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/rtnetlink.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index a7d757e96c55..0cbbbded0331 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -137,4 +137,19 @@ static inline int rtnl_has_listeners(const struct net *net, u32 group)
 	return netlink_has_listeners(rtnl, group);
 }
 
+/**
+ * rtnl_notify_needed - check if notification is needed
+ * @net: Pointer to the net namespace
+ * @nlflags: netlink ingress message flags
+ * @group: rtnl group
+ *
+ * Based on the ingress message flags and rtnl group, returns true
+ * if a notification is needed, false otherwise.
+ */
+static inline bool
+rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group)
+{
+	return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group);
+}
+
 #endif	/* __LINUX_RTNETLINK_H */
-- 
cgit v1.2.3


From ddb6b284bdc32b6e218b3d90b5a745ea26620812 Mon Sep 17 00:00:00 2001
From: Pedro Tammela <pctammela@mojatatu.com>
Date: Fri, 8 Dec 2023 16:28:43 -0300
Subject: rtnl: add helper to send if skb is not null

This is a convenience helper for routines handling conditional rtnl
events, that is code that might send a notification depending on
rtnl_has_listeners/rtnl_notify_needed.

Instead of:
   if (skb)
      rtnetlink_send(...)

Use:
      rtnetlink_maybe_send(...)

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Link: https://lore.kernel.org/r/20231208192847.714940-4-pctammela@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/rtnetlink.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 0cbbbded0331..6a8543b34e2c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -10,6 +10,13 @@
 #include <uapi/linux/rtnetlink.h>
 
 extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
+
+static inline int rtnetlink_maybe_send(struct sk_buff *skb, struct net *net,
+				       u32 pid, u32 group, int echo)
+{
+	return !skb ? 0 : rtnetlink_send(skb, net, pid, group, echo);
+}
+
 extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
 extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
 			u32 group, const struct nlmsghdr *nlh, gfp_t flags);
-- 
cgit v1.2.3


From 1a1ad782dcbbacd9e8d4e2e7ff1bf14d1db80727 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 4 Dec 2023 15:39:21 -0800
Subject: bpf: tidy up exception callback management a bit

Use the fact that we are passing subprog index around and have
a corresponding struct bpf_subprog_info in bpf_verifier_env for each
subprogram. We don't need to separately pass around a flag whether
subprog is exception callback or not, each relevant verifier function
can determine this using provided subprog index if we maintain
bpf_subprog_info properly.

Also move out exception callback-specific logic from
btf_prepare_func_args(), keeping it generic. We can enforce all these
restriction right before exception callback verification pass. We add
out parameter, arg_cnt, for now, but this will be unnecessary with
subsequent refactoring and will be removed.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231204233931.49758-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c1a06263a4f3..0bd4889e917a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2494,7 +2494,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *reg, bool is_ex_cb);
+			  struct bpf_reg_state *reg, u32 *nargs);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
 const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
-- 
cgit v1.2.3


From 406a6fa44bfbc8563f0612b08d43df2fa65e8bc5 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 4 Dec 2023 15:39:22 -0800
Subject: bpf: use bitfields for simple per-subprog bool flags

We have a bunch of bool flags for each subprog. Instead of wasting bytes
for them, use bitfields instead.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231204233931.49758-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 314b679fb494..c2819a6579a5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -611,12 +611,12 @@ struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
 	u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
 	u16 stack_depth; /* max. stack depth used by this function */
-	bool has_tail_call;
-	bool tail_call_reachable;
-	bool has_ld_abs;
-	bool is_cb;
-	bool is_async_cb;
-	bool is_exception_cb;
+	bool has_tail_call: 1;
+	bool tail_call_reachable: 1;
+	bool has_ld_abs: 1;
+	bool is_cb: 1;
+	bool is_async_cb: 1;
+	bool is_exception_cb: 1;
 };
 
 struct bpf_verifier_env;
-- 
cgit v1.2.3


From 1ca51628e7303718fdabe29c7d36f582500d5cf2 Mon Sep 17 00:00:00 2001
From: Shun Hao <shunh@nvidia.com>
Date: Wed, 6 Dec 2023 16:01:34 +0200
Subject: net/mlx5: Introduce indirect-sw-encap ICM properties

Add new fields for device memory capabilities, in order to support
creation of new ICM memory type of SW encap.

Signed-off-by: Shun Hao <shunh@nvidia.com>
Link: https://lore.kernel.org/r/107cca7dd6a932a1704abf6ebd1b801105546a8e.1701871118.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6f3631425f38..02b25dc36143 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1193,7 +1193,8 @@ struct mlx5_ifc_device_mem_cap_bits {
 	u8         log_sw_icm_alloc_granularity[0x6];
 	u8         log_steering_sw_icm_size[0x8];
 
-	u8         reserved_at_120[0x18];
+	u8         log_indirect_encap_sw_icm_size[0x8];
+	u8         reserved_at_128[0x10];
 	u8         log_header_modify_pattern_sw_icm_size[0x8];
 
 	u8         header_modify_sw_icm_start_address[0x40];
@@ -1204,7 +1205,11 @@ struct mlx5_ifc_device_mem_cap_bits {
 
 	u8         memic_operations[0x20];
 
-	u8         reserved_at_220[0x5e0];
+	u8         reserved_at_220[0x20];
+
+	u8         indirect_encap_sw_icm_start_address[0x40];
+
+	u8         reserved_at_280[0x580];
 };
 
 struct mlx5_ifc_device_event_cap_bits {
-- 
cgit v1.2.3


From a429ec96c07f3020af12029acefc46f42ff5c91c Mon Sep 17 00:00:00 2001
From: Shun Hao <shunh@nvidia.com>
Date: Wed, 6 Dec 2023 16:01:35 +0200
Subject: RDMA/mlx5: Support handling of SW encap ICM area

New type for this ICM area, now the user can allocate/deallocate
the new type of SW encap ICM memory, to store the encap header data
which are managed by SW.

Signed-off-by: Shun Hao <shunh@nvidia.com>
Link: https://lore.kernel.org/r/546fe43fc700240709e30acf7713ec6834d652bd.1701871118.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d2b8d4a74a30..96cb8845682d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -688,6 +688,7 @@ enum mlx5_sw_icm_type {
 	MLX5_SW_ICM_TYPE_STEERING,
 	MLX5_SW_ICM_TYPE_HEADER_MODIFY,
 	MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN,
+	MLX5_SW_ICM_TYPE_SW_ENCAP,
 };
 
 #define MLX5_MAX_RESERVED_GIDS 8
-- 
cgit v1.2.3


From eb524d0fd46249b0b9e5d52372dc65d8b32430c3 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Wed, 6 Dec 2023 16:01:37 +0200
Subject: net/mlx5: E-Switch, expose eswitch manager vport

Expose the ability the query the eswitch manager vport number.
Next patch will utilize this capability to reveal the correct
register C0 value to the users.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://lore.kernel.org/r/614fb0e216250e2ce3340471ec141b83ec45c7f4.1701871118.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/eswitch.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 950d2431a53c..df73a2ccc9af 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -7,6 +7,7 @@
 #define _MLX5_ESWITCH_
 
 #include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
 #include <net/devlink.h>
 
 #define MLX5_ESWITCH_MANAGER(mdev) MLX5_CAP_GEN(mdev, eswitch_manager)
@@ -210,4 +211,11 @@ static inline bool is_mdev_switchdev_mode(struct mlx5_core_dev *dev)
 	return mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS;
 }
 
+/* The returned number is valid only when the dev is eswitch manager. */
+static inline u16 mlx5_eswitch_manager_vport(struct mlx5_core_dev *dev)
+{
+	return mlx5_core_is_ecpf_esw_manager(dev) ?
+		MLX5_VPORT_ECPF : MLX5_VPORT_PF;
+}
+
 #endif
-- 
cgit v1.2.3


From 98fb9b9680c9f3895ced02d6a73e27f5d7b5892b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 6 Dec 2023 22:37:57 +0100
Subject: wifi: ieee80211: don't require protected vendor action frames

For vendor action frames, whether a protected one should be
used or not is clearly up to the individual vendor and frame,
so even though a protected dual is defined, it may not get
used. Thus, don't require protection for vendor action frames
when they're used in a connection.

Since we obviously don't process frames unknown to the kernel
in the kernel, it may makes sense to invert this list to have
all the ones the kernel processes and knows to be requiring
protection, but that'd be a different change.

Fixes: 91535613b609 ("wifi: mac80211: don't drop all unprotected public action frames")
Reported-by: Jouni Malinen <j@w1.fi>
Link: https://msgid.link/20231206223801.f6a2cf4e67ec.Ifa6acc774bd67801d3dafb405278f297683187aa@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index c2ac9e9e7ee9..2b5e500bf093 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4447,7 +4447,8 @@ ieee80211_is_protected_dual_of_public_action(struct sk_buff *skb)
 		action != WLAN_PUB_ACTION_LOC_TRACK_NOTI &&
 		action != WLAN_PUB_ACTION_FTM_REQUEST &&
 		action != WLAN_PUB_ACTION_FTM_RESPONSE &&
-		action != WLAN_PUB_ACTION_FILS_DISCOVERY;
+		action != WLAN_PUB_ACTION_FILS_DISCOVERY &&
+		action != WLAN_PUB_ACTION_VENDOR_SPECIFIC;
 }
 
 /**
-- 
cgit v1.2.3


From 8f23f5dba6b4693448144bde4dd6f537543442c2 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 27 Oct 2023 08:05:20 +0800
Subject: iommu: Change kconfig around IOMMU_SVA

Linus suggested that the kconfig here is confusing:

https://lore.kernel.org/all/CAHk-=wgUiAtiszwseM1p2fCJ+sC4XWQ+YN4TanFhUgvUqjr9Xw@mail.gmail.com/

Let's break it into three kconfigs controlling distinct things:

 - CONFIG_IOMMU_MM_DATA controls if the mm_struct has the additional
   fields for the IOMMU. Currently only PASID, but later patches store
   a struct iommu_mm_data *

 - CONFIG_ARCH_HAS_CPU_PASID controls if the arch needs the scheduling bit
   for keeping track of the ENQCMD instruction. x86 will select this if
   IOMMU_SVA is enabled

 - IOMMU_SVA controls if the IOMMU core compiles in the SVA support code
   for iommu driver use and the IOMMU exported API

This way ARM will not enable CONFIG_ARCH_HAS_CPU_PASID

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231027000525.1278806-2-tina.zhang@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h    | 2 +-
 include/linux/mm_types.h | 2 +-
 include/linux/sched.h    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c7394b39599c..cd3f398095bf 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1337,7 +1337,7 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream
 	return false;
 }
 
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_IOMMU_MM_DATA
 static inline void mm_pasid_init(struct mm_struct *mm)
 {
 	mm->pasid = IOMMU_PASID_INVALID;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 957ce38768b2..41f248608dd9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -938,7 +938,7 @@ struct mm_struct {
 #endif
 		struct work_struct async_put_work;
 
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_IOMMU_MM_DATA
 		u32 pasid;
 #endif
 #ifdef CONFIG_KSM
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..70888a36677b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -954,7 +954,7 @@ struct task_struct {
 	/* Recursion prevention for eventfd_signal() */
 	unsigned			in_eventfd:1;
 #endif
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
 	unsigned			pasid_activated:1;
 #endif
 #ifdef	CONFIG_CPU_SUP_INTEL
-- 
cgit v1.2.3


From 2396046d75d3c0b2cfead852a77efd023f8539dc Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Fri, 27 Oct 2023 08:05:22 +0800
Subject: iommu: Add mm_get_enqcmd_pasid() helper function

mm_get_enqcmd_pasid() should be used by architecture code and closely
related to learn the PASID value that the x86 ENQCMD operation should
use for the mm.

For the moment SMMUv3 uses this without any connection to ENQCMD, it
will be cleaned up similar to how the prior patch made VT-d use the
PASID argument of set_dev_pasid().

The motivation is to replace mm->pasid with an iommu private data
structure that is introduced in a later patch.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231027000525.1278806-4-tina.zhang@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index cd3f398095bf..4fb239c6ca8d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1346,6 +1346,12 @@ static inline bool mm_valid_pasid(struct mm_struct *mm)
 {
 	return mm->pasid != IOMMU_PASID_INVALID;
 }
+
+static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
+{
+	return mm->pasid;
+}
+
 void mm_pasid_drop(struct mm_struct *mm);
 struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 					struct mm_struct *mm);
@@ -1368,6 +1374,12 @@ static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 }
 static inline void mm_pasid_init(struct mm_struct *mm) {}
 static inline bool mm_valid_pasid(struct mm_struct *mm) { return false; }
+
+static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
+{
+	return IOMMU_PASID_INVALID;
+}
+
 static inline void mm_pasid_drop(struct mm_struct *mm) {}
 #endif /* CONFIG_IOMMU_SVA */
 
-- 
cgit v1.2.3


From 541a3e257d48c16b77d19f39ed939ef5832046df Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Fri, 27 Oct 2023 08:05:23 +0800
Subject: mm: Add structure to keep sva information

Introduce iommu_mm_data structure to keep sva information (pasid and the
related sva domains). Add iommu_mm pointer, pointing to an instance of
iommu_mm_data structure, to mm.

Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231027000525.1278806-5-tina.zhang@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h    | 5 +++++
 include/linux/mm_types.h | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4fb239c6ca8d..f7b1b469e98d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -812,6 +812,11 @@ struct iommu_sva {
 	struct iommu_domain		*domain;
 };
 
+struct iommu_mm_data {
+	u32			pasid;
+	struct list_head	sva_domains;
+};
+
 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
 		      const struct iommu_ops *ops);
 void iommu_fwspec_free(struct device *dev);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 41f248608dd9..0b4314fab478 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -727,6 +727,7 @@ struct mm_cid {
 #endif
 
 struct kioctx_table;
+struct iommu_mm_data;
 struct mm_struct {
 	struct {
 		/*
@@ -940,6 +941,7 @@ struct mm_struct {
 
 #ifdef CONFIG_IOMMU_MM_DATA
 		u32 pasid;
+		struct iommu_mm_data *iommu_mm;
 #endif
 #ifdef CONFIG_KSM
 		/*
-- 
cgit v1.2.3


From 092edaddb660376648acb97678570ed5d8299768 Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Fri, 27 Oct 2023 08:05:24 +0800
Subject: iommu: Support mm PASID 1:n with sva domains

Each mm bound to devices gets a PASID and corresponding sva domains
allocated in iommu_sva_bind_device(), which are referenced by iommu_mm
field of the mm. The PASID is released in __mmdrop(), while a sva domain
is released when no one is using it (the reference count is decremented
in iommu_sva_unbind_device()). However, although sva domains and their
PASID are separate objects such that their own life cycles could be
handled independently, an enqcmd use case may require releasing the
PASID in releasing the mm (i.e., once a PASID is allocated for a mm, it
will be permanently used by the mm and won't be released until the end
of mm) and only allows to drop the PASID after the sva domains are
released. To this end, mmgrab() is called in iommu_sva_domain_alloc() to
increment the mm reference count and mmdrop() is invoked in
iommu_domain_free() to decrement the mm reference count.

Since the required info of PASID and sva domains is kept in struct
iommu_mm_data of a mm, use mm->iommu_mm field instead of the old pasid
field in mm struct. The sva domain list is protected by iommu_sva_lock.

Besides, this patch removes mm_pasid_init(), as with the introduced
iommu_mm structure, initializing mm pasid in mm_init() is unnecessary.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231027000525.1278806-6-tina.zhang@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index f7b1b469e98d..c6bbbe0901d0 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -121,6 +121,11 @@ struct iommu_domain {
 		struct {	/* IOMMU_DOMAIN_SVA */
 			struct mm_struct *mm;
 			int users;
+			/*
+			 * Next iommu_domain in mm->iommu_mm->sva-domains list
+			 * protected by iommu_sva_lock.
+			 */
+			struct list_head next;
 		};
 	};
 };
@@ -1345,16 +1350,28 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream
 #ifdef CONFIG_IOMMU_MM_DATA
 static inline void mm_pasid_init(struct mm_struct *mm)
 {
-	mm->pasid = IOMMU_PASID_INVALID;
+	/*
+	 * During dup_mm(), a new mm will be memcpy'd from an old one and that makes
+	 * the new mm and the old one point to a same iommu_mm instance. When either
+	 * one of the two mms gets released, the iommu_mm instance is freed, leaving
+	 * the other mm running into a use-after-free/double-free problem. To avoid
+	 * the problem, zeroing the iommu_mm pointer of a new mm is needed here.
+	 */
+	mm->iommu_mm = NULL;
 }
+
 static inline bool mm_valid_pasid(struct mm_struct *mm)
 {
-	return mm->pasid != IOMMU_PASID_INVALID;
+	return READ_ONCE(mm->iommu_mm);
 }
 
 static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
 {
-	return mm->pasid;
+	struct iommu_mm_data *iommu_mm = READ_ONCE(mm->iommu_mm);
+
+	if (!iommu_mm)
+		return IOMMU_PASID_INVALID;
+	return iommu_mm->pasid;
 }
 
 void mm_pasid_drop(struct mm_struct *mm);
-- 
cgit v1.2.3


From 1fa05c932dc71c474da38e4fd0456131128f8486 Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Fri, 27 Oct 2023 08:05:25 +0800
Subject: mm: Deprecate pasid field

Drop the pasid field, as all the information needed for sva domain
management has been moved to the newly added iommu_mm field.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231027000525.1278806-7-tina.zhang@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/mm_types.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0b4314fab478..ec71c91e210b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -940,7 +940,6 @@ struct mm_struct {
 		struct work_struct async_put_work;
 
 #ifdef CONFIG_IOMMU_MM_DATA
-		u32 pasid;
 		struct iommu_mm_data *iommu_mm;
 #endif
 #ifdef CONFIG_KSM
-- 
cgit v1.2.3


From 4720287c7bf76e59d19d4dfbdc3f54eeea6fd46b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 7 Dec 2023 14:03:08 -0400
Subject: iommu: Remove struct iommu_ops *iommu from arch_setup_dma_ops()

This is not being used to pass ops, it is just a way to tell if an
iommu driver was probed. These days this can be detected directly via
device_iommu_mapped(). Call device_iommu_mapped() in the two places that
need to check it and remove the iommu parameter everywhere.

Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Moritz Fischer <mdf@kernel.org>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Rob Herring <robh@kernel.org>
Tested-by: Hector Martin <marcan@marcan.st>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/dma-map-ops.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index a52e508d1869..e9cc317e9d7d 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -427,10 +427,10 @@ bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg,
 
 #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-		const struct iommu_ops *iommu, bool coherent);
+		bool coherent);
 #else
 static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
-		u64 size, const struct iommu_ops *iommu, bool coherent)
+		u64 size, bool coherent)
 {
 }
 #endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */
-- 
cgit v1.2.3


From 6ff6e184f1f4d4993d45ca3f934c8288890965fe Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 7 Dec 2023 14:03:09 -0400
Subject: iommmu/of: Do not return struct iommu_ops from of_iommu_configure()

Nothing needs this pointer. Return a normal error code with the usual
IOMMU semantic that ENODEV means 'there is no IOMMU driver'.

Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Rob Herring <robh@kernel.org>
Tested-by: Hector Martin <marcan@marcan.st>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/of_iommu.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h
index 9a5e6b410dd2..e61cbbe12dac 100644
--- a/include/linux/of_iommu.h
+++ b/include/linux/of_iommu.h
@@ -8,20 +8,19 @@ struct iommu_ops;
 
 #ifdef CONFIG_OF_IOMMU
 
-extern const struct iommu_ops *of_iommu_configure(struct device *dev,
-					struct device_node *master_np,
-					const u32 *id);
+extern int of_iommu_configure(struct device *dev, struct device_node *master_np,
+			      const u32 *id);
 
 extern void of_iommu_get_resv_regions(struct device *dev,
 				      struct list_head *list);
 
 #else
 
-static inline const struct iommu_ops *of_iommu_configure(struct device *dev,
-					 struct device_node *master_np,
-					 const u32 *id)
+static inline int of_iommu_configure(struct device *dev,
+				     struct device_node *master_np,
+				     const u32 *id)
 {
-	return NULL;
+	return -ENODEV;
 }
 
 static inline void of_iommu_get_resv_regions(struct device *dev,
-- 
cgit v1.2.3


From eda1a94caf6b05482bbf57dc244e7a31a9dba77c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 7 Dec 2023 14:03:12 -0400
Subject: iommu: Mark dev_iommu_priv_set() with a lockdep

A perfect driver would only call dev_iommu_priv_set() from its probe
callback. We've made it functionally correct to call it from the of_xlate
by adding a lock around that call.

lockdep assert that iommu_probe_device_lock is held to discourage misuse.

Exclude PPC kernels with CONFIG_FSL_PAMU turned on because FSL_PAMU uses a
global static for its priv and abuses priv for its domain.

Remove the pointless stores of NULL, all these are on paths where the core
code will free dev->iommu after the op returns.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Tested-by: Hector Martin <marcan@marcan.st>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/5-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c6bbbe0901d0..3a556996fea7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -850,10 +850,7 @@ static inline void *dev_iommu_priv_get(struct device *dev)
 		return NULL;
 }
 
-static inline void dev_iommu_priv_set(struct device *dev, void *priv)
-{
-	dev->iommu->priv = priv;
-}
+void dev_iommu_priv_set(struct device *dev, void *priv);
 
 extern struct mutex iommu_probe_device_lock;
 int iommu_probe_device(struct device *dev);
-- 
cgit v1.2.3


From 62e1f212e5fe7624249212813ee96202e0c31430 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 11 Dec 2023 16:13:14 +0000
Subject: arm: perf/kvm: Use GENMASK for ARMV8_PMU_PMCR_N

This is so that FIELD_GET and FIELD_PREP can be used and that the fields
are in a consistent format to arm64/tools/sysreg

Signed-off-by: James Clark <james.clark@arm.com>
Link: https://lore.kernel.org/r/20231211161331.1277825-3-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmuv3.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 9c226adf938a..ed62bd75cec7 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -215,8 +215,7 @@
 #define ARMV8_PMU_PMCR_DP	(1 << 5) /* Disable CCNT if non-invasive debug*/
 #define ARMV8_PMU_PMCR_LC	(1 << 6) /* Overflow on 64 bit cycle counter */
 #define ARMV8_PMU_PMCR_LP	(1 << 7) /* Long event counter enable */
-#define ARMV8_PMU_PMCR_N_SHIFT	11  /* Number of counters supported */
-#define ARMV8_PMU_PMCR_N_MASK	0x1f
+#define ARMV8_PMU_PMCR_N	GENMASK(15, 11) /* Number of counters supported */
 #define ARMV8_PMU_PMCR_MASK	0xff    /* Mask for writable bits */
 
 /*
-- 
cgit v1.2.3


From 2f6a00f30600417ee2737f2b1229c75663f1e3c9 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 11 Dec 2023 16:13:15 +0000
Subject: arm: perf: Use GENMASK for PMMIR fields

This is so that FIELD_GET and FIELD_PREP can be used and that the fields
are in a consistent format to arm64/tools/sysreg

Signed-off-by: James Clark <james.clark@arm.com>
Link: https://lore.kernel.org/r/20231211161331.1277825-4-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmuv3.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index ed62bd75cec7..1bc7678c10d4 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -250,12 +250,9 @@
 #define ARMV8_PMU_USERENR_ER	(1 << 3) /* Event counter can be read at EL0 */
 
 /* PMMIR_EL1.SLOTS mask */
-#define ARMV8_PMU_SLOTS_MASK	0xff
-
-#define ARMV8_PMU_BUS_SLOTS_SHIFT 8
-#define ARMV8_PMU_BUS_SLOTS_MASK 0xff
-#define ARMV8_PMU_BUS_WIDTH_SHIFT 16
-#define ARMV8_PMU_BUS_WIDTH_MASK 0xf
+#define ARMV8_PMU_SLOTS		GENMASK(7, 0)
+#define ARMV8_PMU_BUS_SLOTS	GENMASK(15, 8)
+#define ARMV8_PMU_BUS_WIDTH	GENMASK(19, 16)
 
 /*
  * This code is really good
-- 
cgit v1.2.3


From d30f09b6d7de5d159dbb537f9d67dceb67409420 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 11 Dec 2023 16:13:16 +0000
Subject: arm: perf: Convert remaining fields to use GENMASK

Convert the remaining fields to use either GENMASK or be built from
other fields. These all already started at bit 0 so don't need a code
change for the lack of _SHIFT.

Signed-off-by: James Clark <james.clark@arm.com>
Link: https://lore.kernel.org/r/20231211161331.1277825-5-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmuv3.h | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 1bc7678c10d4..daa63542242d 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -216,19 +216,25 @@
 #define ARMV8_PMU_PMCR_LC	(1 << 6) /* Overflow on 64 bit cycle counter */
 #define ARMV8_PMU_PMCR_LP	(1 << 7) /* Long event counter enable */
 #define ARMV8_PMU_PMCR_N	GENMASK(15, 11) /* Number of counters supported */
-#define ARMV8_PMU_PMCR_MASK	0xff    /* Mask for writable bits */
+/* Mask for writable bits */
+#define ARMV8_PMU_PMCR_MASK	(ARMV8_PMU_PMCR_E | ARMV8_PMU_PMCR_P | \
+				 ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_D | \
+				 ARMV8_PMU_PMCR_X | ARMV8_PMU_PMCR_DP | \
+				 ARMV8_PMU_PMCR_LC | ARMV8_PMU_PMCR_LP)
 
 /*
  * PMOVSR: counters overflow flag status reg
  */
-#define ARMV8_PMU_OVSR_MASK		0xffffffff	/* Mask for writable bits */
-#define ARMV8_PMU_OVERFLOWED_MASK	ARMV8_PMU_OVSR_MASK
+#define ARMV8_PMU_OVSR_P		GENMASK(30, 0)
+#define ARMV8_PMU_OVSR_C		BIT(31)
+/* Mask for writable bits is both P and C fields */
+#define ARMV8_PMU_OVERFLOWED_MASK	(ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C)
 
 /*
  * PMXEVTYPER: Event selection reg
  */
 #define ARMV8_PMU_EVTYPE_MASK	0xc800ffff	/* Mask for writable bits */
-#define ARMV8_PMU_EVTYPE_EVENT	0xffff		/* Mask for EVENT bits */
+#define ARMV8_PMU_EVTYPE_EVENT	GENMASK(15, 0)	/* Mask for EVENT bits */
 
 /*
  * Event filters for PMUv3
@@ -243,11 +249,13 @@
 /*
  * PMUSERENR: user enable reg
  */
-#define ARMV8_PMU_USERENR_MASK	0xf		/* Mask for writable bits */
 #define ARMV8_PMU_USERENR_EN	(1 << 0) /* PMU regs can be accessed at EL0 */
 #define ARMV8_PMU_USERENR_SW	(1 << 1) /* PMSWINC can be written at EL0 */
 #define ARMV8_PMU_USERENR_CR	(1 << 2) /* Cycle counter can be read at EL0 */
 #define ARMV8_PMU_USERENR_ER	(1 << 3) /* Event counter can be read at EL0 */
+/* Mask for writable bits */
+#define ARMV8_PMU_USERENR_MASK	(ARMV8_PMU_USERENR_EN | ARMV8_PMU_USERENR_SW | \
+				 ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_ER)
 
 /* PMMIR_EL1.SLOTS mask */
 #define ARMV8_PMU_SLOTS		GENMASK(7, 0)
-- 
cgit v1.2.3


From 3115ee021bfb04efde2e96507bfcc1330261a6a1 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 11 Dec 2023 16:13:17 +0000
Subject: arm64: perf: Include threshold control fields in PMEVTYPER mask

FEAT_PMUv3_TH (Armv8.8) adds two new fields to PMEVTYPER, so include
them in the mask. These aren't writable on 32 bit kernels as they are in
the high part of the register, so only include them for arm64.

It would be difficult to do this statically in the asm header files for
each platform without resulting in circular includes or #ifdefs inline
in the code. For that reason the ARMV8_PMU_EVTYPE_MASK definition has
been removed and the mask is constructed programmatically.

Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: James Clark <james.clark@arm.com>
Link: https://lore.kernel.org/r/20231211161331.1277825-6-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmuv3.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index daa63542242d..91957b3468e9 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -233,8 +233,9 @@
 /*
  * PMXEVTYPER: Event selection reg
  */
-#define ARMV8_PMU_EVTYPE_MASK	0xc800ffff	/* Mask for writable bits */
 #define ARMV8_PMU_EVTYPE_EVENT	GENMASK(15, 0)	/* Mask for EVENT bits */
+#define ARMV8_PMU_EVTYPE_TH	GENMASK(43, 32)
+#define ARMV8_PMU_EVTYPE_TC	GENMASK(63, 61)
 
 /*
  * Event filters for PMUv3
-- 
cgit v1.2.3


From f6da86969a3c284466ab6080764b2ed91689f262 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 11 Dec 2023 16:13:18 +0000
Subject: arm: pmu: Share user ABI format mechanism with SPE

This mechanism makes it much easier to define and read new attributes
so move it to the arm_pmu.h header so that it can be shared. At the same
time update the existing format attributes to use it.

GENMASK has to be changed to GENMASK_ULL because the config fields are
64 bits even on arm32 where this will also be used now.

Signed-off-by: James Clark <james.clark@arm.com>
Link: https://lore.kernel.org/r/20231211161331.1277825-7-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmu.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index e2503d48ddee..b3b34f6670cf 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -183,4 +183,26 @@ void armpmu_free_irq(int irq, int cpu);
 #define ARMV8_SPE_PDEV_NAME "arm,spe-v1"
 #define ARMV8_TRBE_PDEV_NAME "arm,trbe"
 
+/* Why does everything I do descend into this? */
+#define __GEN_PMU_FORMAT_ATTR(cfg, lo, hi)				\
+	(lo) == (hi) ? #cfg ":" #lo "\n" : #cfg ":" #lo "-" #hi
+
+#define _GEN_PMU_FORMAT_ATTR(cfg, lo, hi)				\
+	__GEN_PMU_FORMAT_ATTR(cfg, lo, hi)
+
+#define GEN_PMU_FORMAT_ATTR(name)					\
+	PMU_FORMAT_ATTR(name,						\
+	_GEN_PMU_FORMAT_ATTR(ATTR_CFG_FLD_##name##_CFG,			\
+			     ATTR_CFG_FLD_##name##_LO,			\
+			     ATTR_CFG_FLD_##name##_HI))
+
+#define _ATTR_CFG_GET_FLD(attr, cfg, lo, hi)				\
+	((((attr)->cfg) >> lo) & GENMASK_ULL(hi - lo, 0))
+
+#define ATTR_CFG_GET_FLD(attr, name)					\
+	_ATTR_CFG_GET_FLD(attr,						\
+			  ATTR_CFG_FLD_##name##_CFG,			\
+			  ATTR_CFG_FLD_##name##_LO,			\
+			  ATTR_CFG_FLD_##name##_HI)
+
 #endif /* __ARM_PMU_H__ */
-- 
cgit v1.2.3


From 816c26754447e8b28d6c604e1f5b1d205b2586ee Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 11 Dec 2023 16:13:22 +0000
Subject: arm64: perf: Add support for event counting threshold

FEAT_PMUv3_TH (Armv8.8) permits a PMU counter to increment only on
events whose count meets a specified threshold condition. For example if
PMEVTYPERn.TC (Threshold Control) is set to 0b101 (Greater than or
equal, count), and the threshold is set to 2, then the PMU counter will
now only increment by 1 when an event would have previously incremented
the PMU counter by 2 or more on a single processor cycle.

Three new Perf event config fields, 'threshold', 'threshold_compare' and
'threshold_count' have been added to control the feature.
threshold_compare maps to the upper two bits of PMEVTYPERn.TC and
threshold_count maps to the first bit of TC. These separate attributes
have been picked rather than enumerating all the possible combinations
of the TC field as in the Arm ARM. The attributes would be used on a
Perf command line like this:

  $ perf stat -e stall_slot/threshold=2,threshold_compare=2/

A new capability for reading out the maximum supported threshold value
has also been added:

  $ cat /sys/bus/event_source/devices/armv8_pmuv3/caps/threshold_max

  0x000000ff

If a threshold higher than threshold_max is provided, then an error is
generated. If FEAT_PMUv3_TH isn't implemented or a 32 bit kernel is
running, then threshold_max reads zero, and attempting to set a
threshold value will also result in an error.

The threshold is per PMU counter, and there are potentially different
threshold_max values per PMU type on heterogeneous systems.

Bits higher than 32 now need to be written into PMEVTYPER, so
armv8pmu_write_evtype() has to be updated to take an unsigned long value
rather than u32 which gives the correct behavior on both aarch32 and 64.

Signed-off-by: James Clark <james.clark@arm.com>
Link: https://lore.kernel.org/r/20231211161331.1277825-11-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmuv3.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 91957b3468e9..0f4d62ef3a9a 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -262,6 +262,7 @@
 #define ARMV8_PMU_SLOTS		GENMASK(7, 0)
 #define ARMV8_PMU_BUS_SLOTS	GENMASK(15, 8)
 #define ARMV8_PMU_BUS_WIDTH	GENMASK(19, 16)
+#define ARMV8_PMU_THWIDTH	GENMASK(23, 20)
 
 /*
  * This code is really good
-- 
cgit v1.2.3


From 253ca8678d30bcf94410b54476fc1e0f1627a137 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 26 Nov 2023 12:24:38 -0800
Subject: Improve __fget_files_rcu() code generation (and thus __fget_light())

Commit 0ede61d8589c ("file: convert to SLAB_TYPESAFE_BY_RCU") caused a
performance regression as reported by the kernel test robot.

The __fget_light() function is one of those critical ones for some
loads, and the code generation was unnecessarily impacted.  Let's just
write that function to better.

Reported-by: kernel test robot <oliver.sang@intel.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Closes: https://lore.kernel.org/oe-lkp/202311201406.2022ca3f-oliver.sang@intel.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/CAHk-=wiCJtLbFWNURB34b9a_R_unaH3CiMRXfkR0-iihB_z68A@mail.gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fdtable.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index bc4c3287a65e..80bd7789bab1 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -83,12 +83,17 @@ struct dentry;
 static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
 {
 	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
-
-	if (fd < fdt->max_fds) {
-		fd = array_index_nospec(fd, fdt->max_fds);
-		return rcu_dereference_raw(fdt->fd[fd]);
-	}
-	return NULL;
+	unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds);
+	struct file *needs_masking;
+
+	/*
+	 * 'mask' is zero for an out-of-bounds fd, all ones for ok.
+	 * 'fd&mask' is 'fd' for ok, or 0 for out of bounds.
+	 *
+	 * Accessing fdt->fd[0] is ok, but needs masking of the result.
+	 */
+	needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]);
+	return (struct file *)(mask & (unsigned long)needs_masking);
 }
 
 static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd)
-- 
cgit v1.2.3


From a88c955fcfb49727d0ed86b47410f6555a8e69e4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 30 Nov 2023 13:49:07 +0100
Subject: file: s/close_fd_get_file()/file_close_fd()/g

That really shouldn't have "get" in there as that implies we're bumping
the reference count which we don't do at all. We used to but not anmore.
Now we're just closing the fd and pick that file from the fdtable
without bumping the reference count. Update the wrong documentation
while at it.

Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-1-e73ca6f4ea83@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fdtable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 80bd7789bab1..78c8326d74ae 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -119,7 +119,7 @@ int iterate_fd(struct files_struct *, unsigned,
 
 extern int close_fd(unsigned int fd);
 extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
-extern struct file *close_fd_get_file(unsigned int fd);
+extern struct file *file_close_fd(unsigned int fd);
 extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
 		      struct files_struct **new_fdp);
 
-- 
cgit v1.2.3


From 372a34e66fb7f95124fadae9c600b231c35696a7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 30 Nov 2023 13:49:09 +0100
Subject: fs: replace f_rcuhead with f_task_work

The naming is actively misleading since we switched to
SLAB_TYPESAFE_BY_RCU. rcu_head is #define callback_head. Use
callback_head directly and rename f_rcuhead to f_task_work.

Add comments in there to explain what it's used for.

Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-3-e73ca6f4ea83@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..354fd02e0e11 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -991,8 +991,10 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
  */
 struct file {
 	union {
+		/* fput() uses task work when closing and freeing file (default). */
+		struct callback_head 	f_task_work;
+		/* fput() must use workqueue (most kernel threads). */
 		struct llist_node	f_llist;
-		struct rcu_head 	f_rcuhead;
 		unsigned int 		f_iocb_flags;
 	};
 
-- 
cgit v1.2.3


From eac9189c96196574a83a553ca5a7543dd9f5fe3e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 30 Nov 2023 13:49:10 +0100
Subject: file: stop exposing receive_fd_user()

Not every subsystem needs to have their own specialized helper.
Just us the __receive_fd() helper.

Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-4-e73ca6f4ea83@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/file.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/file.h b/include/linux/file.h
index 6e9099d29343..c0d5219c2852 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -101,13 +101,6 @@ extern int __receive_fd(struct file *file, int __user *ufd,
 
 extern int receive_fd(struct file *file, unsigned int o_flags);
 
-static inline int receive_fd_user(struct file *file, int __user *ufd,
-				  unsigned int o_flags)
-{
-	if (ufd == NULL)
-		return -EFAULT;
-	return __receive_fd(file, ufd, o_flags);
-}
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);
 
 extern void flush_delayed_fput(void);
-- 
cgit v1.2.3


From 4e94ddfe2aab72139acb8d5372fac9e6c3f3e383 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 30 Nov 2023 13:49:11 +0100
Subject: file: remove __receive_fd()

Honestly, there's little value in having a helper with and without that
int __user *ufd argument. It's just messy and doesn't really give us
anything. Just expose receive_fd() with that argument and get rid of
that helper.

Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-5-e73ca6f4ea83@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/file.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/file.h b/include/linux/file.h
index c0d5219c2852..6834a29338c4 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -96,10 +96,7 @@ DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
 
 extern void fd_install(unsigned int fd, struct file *file);
 
-extern int __receive_fd(struct file *file, int __user *ufd,
-			unsigned int o_flags);
-
-extern int receive_fd(struct file *file, unsigned int o_flags);
+int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags);
 
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);
 
-- 
cgit v1.2.3


From b66509b8497f2b002a2654e386a440f1274ddcc7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 1 Dec 2023 00:57:35 +0000
Subject: io_uring: split out cmd api into a separate header

linux/io_uring.h is slowly becoming a rubbish bin where we put
anything exposed to other subsystems. For instance, the task exit
hooks and io_uring cmd infra are completely orthogonal and don't need
each other's definitions. Start cleaning it up by splitting out all
command bits into a new header file.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7ec50bae6e21f371d3850796e716917fc141225a.1701391955.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h       | 89 +-----------------------------------------
 include/linux/io_uring/cmd.h   | 81 ++++++++++++++++++++++++++++++++++++++
 include/linux/io_uring_types.h | 20 ++++++++++
 3 files changed, 102 insertions(+), 88 deletions(-)
 create mode 100644 include/linux/io_uring/cmd.h

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index aefb73eeeebf..d8fc93492dc5 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -6,71 +6,13 @@
 #include <linux/xarray.h>
 #include <uapi/linux/io_uring.h>
 
-enum io_uring_cmd_flags {
-	IO_URING_F_COMPLETE_DEFER	= 1,
-	IO_URING_F_UNLOCKED		= 2,
-	/* the request is executed from poll, it should not be freed */
-	IO_URING_F_MULTISHOT		= 4,
-	/* executed by io-wq */
-	IO_URING_F_IOWQ			= 8,
-	/* int's last bit, sign checks are usually faster than a bit test */
-	IO_URING_F_NONBLOCK		= INT_MIN,
-
-	/* ctx state flags, for URING_CMD */
-	IO_URING_F_SQE128		= (1 << 8),
-	IO_URING_F_CQE32		= (1 << 9),
-	IO_URING_F_IOPOLL		= (1 << 10),
-
-	/* set when uring wants to cancel a previously issued command */
-	IO_URING_F_CANCEL		= (1 << 11),
-	IO_URING_F_COMPAT		= (1 << 12),
-};
-
-/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
-#define IORING_URING_CMD_CANCELABLE	(1U << 30)
-#define IORING_URING_CMD_POLLED		(1U << 31)
-
-struct io_uring_cmd {
-	struct file	*file;
-	const struct io_uring_sqe *sqe;
-	union {
-		/* callback to defer completions to task context */
-		void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
-		/* used for polled completion */
-		void *cookie;
-	};
-	u32		cmd_op;
-	u32		flags;
-	u8		pdu[32]; /* available inline for free use */
-};
-
-static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
-{
-	return sqe->cmd;
-}
-
 #if defined(CONFIG_IO_URING)
-int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
-			      struct iov_iter *iter, void *ioucmd);
-void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
-			unsigned issue_flags);
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
 void io_uring_unreg_ringfd(void);
 const char *io_uring_get_opcode(u8 opcode);
-void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
-			    void (*task_work_cb)(struct io_uring_cmd *, unsigned),
-			    unsigned flags);
-/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */
-void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned));
-
-static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
-}
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
 
 static inline void io_uring_files_cancel(void)
 {
@@ -89,28 +31,7 @@ static inline void io_uring_free(struct task_struct *tsk)
 	if (tsk->io_uring)
 		__io_uring_free(tsk);
 }
-int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
-void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
-		unsigned int issue_flags);
-struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
 #else
-static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
-			      struct iov_iter *iter, void *ioucmd)
-{
-	return -EOPNOTSUPP;
-}
-static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
-		ssize_t ret2, unsigned issue_flags)
-{
-}
-static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-}
-static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-}
 static inline struct sock *io_uring_get_socket(struct file *file)
 {
 	return NULL;
@@ -133,14 +54,6 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
 {
 	return -EOPNOTSUPP;
 }
-static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
-		unsigned int issue_flags)
-{
-}
-static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
-{
-	return NULL;
-}
 #endif
 
 #endif
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
new file mode 100644
index 000000000000..62fcfaf6fcc9
--- /dev/null
+++ b/include/linux/io_uring/cmd.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_IO_URING_CMD_H
+#define _LINUX_IO_URING_CMD_H
+
+#include <uapi/linux/io_uring.h>
+#include <linux/io_uring_types.h>
+
+/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
+#define IORING_URING_CMD_CANCELABLE	(1U << 30)
+#define IORING_URING_CMD_POLLED		(1U << 31)
+
+struct io_uring_cmd {
+	struct file	*file;
+	const struct io_uring_sqe *sqe;
+	union {
+		/* callback to defer completions to task context */
+		void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
+		/* used for polled completion */
+		void *cookie;
+	};
+	u32		cmd_op;
+	u32		flags;
+	u8		pdu[32]; /* available inline for free use */
+};
+
+static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
+{
+	return sqe->cmd;
+}
+
+#if defined(CONFIG_IO_URING)
+int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
+			      struct iov_iter *iter, void *ioucmd);
+void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
+			unsigned issue_flags);
+void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
+			    void (*task_work_cb)(struct io_uring_cmd *, unsigned),
+			    unsigned flags);
+/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */
+void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *, unsigned));
+
+static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
+}
+
+void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags);
+struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
+
+#else
+static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
+			      struct iov_iter *iter, void *ioucmd)
+{
+	return -EOPNOTSUPP;
+}
+static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
+		ssize_t ret2, unsigned issue_flags)
+{
+}
+static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+}
+static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+}
+static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags)
+{
+}
+static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+	return NULL;
+}
+#endif
+
+#endif /* _LINUX_IO_URING_CMD_H */
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 805bb635cdf5..8c807bcc8b2b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -7,6 +7,26 @@
 #include <linux/llist.h>
 #include <uapi/linux/io_uring.h>
 
+enum io_uring_cmd_flags {
+	IO_URING_F_COMPLETE_DEFER	= 1,
+	IO_URING_F_UNLOCKED		= 2,
+	/* the request is executed from poll, it should not be freed */
+	IO_URING_F_MULTISHOT		= 4,
+	/* executed by io-wq */
+	IO_URING_F_IOWQ			= 8,
+	/* int's last bit, sign checks are usually faster than a bit test */
+	IO_URING_F_NONBLOCK		= INT_MIN,
+
+	/* ctx state flags, for URING_CMD */
+	IO_URING_F_SQE128		= (1 << 8),
+	IO_URING_F_CQE32		= (1 << 9),
+	IO_URING_F_IOPOLL		= (1 << 10),
+
+	/* set when uring wants to cancel a previously issued command */
+	IO_URING_F_CANCEL		= (1 << 11),
+	IO_URING_F_COMPAT		= (1 << 12),
+};
+
 struct io_wq_work_node {
 	struct io_wq_work_node *next;
 };
-- 
cgit v1.2.3


From 6b04a3737057ddfed396c954f9e4be4fe6d53c62 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 1 Dec 2023 00:57:36 +0000
Subject: io_uring/cmd: inline io_uring_cmd_do_in_task_lazy

Now as we can easily include io_uring_types.h, move IOU_F_TWQ_LAZY_WAKE
and inline io_uring_cmd_do_in_task_lazy().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/2ec9fb31dd192d1c5cf26d0a2dec5657d88a8e48.1701391955.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring/cmd.h   | 31 ++++++++++++++++---------------
 include/linux/io_uring_types.h | 11 +++++++++++
 2 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 62fcfaf6fcc9..ee9b3bc3a4af 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -36,15 +36,6 @@ void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
 			    void (*task_work_cb)(struct io_uring_cmd *, unsigned),
 			    unsigned flags);
-/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */
-void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned));
-
-static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
-}
 
 void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 		unsigned int issue_flags);
@@ -60,12 +51,9 @@ static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
 		ssize_t ret2, unsigned issue_flags)
 {
 }
-static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-}
-static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
+			    void (*task_work_cb)(struct io_uring_cmd *, unsigned),
+			    unsigned flags)
 {
 }
 static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
@@ -78,4 +66,17 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd
 }
 #endif
 
+/* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */
+static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE);
+}
+
+static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
+}
+
 #endif /* _LINUX_IO_URING_CMD_H */
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 8c807bcc8b2b..bebab36abce8 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -7,6 +7,17 @@
 #include <linux/llist.h>
 #include <uapi/linux/io_uring.h>
 
+enum {
+	/*
+	 * A hint to not wake right away but delay until there are enough of
+	 * tw's queued to match the number of CQEs the task is waiting for.
+	 *
+	 * Must not be used wirh requests generating more than one CQE.
+	 * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
+	 */
+	IOU_F_TWQ_LAZY_WAKE			= 1,
+};
+
 enum io_uring_cmd_flags {
 	IO_URING_F_COMPLETE_DEFER	= 1,
 	IO_URING_F_UNLOCKED		= 2,
-- 
cgit v1.2.3


From 055c15626a45b1ebc9f2f34981e705e1af171236 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 1 Dec 2023 00:57:37 +0000
Subject: io_uring/cmd: inline io_uring_cmd_get_task

With io_uring_types.h we see all required definitions to inline
io_uring_cmd_get_task().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/aa8e317f09e651a5f3e72f8c0ad3902084c1f930.1701391955.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring/cmd.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index ee9b3bc3a4af..d69b4038aa3e 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -39,7 +39,6 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
 
 void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 		unsigned int issue_flags);
-struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
 
 #else
 static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -60,10 +59,6 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 		unsigned int issue_flags)
 {
 }
-static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
-{
-	return NULL;
-}
 #endif
 
 /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */
@@ -79,4 +74,9 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
 	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
 }
 
+static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+	return cmd_to_io_kiocb(cmd)->task;
+}
+
 #endif /* _LINUX_IO_URING_CMD_H */
-- 
cgit v1.2.3


From 0f292086c22b43202daffc14b585d3b54b9a1206 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 12 Dec 2023 11:44:36 +0200
Subject: splice: return type ssize_t from all helpers

Not sure why some splice helpers return long, maybe historic reasons.
Change them all to return ssize_t to conform to the splice methods and
to the rest of the helpers.

Suggested-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20231208-horchen-helium-d3ec1535ede5@brauner/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231212094440.250945-2-amir73il@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/splice.h | 43 +++++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/splice.h b/include/linux/splice.h
index 49532d5dda52..068a8e8ffd73 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -68,31 +68,30 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
 typedef int (splice_direct_actor)(struct pipe_inode_info *,
 				  struct splice_desc *);
 
-extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *,
-				loff_t *, size_t, unsigned int,
-				splice_actor *);
-extern ssize_t __splice_from_pipe(struct pipe_inode_info *,
-				  struct splice_desc *, splice_actor *);
-extern ssize_t splice_to_pipe(struct pipe_inode_info *,
-			      struct splice_pipe_desc *);
-extern ssize_t add_to_pipe(struct pipe_inode_info *,
-			      struct pipe_buffer *);
-long vfs_splice_read(struct file *in, loff_t *ppos,
-		     struct pipe_inode_info *pipe, size_t len,
-		     unsigned int flags);
+ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
+			 loff_t *ppos, size_t len, unsigned int flags,
+			 splice_actor *actor);
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
+			   struct splice_desc *sd, splice_actor *actor);
+ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
+			      struct splice_pipe_desc *spd);
+ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf);
+ssize_t vfs_splice_read(struct file *in, loff_t *ppos,
+			struct pipe_inode_info *pipe, size_t len,
+			unsigned int flags);
 ssize_t splice_direct_to_actor(struct file *file, struct splice_desc *sd,
 			       splice_direct_actor *actor);
-long do_splice(struct file *in, loff_t *off_in, struct file *out,
-	       loff_t *off_out, size_t len, unsigned int flags);
-long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-		      loff_t *opos, size_t len, unsigned int flags);
-long splice_file_range(struct file *in, loff_t *ppos, struct file *out,
-		       loff_t *opos, size_t len);
+ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out,
+		  loff_t *off_out, size_t len, unsigned int flags);
+ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+			 loff_t *opos, size_t len, unsigned int flags);
+ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
+			  loff_t *opos, size_t len);
 
-extern long do_tee(struct file *in, struct file *out, size_t len,
-		   unsigned int flags);
-extern ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
-				loff_t *ppos, size_t len, unsigned int flags);
+ssize_t do_tee(struct file *in, struct file *out, size_t len,
+	       unsigned int flags);
+ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
+			 loff_t *ppos, size_t len, unsigned int flags);
 
 /*
  * for dynamic pipe sizing
-- 
cgit v1.2.3


From 705bcfcbde38b9dd4db00cd3deb0b98bddb0dd4a Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 12 Dec 2023 11:44:37 +0200
Subject: fs: use splice_copy_file_range() inline helper

generic_copy_file_range() is just a wrapper around splice_file_range(),
which caps the maximum copy length.

The only caller of splice_file_range(), namely __ceph_copy_file_range()
is already ready to cope with short copy.

Move the length capping into splice_file_range() and replace the exported
symbol generic_copy_file_range() with a simple inline helper.

Suggested-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/linux-fsdevel/20231204083849.GC32438@lst.de/
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231212094440.250945-3-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h     | 3 ---
 include/linux/splice.h | 7 +++++++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 04422a0eccdd..900d0cd55b50 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2090,9 +2090,6 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
 				   loff_t, size_t, unsigned int);
-extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
-				       struct file *file_out, loff_t pos_out,
-				       size_t len, unsigned int flags);
 int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				    struct file *file_out, loff_t pos_out,
 				    loff_t *len, unsigned int remap_flags,
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 068a8e8ffd73..9dec4861d09f 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -88,6 +88,13 @@ ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
 			  loff_t *opos, size_t len);
 
+static inline long splice_copy_file_range(struct file *in, loff_t pos_in,
+					  struct file *out, loff_t pos_out,
+					  size_t len)
+{
+	return splice_file_range(in, &pos_in, out, &pos_out, len);
+}
+
 ssize_t do_tee(struct file *in, struct file *out, size_t len,
 	       unsigned int flags);
 ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
-- 
cgit v1.2.3


From 36e28c42187c95eb148873ffb059bfdcb8cdb75b Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 12 Dec 2023 11:44:38 +0200
Subject: fsnotify: split fsnotify_perm() into two hooks

We would like to make changes to the fsnotify access permission hook -
add file range arguments and add the pre modify event.

In preparation for these changes, split the fsnotify_perm() hook into
fsnotify_open_perm() and fsnotify_file_perm().

This is needed for fanotify "pre content" events.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231212094440.250945-4-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fsnotify.h | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index bcb6609b54b3..926bb4461b9e 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -100,29 +100,33 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
 	return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
 }
 
-/* Simple call site for access decisions */
-static inline int fsnotify_perm(struct file *file, int mask)
+/*
+ * fsnotify_file_perm - permission hook before file access
+ */
+static inline int fsnotify_file_perm(struct file *file, int perm_mask)
 {
-	int ret;
-	__u32 fsnotify_mask = 0;
+	__u32 fsnotify_mask = FS_ACCESS_PERM;
 
-	if (!(mask & (MAY_READ | MAY_OPEN)))
+	if (!(perm_mask & MAY_READ))
 		return 0;
 
-	if (mask & MAY_OPEN) {
-		fsnotify_mask = FS_OPEN_PERM;
+	return fsnotify_file(file, fsnotify_mask);
+}
 
-		if (file->f_flags & __FMODE_EXEC) {
-			ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);
+/*
+ * fsnotify_open_perm - permission hook before file open
+ */
+static inline int fsnotify_open_perm(struct file *file)
+{
+	int ret;
 
-			if (ret)
-				return ret;
-		}
-	} else if (mask & MAY_READ) {
-		fsnotify_mask = FS_ACCESS_PERM;
+	if (file->f_flags & __FMODE_EXEC) {
+		ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);
+		if (ret)
+			return ret;
 	}
 
-	return fsnotify_file(file, fsnotify_mask);
+	return fsnotify_file(file, FS_OPEN_PERM);
 }
 
 /*
-- 
cgit v1.2.3


From cb383f06686734ef04daf63a4369566800717b7b Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 12 Dec 2023 11:44:39 +0200
Subject: fsnotify: assert that file_start_write() is not held in permission
 hooks

filesystem may be modified in the context of fanotify permission events
(e.g. by HSM service), so assert that sb freeze protection is not held.

If the assertion fails, then the following deadlock would be possible:

CPU0				CPU1			CPU2
-------------------------------------------------------------------------
file_start_write()#0
...
  fsnotify_perm()
    fanotify_get_response() =>	(read event and fill file)
				...
				...			freeze_super()
				...			  sb_wait_write()
				...
				vfs_write()
				  file_start_write()#1

This example demonstrates a use case of an hierarchical storage management
(HSM) service that uses fanotify permission events to fill the content of
a file before access, while a 3rd process starts fsfreeze.

This creates a circular dependeny:
  file_start_write()#0 => fanotify_get_response =>
    file_start_write()#1 =>
      sb_wait_write() =>
        file_end_write()#0

Where file_end_write()#0 can never be called and none of the threads can
make progress.

The assertion is checked for both MAY_READ and MAY_WRITE permission
hooks in preparation for a pre-modify permission event.

The assertion is not checked for an open permission event, because
do_open() takes mnt_want_write() in O_TRUNC case, meaning that it is not
safe to write to filesystem in the content of an open permission event.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231212094440.250945-5-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fsnotify.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 926bb4461b9e..0a9d6a8a747a 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -107,6 +107,13 @@ static inline int fsnotify_file_perm(struct file *file, int perm_mask)
 {
 	__u32 fsnotify_mask = FS_ACCESS_PERM;
 
+	/*
+	 * filesystem may be modified in the context of permission events
+	 * (e.g. by HSM filling a file on access), so sb freeze protection
+	 * must not be held.
+	 */
+	lockdep_assert_once(file_write_not_started(file));
+
 	if (!(perm_mask & MAY_READ))
 		return 0;
 
-- 
cgit v1.2.3


From d9e5d31084b024734e64307521414ef0ae1d5333 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 12 Dec 2023 11:44:40 +0200
Subject: fsnotify: optionally pass access range in file permission hooks

In preparation for pre-content permission events with file access range,
move fsnotify_file_perm() hook out of security_file_permission() and into
the callers.

Callers that have the access range information call the new hook
fsnotify_file_area_perm() with the access range.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231212094440.250945-6-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fsnotify.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 0a9d6a8a747a..11e6434b8e71 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -101,9 +101,10 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
 }
 
 /*
- * fsnotify_file_perm - permission hook before file access
+ * fsnotify_file_area_perm - permission hook before access to file range
  */
-static inline int fsnotify_file_perm(struct file *file, int perm_mask)
+static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
+					  const loff_t *ppos, size_t count)
 {
 	__u32 fsnotify_mask = FS_ACCESS_PERM;
 
@@ -120,6 +121,14 @@ static inline int fsnotify_file_perm(struct file *file, int perm_mask)
 	return fsnotify_file(file, fsnotify_mask);
 }
 
+/*
+ * fsnotify_file_perm - permission hook before file access
+ */
+static inline int fsnotify_file_perm(struct file *file, int perm_mask)
+{
+	return fsnotify_file_area_perm(file, perm_mask, NULL, 0);
+}
+
 /*
  * fsnotify_open_perm - permission hook before file open
  */
-- 
cgit v1.2.3


From bf857ddd21d0bffc1edafc317e8e2ce0d6d5950c Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Wed, 1 Nov 2023 13:16:20 -0400
Subject: maple_tree: move debug check to __mas_set_range()

__mas_set_range() was created to shortcut resetting the maple state and a
debug check was added to the caller (the vma iterator) to ensure the
internal maple state remains safe to use.  Move the debug check from the
vma iterator into the maple tree itself so other users do not incorrectly
use the advanced maple state modification.

Fallout from this change include a large amount of debug setup needed to
be moved to earlier in the header, and the maple_tree.h radix-tree test
code needed to move the inclusion of the header to after the atomic
define.  None of those changes have functional changes.

Link: https://lkml.kernel.org/r/20231101171629.3612299-4-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peng Zhang <zhangpeng.00@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 255 +++++++++++++++++++++++----------------------
 1 file changed, 129 insertions(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index a452dd8a1e5c..b5d5992578c9 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -557,6 +557,131 @@ static inline void mas_reset(struct ma_state *mas)
  */
 #define mas_for_each(__mas, __entry, __max) \
 	while (((__entry) = mas_find((__mas), (__max))) != NULL)
+
+#ifdef CONFIG_DEBUG_MAPLE_TREE
+enum mt_dump_format {
+	mt_dump_dec,
+	mt_dump_hex,
+};
+
+extern atomic_t maple_tree_tests_run;
+extern atomic_t maple_tree_tests_passed;
+
+void mt_dump(const struct maple_tree *mt, enum mt_dump_format format);
+void mas_dump(const struct ma_state *mas);
+void mas_wr_dump(const struct ma_wr_state *wr_mas);
+void mt_validate(struct maple_tree *mt);
+void mt_cache_shrink(void);
+#define MT_BUG_ON(__tree, __x) do {					\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (__x) {							\
+		pr_info("BUG at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mt_dump(__tree, mt_dump_hex);				\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+} while (0)
+
+#define MAS_BUG_ON(__mas, __x) do {					\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (__x) {							\
+		pr_info("BUG at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mas_dump(__mas);					\
+		mt_dump((__mas)->tree, mt_dump_hex);			\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+} while (0)
+
+#define MAS_WR_BUG_ON(__wrmas, __x) do {				\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (__x) {							\
+		pr_info("BUG at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mas_wr_dump(__wrmas);					\
+		mas_dump((__wrmas)->mas);				\
+		mt_dump((__wrmas)->mas->tree, mt_dump_hex);		\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+} while (0)
+
+#define MT_WARN_ON(__tree, __x)  ({					\
+	int ret = !!(__x);						\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (ret) {							\
+		pr_info("WARN at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mt_dump(__tree, mt_dump_hex);				\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+	unlikely(ret);							\
+})
+
+#define MAS_WARN_ON(__mas, __x) ({					\
+	int ret = !!(__x);						\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (ret) {							\
+		pr_info("WARN at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mas_dump(__mas);					\
+		mt_dump((__mas)->tree, mt_dump_hex);			\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+	unlikely(ret);							\
+})
+
+#define MAS_WR_WARN_ON(__wrmas, __x) ({					\
+	int ret = !!(__x);						\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (ret) {							\
+		pr_info("WARN at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mas_wr_dump(__wrmas);					\
+		mas_dump((__wrmas)->mas);				\
+		mt_dump((__wrmas)->mas->tree, mt_dump_hex);		\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+	unlikely(ret);							\
+})
+#else
+#define MT_BUG_ON(__tree, __x)		BUG_ON(__x)
+#define MAS_BUG_ON(__mas, __x)		BUG_ON(__x)
+#define MAS_WR_BUG_ON(__mas, __x)	BUG_ON(__x)
+#define MT_WARN_ON(__tree, __x)		WARN_ON(__x)
+#define MAS_WARN_ON(__mas, __x)		WARN_ON(__x)
+#define MAS_WR_WARN_ON(__mas, __x)	WARN_ON(__x)
+#endif /* CONFIG_DEBUG_MAPLE_TREE */
+
 /**
  * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the
  * current location.
@@ -570,6 +695,9 @@ static inline void mas_reset(struct ma_state *mas)
 static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
 		unsigned long last)
 {
+	/* Ensure the range starts within the current slot */
+	MAS_WARN_ON(mas, mas_is_active(mas) &&
+		   (mas->index > start || mas->last < start));
 	mas->index = start;
 	mas->last = last;
 }
@@ -587,8 +715,8 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
 static inline
 void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
 {
-	__mas_set_range(mas, start, last);
 	mas->node = MAS_START;
+	__mas_set_range(mas, start, last);
 }
 
 /**
@@ -713,129 +841,4 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);
 	for (__entry = mt_find(__tree, &(__index), __max); \
 		__entry; __entry = mt_find_after(__tree, &(__index), __max))
 
-
-#ifdef CONFIG_DEBUG_MAPLE_TREE
-enum mt_dump_format {
-	mt_dump_dec,
-	mt_dump_hex,
-};
-
-extern atomic_t maple_tree_tests_run;
-extern atomic_t maple_tree_tests_passed;
-
-void mt_dump(const struct maple_tree *mt, enum mt_dump_format format);
-void mas_dump(const struct ma_state *mas);
-void mas_wr_dump(const struct ma_wr_state *wr_mas);
-void mt_validate(struct maple_tree *mt);
-void mt_cache_shrink(void);
-#define MT_BUG_ON(__tree, __x) do {					\
-	atomic_inc(&maple_tree_tests_run);				\
-	if (__x) {							\
-		pr_info("BUG at %s:%d (%u)\n",				\
-		__func__, __LINE__, __x);				\
-		mt_dump(__tree, mt_dump_hex);				\
-		pr_info("Pass: %u Run:%u\n",				\
-			atomic_read(&maple_tree_tests_passed),		\
-			atomic_read(&maple_tree_tests_run));		\
-		dump_stack();						\
-	} else {							\
-		atomic_inc(&maple_tree_tests_passed);			\
-	}								\
-} while (0)
-
-#define MAS_BUG_ON(__mas, __x) do {					\
-	atomic_inc(&maple_tree_tests_run);				\
-	if (__x) {							\
-		pr_info("BUG at %s:%d (%u)\n",				\
-		__func__, __LINE__, __x);				\
-		mas_dump(__mas);					\
-		mt_dump((__mas)->tree, mt_dump_hex);			\
-		pr_info("Pass: %u Run:%u\n",				\
-			atomic_read(&maple_tree_tests_passed),		\
-			atomic_read(&maple_tree_tests_run));		\
-		dump_stack();						\
-	} else {							\
-		atomic_inc(&maple_tree_tests_passed);			\
-	}								\
-} while (0)
-
-#define MAS_WR_BUG_ON(__wrmas, __x) do {				\
-	atomic_inc(&maple_tree_tests_run);				\
-	if (__x) {							\
-		pr_info("BUG at %s:%d (%u)\n",				\
-		__func__, __LINE__, __x);				\
-		mas_wr_dump(__wrmas);					\
-		mas_dump((__wrmas)->mas);				\
-		mt_dump((__wrmas)->mas->tree, mt_dump_hex);		\
-		pr_info("Pass: %u Run:%u\n",				\
-			atomic_read(&maple_tree_tests_passed),		\
-			atomic_read(&maple_tree_tests_run));		\
-		dump_stack();						\
-	} else {							\
-		atomic_inc(&maple_tree_tests_passed);			\
-	}								\
-} while (0)
-
-#define MT_WARN_ON(__tree, __x)  ({					\
-	int ret = !!(__x);						\
-	atomic_inc(&maple_tree_tests_run);				\
-	if (ret) {							\
-		pr_info("WARN at %s:%d (%u)\n",				\
-		__func__, __LINE__, __x);				\
-		mt_dump(__tree, mt_dump_hex);				\
-		pr_info("Pass: %u Run:%u\n",				\
-			atomic_read(&maple_tree_tests_passed),		\
-			atomic_read(&maple_tree_tests_run));		\
-		dump_stack();						\
-	} else {							\
-		atomic_inc(&maple_tree_tests_passed);			\
-	}								\
-	unlikely(ret);							\
-})
-
-#define MAS_WARN_ON(__mas, __x) ({					\
-	int ret = !!(__x);						\
-	atomic_inc(&maple_tree_tests_run);				\
-	if (ret) {							\
-		pr_info("WARN at %s:%d (%u)\n",				\
-		__func__, __LINE__, __x);				\
-		mas_dump(__mas);					\
-		mt_dump((__mas)->tree, mt_dump_hex);			\
-		pr_info("Pass: %u Run:%u\n",				\
-			atomic_read(&maple_tree_tests_passed),		\
-			atomic_read(&maple_tree_tests_run));		\
-		dump_stack();						\
-	} else {							\
-		atomic_inc(&maple_tree_tests_passed);			\
-	}								\
-	unlikely(ret);							\
-})
-
-#define MAS_WR_WARN_ON(__wrmas, __x) ({					\
-	int ret = !!(__x);						\
-	atomic_inc(&maple_tree_tests_run);				\
-	if (ret) {							\
-		pr_info("WARN at %s:%d (%u)\n",				\
-		__func__, __LINE__, __x);				\
-		mas_wr_dump(__wrmas);					\
-		mas_dump((__wrmas)->mas);				\
-		mt_dump((__wrmas)->mas->tree, mt_dump_hex);		\
-		pr_info("Pass: %u Run:%u\n",				\
-			atomic_read(&maple_tree_tests_passed),		\
-			atomic_read(&maple_tree_tests_run));		\
-		dump_stack();						\
-	} else {							\
-		atomic_inc(&maple_tree_tests_passed);			\
-	}								\
-	unlikely(ret);							\
-})
-#else
-#define MT_BUG_ON(__tree, __x)		BUG_ON(__x)
-#define MAS_BUG_ON(__mas, __x)		BUG_ON(__x)
-#define MAS_WR_BUG_ON(__mas, __x)	BUG_ON(__x)
-#define MT_WARN_ON(__tree, __x)		WARN_ON(__x)
-#define MAS_WARN_ON(__mas, __x)		WARN_ON(__x)
-#define MAS_WR_WARN_ON(__mas, __x)	WARN_ON(__x)
-#endif /* CONFIG_DEBUG_MAPLE_TREE */
-
 #endif /*_LINUX_MAPLE_TREE_H */
-- 
cgit v1.2.3


From 31c532a8af57513228c2b12d281104198ff412b8 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Wed, 1 Nov 2023 13:16:21 -0400
Subject: maple_tree: add end of node tracking to the maple state

Analysis of the mas_for_each() iteration showed that there is a
significant time spent finding the end of a node.  This time can be
greatly reduced if the end of the node is cached in the maple state.  Care
must be taken to update & invalidate as necessary.

Link: https://lkml.kernel.org/r/20231101171629.3612299-5-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peng Zhang <zhangpeng.00@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index b5d5992578c9..0b82efe0cf1e 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -393,6 +393,7 @@ struct ma_state {
 	unsigned char depth;		/* depth of tree descent during write */
 	unsigned char offset;
 	unsigned char mas_flags;
+	unsigned char end;		/* The end of the node */
 };
 
 struct ma_wr_state {
-- 
cgit v1.2.3


From 067311d33e650adfe7ae23765959ddcc1ba18510 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Wed, 1 Nov 2023 13:16:25 -0400
Subject: maple_tree: separate ma_state node from status

The maple tree node is overloaded to keep status as well as the active
node.  This, unfortunately, results in a re-walk on underflow or overflow.
Since the maple state has room, the status can be placed in its own enum
in the structure.  Once an underflow/overflow is detected, certain modes
can restore the status to active and others may need to re-walk just that
one node to see the entry.

The status being an enum has the benefit of detecting unhandled status in
switch statements.

[Liam.Howlett@oracle.com: fix comments about MAS_*]
  Link: https://lkml.kernel.org/r/20231106154124.614247-1-Liam.Howlett@oracle.com
[Liam.Howlett@oracle.com: update forking to separate maple state and node]
  Link: https://lkml.kernel.org/r/20231106154551.615042-1-Liam.Howlett@oracle.com
[Liam.Howlett@oracle.com: fix mas_prev() state separation code]
  Link: https://lkml.kernel.org/r/20231207193319.4025462-1-Liam.Howlett@oracle.com
Link: https://lkml.kernel.org/r/20231101171629.3612299-9-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peng Zhang <zhangpeng.00@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 87 ++++++++++++++++++++++++++--------------------
 include/linux/mm_types.h   |  3 +-
 2 files changed, 52 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 0b82efe0cf1e..4dd668f7b111 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -349,6 +349,36 @@ static inline bool mtree_empty(const struct maple_tree *mt)
 
 /* Advanced API */
 
+/*
+ * Maple State Status
+ * ma_active means the maple state is pointing to a node and offset and can
+ * continue operating on the tree.
+ * ma_start means we have not searched the tree.
+ * ma_root means we have searched the tree and the entry we found lives in
+ * the root of the tree (ie it has index 0, length 1 and is the only entry in
+ * the tree).
+ * ma_none means we have searched the tree and there is no node in the
+ * tree for this entry.  For example, we searched for index 1 in an empty
+ * tree.  Or we have a tree which points to a full leaf node and we
+ * searched for an entry which is larger than can be contained in that
+ * leaf node.
+ * ma_pause means the data within the maple state may be stale, restart the
+ * operation
+ * ma_overflow means the search has reached the upper limit of the search
+ * ma_underflow means the search has reached the lower limit of the search
+ * ma_error means there was an error, check the node for the error number.
+ */
+enum maple_status {
+	ma_active,
+	ma_start,
+	ma_root,
+	ma_none,
+	ma_pause,
+	ma_overflow,
+	ma_underflow,
+	ma_error,
+};
+
 /*
  * The maple state is defined in the struct ma_state and is used to keep track
  * of information during operations, and even between operations when using the
@@ -381,6 +411,13 @@ static inline bool mtree_empty(const struct maple_tree *mt)
  * When returning a value the maple state index and last respectively contain
  * the start and end of the range for the entry.  Ranges are inclusive in the
  * Maple Tree.
+ *
+ * The status of the state is used to determine how the next action should treat
+ * the state.  For instance, if the status is ma_start then the next action
+ * should start at the root of the tree and walk down.  If the status is
+ * ma_pause then the node may be stale data and should be discarded.  If the
+ * status is ma_overflow, then the last action hit the upper limit.
+ *
  */
 struct ma_state {
 	struct maple_tree *tree;	/* The tree we're operating in */
@@ -390,6 +427,7 @@ struct ma_state {
 	unsigned long min;		/* The minimum index of this node - implied pivot min */
 	unsigned long max;		/* The maximum index of this node - implied pivot max */
 	struct maple_alloc *alloc;	/* Allocated nodes for this operation */
+	enum maple_status status;	/* The status of the state (active, start, none, etc) */
 	unsigned char depth;		/* depth of tree descent during write */
 	unsigned char offset;
 	unsigned char mas_flags;
@@ -416,28 +454,12 @@ struct ma_wr_state {
 		spin_lock_nested(&((mas)->tree->ma_lock), subclass)
 #define mas_unlock(mas)         spin_unlock(&((mas)->tree->ma_lock))
 
-
 /*
  * Special values for ma_state.node.
- * MAS_START means we have not searched the tree.
- * MAS_ROOT means we have searched the tree and the entry we found lives in
- * the root of the tree (ie it has index 0, length 1 and is the only entry in
- * the tree).
- * MAS_NONE means we have searched the tree and there is no node in the
- * tree for this entry.  For example, we searched for index 1 in an empty
- * tree.  Or we have a tree which points to a full leaf node and we
- * searched for an entry which is larger than can be contained in that
- * leaf node.
  * MA_ERROR represents an errno.  After dropping the lock and attempting
  * to resolve the error, the walk would have to be restarted from the
  * top of the tree as the tree may have been modified.
  */
-#define MAS_START	((struct maple_enode *)1UL)
-#define MAS_ROOT	((struct maple_enode *)5UL)
-#define MAS_NONE	((struct maple_enode *)9UL)
-#define MAS_PAUSE	((struct maple_enode *)17UL)
-#define MAS_OVERFLOW	((struct maple_enode *)33UL)
-#define MAS_UNDERFLOW	((struct maple_enode *)65UL)
 #define MA_ERROR(err) \
 		((struct maple_enode *)(((unsigned long)err << 2) | 2UL))
 
@@ -446,7 +468,8 @@ struct ma_wr_state {
 		.tree = mt,						\
 		.index = first,						\
 		.last = end,						\
-		.node = MAS_START,					\
+		.node = NULL,						\
+		.status = ma_start,					\
 		.min = 0,						\
 		.max = ULONG_MAX,					\
 		.alloc = NULL,						\
@@ -477,7 +500,6 @@ void *mas_find_range(struct ma_state *mas, unsigned long max);
 void *mas_find_rev(struct ma_state *mas, unsigned long min);
 void *mas_find_range_rev(struct ma_state *mas, unsigned long max);
 int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
-bool mas_is_err(struct ma_state *mas);
 
 bool mas_nomem(struct ma_state *mas, gfp_t gfp);
 void mas_pause(struct ma_state *mas);
@@ -506,28 +528,18 @@ static inline void mas_init(struct ma_state *mas, struct maple_tree *tree,
 	mas->tree = tree;
 	mas->index = mas->last = addr;
 	mas->max = ULONG_MAX;
-	mas->node = MAS_START;
+	mas->status = ma_start;
+	mas->node = NULL;
 }
 
-/* Checks if a mas has not found anything */
-static inline bool mas_is_none(const struct ma_state *mas)
-{
-	return mas->node == MAS_NONE;
-}
-
-/* Checks if a mas has been paused */
-static inline bool mas_is_paused(const struct ma_state *mas)
+static inline bool mas_is_active(struct ma_state *mas)
 {
-	return mas->node == MAS_PAUSE;
+	return mas->status == ma_active;
 }
 
-/* Check if the mas is pointing to a node or not */
-static inline bool mas_is_active(struct ma_state *mas)
+static inline bool mas_is_err(struct ma_state *mas)
 {
-	if ((unsigned long)mas->node >= MAPLE_RESERVED_RANGE)
-		return true;
-
-	return false;
+	return mas->status == ma_error;
 }
 
 /**
@@ -540,9 +552,10 @@ static inline bool mas_is_active(struct ma_state *mas)
  *
  * Context: Any context.
  */
-static inline void mas_reset(struct ma_state *mas)
+static __always_inline void mas_reset(struct ma_state *mas)
 {
-	mas->node = MAS_START;
+	mas->status = ma_start;
+	mas->node = NULL;
 }
 
 /**
@@ -716,7 +729,7 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
 static inline
 void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
 {
-	mas->node = MAS_START;
+	mas_reset(mas);
 	__mas_set_range(mas, start, last);
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ef18d2b25378..a66534c78c4d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1071,7 +1071,8 @@ struct vma_iterator {
 		.mas = {						\
 			.tree = &(__mm)->mm_mt,				\
 			.index = __addr,				\
-			.node = MAS_START,				\
+			.node = NULL,					\
+			.status = ma_start,				\
 		},							\
 	}
 
-- 
cgit v1.2.3


From 0de56e38b307b0cb2ac825e8e7cb371a28daf844 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Wed, 1 Nov 2023 13:16:27 -0400
Subject: maple_tree: use maple state end for write operations

ma_wr_state was previously tracking the end of the node for writing.
Since the implementation of the ma_state end tracking, this is duplicated
work.  This patch removes the maple write state tracking of the end of the
node and uses the maple state end instead.

Link: https://lkml.kernel.org/r/20231101171629.3612299-11-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peng Zhang <zhangpeng.00@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 4dd668f7b111..b3d63123b945 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -441,7 +441,6 @@ struct ma_wr_state {
 	unsigned long r_max;		/* range max */
 	enum maple_type type;		/* mas->node type */
 	unsigned char offset_end;	/* The offset where the write ends */
-	unsigned char node_end;		/* mas->node end */
 	unsigned long *pivots;		/* mas->node->pivots pointer */
 	unsigned long end_piv;		/* The pivot at the offset end */
 	void __rcu **slots;		/* mas->node->slots pointer */
-- 
cgit v1.2.3


From 0a97c01cd20bb96359d8c9dedad92a061ed34e0b Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Thu, 30 Nov 2023 11:40:18 -0800
Subject: list_lru: allow explicit memcg and NUMA node selection

Patch series "workload-specific and memory pressure-driven zswap
writeback", v8.

There are currently several issues with zswap writeback:

1. There is only a single global LRU for zswap, making it impossible to
   perform worload-specific shrinking - an memcg under memory pressure
   cannot determine which pages in the pool it owns, and often ends up
   writing pages from other memcgs. This issue has been previously
   observed in practice and mitigated by simply disabling
   memcg-initiated shrinking:

   https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u

   But this solution leaves a lot to be desired, as we still do not
   have an avenue for an memcg to free up its own memory locked up in
   the zswap pool.

2. We only shrink the zswap pool when the user-defined limit is hit.
   This means that if we set the limit too high, cold data that are
   unlikely to be used again will reside in the pool, wasting precious
   memory. It is hard to predict how much zswap space will be needed
   ahead of time, as this depends on the workload (specifically, on
   factors such as memory access patterns and compressibility of the
   memory pages).

This patch series solves these issues by separating the global zswap LRU
into per-memcg and per-NUMA LRUs, and performs workload-specific (i.e
memcg- and NUMA-aware) zswap writeback under memory pressure.  The new
shrinker does not have any parameter that must be tuned by the user, and
can be opted in or out on a per-memcg basis.

As a proof of concept, we ran the following synthetic benchmark: build the
linux kernel in a memory-limited cgroup, and allocate some cold data in
tmpfs to see if the shrinker could write them out and improved the overall
performance.  Depending on the amount of cold data generated, we observe
from 14% to 35% reduction in kernel CPU time used in the kernel builds.


This patch (of 6):

The interface of list_lru is based on the assumption that the list node
and the data it represents belong to the same allocated on the correct
node/memcg.  While this assumption is valid for existing slab objects LRU
such as dentries and inodes, it is undocumented, and rather inflexible for
certain potential list_lru users (such as the upcoming zswap shrinker and
the THP shrinker).  It has caused us a lot of issues during our
development.

This patch changes list_lru interface so that the caller must explicitly
specify numa node and memcg when adding and removing objects.  The old
list_lru_add() and list_lru_del() are renamed to list_lru_add_obj() and
list_lru_del_obj(), respectively.

It also extends the list_lru API with a new function, list_lru_putback,
which undoes a previous list_lru_isolate call.  Unlike list_lru_add, it
does not increment the LRU node count (as list_lru_isolate does not
decrement the node count).  list_lru_putback also allows for explicit
memcg and NUMA node selection.

Link: https://lkml.kernel.org/r/20231130194023.4102148-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231130194023.4102148-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list_lru.h | 54 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index db86ad78d428..7675a48a0701 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -75,6 +75,8 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
  * list_lru_add: add an element to the lru list's tail
  * @lru: the lru pointer
  * @item: the item to be added.
+ * @nid: the node id of the sublist to add the item to.
+ * @memcg: the cgroup of the sublist to add the item to.
  *
  * If the element is already part of a list, this function returns doing
  * nothing. Therefore the caller does not need to keep state about whether or
@@ -87,12 +89,28 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
  *
  * Return: true if the list was updated, false otherwise
  */
-bool list_lru_add(struct list_lru *lru, struct list_head *item);
+bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
+		    struct mem_cgroup *memcg);
 
 /**
- * list_lru_del: delete an element to the lru list
+ * list_lru_add_obj: add an element to the lru list's tail
+ * @lru: the lru pointer
+ * @item: the item to be added.
+ *
+ * This function is similar to list_lru_add(), but the NUMA node and the
+ * memcg of the sublist is determined by @item list_head. This assumption is
+ * valid for slab objects LRU such as dentries, inodes, etc.
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_add_obj(struct list_lru *lru, struct list_head *item);
+
+/**
+ * list_lru_del: delete an element from the lru list
  * @lru: the lru pointer
  * @item: the item to be deleted.
+ * @nid: the node id of the sublist to delete the item from.
+ * @memcg: the cgroup of the sublist to delete the item from.
  *
  * This function works analogously as list_lru_add() in terms of list
  * manipulation. The comments about an element already pertaining to
@@ -100,7 +118,21 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
  *
  * Return: true if the list was updated, false otherwise
  */
-bool list_lru_del(struct list_lru *lru, struct list_head *item);
+bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
+		    struct mem_cgroup *memcg);
+
+/**
+ * list_lru_del_obj: delete an element from the lru list
+ * @lru: the lru pointer
+ * @item: the item to be deleted.
+ *
+ * This function is similar to list_lru_del(), but the NUMA node and the
+ * memcg of the sublist is determined by @item list_head. This assumption is
+ * valid for slab objects LRU such as dentries, inodes, etc.
+ *
+ * Return value: true if the list was updated, false otherwise.
+ */
+bool list_lru_del_obj(struct list_lru *lru, struct list_head *item);
 
 /**
  * list_lru_count_one: return the number of objects currently held by @lru
@@ -138,6 +170,22 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
 void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
 void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
 			   struct list_head *head);
+/**
+ * list_lru_putback: undo list_lru_isolate
+ * @lru: the lru pointer.
+ * @item: the item to put back.
+ * @nid: the node id of the sublist to put the item back to.
+ * @memcg: the cgroup of the sublist to put the item back to.
+ *
+ * Put back an isolated item into its original LRU. Note that unlike
+ * list_lru_add, this does not increment the node LRU count (as
+ * list_lru_isolate does not originally decrement this count).
+ *
+ * Since we might have dropped the LRU lock in between, recompute list_lru_one
+ * from the node's id and memcg.
+ */
+void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
+		      struct mem_cgroup *memcg);
 
 typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
 		struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
-- 
cgit v1.2.3


From fdc4161ff6a5e96222e159c1f1b28d31a985130d Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Thu, 30 Nov 2023 11:40:19 -0800
Subject: memcontrol: implement mem_cgroup_tryget_online()

This patch implements a helper function that try to get a reference to an
memcg's css, as well as checking if it is online.  This new function is
almost exactly the same as the existing mem_cgroup_tryget(), except for
the onlineness check.  In the !CONFIG_MEMCG case, it always returns true,
analogous to mem_cgroup_tryget().  This is useful for e.g to the new zswap
writeback scheme, where we need to select the next online memcg as a
candidate for the global limit reclaim.

Link: https://lkml.kernel.org/r/20231130194023.4102148-3-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7bdcf3020d7a..2bd7d14ace78 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -821,6 +821,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
 	return !memcg || css_tryget(&memcg->css);
 }
 
+static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
+{
+	return !memcg || css_tryget_online(&memcg->css);
+}
+
 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 	if (memcg)
@@ -1349,6 +1354,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
 	return true;
 }
 
+static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
+{
+	return true;
+}
+
 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
-- 
cgit v1.2.3


From a65b0e7607ccb5e5184591f73e48512f25c76061 Mon Sep 17 00:00:00 2001
From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Date: Thu, 30 Nov 2023 11:40:20 -0800
Subject: zswap: make shrinking memcg-aware

Currently, we only have a single global LRU for zswap.  This makes it
impossible to perform worload-specific shrinking - an memcg cannot
determine which pages in the pool it owns, and often ends up writing pages
from other memcgs.  This issue has been previously observed in practice
and mitigated by simply disabling memcg-initiated shrinking:

https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u

This patch fully resolves the issue by replacing the global zswap LRU
with memcg- and NUMA-specific LRUs, and modify the reclaim logic:

a) When a store attempt hits an memcg limit, it now triggers a
   synchronous reclaim attempt that, if successful, allows the new
   hotter page to be accepted by zswap.
b) If the store attempt instead hits the global zswap limit, it will
   trigger an asynchronous reclaim attempt, in which an memcg is
   selected for reclaim in a round-robin-like fashion.

[nphamcs@gmail.com: use correct function for the onlineness check, use mem_cgroup_iter_break()]
  Link: https://lkml.kernel.org/r/20231205195419.2563217-1-nphamcs@gmail.com
[nphamcs@gmail.com: drop the pool's reference at the end of the writeback step]
  Link: https://lkml.kernel.org/r/20231206030627.4155634-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231130194023.4102148-4-nphamcs@gmail.com
Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Co-developed-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 5 +++++
 include/linux/zswap.h      | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2bd7d14ace78..a308c8eacf20 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1192,6 +1192,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
 	return NULL;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
+{
+	return NULL;
+}
+
 static inline bool folio_memcg_kmem(struct folio *folio)
 {
 	return false;
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 2a60ce39cfde..e571e393669b 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -15,6 +15,7 @@ bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
 void zswap_swapon(int type);
 void zswap_swapoff(int type);
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 
 #else
 
@@ -31,6 +32,7 @@ static inline bool zswap_load(struct folio *folio)
 static inline void zswap_invalidate(int type, pgoff_t offset) {}
 static inline void zswap_swapon(int type) {}
 static inline void zswap_swapoff(int type) {}
+static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
 
 #endif
 
-- 
cgit v1.2.3


From 7108cc3f765cafd48a6a35f8add140beaecfa75b Mon Sep 17 00:00:00 2001
From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Date: Thu, 30 Nov 2023 11:40:21 -0800
Subject: mm: memcg: add per-memcg zswap writeback stat

Since zswap now writes back pages from memcg-specific LRUs, we now need a
new stat to show writebacks count for each memcg.

[nphamcs@gmail.com: rename ZSWP_WB to ZSWPWB]
  Link: https://lkml.kernel.org/r/20231205193307.2432803-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231130194023.4102148-5-nphamcs@gmail.com
Suggested-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vm_event_item.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index d1b847502f09..747943bc8cc2 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -142,6 +142,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_ZSWAP
 		ZSWPIN,
 		ZSWPOUT,
+		ZSWPWB,
 #endif
 #ifdef CONFIG_X86
 		DIRECT_MAP_LEVEL2_SPLIT,
-- 
cgit v1.2.3


From b5ba474f3f518701249598b35c581b92a3c95b48 Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Thu, 30 Nov 2023 11:40:23 -0800
Subject: zswap: shrink zswap pool based on memory pressure

Currently, we only shrink the zswap pool when the user-defined limit is
hit.  This means that if we set the limit too high, cold data that are
unlikely to be used again will reside in the pool, wasting precious
memory.  It is hard to predict how much zswap space will be needed ahead
of time, as this depends on the workload (specifically, on factors such as
memory access patterns and compressibility of the memory pages).

This patch implements a memcg- and NUMA-aware shrinker for zswap, that is
initiated when there is memory pressure.  The shrinker does not have any
parameter that must be tuned by the user, and can be opted in or out on a
per-memcg basis.

Furthermore, to make it more robust for many workloads and prevent
overshrinking (i.e evicting warm pages that might be refaulted into
memory), we build in the following heuristics:

* Estimate the number of warm pages residing in zswap, and attempt to
  protect this region of the zswap LRU.
* Scale the number of freeable objects by an estimate of the memory
  saving factor. The better zswap compresses the data, the fewer pages
  we will evict to swap (as we will otherwise incur IO for relatively
  small memory saving).
* During reclaim, if the shrinker encounters a page that is also being
  brought into memory, the shrinker will cautiously terminate its
  shrinking action, as this is a sign that it is touching the warmer
  region of the zswap LRU.

As a proof of concept, we ran the following synthetic benchmark: build the
linux kernel in a memory-limited cgroup, and allocate some cold data in
tmpfs to see if the shrinker could write them out and improved the overall
performance.  Depending on the amount of cold data generated, we observe
from 14% to 35% reduction in kernel CPU time used in the kernel builds.

[nphamcs@gmail.com: check shrinker enablement early, use less costly stat flushing]
  Link: https://lkml.kernel.org/r/20231206194456.3234203-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231130194023.4102148-7-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  2 ++
 include/linux/zswap.h  | 25 +++++++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 14faffa4354f..9ef9d010bff0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -22,6 +22,7 @@
 #include <linux/mm_types.h>
 #include <linux/page-flags.h>
 #include <linux/local_lock.h>
+#include <linux/zswap.h>
 #include <asm/page.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -641,6 +642,7 @@ struct lruvec {
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
 #endif
+	struct zswap_lruvec_state zswap_lruvec_state;
 };
 
 /* Isolate for asynchronous migration */
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index e571e393669b..08c240e16a01 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -5,20 +5,40 @@
 #include <linux/types.h>
 #include <linux/mm_types.h>
 
+struct lruvec;
+
 extern u64 zswap_pool_total_size;
 extern atomic_t zswap_stored_pages;
 
 #ifdef CONFIG_ZSWAP
 
+struct zswap_lruvec_state {
+	/*
+	 * Number of pages in zswap that should be protected from the shrinker.
+	 * This number is an estimate of the following counts:
+	 *
+	 * a) Recent page faults.
+	 * b) Recent insertion to the zswap LRU. This includes new zswap stores,
+	 *    as well as recent zswap LRU rotations.
+	 *
+	 * These pages are likely to be warm, and might incur IO if the are written
+	 * to swap.
+	 */
+	atomic_long_t nr_zswap_protected;
+};
+
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
 void zswap_swapon(int type);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
-
+void zswap_lruvec_state_init(struct lruvec *lruvec);
+void zswap_page_swapin(struct page *page);
 #else
 
+struct zswap_lruvec_state {};
+
 static inline bool zswap_store(struct folio *folio)
 {
 	return false;
@@ -33,7 +53,8 @@ static inline void zswap_invalidate(int type, pgoff_t offset) {}
 static inline void zswap_swapon(int type) {}
 static inline void zswap_swapoff(int type) {}
 static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
-
+static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
+static inline void zswap_page_swapin(struct page *page) {}
 #endif
 
 #endif /* _LINUX_ZSWAP_H */
-- 
cgit v1.2.3


From 9294a037c01564786abb15436529fae3863268a2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 30 Nov 2023 02:36:44 +0000
Subject: mm/damon/core: implement goal-oriented feedback-driven quota
 auto-tuning

Patch series "mm/damon: let users feed and tame/auto-tune DAMOS".

Introduce Aim-oriented Feedback-driven DAMOS Aggressiveness Auto-tuning.
It makes DAMOS self-tuned with periodic simple user feedback.

Background: DAMOS Control Difficulty
====================================

DAMOS helps users easily implement access pattern aware system operations.
However, controlling DAMOS in the wild is not that easy.

The basic way for DAMOS control is specifying the target access pattern.
In this approach, the user is assumed to well understand the access
pattern and the characteristics of the system and the workloads.  Though
there are useful tools for that, it takes time and effort depending on the
complexity and the dynamicity of the system and the workloads.  After all,
the access pattern consists of three ranges, namely the size, the access
rate, and the age of the regions.  It means users need to tune six
parameters, which is anyway not a simple task.

One of the worst cases would be DAMOS being too aggressive like a
berserker, and therefore consuming too much system resource and making
unwanted radical system operations.  To let users avoid such cases, DAMOS
allows users to set the upper-limit of the schemes' aggressiveness, namely
DAMOS quota.  DAMOS further provides its best-effort under the limit by
prioritizing regions based on the access pattern of the regions.  For
example, users can ask DAMOS to page out up to 100 MiB of memory regions
per second.  Then DAMOS pages out regions that are not accessed for a
longer time (colder) first under the limit.  This allows users to set the
target access pattern a bit naive with wider ranges, and focus on tuning
only one parameter, the quota.  In other words, the number of parameters
to tune can be reduced from six to one.

Still, however, the optimum value for the quota depends on the system and
the workloads' characteristics, so not that simple.  The number of
parameters to tune can also increase again if the user needs to run
multiple schemes.

Aim-oriented Feedback-driven DAMOS Aggressiveness Auto Tuning
=============================================================

Users would use DAMOS since they want to achieve something with it.  They
will likely have measurable metrics representing the achievement and the
target number of the metric like SLO, and continuously measure that
anyway.  While the additional cost of getting the information is nearly
zero, it could be useful for DAMOS to understand how appropriate its
current aggressiveness is set, and adjust it on its own to make the metric
value more close to the target.

Based on this idea, we introduce a new way of tuning DAMOS with nearly
zero additional effort, namely Aim-oriented Feedback-driven DAMOS
Aggressiveness Auto Tuning.  It asks users to provide feedback
representing how well DAMOS is doing relative to the users' aim.  Then
DAMOS adjusts its aggressiveness, specifically the quota that provides
the best effort result under the limit, based on the current level of
the aggressiveness and the users' feedback.

Implementation
==============

The implementation asks users to represent the feedback with score
numbers.  The scores could be anything including user-space specific
metrics including latency and throughput of special user-space workloads,
and system metrics including free memory ratio, memory pressure stall time
(PSI), and active to inactive LRU lists size ratio.  The feedback scores
and the aggressiveness of the given DAMOS scheme are assumed to be
positively proportional, though.  Selecting metrics of the assumption is
the users' responsibility.

The core logic uses the below simple feedback loop algorithm to calculate
the next aggressiveness level of the scheme from the current
aggressiveness level and the current feedback (target_score and
current_score).  It calculates the compensation for next aggressiveness as
a proportion of current aggressiveness and distance to the target score.
As a result, it arrives at the near-goal state in a short time using big
steps when it's far from the goal, but avoids making unnecessarily radical
changes that could turn out to be a bad decision using small steps when
its near to the goal.

    f(n) = max(1, f(n - 1) * ((target_score - current_score) / target_score + 1))

Note that the compensation value becomes negative when it's over
achieving the goal.  That's why the feedback metric and the
aggressiveness of the scheme should be positively proportional.  The
distance-adaptive speed manipulation is simply applied.

Example Use Cases
=================

If users want to reduce the memory footprint of the system as much as
possible as long as the time spent for handling the resulting memory
pressure is within a threshold, they could use DAMOS scheme that reclaims
cold memory regions aiming for a little level of memory pressure stall
time.

If users want the active/inactive LRU lists well balanced to reduce the
performance impact due to possible future memory pressure, they could use
two schemes.  The first one would be set to locate hot pages in the active
LRU list, aiming for a specific active-to-inactive LRU list size ratio,
say, 70%.  The second one would be to locate cold pages in the inactive
LRU list, aiming for a specific inactive-to-active LRU list size ratio,
say, 30%.  Then, DAMOS will balance the two schemes based on the goal and
feedback.

This aim-oriented auto tuning could also be useful for general
balancing-required access aware system operations such as system memory
auto scaling[3] and tiered memory management[4].  These two example usages
are not what current DAMOS implementation is already supporting, but
require additional DAMOS action developments, though.

Evaluation: subtle memory pressure aiming proactive reclamation
===============================================================

To show if the implementation works as expected, we prepare four different
system configurations on AWS i3.metal instances.  The first setup
(original) runs the workload without any DAMOS scheme.  The second setup
(not-tuned) runs the workload with a virtual address space-based proactive
reclamation scheme that pages out memory regions that are not accessed for
five seconds or more.  The third setup (offline-tuned) runs the same
proactive reclamation DAMOS scheme, but after making it tuned for each
workload offline, using our previous user-space driven automatic tuning
approach, namely DAMOOS[1].  The fourth and final setup (AFDAA) runs the
scheme that is the same as that of 'not-tuned' setup, but aims to keep
0.5% of 'some' memory pressure stall time (PSI) for the last 10 seconds
using the aiming-oriented auto tuning.

For each setup, we run realistic workloads from PARSEC3 and SPLASH-2X
benchmark suites.  For each run, we measure RSS and runtime of the
workload, and 'some' memory pressure stall time (PSI) of the system.  We
repeat the runs five times and use averaged measurements.

For simple comparison of the results, we normalize the measurements to
those of 'original'.  In the case of the PSI, though, the measurement for
'original' was zero, so we normalize the value to that of 'not-tuned'
scheme's result.  The normalized results are shown below.

            Not-tuned         Offline-tuned     AFDAA
    RSS     0.622688178226118 0.787950678944904 0.740093483278979
    runtime 1.11767826657912  1.0564674983585   1.0910833880499
    PSI     1                 0.727521443794069 0.308498846350299

The 'not-tuned' scheme achieves about 38.7% memory saving but incur about
11.7% runtime slowdown.  The 'offline-tuned' scheme achieves about 22.2%
memory saving with about 5.5% runtime slowdown.  It also achieves about
28.2% memory pressure stall time saving.  AFDAA achieves about 26% memory
saving with about 9.1% runtime slowdown.  It also achieves about 69.1%
memory pressure stall time saving.  We repeat this test multiple times,
and get consistent results.  AFDAA is now integrated in our daily DAMON
performance test setup.

Apparently the aggressiveness of 'AFDAA' setup is somewhere between those
of 'not-tuned' and 'offline-tuned' setup, since its memory saving and
runtime overhead are between those of the other two setups.  Actually we
set the memory pressure stall time goal aiming for this middle
aggressiveness.  The difference in the two metrics are not significant,
though.  However, it shows significant saving of the memory pressure stall
time, which was the goal of the auto-tuning, over the two variants.
Hence, we conclude the automatic tuning is working as expected.

Please note that the AFDAA setup is only for the evaluation, and
therefore intentionally set a bit aggressive.  It might not be
appropriate for production environments.

The test code is also available[2], so you could reproduce it on your
system and workloads.

Patches Sequence
================

The first four patches implement the core logic and user interfaces for
the auto tuning.  The first patch implements the core logic for the auto
tuning, and the API for DAMOS users in the kernel space.  The second
patch implements basic file operations of DAMON sysfs directories and
files that will be used for setting the goals and providing the
feedback.  The third patch connects the quota goals files inputs to the
DAMOS core logic.  Finally the fourth patch implements a dedicated DAMOS
sysfs command for efficiently committing the quota goals feedback.

Two patches for simple tests of the logic and interfaces follow.  The
fifth patch implements the core logic unit test.  The sixth patch
implements a selftest for the DAMON Sysfs interface for the goals.

Finally, three patches for documentation follows.  The seventh patch
documents the design of the feature.  The eighth patch updates the API
doc for the new sysfs files.  The final eighth patch updates the usage
document for the features.

References
==========

[1] DAOS paper:
    https://www.amazon.science/publications/daos-data-access-aware-operating-system
[2] Evaluation code:
    https://github.com/damonitor/damon-tests/commit/3f884e61193f0166b8724554b6d06b0c449a712d
[3] Memory auto scaling RFC idea:
    https://lore.kernel.org/damon/20231112195114.61474-1-sj@kernel.org/
[4] DAMON-based tiered memory management RFC idea:
    https://lore.kernel.org/damon/20231112195602.61525-1-sj@kernel.org/


This patch (of 9)

Users can effectively control the upper-limit aggressiveness of DAMOS
schemes using the quota feature.  The quota provides best result under the
limit by prioritizing regions based on the access pattern.  That said,
finding the best value, which could depend on dynamic characteristics of
the system and the workloads, is still challenging.

Implement a simple feedback-driven tuning mechanism and use it for
automatic tuning of DAMOS quota.  The implementation allows users to
provide the feedback by setting a feedback score returning callback
function.  Then DAMOS periodically calls the function back and adjusts the
quota based on the return value of the callback and current quota value.

Note that the absolute-value based time/size quotas still work as the
maximum hard limits of the scheme's aggressiveness.  The feedback-driven
auto-tuned quota is applied only if it is not exceeding the manually set
maximum limits.  Same for the scheme-target access pattern and filters
like other features.

[sj@kernel.org: document get_score_arg field of struct damos_quota]
  Link: https://lkml.kernel.org/r/20231204170106.60992-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231130023652.50284-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231130023652.50284-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Gow <davidgow@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ab2f17d9926b..aa34ab433bc5 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -136,6 +136,9 @@ enum damos_action {
  * @weight_nr_accesses:	Weight of the region's nr_accesses for prioritization.
  * @weight_age:		Weight of the region's age for prioritization.
  *
+ * @get_score:		Feedback function for self-tuning quota.
+ * @get_score_arg:	Parameter for @get_score
+ *
  * To avoid consuming too much CPU time or IO resources for applying the
  * &struct damos->action to large memory, DAMON allows users to set time and/or
  * size quotas.  The quotas can be set by writing non-zero values to &ms and
@@ -153,6 +156,17 @@ enum damos_action {
  * You could customize the prioritization logic by setting &weight_sz,
  * &weight_nr_accesses, and &weight_age, because monitoring operations are
  * encouraged to respect those.
+ *
+ * If @get_score function pointer is set, DAMON calls it back with
+ * @get_score_arg and get the return value of it for every @reset_interval.
+ * Then, DAMON adjusts the effective quota using the return value as a feedback
+ * score to the current quota, using its internal feedback loop algorithm.
+ *
+ * The feedback loop algorithem assumes the quota input and the feedback score
+ * output are in a positive proportional relationship, and the goal of the
+ * tuning is getting the feedback screo value of 10,000.  If @ms and/or @sz are
+ * set together, those work as a hard limit quota.  If neither @ms nor @sz are
+ * set, the mechanism starts from the quota of one byte.
  */
 struct damos_quota {
 	unsigned long ms;
@@ -163,6 +177,9 @@ struct damos_quota {
 	unsigned int weight_nr_accesses;
 	unsigned int weight_age;
 
+	unsigned long (*get_score)(void *arg);
+	void *get_score_arg;
+
 /* private: */
 	/* For throughput estimation */
 	unsigned long total_charged_sz;
@@ -179,6 +196,9 @@ struct damos_quota {
 	/* For prioritization */
 	unsigned long histogram[DAMOS_MAX_SCORE + 1];
 	unsigned int min_score;
+
+	/* For feedback loop */
+	unsigned long esz_bp;
 };
 
 /**
-- 
cgit v1.2.3


From 1486fb50136f4799946f5ecfe050094574647153 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Sat, 18 Nov 2023 10:32:28 +0800
Subject: mm: ksm: use more folio api in ksm_might_need_to_copy()

Patch series "mm: cleanup and use more folio in page fault", v3.

Rename page_copy_prealloc() to folio_prealloc(), which is used by more
functions, also do more folio conversion in page fault.


This patch (of 5):

Since ksm only support normal page, no swapout/in for ksm large folio too,
add large folio check in ksm_might_need_to_copy(), also convert
page->index to folio->index as page->index is going away.

Then convert ksm_might_need_to_copy() to use more folio api to save nine
compound_head() calls, short 'address' to reduce max-line-length.

Link: https://lkml.kernel.org/r/20231118023232.1409103-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20231118023232.1409103-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index c2dd786a30e1..4643d5244e77 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -77,7 +77,7 @@ static inline void ksm_exit(struct mm_struct *mm)
  * but what if the vma was unmerged while the page was swapped out?
  */
 struct page *ksm_might_need_to_copy(struct page *page,
-			struct vm_area_struct *vma, unsigned long address);
+			struct vm_area_struct *vma, unsigned long addr);
 
 void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
 void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
@@ -130,7 +130,7 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 }
 
 static inline struct page *ksm_might_need_to_copy(struct page *page,
-			struct vm_area_struct *vma, unsigned long address)
+			struct vm_area_struct *vma, unsigned long addr)
 {
 	return page;
 }
-- 
cgit v1.2.3


From f67f8d4a8c1e1ebc85a6cbdb9a7266f14863461c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 1 Dec 2023 14:59:36 -0500
Subject: mm/rmap: fix misplaced parenthesis of a likely()

Running my yearly branch profiler to see where likely/unlikely annotation
may be added or removed, I discovered this:

correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
       0   457918 100 page_try_dup_anon_rmap         rmap.h               264
[..]
  458021        0   0 page_try_dup_anon_rmap         rmap.h               265

I thought it was interesting that line 264 of rmap.h had a 100% incorrect
annotation, but the line directly below it was 100% correct. Looking at the
code:

	if (likely(!is_device_private_page(page) &&
	    unlikely(page_needs_cow_for_dma(vma, page))))

It didn't make sense. The "likely()" was around the entire if statement
(not just the "!is_device_private_page(page)"), which also included the
"unlikely()" portion of that if condition.

If the unlikely portion is unlikely to be true, that would make the entire
if condition unlikely to be true, so it made no sense at all to say the
entire if condition is true.

What is more likely to be likely is just the first part of the if statement
before the && operation. It's likely to be a misplaced parenthesis. And
after making the if condition broken into a likely() && unlikely(), both
now appear to be correct!

Link: https://lkml.kernel.org/r/20231201145936.5ddfdb50@gandalf.local.home
Fixes:fb3d824d1a46c ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b26fe858fd44..3c2fc291b071 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -261,8 +261,8 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
 	 * guarantee the pinned page won't be randomly replaced in the
 	 * future on write faults.
 	 */
-	if (likely(!is_device_private_page(page) &&
-	    unlikely(page_needs_cow_for_dma(vma, page))))
+	if (likely(!is_device_private_page(page)) &&
+	    unlikely(page_needs_cow_for_dma(vma, page)))
 		return -EBUSY;
 
 	ClearPageAnonExclusive(page);
-- 
cgit v1.2.3


From 82c944d05b1a24c76948ee9d6bb1d7de1ebb8b3a Mon Sep 17 00:00:00 2001
From: Herve Codina <herve.codina@bootlin.com>
Date: Tue, 28 Nov 2023 14:25:30 +0100
Subject: net: wan: Add framer framework support

A framer is a component in charge of an E1/T1 line interface.
Connected usually to a TDM bus, it converts TDM frames to/from E1/T1
frames. It also provides information related to the E1/T1 line.

The framer framework provides a set of APIs for the framer drivers
(framer provider) to create/destroy a framer and APIs for the framer
users (framer consumer) to obtain a reference to the framer, and
use the framer.

This basic implementation provides a framer abstraction for:
 - power on/off the framer
 - get the framer status (line state)
 - be notified on framer status changes
 - get/set the framer configuration

Signed-off-by: Herve Codina <herve.codina@bootlin.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20231128132534.258459-2-herve.codina@bootlin.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/framer/framer-provider.h | 194 +++++++++++++++++++++++++++++++
 include/linux/framer/framer.h          | 205 +++++++++++++++++++++++++++++++++
 2 files changed, 399 insertions(+)
 create mode 100644 include/linux/framer/framer-provider.h
 create mode 100644 include/linux/framer/framer.h

(limited to 'include/linux')

diff --git a/include/linux/framer/framer-provider.h b/include/linux/framer/framer-provider.h
new file mode 100644
index 000000000000..782cd5fc83d5
--- /dev/null
+++ b/include/linux/framer/framer-provider.h
@@ -0,0 +1,194 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Generic framer profider header file
+ *
+ * Copyright 2023 CS GROUP France
+ *
+ * Author: Herve Codina <herve.codina@bootlin.com>
+ */
+
+#ifndef __DRIVERS_PROVIDER_FRAMER_H
+#define __DRIVERS_PROVIDER_FRAMER_H
+
+#include <linux/export.h>
+#include <linux/framer/framer.h>
+#include <linux/types.h>
+
+#define FRAMER_FLAG_POLL_STATUS  BIT(0)
+
+/**
+ * struct framer_ops - set of function pointers for performing framer operations
+ * @init: operation to be performed for initializing the framer
+ * @exit: operation to be performed while exiting
+ * @power_on: powering on the framer
+ * @power_off: powering off the framer
+ * @flags: OR-ed flags (FRAMER_FLAG_*) to ask for core functionality
+ *          - @FRAMER_FLAG_POLL_STATUS:
+ *            Ask the core to perform a polling to get the framer status and
+ *            notify consumers on change.
+ *            The framer should call @framer_notify_status_change() when it
+ *            detects a status change. This is usually done using interrupts.
+ *            If the framer cannot detect this change, it can ask the core for
+ *            a status polling. The core will call @get_status() periodically
+ *            and, on change detected, it will notify the consumer.
+ *            the @get_status()
+ * @owner: the module owner containing the ops
+ */
+struct framer_ops {
+	int	(*init)(struct framer *framer);
+	void	(*exit)(struct framer *framer);
+	int	(*power_on)(struct framer *framer);
+	int	(*power_off)(struct framer *framer);
+
+	/**
+	 * @get_status:
+	 *
+	 * Optional.
+	 *
+	 * Used to get the framer status. framer_init() must have
+	 * been called on the framer.
+	 *
+	 * Returns: 0 if successful, an negative error code otherwise
+	 */
+	int	(*get_status)(struct framer *framer, struct framer_status *status);
+
+	/**
+	 * @set_config:
+	 *
+	 * Optional.
+	 *
+	 * Used to set the framer configuration. framer_init() must have
+	 * been called on the framer.
+	 *
+	 * Returns: 0 if successful, an negative error code otherwise
+	 */
+	int	(*set_config)(struct framer *framer, const struct framer_config *config);
+
+	/**
+	 * @get_config:
+	 *
+	 * Optional.
+	 *
+	 * Used to get the framer configuration. framer_init() must have
+	 * been called on the framer.
+	 *
+	 * Returns: 0 if successful, an negative error code otherwise
+	 */
+	int	(*get_config)(struct framer *framer, struct framer_config *config);
+
+	u32 flags;
+	struct module *owner;
+};
+
+/**
+ * struct framer_provider - represents the framer provider
+ * @dev: framer provider device
+ * @children: can be used to override the default (dev->of_node) child node
+ * @owner: the module owner having of_xlate
+ * @list: to maintain a linked list of framer providers
+ * @of_xlate: function pointer to obtain framer instance from framer pointer
+ */
+struct framer_provider {
+	struct device		*dev;
+	struct module		*owner;
+	struct list_head	list;
+	struct framer * (*of_xlate)(struct device *dev,
+				    struct of_phandle_args *args);
+};
+
+static inline void framer_set_drvdata(struct framer *framer, void *data)
+{
+	dev_set_drvdata(&framer->dev, data);
+}
+
+static inline void *framer_get_drvdata(struct framer *framer)
+{
+	return dev_get_drvdata(&framer->dev);
+}
+
+#if IS_ENABLED(CONFIG_GENERIC_FRAMER)
+
+/* Create and destroy a framer */
+struct framer *framer_create(struct device *dev, struct device_node *node,
+			     const struct framer_ops *ops);
+void framer_destroy(struct framer *framer);
+
+/* devm version */
+struct framer *devm_framer_create(struct device *dev, struct device_node *node,
+				  const struct framer_ops *ops);
+
+struct framer *framer_provider_simple_of_xlate(struct device *dev,
+					       struct of_phandle_args *args);
+
+struct framer_provider *
+__framer_provider_of_register(struct device *dev, struct module *owner,
+			      struct framer *(*of_xlate)(struct device *dev,
+							 struct of_phandle_args *args));
+
+void framer_provider_of_unregister(struct framer_provider *framer_provider);
+
+struct framer_provider *
+__devm_framer_provider_of_register(struct device *dev, struct module *owner,
+				   struct framer *(*of_xlate)(struct device *dev,
+							      struct of_phandle_args *args));
+
+void framer_notify_status_change(struct framer *framer);
+
+#else /* IS_ENABLED(CONFIG_GENERIC_FRAMER) */
+
+static inline struct framer *framer_create(struct device *dev, struct device_node *node,
+					   const struct framer_ops *ops)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline void framer_destroy(struct framer *framer)
+{
+}
+
+/* devm version */
+static inline struct framer *devm_framer_create(struct device *dev, struct device_node *node,
+						const struct framer_ops *ops)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline struct framer *framer_provider_simple_of_xlate(struct device *dev,
+							     struct of_phandle_args *args)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline struct framer_provider *
+__framer_provider_of_register(struct device *dev, struct module *owner,
+			      struct framer *(*of_xlate)(struct device *dev,
+							 struct of_phandle_args *args))
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+void framer_provider_of_unregister(struct framer_provider *framer_provider)
+{
+}
+
+static inline struct framer_provider *
+__devm_framer_provider_of_register(struct device *dev, struct module *owner,
+				   struct framer *(*of_xlate)(struct device *dev,
+							      struct of_phandle_args *args))
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+void framer_notify_status_change(struct framer *framer)
+{
+}
+
+#endif /* IS_ENABLED(CONFIG_GENERIC_FRAMER) */
+
+#define framer_provider_of_register(dev, xlate)		\
+	__framer_provider_of_register((dev), THIS_MODULE, (xlate))
+
+#define devm_framer_provider_of_register(dev, xlate)	\
+	__devm_framer_provider_of_register((dev), THIS_MODULE, (xlate))
+
+#endif /* __DRIVERS_PROVIDER_FRAMER_H */
diff --git a/include/linux/framer/framer.h b/include/linux/framer/framer.h
new file mode 100644
index 000000000000..9a9b88962c29
--- /dev/null
+++ b/include/linux/framer/framer.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Generic framer header file
+ *
+ * Copyright 2023 CS GROUP France
+ *
+ * Author: Herve Codina <herve.codina@bootlin.com>
+ */
+
+#ifndef __DRIVERS_FRAMER_H
+#define __DRIVERS_FRAMER_H
+
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/of.h>
+#include <linux/device.h>
+#include <linux/workqueue.h>
+
+/**
+ * enum framer_iface - Framer interface
+ * @FRAMER_IFACE_E1: E1 interface
+ * @FRAMER_IFACE_T1: T1 interface
+ */
+enum framer_iface {
+	FRAMER_IFACE_E1,
+	FRAMER_IFACE_T1,
+};
+
+/**
+ * enum framer_clock_type - Framer clock type
+ * @FRAMER_CLOCK_EXT: External clock
+ * @FRAMER_CLOCK_INT: Internal clock
+ */
+enum framer_clock_type {
+	FRAMER_CLOCK_EXT,
+	FRAMER_CLOCK_INT,
+};
+
+/**
+ * struct framer_config - Framer configuration
+ * @iface: Framer line interface
+ * @clock_type: Framer clock type
+ * @line_clock_rate: Framer line clock rate
+ */
+struct framer_config {
+	enum framer_iface iface;
+	enum framer_clock_type clock_type;
+	unsigned long line_clock_rate;
+};
+
+/**
+ * struct framer_status - Framer status
+ * @link_is_on: Framer link state. true, the link is on, false, the link is off.
+ */
+struct framer_status {
+	bool link_is_on;
+};
+
+/**
+ * enum framer_event - Event available for notification
+ * @FRAMER_EVENT_STATUS: Event notified on framer_status changes
+ */
+enum framer_event {
+	FRAMER_EVENT_STATUS,
+};
+
+/**
+ * struct framer - represents the framer device
+ * @dev: framer device
+ * @id: id of the framer device
+ * @ops: function pointers for performing framer operations
+ * @mutex: mutex to protect framer_ops
+ * @init_count: used to protect when the framer is used by multiple consumers
+ * @power_count: used to protect when the framer is used by multiple consumers
+ * @pwr: power regulator associated with the framer
+ * @notify_status_work: work structure used for status notifications
+ * @notifier_list: notifier list used for notifications
+ * @polling_work: delayed work structure used for the polling task
+ * @prev_status: previous read status used by the polling task to detect changes
+ */
+struct framer {
+	struct device			dev;
+	int				id;
+	const struct framer_ops		*ops;
+	struct mutex			mutex;	/* Protect framer */
+	int				init_count;
+	int				power_count;
+	struct regulator		*pwr;
+	struct work_struct		notify_status_work;
+	struct blocking_notifier_head	notifier_list;
+	struct delayed_work		polling_work;
+	struct framer_status		prev_status;
+};
+
+#if IS_ENABLED(CONFIG_GENERIC_FRAMER)
+int framer_pm_runtime_get(struct framer *framer);
+int framer_pm_runtime_get_sync(struct framer *framer);
+int framer_pm_runtime_put(struct framer *framer);
+int framer_pm_runtime_put_sync(struct framer *framer);
+int framer_init(struct framer *framer);
+int framer_exit(struct framer *framer);
+int framer_power_on(struct framer *framer);
+int framer_power_off(struct framer *framer);
+int framer_get_status(struct framer *framer, struct framer_status *status);
+int framer_get_config(struct framer *framer, struct framer_config *config);
+int framer_set_config(struct framer *framer, const struct framer_config *config);
+int framer_notifier_register(struct framer *framer, struct notifier_block *nb);
+int framer_notifier_unregister(struct framer *framer, struct notifier_block *nb);
+
+struct framer *framer_get(struct device *dev, const char *con_id);
+void framer_put(struct device *dev, struct framer *framer);
+
+struct framer *devm_framer_get(struct device *dev, const char *con_id);
+struct framer *devm_framer_optional_get(struct device *dev, const char *con_id);
+#else
+static inline int framer_pm_runtime_get(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_pm_runtime_get_sync(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_pm_runtime_put(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_pm_runtime_put_sync(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_init(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_exit(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_power_on(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_power_off(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_get_status(struct framer *framer, struct framer_status *status)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_get_config(struct framer *framer, struct framer_config *config)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_set_config(struct framer *framer, const struct framer_config *config)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_notifier_register(struct framer *framer,
+					   struct notifier_block *nb)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_notifier_unregister(struct framer *framer,
+					     struct notifier_block *nb)
+{
+	return -ENOSYS;
+}
+
+struct framer *framer_get(struct device *dev, const char *con_id)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+void framer_put(struct device *dev, struct framer *framer)
+{
+}
+
+static inline struct framer *devm_framer_get(struct device *dev, const char *con_id)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline struct framer *devm_framer_optional_get(struct device *dev, const char *con_id)
+{
+	return NULL;
+}
+
+#endif
+
+#endif /* __DRIVERS_FRAMER_H */
-- 
cgit v1.2.3


From c96e976d9a05d559f4ac4f617ea0f798c75a1799 Mon Sep 17 00:00:00 2001
From: Herve Codina <herve.codina@bootlin.com>
Date: Tue, 28 Nov 2023 14:25:32 +0100
Subject: net: wan: framer: Add support for the Lantiq PEF2256 framer

The Lantiq PEF2256 is a framer and line interface component designed to
fulfill all required interfacing between an analog E1/T1/J1 line and the
digital PCM system highway/H.100 bus.

Signed-off-by: Herve Codina <herve.codina@bootlin.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20231128132534.258459-4-herve.codina@bootlin.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/framer/pef2256.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 include/linux/framer/pef2256.h

(limited to 'include/linux')

diff --git a/include/linux/framer/pef2256.h b/include/linux/framer/pef2256.h
new file mode 100644
index 000000000000..71d80af58c40
--- /dev/null
+++ b/include/linux/framer/pef2256.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * PEF2256 consumer API
+ *
+ * Copyright 2023 CS GROUP France
+ *
+ * Author: Herve Codina <herve.codina@bootlin.com>
+ */
+#ifndef __PEF2256_H__
+#define __PEF2256_H__
+
+#include <linux/types.h>
+
+struct pef2256;
+struct regmap;
+
+/* Retrieve the PEF2256 regmap */
+struct regmap *pef2256_get_regmap(struct pef2256 *pef2256);
+
+/* PEF2256 hardware versions */
+enum pef2256_version {
+	PEF2256_VERSION_UNKNOWN,
+	PEF2256_VERSION_1_2,
+	PEF2256_VERSION_2_1,
+	PEF2256_VERSION_2_2,
+};
+
+/* Get the PEF2256 hardware version */
+enum pef2256_version pef2256_get_version(struct pef2256 *pef2256);
+
+#endif /* __PEF2256_H__ */
-- 
cgit v1.2.3


From d3bb89ea9c13e5a98d2b7a0ba8e50a77893132cb Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 7 Dec 2023 23:25:25 +0800
Subject: mm: fix VMA heap bounds checking

After converting selinux to VMA heap check helper, the gcl triggers an
execheap SELinux denial, which is caused by a changed logic check.

Previously selinux only checked that the VMA range was within the VMA heap
range, and the implementation checks the intersection between the two
ranges, but the corner case (vm_end=start_brk, brk=vm_start) isn't handled
correctly.

Since commit 11250fd12eb8 ("mm: factor out VMA stack and heap checks") was
only a function extraction, it seems that the issue was introduced by
commit 0db0c01b53a1 ("procfs: fix /proc/<pid>/maps heap check").  Let's
fix above corner cases, meanwhile, correct the wrong indentation of the
stack and heap check helpers.

Fixes: 11250fd12eb8 ("mm: factor out VMA stack and heap checks")
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reported-by: Ondrej Mosnacek <omosnace@redhat.com>
Closes: https://lore.kernel.org/selinux/CAFqZXNv0SVT0fkOK6neP9AXbj3nxJ61JAY4+zJzvxqJaeuhbFw@mail.gmail.com/
Tested-by: Ondrej Mosnacek <omosnace@redhat.com>
Link: https://lkml.kernel.org/r/20231207152525.2607420-1-wangkefeng.wang@huawei.com
Cc: David Hildenbrand <david@redhat.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <stephen.smalley.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 418d26608ece..da5219b48d52 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -886,8 +886,8 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
  */
 static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
 {
-       return vma->vm_start <= vma->vm_mm->brk &&
-		vma->vm_end >= vma->vm_mm->start_brk;
+	return vma->vm_start < vma->vm_mm->brk &&
+		vma->vm_end > vma->vm_mm->start_brk;
 }
 
 /*
@@ -901,8 +901,8 @@ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
 	 * its "stack".  It's not even well-defined for programs written
 	 * languages like Go.
 	 */
-       return vma->vm_start <= vma->vm_mm->start_stack &&
-	       vma->vm_end >= vma->vm_mm->start_stack;
+	return vma->vm_start <= vma->vm_mm->start_stack &&
+		vma->vm_end >= vma->vm_mm->start_stack;
 }
 
 static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
-- 
cgit v1.2.3


From 6376a824595607e99d032a39ba3394988b4fce96 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 8 Dec 2023 17:50:18 +0000
Subject: mm/damon/core: make damon_start() waits until kdamond_fn() starts

The cleanup tasks of kdamond threads including reset of corresponding
DAMON context's ->kdamond field and decrease of global nr_running_ctxs
counter is supposed to be executed by kdamond_fn().  However, commit
0f91d13366a4 ("mm/damon: simplify stop mechanism") made neither
damon_start() nor damon_stop() ensure the corresponding kdamond has
started the execution of kdamond_fn().

As a result, the cleanup can be skipped if damon_stop() is called fast
enough after the previous damon_start().  Especially the skipped reset
of ->kdamond could cause a use-after-free.

Fix it by waiting for start of kdamond_fn() execution from
damon_start().

Link: https://lkml.kernel.org/r/20231208175018.63880-1-sj@kernel.org
Fixes: 0f91d13366a4 ("mm/damon: simplify stop mechanism")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Jakub Acs <acsjakub@amazon.de>
Cc: Changbin Du <changbin.du@intel.com>
Cc: Jakub Acs <acsjakub@amazon.de>
Cc: <stable@vger.kernel.org> # 5.15.x
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ab2f17d9926b..e00ddf1ed39c 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -559,6 +559,8 @@ struct damon_ctx {
 	 * update
 	 */
 	unsigned long next_ops_update_sis;
+	/* for waiting until the execution of the kdamond_fn is started */
+	struct completion kdamond_started;
 
 /* public: */
 	struct task_struct *kdamond;
-- 
cgit v1.2.3


From 081488051d28d32569ebb7c7a23572778b2e7d57 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 7 Dec 2023 23:14:04 -0700
Subject: mm/mglru: fix underprotected page cache

Unmapped folios accessed through file descriptors can be underprotected.
Those folios are added to the oldest generation based on:

1. The fact that they are less costly to reclaim (no need to walk the
   rmap and flush the TLB) and have less impact on performance (don't
   cause major PFs and can be non-blocking if needed again).
2. The observation that they are likely to be single-use. E.g., for
   client use cases like Android, its apps parse configuration files
   and store the data in heap (anon); for server use cases like MySQL,
   it reads from InnoDB files and holds the cached data for tables in
   buffer pools (anon).

However, the oldest generation can be very short lived, and if so, it
doesn't provide the PID controller with enough time to respond to a surge
of refaults.  (Note that the PID controller uses weighted refaults and
those from evicted generations only take a half of the whole weight.) In
other words, for a short lived generation, the moving average smooths out
the spike quickly.

To fix the problem:
1. For folios that are already on LRU, if they can be beyond the
   tracking range of tiers, i.e., five accesses through file
   descriptors, move them to the second oldest generation to give them
   more time to age. (Note that tiers are used by the PID controller
   to statistically determine whether folios accessed multiple times
   through file descriptors are worth protecting.)
2. When adding unmapped folios to LRU, adjust the placement of them so
   that they are not too close to the tail. The effect of this is
   similar to the above.

On Android, launching 55 apps sequentially:
                           Before     After      Change
  workingset_refault_anon  25641024   25598972   0%
  workingset_refault_file  115016834  106178438  -8%

Link: https://lkml.kernel.org/r/20231208061407.2125867-1-yuzhao@google.com
Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: Charan Teja Kalla <quic_charante@quicinc.com>
Tested-by: Kalesh Singh <kaleshsingh@google.com>
Cc: T.J. Mercier <tjmercier@google.com>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_inline.h | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 9ae7def16cb2..f4fe593c1400 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -232,22 +232,27 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
 	if (folio_test_unevictable(folio) || !lrugen->enabled)
 		return false;
 	/*
-	 * There are three common cases for this page:
-	 * 1. If it's hot, e.g., freshly faulted in or previously hot and
-	 *    migrated, add it to the youngest generation.
-	 * 2. If it's cold but can't be evicted immediately, i.e., an anon page
-	 *    not in swapcache or a dirty page pending writeback, add it to the
-	 *    second oldest generation.
-	 * 3. Everything else (clean, cold) is added to the oldest generation.
+	 * There are four common cases for this page:
+	 * 1. If it's hot, i.e., freshly faulted in, add it to the youngest
+	 *    generation, and it's protected over the rest below.
+	 * 2. If it can't be evicted immediately, i.e., a dirty page pending
+	 *    writeback, add it to the second youngest generation.
+	 * 3. If it should be evicted first, e.g., cold and clean from
+	 *    folio_rotate_reclaimable(), add it to the oldest generation.
+	 * 4. Everything else falls between 2 & 3 above and is added to the
+	 *    second oldest generation if it's considered inactive, or the
+	 *    oldest generation otherwise. See lru_gen_is_active().
 	 */
 	if (folio_test_active(folio))
 		seq = lrugen->max_seq;
 	else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
 		 (folio_test_reclaim(folio) &&
 		  (folio_test_dirty(folio) || folio_test_writeback(folio))))
-		seq = lrugen->min_seq[type] + 1;
-	else
+		seq = lrugen->max_seq - 1;
+	else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq)
 		seq = lrugen->min_seq[type];
+	else
+		seq = lrugen->min_seq[type] + 1;
 
 	gen = lru_gen_from_seq(seq);
 	flags = (gen + 1UL) << LRU_GEN_PGOFF;
-- 
cgit v1.2.3


From 8aa420617918d12d1f5d55030a503c9418e73c2c Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 7 Dec 2023 23:14:06 -0700
Subject: mm/mglru: respect min_ttl_ms with memcgs

While investigating kswapd "consuming 100% CPU" [1] (also see "mm/mglru:
try to stop at high watermarks"), it was discovered that the memcg LRU can
breach the thrashing protection imposed by min_ttl_ms.

Before the memcg LRU:
  kswapd()
    shrink_node_memcgs()
      mem_cgroup_iter()
        inc_max_seq()  // always hit a different memcg
    lru_gen_age_node()
      mem_cgroup_iter()
        check the timestamp of the oldest generation

After the memcg LRU:
  kswapd()
    shrink_many()
      restart:
        iterate the memcg LRU:
          inc_max_seq()  // occasionally hit the same memcg
          if raced with lru_gen_rotate_memcg():
            goto restart
    lru_gen_age_node()
      mem_cgroup_iter()
        check the timestamp of the oldest generation

Specifically, when the restart happens in shrink_many(), it needs to stick
with the (memcg LRU) generation it began with.  In other words, it should
neither re-read memcg_lru->seq nor age an lruvec of a different
generation.  Otherwise it can hit the same memcg multiple times without
giving lru_gen_age_node() a chance to check the timestamp of that memcg's
oldest generation (against min_ttl_ms).

[1] https://lore.kernel.org/CAK8fFZ4DY+GtBA40Pm7Nn5xCHy+51w3sfxPqkqpqakSXYyX+Wg@mail.gmail.com/

Link: https://lkml.kernel.org/r/20231208061407.2125867-3-yuzhao@google.com
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: T.J. Mercier <tjmercier@google.com>
Cc: Charan Teja Kalla <quic_charante@quicinc.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3c25226beeed..23533b12bee2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -505,33 +505,37 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
  * the old generation, is incremented when all its bins become empty.
  *
  * There are four operations:
- * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
+ * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its
  *    current generation (old or young) and updates its "seg" to "head";
- * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
+ * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its
  *    current generation (old or young) and updates its "seg" to "tail";
- * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
+ * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old
  *    generation, updates its "gen" to "old" and resets its "seg" to "default";
- * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
+ * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the
  *    young generation, updates its "gen" to "young" and resets its "seg" to
  *    "default".
  *
  * The events that trigger the above operations are:
  * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
- * 2. The first attempt to reclaim an memcg below low, which triggers
+ * 2. The first attempt to reclaim a memcg below low, which triggers
  *    MEMCG_LRU_TAIL;
- * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
+ * 3. The first attempt to reclaim a memcg below reclaimable size threshold,
  *    which triggers MEMCG_LRU_TAIL;
- * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
+ * 4. The second attempt to reclaim a memcg below reclaimable size threshold,
  *    which triggers MEMCG_LRU_YOUNG;
- * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
+ * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG;
  * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
- * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
+ * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD.
  *
- * Note that memcg LRU only applies to global reclaim, and the round-robin
- * incrementing of their max_seq counters ensures the eventual fairness to all
- * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
+ * Notes:
+ * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing
+ *    of their max_seq counters ensures the eventual fairness to all eligible
+ *    memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
+ * 2. There are only two valid generations: old (seq) and young (seq+1).
+ *    MEMCG_NR_GENS is set to three so that when reading the generation counter
+ *    locklessly, a stale value (seq-1) does not wraparound to young.
  */
-#define MEMCG_NR_GENS	2
+#define MEMCG_NR_GENS	3
 #define MEMCG_NR_BINS	8
 
 struct lru_gen_memcg {
-- 
cgit v1.2.3


From 4376807bf2d5371c3e00080c972be568c3f8a7d1 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 7 Dec 2023 23:14:07 -0700
Subject: mm/mglru: reclaim offlined memcgs harder

In the effort to reduce zombie memcgs [1], it was discovered that the
memcg LRU doesn't apply enough pressure on offlined memcgs.  Specifically,
instead of rotating them to the tail of the current generation
(MEMCG_LRU_TAIL) for a second attempt, it moves them to the next
generation (MEMCG_LRU_YOUNG) after the first attempt.

Not applying enough pressure on offlined memcgs can cause them to build
up, and this can be particularly harmful to memory-constrained systems.

On Pixel 8 Pro, launching apps for 50 cycles:
                 Before  After  Change
  Zombie memcgs  45      35     -22%

[1] https://lore.kernel.org/CABdmKX2M6koq4Q0Cmp_-=wbP0Qa190HdEGGaHfxNS05gAkUtPA@mail.gmail.com/

Link: https://lkml.kernel.org/r/20231208061407.2125867-4-yuzhao@google.com
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: T.J. Mercier <tjmercier@google.com>
Tested-by: T.J. Mercier <tjmercier@google.com>
Cc: Charan Teja Kalla <quic_charante@quicinc.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 23533b12bee2..9db36e197712 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -519,10 +519,10 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
  * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
  * 2. The first attempt to reclaim a memcg below low, which triggers
  *    MEMCG_LRU_TAIL;
- * 3. The first attempt to reclaim a memcg below reclaimable size threshold,
- *    which triggers MEMCG_LRU_TAIL;
- * 4. The second attempt to reclaim a memcg below reclaimable size threshold,
- *    which triggers MEMCG_LRU_YOUNG;
+ * 3. The first attempt to reclaim a memcg offlined or below reclaimable size
+ *    threshold, which triggers MEMCG_LRU_TAIL;
+ * 4. The second attempt to reclaim a memcg offlined or below reclaimable size
+ *    threshold, which triggers MEMCG_LRU_YOUNG;
  * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG;
  * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
  * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD.
-- 
cgit v1.2.3


From 05ce71929efc79f5978589e0456a54eb0fe6485e Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 27 Nov 2023 16:19:31 +0100
Subject: PM: domains: Drop the unused pm_genpd_opp_to_performance_state()

Since commit 7c41cdcd3bbe ("OPP: Simplify the over-designed pstate <->
level dance"), there is no longer any users of the
pm_genpd_opp_to_performance_state() API. Let's therefore drop it and its
corresponding ->opp_to_performance_state() callback, which also no longer
has any users.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20231127151931.47055-1-ulf.hansson@linaro.org
---
 include/linux/pm_domain.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 34663d0d5c55..b97c5e9820f9 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -118,7 +118,6 @@ struct genpd_power_state {
 };
 
 struct genpd_lock_ops;
-struct dev_pm_opp;
 struct opp_table;
 
 struct generic_pm_domain {
@@ -146,8 +145,6 @@ struct generic_pm_domain {
 	int (*power_on)(struct generic_pm_domain *domain);
 	struct raw_notifier_head power_notifiers; /* Power on/off notifiers */
 	struct opp_table *opp_table;	/* OPP table of the genpd */
-	unsigned int (*opp_to_performance_state)(struct generic_pm_domain *genpd,
-						 struct dev_pm_opp *opp);
 	int (*set_performance_state)(struct generic_pm_domain *genpd,
 				     unsigned int state);
 	struct gpd_dev_ops dev_ops;
@@ -348,8 +345,6 @@ int of_genpd_remove_subdomain(struct of_phandle_args *parent_spec,
 struct generic_pm_domain *of_genpd_remove_last(struct device_node *np);
 int of_genpd_parse_idle_states(struct device_node *dn,
 			       struct genpd_power_state **states, int *n);
-unsigned int pm_genpd_opp_to_performance_state(struct device *genpd_dev,
-					       struct dev_pm_opp *opp);
 
 int genpd_dev_pm_attach(struct device *dev);
 struct device *genpd_dev_pm_attach_by_id(struct device *dev,
@@ -395,13 +390,6 @@ static inline int of_genpd_parse_idle_states(struct device_node *dn,
 	return -ENODEV;
 }
 
-static inline unsigned int
-pm_genpd_opp_to_performance_state(struct device *genpd_dev,
-				  struct dev_pm_opp *opp)
-{
-	return 0;
-}
-
 static inline int genpd_dev_pm_attach(struct device *dev)
 {
 	return 0;
-- 
cgit v1.2.3


From 4f7aa122bc9219baca0bfface5917062d6c45ee8 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Thu, 7 Dec 2023 16:12:04 +0100
Subject: dpll: remove leftover mode_supported() op and use mode_get() instead

Mode supported is currently reported to the user exactly the same, as
the current mode. That's because mode changing is not implemented.
Remove the leftover mode_supported() op and use mode_get() to fill up
the supported mode exposed to user.

One, if even, mode changing is going to be introduced, this could be
very easily taken back. In the meantime, prevent drivers form
implementing this in wrong way (as for example recent netdevsim
implementation attempt intended to do).

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dpll.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 578fc5fa3750..b1a5f9ca8ee5 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -17,9 +17,6 @@ struct dpll_pin;
 struct dpll_device_ops {
 	int (*mode_get)(const struct dpll_device *dpll, void *dpll_priv,
 			enum dpll_mode *mode, struct netlink_ext_ack *extack);
-	bool (*mode_supported)(const struct dpll_device *dpll, void *dpll_priv,
-			       const enum dpll_mode mode,
-			       struct netlink_ext_ack *extack);
 	int (*lock_status_get)(const struct dpll_device *dpll, void *dpll_priv,
 			       enum dpll_lock_status *status,
 			       struct netlink_ext_ack *extack);
-- 
cgit v1.2.3


From 59b3e31e73322ec195e45e0a1da712c752ee1b0c Mon Sep 17 00:00:00 2001
From: Daniel Golle <daniel@makrotopia.org>
Date: Tue, 28 Nov 2023 04:00:10 +0000
Subject: leds: trigger: netdev: Extend speeds up to 10G

Add 2.5G, 5G and 10G as available speeds to the netdev LED trigger.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/99e7d3304c6bba7f4863a4a80764a869855f2085.1701143925.git.daniel@makrotopia.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/leds.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index aa16dc2a8230..1bdf7f5a0d7c 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -588,6 +588,9 @@ enum led_trigger_netdev_modes {
 	TRIGGER_NETDEV_LINK_10,
 	TRIGGER_NETDEV_LINK_100,
 	TRIGGER_NETDEV_LINK_1000,
+	TRIGGER_NETDEV_LINK_2500,
+	TRIGGER_NETDEV_LINK_5000,
+	TRIGGER_NETDEV_LINK_10000,
 	TRIGGER_NETDEV_HALF_DUPLEX,
 	TRIGGER_NETDEV_FULL_DUPLEX,
 	TRIGGER_NETDEV_TX,
-- 
cgit v1.2.3


From 4ff4379ce6eefe81695bcc2e021ce1dac3d707d2 Mon Sep 17 00:00:00 2001
From: Florian Eckert <fe@dev.tdt.de>
Date: Mon, 27 Nov 2023 12:03:08 +0100
Subject: tty: add new helper function tty_get_tiocm

There is no in-kernel function to get the status register of a tty device
like the TIOCMGET ioctl returns to userspace. Create a new function,
tty_get_tiocm(), to obtain the status register that other portions of the
kernel can call if they need this information, and move the existing
internal tty_tiocmget() function to use this interface.

Signed-off-by: Florian Eckert <fe@dev.tdt.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20231127110311.3583957-2-fe@dev.tdt.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/tty.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 4b6340ac2af2..d219a11e3fe0 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -419,6 +419,7 @@ bool tty_unthrottle_safe(struct tty_struct *tty);
 int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
 int tty_get_icount(struct tty_struct *tty,
 		struct serial_icounter_struct *icount);
+int tty_get_tiocm(struct tty_struct *tty);
 int is_current_pgrp_orphaned(void);
 void tty_hangup(struct tty_struct *tty);
 void tty_vhangup(struct tty_struct *tty);
-- 
cgit v1.2.3


From bdc22c8d52d70fc5655ab4dbf72fa79b034bb7b5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 5 Dec 2023 20:18:39 +0100
Subject: thermal: trip: Send trip change notifications on all trip updates

The _store callbacks of the trip point temperature and hysteresis sysfs
attributes invoke thermal_notify_tz_trip_change() to send a notification
regarding the trip point change, but when trip points are updated by the
platform firmware, trip point change notifications are not sent.

To make the behavior after a trip point change more consistent,
modify all of the 3 places where trip point temperature is updated
to use a new function called thermal_zone_set_trip_temp() for this
purpose and make that function call thermal_notify_tz_trip_change().

Note that trip point hysteresis can only be updated via sysfs and
trip_point_hyst_store() calls thermal_notify_tz_trip_change() already,
so this code path need not be changed.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/thermal.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index bedbaec9a42e..09f6eb82c191 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -291,6 +291,8 @@ int thermal_zone_for_each_trip(struct thermal_zone_device *tz,
 			       int (*cb)(struct thermal_trip *, void *),
 			       void *data);
 int thermal_zone_get_num_trips(struct thermal_zone_device *tz);
+void thermal_zone_set_trip_temp(struct thermal_zone_device *tz,
+				struct thermal_trip *trip, int temp);
 
 int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp);
 
-- 
cgit v1.2.3


From ad6534c626fedd818718d76c36d69c7d8e7b61cc Mon Sep 17 00:00:00 2001
From: Shuai Xue <xueshuai@linux.alibaba.com>
Date: Fri, 8 Dec 2023 10:56:49 +0800
Subject: PCI: Add Alibaba Vendor ID to linux/pci_ids.h

The Alibaba Vendor ID (0x1ded) is now used by Alibaba elasticRDMA ("erdma")
and will be shared with the upcoming PCIe PMU ("dwc_pcie_pmu"). Move the
Vendor ID to linux/pci_ids.h so that it can shared by several drivers
later.

Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>	# pci_ids.h
Tested-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20231208025652.87192-3-xueshuai@linux.alibaba.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 275799b5f535..844ffdac8d7d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2605,6 +2605,8 @@
 #define PCI_VENDOR_ID_TEKRAM		0x1de1
 #define PCI_DEVICE_ID_TEKRAM_DC290	0xdc29
 
+#define PCI_VENDOR_ID_ALIBABA		0x1ded
+
 #define PCI_VENDOR_ID_TEHUTI		0x1fc9
 #define PCI_DEVICE_ID_TEHUTI_3009	0x3009
 #define PCI_DEVICE_ID_TEHUTI_3010	0x3010
-- 
cgit v1.2.3


From ac16087134b837d42b75bb1c741070b6c142f258 Mon Sep 17 00:00:00 2001
From: Shuai Xue <xueshuai@linux.alibaba.com>
Date: Fri, 8 Dec 2023 10:56:50 +0800
Subject: PCI: Move pci_clear_and_set_dword() helper to PCI header

The clear and set pattern is commonly used for accessing PCI config,
move the helper pci_clear_and_set_dword() from aspm.c into PCI header.
In addition, rename to pci_clear_and_set_config_dword() to retain the
"config" information and match the other accessors.

No functional change intended.

Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20231208025652.87192-4-xueshuai@linux.alibaba.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60ca768bc867..268c4bd98ef3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1239,6 +1239,8 @@ int pci_read_config_dword(const struct pci_dev *dev, int where, u32 *val);
 int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val);
 int pci_write_config_word(const struct pci_dev *dev, int where, u16 val);
 int pci_write_config_dword(const struct pci_dev *dev, int where, u32 val);
+void pci_clear_and_set_config_dword(const struct pci_dev *dev, int pos,
+				    u32 clear, u32 set);
 
 int pcie_capability_read_word(struct pci_dev *dev, int pos, u16 *val);
 int pcie_capability_read_dword(struct pci_dev *dev, int pos, u32 *val);
-- 
cgit v1.2.3


From c82a1662d4548c454de5343b88f69b9fc82266b3 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 8 Dec 2023 23:56:41 +0100
Subject: leds: trigger: Remove unused function led_trigger_rename_static()

This function was added with a8df7b1ab70b ("leds: add led_trigger_rename
function") 11 yrs ago, but it has no users. So remove it.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/d90f30be-f661-4db7-b0b5-d09d07a78a68@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/leds.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index 1bdf7f5a0d7c..4754b02d3a2c 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -527,23 +527,6 @@ static inline void *led_get_trigger_data(struct led_classdev *led_cdev)
 	return led_cdev->trigger_data;
 }
 
-/**
- * led_trigger_rename_static - rename a trigger
- * @name: the new trigger name
- * @trig: the LED trigger to rename
- *
- * Change a LED trigger name by copying the string passed in
- * name into current trigger name, which MUST be large
- * enough for the new string.
- *
- * Note that name must NOT point to the same string used
- * during LED registration, as that could lead to races.
- *
- * This is meant to be used on triggers with statically
- * allocated name.
- */
-void led_trigger_rename_static(const char *name, struct led_trigger *trig);
-
 #define module_led_trigger(__led_trigger) \
 	module_driver(__led_trigger, led_trigger_register, \
 		      led_trigger_unregister)
-- 
cgit v1.2.3


From 595e52284d24adc376890d3fc93bdca4707d9aca Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 13 Dec 2023 08:58:15 -0700
Subject: io_uring/poll: don't enable lazy wake for POLLEXCLUSIVE

There are a few quirks around using lazy wake for poll unconditionally,
and one of them is related the EPOLLEXCLUSIVE. Those may trigger
exclusive wakeups, which wake a limited number of entries in the wait
queue. If that wake number is less than the number of entries someone is
waiting for (and that someone is also using DEFER_TASKRUN), then we can
get stuck waiting for more entries while we should be processing the ones
we already got.

If we're doing exclusive poll waits, flag the request as not being
compatible with lazy wakeups.

Reported-by: Pavel Begunkov <asml.silence@gmail.com>
Fixes: 6ce4a93dbb5b ("io_uring/poll: use IOU_F_TWQ_LAZY_WAKE for wakeups")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 805bb635cdf5..239a4f68801b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -434,6 +434,7 @@ enum {
 	/* keep async read/write and isreg together and in order */
 	REQ_F_SUPPORT_NOWAIT_BIT,
 	REQ_F_ISREG_BIT,
+	REQ_F_POLL_NO_LAZY_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -501,6 +502,8 @@ enum {
 	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT),
 	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
 	REQ_F_HASH_LOCKED	= BIT(REQ_F_HASH_LOCKED_BIT),
+	/* don't use lazy poll wake for this request */
+	REQ_F_POLL_NO_LAZY	= BIT(REQ_F_POLL_NO_LAZY_BIT),
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
-- 
cgit v1.2.3


From 3c6b0c1c28184038d90dffe8eb542bedcb8ccf98 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 30 Nov 2023 14:27:29 +0100
Subject: srcu: Use try-lock lockdep annotation for NMI-safe access.

It is claimed that srcu_read_lock_nmisafe() NMI-safe. However it
triggers a lockdep if used from NMI because lockdep expects a deadlock
since nothing disables NMIs while the lock is acquired.

This is because commit f0f44752f5f61 ("rcu: Annotate SRCU's update-side
lockdep dependencies") annotates synchronize_srcu() as a write lock
usage. This helps to detect a deadlocks such as
	srcu_read_lock();
	synchronize_srcu();
	srcu_read_unlock();

The side effect is that the lock srcu_struct now has a USED usage in normal
contexts, so it conflicts with a USED_READ usage in NMI. But this shouldn't
cause a real deadlock because the write lock usage from synchronize_srcu()
is a fake one and only used for read/write deadlock detection.

Use a try-lock annotation for srcu_read_lock_nmisafe() to avoid lockdep
complains if used from NMI.

Fixes: f0f44752f5f6 ("rcu: Annotate SRCU's update-side lockdep dependencies")
Link: https://lore.kernel.org/r/20230927160231.XRCDDSK4@linutronix.de
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 include/linux/rcupdate.h | 6 ++++++
 include/linux/srcu.h     | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f7206b2623c9..31d523c4e089 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -301,6 +301,11 @@ static inline void rcu_lock_acquire(struct lockdep_map *map)
 	lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
 }
 
+static inline void rcu_try_lock_acquire(struct lockdep_map *map)
+{
+	lock_acquire(map, 0, 1, 2, 0, NULL, _THIS_IP_);
+}
+
 static inline void rcu_lock_release(struct lockdep_map *map)
 {
 	lock_release(map, _THIS_IP_);
@@ -315,6 +320,7 @@ int rcu_read_lock_any_held(void);
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 # define rcu_lock_acquire(a)		do { } while (0)
+# define rcu_try_lock_acquire(a)	do { } while (0)
 # define rcu_lock_release(a)		do { } while (0)
 
 static inline int rcu_read_lock_held(void)
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 127ef3b2e607..236610e4a8fa 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -229,7 +229,7 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp
 
 	srcu_check_nmi_safety(ssp, true);
 	retval = __srcu_read_lock_nmisafe(ssp);
-	rcu_lock_acquire(&ssp->dep_map);
+	rcu_try_lock_acquire(&ssp->dep_map);
 	return retval;
 }
 
-- 
cgit v1.2.3


From 493dffa3ab07b5d2c0b7bd5de5bff6e85f01f52a Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Wed, 20 Sep 2023 11:22:12 +0200
Subject: rculist.h: docs: Fix wrong function summary

The brief summary in the docstring for function list_next_or_null_rcu()
states that the function is supposed to provide the "first" member of a
list, whereas in truth it returns the next member.

Change the docstring so it describes what the function actually does.

Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 include/linux/rculist.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index d29740be4833..3dc1e58865f7 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -355,7 +355,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
 })
 
 /**
- * list_next_or_null_rcu - get the first element from a list
+ * list_next_or_null_rcu - get the next element from a list
  * @head:	the head for the list.
  * @ptr:        the list head to take the next element from.
  * @type:       the type of the struct this is embedded in.
-- 
cgit v1.2.3


From 750e785796bb72423b97cac21ecd0fa3b3b65610 Mon Sep 17 00:00:00 2001
From: Jie Jiang <jiejiang@chromium.org>
Date: Tue, 12 Dec 2023 09:39:23 +0000
Subject: bpf: Support uid and gid when mounting bpffs

Parse uid and gid in bpf_parse_param() so that they can be passed in as
the `data` parameter when mount() bpffs. This will be useful when we
want to control which user/group has the control to the mounted bpffs,
otherwise a separate chown() call will be needed.

Signed-off-by: Jie Jiang <jiejiang@chromium.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Mike Frysinger <vapier@chromium.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231212093923.497838-1-jiejiang@chromium.org
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0bd4889e917a..c87c608a3689 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1595,6 +1595,8 @@ struct bpf_link_primer {
 };
 
 struct bpf_mount_opts {
+	kuid_t uid;
+	kgid_t gid;
 	umode_t mode;
 
 	/* BPF token-related delegation options */
-- 
cgit v1.2.3


From 537fec0733c4a72e2a2b69fee365459c5b75d92e Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Tue, 5 Dec 2023 22:08:42 +0100
Subject: net: make vlan_get_tag() return -ENODATA instead of -EINVAL

__vlan_hwaccel_get_tag() is used in veth XDP hints implementation,
its return value (-EINVAL if skb is not VLAN tagged) is passed to bpf code,
but XDP hints specification requires drivers to return -ENODATA, if a hint
cannot be provided for a particular packet.

Solve this inconsistency by changing error return value of
__vlan_hwaccel_get_tag() from -EINVAL to -ENODATA, do the same thing to
__vlan_get_tag(), because this function is supposed to follow the same
convention. This, in turn, makes -ENODATA the only non-zero value
vlan_get_tag() can return. We can do this with no side effects, because
none of the users of the 3 above-mentioned functions rely on the exact
value.

Suggested-by: Jesper Dangaard Brouer <jbrouer@redhat.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Link: https://lore.kernel.org/r/20231205210847.28460-14-larysa.zaremba@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/if_vlan.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 3028af87716e..c1645c86eed9 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -540,7 +540,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 	struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb);
 
 	if (!eth_type_vlan(veth->h_vlan_proto))
-		return -EINVAL;
+		return -ENODATA;
 
 	*vlan_tci = ntohs(veth->h_vlan_TCI);
 	return 0;
@@ -561,7 +561,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
 		return 0;
 	} else {
 		*vlan_tci = 0;
-		return -EINVAL;
+		return -ENODATA;
 	}
 }
 
-- 
cgit v1.2.3


From 7978bad4b6b9265a1e808a5f679ee428d1dd6523 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Tue, 5 Dec 2023 22:08:43 +0100
Subject: mlx5: implement VLAN tag XDP hint

Implement the newly added .xmo_rx_vlan_tag() hint function.

Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Link: https://lore.kernel.org/r/20231205210847.28460-15-larysa.zaremba@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/mlx5/device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 820bca965fb6..01275c6e8468 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -918,7 +918,7 @@ static inline u8 get_cqe_tls_offload(struct mlx5_cqe64 *cqe)
 	return (cqe->tls_outer_l3_tunneled >> 3) & 0x3;
 }
 
-static inline bool cqe_has_vlan(struct mlx5_cqe64 *cqe)
+static inline bool cqe_has_vlan(const struct mlx5_cqe64 *cqe)
 {
 	return cqe->l4_l3_hdr_type & 0x1;
 }
-- 
cgit v1.2.3


From 0c476157085fe2ad13b9bec70ea672e86647fa1a Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Tue, 12 Dec 2023 06:41:43 +0100
Subject: net: phy: c45: add genphy_c45_pma_read_ext_abilities() function

Move part of the genphy_c45_pma_read_abilities() code to a separate
function.

Some PHYs do not implement PMA/PMD status 2 register (Register 1.8) but
do implement PMA/PMD extended ability register (Register 1.11). To make
use of it, we need to be able to access this part of code separately.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/20231212054144.87527-2-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 6e7ebcc50b85..dbb5e13e3e1b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1866,6 +1866,7 @@ int genphy_c45_an_config_aneg(struct phy_device *phydev);
 int genphy_c45_an_disable_aneg(struct phy_device *phydev);
 int genphy_c45_read_mdix(struct phy_device *phydev);
 int genphy_c45_pma_read_abilities(struct phy_device *phydev);
+int genphy_c45_pma_read_ext_abilities(struct phy_device *phydev);
 int genphy_c45_pma_baset1_read_abilities(struct phy_device *phydev);
 int genphy_c45_read_eee_abilities(struct phy_device *phydev);
 int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev);
-- 
cgit v1.2.3


From 13049408a4bd29c92227ca2d6befab80dbb96663 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Sun, 7 May 2023 16:47:42 +0300
Subject: net/mlx5: Add mlx5_ifc bits used for supporting single netdev
 Socket-Direct

Multiple device caps and features are required to support
single netdev Socket-Direct.
Add them here in preparation for the feature implementation.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ce2e71cd6d2a..405d141b4a08 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -435,7 +435,7 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         flow_table_modify[0x1];
 	u8         reformat[0x1];
 	u8         decap[0x1];
-	u8         reserved_at_9[0x1];
+	u8         reset_root_to_default[0x1];
 	u8         pop_vlan[0x1];
 	u8         push_vlan[0x1];
 	u8         reserved_at_c[0x1];
@@ -1801,7 +1801,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         disable_local_lb_uc[0x1];
 	u8         disable_local_lb_mc[0x1];
 	u8         log_min_hairpin_wq_data_sz[0x5];
-	u8         reserved_at_3e8[0x2];
+	u8         reserved_at_3e8[0x1];
+	u8         silent_mode[0x1];
 	u8         vhca_state[0x1];
 	u8         log_max_vlan_list[0x5];
 	u8         reserved_at_3f0[0x3];
@@ -1818,7 +1819,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_460[0x1];
 	u8         ats[0x1];
-	u8         reserved_at_462[0x1];
+	u8         cross_vhca_rqt[0x1];
 	u8         log_max_uctx[0x5];
 	u8         reserved_at_468[0x1];
 	u8         crypto[0x1];
@@ -1943,6 +1944,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 enum {
 	MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_TO_REMOTE_FLOW_TABLE_MISS  = 0x80000,
+	MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_ROOT_TO_REMOTE_FLOW_TABLE  = (1ULL << 20),
 };
 
 enum {
@@ -1992,7 +1994,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_260[0x120];
 	u8	   reserved_at_380[0x10];
 	u8	   ec_vf_vport_base[0x10];
-	u8	   reserved_at_3a0[0x460];
+
+	u8	   reserved_at_3a0[0x10];
+	u8	   max_rqt_vhca_id[0x10];
+
+	u8	   reserved_at_3c0[0x440];
 };
 
 enum mlx5_ifc_flow_destination_type {
@@ -2151,6 +2157,13 @@ struct mlx5_ifc_rq_num_bits {
 	u8         rq_num[0x18];
 };
 
+struct mlx5_ifc_rq_vhca_bits {
+	u8         reserved_at_0[0x8];
+	u8         rq_num[0x18];
+	u8         reserved_at_20[0x10];
+	u8         rq_vhca_id[0x10];
+};
+
 struct mlx5_ifc_mac_address_layout_bits {
 	u8         reserved_at_0[0x10];
 	u8         mac_addr_47_32[0x10];
@@ -3901,7 +3914,10 @@ struct mlx5_ifc_rqtc_bits {
 
 	u8    reserved_at_e0[0x6a0];
 
-	struct mlx5_ifc_rq_num_bits rq_num[];
+	union {
+		DECLARE_FLEX_ARRAY(struct mlx5_ifc_rq_num_bits, rq_num);
+		DECLARE_FLEX_ARRAY(struct mlx5_ifc_rq_vhca_bits, rq_vhca);
+	};
 };
 
 enum {
@@ -4744,7 +4760,10 @@ struct mlx5_ifc_set_l2_table_entry_in_bits {
 
 	u8         reserved_at_c0[0x20];
 
-	u8         reserved_at_e0[0x13];
+	u8         reserved_at_e0[0x10];
+	u8         silent_mode_valid[0x1];
+	u8         silent_mode[0x1];
+	u8         reserved_at_f2[0x1];
 	u8         vlan_valid[0x1];
 	u8         vlan[0xc];
 
-- 
cgit v1.2.3


From f5e956329960903d908668d7a20bbc08e0a8b92b Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Mon, 7 Aug 2023 09:05:34 +0300
Subject: net/mlx5: Expose Management PCIe Index Register (MPIR)

MPIR register allows to query the PCIe indexes
and Socket-Direct related parameters.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d2b8d4a74a30..2f67cec1a898 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -150,6 +150,7 @@ enum {
 	MLX5_REG_MTPPSE		 = 0x9054,
 	MLX5_REG_MTUTC		 = 0x9055,
 	MLX5_REG_MPEGC		 = 0x9056,
+	MLX5_REG_MPIR		 = 0x9059,
 	MLX5_REG_MCQS		 = 0x9060,
 	MLX5_REG_MCQI		 = 0x9061,
 	MLX5_REG_MCC		 = 0x9062,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 405d141b4a08..828938368fb7 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10108,6 +10108,20 @@ struct mlx5_ifc_mpegc_reg_bits {
 	u8         reserved_at_60[0x100];
 };
 
+struct mlx5_ifc_mpir_reg_bits {
+	u8         sdm[0x1];
+	u8         reserved_at_1[0x1b];
+	u8         host_buses[0x4];
+
+	u8         reserved_at_20[0x20];
+
+	u8         local_port[0x8];
+	u8         reserved_at_28[0x15];
+	u8         sd_group[0x3];
+
+	u8         reserved_at_60[0x20];
+};
+
 enum {
 	MLX5_MTUTC_FREQ_ADJ_UNITS_PPB          = 0x0,
 	MLX5_MTUTC_FREQ_ADJ_UNITS_SCALED_PPM   = 0x1,
-- 
cgit v1.2.3


From b25bd37c859f32e50a436ab9d2078b76e433008e Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Sun, 6 Aug 2023 14:01:10 +0300
Subject: net/mlx5: Move TISes from priv to mdev HW resources

The transport interface send (TIS) object is responsible for performing
all transport related operations of the transmit side. Messages from
Send Queues get segmented and transmitted by the TIS including all
transport required implications, e.g. in the case of large send offload,
the TIS is responsible for the segmentation.

These are stateless objects and can be used by multiple netdevs (e.g.
representors) who share the same core device.

Providing the TISes as a service from the core layer to the netdev layer
reduces the number of replecated TIS objects (in case of multiple
netdevs), and will ease the transition to netdev with multiple mdevs.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2f67cec1a898..7ee5b79ff3d6 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -679,6 +679,8 @@ struct mlx5e_resources {
 		struct mlx5_td             td;
 		u32			   mkey;
 		struct mlx5_sq_bfreg       bfreg;
+#define MLX5_MAX_NUM_TC 8
+		u32                        tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC];
 	} hw_objs;
 	struct net_device *uplink_netdev;
 	struct mutex uplink_netdev_lock;
-- 
cgit v1.2.3


From 50d73710715de7d1a2c88194562f520816af9c2a Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Tue, 12 Dec 2023 15:27:51 +0100
Subject: ethtool: add SET for TCP_DATA_SPLIT ringparam

Follow up commit 9690ae604290 ("ethtool: add header/data split
indication") and add the set part of Ethtool's header split, i.e.
ability to enable/disable header split via the Ethtool Netlink
interface. This might be helpful to optimize the setup for particular
workloads, for example, to avoid XDP frags, and so on.
A driver should advertise ``ETHTOOL_RING_USE_TCP_DATA_SPLIT`` in its
ops->supported_ring_params to allow doing that. "Unknown" passed from
the userspace when the header split is supported means the driver is
free to choose the preferred state.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20231212142752.935000-2-aleksander.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index deb683d3360f..67b30940234b 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -95,6 +95,7 @@ struct kernel_ethtool_ringparam {
  * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push
  * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push
  * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len
+ * @ETHTOOL_RING_USE_TCP_DATA_SPLIT: capture for setting tcp_data_split
  */
 enum ethtool_supported_ring_param {
 	ETHTOOL_RING_USE_RX_BUF_LEN		= BIT(0),
@@ -102,6 +103,7 @@ enum ethtool_supported_ring_param {
 	ETHTOOL_RING_USE_TX_PUSH		= BIT(2),
 	ETHTOOL_RING_USE_RX_PUSH		= BIT(3),
 	ETHTOOL_RING_USE_TX_PUSH_BUF_LEN	= BIT(4),
+	ETHTOOL_RING_USE_TCP_DATA_SPLIT		= BIT(5),
 };
 
 #define __ETH_RSS_HASH_BIT(bit)	((u32)1 << (bit))
-- 
cgit v1.2.3


From 0a149ab78ee220c75eef797abea7a29f4490e226 Mon Sep 17 00:00:00 2001
From: Liang Chen <liangchen.linux@gmail.com>
Date: Tue, 12 Dec 2023 12:46:11 +0800
Subject: page_pool: transition to reference count management after page
 draining

To support multiple users referencing the same fragment,
'pp_frag_count' is renamed to 'pp_ref_count', transitioning pp pages
from fragment management to reference count management after draining
based on the suggestion from [1].

The idea is that the concept of fragmenting exists before the page is
drained, and all related functions retain their current names.
However, once the page is drained, its management shifts to being
governed by 'pp_ref_count'. Therefore, all functions associated with
that lifecycle stage of a pp page are renamed.

[1]
http://lore.kernel.org/netdev/f71d9448-70c8-8793-dc9a-0eb48a570300@huawei.com

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
Reviewed-by: Yunsheng Lin <linyunsheng@huawei.com>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Link: https://lore.kernel.org/r/20231212044614.42733-2-liangchen.linux@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mm_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 957ce38768b2..64e4572ef06d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -125,7 +125,7 @@ struct page {
 			struct page_pool *pp;
 			unsigned long _pp_mapping_pad;
 			unsigned long dma_addr;
-			atomic_long_t pp_frag_count;
+			atomic_long_t pp_ref_count;
 		};
 		struct {	/* Tail pages of compound page */
 			unsigned long compound_head;	/* Bit zero is set */
-- 
cgit v1.2.3


From 6caa290684255991ffeebf228b2fd9e7e4da8f34 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 12 Dec 2023 23:02:56 -0800
Subject: Input: navpoint - convert to use GPIO descriptor

The Navpoint driver uses a GPIO line, convert this to use
a GPIO descriptor. There are no in-kernel users but out of tree
users can easily be added or converted using a GPIO descriptor
table as with numerous other drivers.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-1-9433162914a3@linaro.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/input/navpoint.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/input/navpoint.h b/include/linux/input/navpoint.h
index d464ffb4db52..5192ae3f5ec1 100644
--- a/include/linux/input/navpoint.h
+++ b/include/linux/input/navpoint.h
@@ -5,5 +5,4 @@
 
 struct navpoint_platform_data {
 	int		port;		/* PXA SSP port for pxa_ssp_request() */
-	int		gpio;		/* GPIO for power on/off */
 };
-- 
cgit v1.2.3


From e53c18da99c75f080bd99436c57824f2ab657f03 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 12 Dec 2023 23:06:55 -0800
Subject: Input: omap-keypad - drop optional GPIO support

The driver supports passing some GPIO lines for rows and columns
through the driver data, but there is no in-kernel user of this.

Further the use seems convoluted because the GPIO lines are unused
in the driver, then explicitly free:ed when removing it without
being requested when probing it, which is assymetric and just
a recepie for disaster.

Remove the support for these unused GPIOs, if need be support can
be reestablished in an organized fashion using GPIO descriptors.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-3-9433162914a3@linaro.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/platform_data/keypad-omap.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/keypad-omap.h b/include/linux/platform_data/keypad-omap.h
index 3e7c64c854f4..f3f1311cdf3a 100644
--- a/include/linux/platform_data/keypad-omap.h
+++ b/include/linux/platform_data/keypad-omap.h
@@ -19,9 +19,6 @@ struct omap_kp_platform_data {
 	bool rep;
 	unsigned long delay;
 	bool dbounce;
-	/* specific to OMAP242x*/
-	unsigned int *row_gpios;
-	unsigned int *col_gpios;
 };
 
 /* Group (0..3) -- when multiple keys are pressed, only the
-- 
cgit v1.2.3


From 7395de647e87476f5b5d2f9a9fe80cee86b4e7cc Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 12 Dec 2023 23:04:47 -0800
Subject: Input: as5011 - convert to GPIO descriptor

This driver does not have any in-tree users but is passing a
legacy GPIO number through platform data.

Convert it to use a GPIO descriptor, new users or outoftree
users can easily be implemented using GPIO descriptor tables
or software nodes.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-4-9433162914a3@linaro.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/input/as5011.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/input/as5011.h b/include/linux/input/as5011.h
index 5fba52a56cd6..5705d5de3aea 100644
--- a/include/linux/input/as5011.h
+++ b/include/linux/input/as5011.h
@@ -7,7 +7,6 @@
  */
 
 struct as5011_platform_data {
-	unsigned int button_gpio;
 	unsigned int axis_irq; /* irq number */
 	unsigned long axis_irqflags;
 	char xp, xn; /* threshold for x axis */
-- 
cgit v1.2.3


From 6ab3d50b106c9aea123a80551a6c9deace83b914 Mon Sep 17 00:00:00 2001
From: Qiang Yu <quic_qianyu@quicinc.com>
Date: Tue, 7 Nov 2023 16:14:49 +0800
Subject: bus: mhi: host: Add a separate timeout parameter for waiting ready

Some devices(eg. SDX75) take longer than expected (default, 8 seconds) to
set ready after reboot. Hence add optional ready timeout parameter and pass
the appropriate timeout value to mhi_poll_reg_field() to wait enough for
device ready as part of power up sequence.

Signed-off-by: Qiang Yu <quic_qianyu@quicinc.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/1699344890-87076-2-git-send-email-quic_qianyu@quicinc.com
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 039943ec4d4e..d0f9b522f328 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -266,6 +266,7 @@ struct mhi_event_config {
  * struct mhi_controller_config - Root MHI controller configuration
  * @max_channels: Maximum number of channels supported
  * @timeout_ms: Timeout value for operations. 0 means use default
+ * @ready_timeout_ms: Timeout value for waiting device to be ready (optional)
  * @buf_len: Size of automatically allocated buffers. 0 means use default
  * @num_channels: Number of channels defined in @ch_cfg
  * @ch_cfg: Array of defined channels
@@ -277,6 +278,7 @@ struct mhi_event_config {
 struct mhi_controller_config {
 	u32 max_channels;
 	u32 timeout_ms;
+	u32 ready_timeout_ms;
 	u32 buf_len;
 	u32 num_channels;
 	const struct mhi_channel_config *ch_cfg;
@@ -330,6 +332,7 @@ struct mhi_controller_config {
  * @pm_mutex: Mutex for suspend/resume operation
  * @pm_lock: Lock for protecting MHI power management state
  * @timeout_ms: Timeout in ms for state transitions
+ * @ready_timeout_ms: Timeout in ms for waiting device to be ready (optional)
  * @pm_state: MHI power management state
  * @db_access: DB access states
  * @ee: MHI device execution environment
@@ -419,6 +422,7 @@ struct mhi_controller {
 	struct mutex pm_mutex;
 	rwlock_t pm_lock;
 	u32 timeout_ms;
+	u32 ready_timeout_ms;
 	u32 pm_state;
 	u32 db_access;
 	enum mhi_ee_type ee;
-- 
cgit v1.2.3


From fb6e30a72539ce28c1323aef4190d35aac106f6f Mon Sep 17 00:00:00 2001
From: Ahmed Zaki <ahmed.zaki@intel.com>
Date: Tue, 12 Dec 2023 17:33:14 -0700
Subject: net: ethtool: pass a pointer to parameters to get/set_rxfh ethtool
 ops

The get/set_rxfh ethtool ops currently takes the rxfh (RSS) parameters
as direct function arguments. This will force us to change the API (and
all drivers' functions) every time some new parameters are added.

This is part 1/2 of the fix, as suggested in [1]:

- First simplify the code by always providing a pointer to all params
   (indir, key and func); the fact that some of them may be NULL seems
   like a weird historic thing or a premature optimization.
   It will simplify the drivers if all pointers are always present.

 - Then make the functions take a dev pointer, and a pointer to a
   single struct wrapping all arguments. The set_* should also take
   an extack.

Link: https://lore.kernel.org/netdev/20231121152906.2dd5f487@kernel.org/ [1]
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Suggested-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Link: https://lore.kernel.org/r/20231213003321.605376-2-ahmed.zaki@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 67b30940234b..3ab2b6a90419 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -596,6 +596,28 @@ struct ethtool_mm_stats {
 	u64 MACMergeHoldCount;
 };
 
+/**
+ * struct ethtool_rxfh_param - RXFH (RSS) parameters
+ * @hfunc: Defines the current RSS hash function used by HW (or to be set to).
+ *	Valid values are one of the %ETH_RSS_HASH_*.
+ * @indir_size: On SET, the array size of the user buffer for the
+ *	indirection table, which may be zero, or
+ *	%ETH_RXFH_INDIR_NO_CHANGE.  On GET (read from the driver),
+ *	the array size of the hardware indirection table.
+ * @indir: The indirection table of size @indir_size entries.
+ * @key_size: On SET, the array size of the user buffer for the hash key,
+ *	which may be zero.  On GET (read from the driver), the size of the
+ *	hardware hash key.
+ * @key: The hash key of size @key_size bytes.
+ */
+struct ethtool_rxfh_param {
+	u8	hfunc;
+	u32	indir_size;
+	u32	*indir;
+	u32	key_size;
+	u8	*key;
+};
+
 /**
  * struct ethtool_ops - optional netdev operations
  * @cap_link_lanes_supported: indicates if the driver supports lanes
@@ -846,14 +868,14 @@ struct ethtool_ops {
 	int	(*reset)(struct net_device *, u32 *);
 	u32	(*get_rxfh_key_size)(struct net_device *);
 	u32	(*get_rxfh_indir_size)(struct net_device *);
-	int	(*get_rxfh)(struct net_device *, u32 *indir, u8 *key,
-			    u8 *hfunc);
-	int	(*set_rxfh)(struct net_device *, const u32 *indir,
-			    const u8 *key, const u8 hfunc);
-	int	(*get_rxfh_context)(struct net_device *, u32 *indir, u8 *key,
-				    u8 *hfunc, u32 rss_context);
-	int	(*set_rxfh_context)(struct net_device *, const u32 *indir,
-				    const u8 *key, const u8 hfunc,
+	int	(*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *);
+	int	(*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *,
+			    struct netlink_ext_ack *extack);
+	int	(*get_rxfh_context)(struct net_device *,
+				    struct ethtool_rxfh_param *,
+				    u32 rss_context);
+	int	(*set_rxfh_context)(struct net_device *,
+				    struct ethtool_rxfh_param *,
 				    u32 *rss_context, bool delete);
 	void	(*get_channels)(struct net_device *, struct ethtool_channels *);
 	int	(*set_channels)(struct net_device *, struct ethtool_channels *);
-- 
cgit v1.2.3


From dcd8dbf9e734eb334113ea43186c1c26e9f497bb Mon Sep 17 00:00:00 2001
From: Ahmed Zaki <ahmed.zaki@intel.com>
Date: Tue, 12 Dec 2023 17:33:15 -0700
Subject: net: ethtool: get rid of get/set_rxfh_context functions

Add the RSS context parameters to struct ethtool_rxfh_param and use the
get/set_rxfh to handle the RSS contexts as well.

This is part 2/2 of the fix suggested in [1]:

 - Add a rss_context member to the argument struct and a capability
   like cap_link_lanes_supported to indicate whether driver supports
   rss contexts, then you can remove *et_rxfh_context functions,
   and instead call *et_rxfh() with a non-zero rss_context.

Link: https://lore.kernel.org/netdev/20231121152906.2dd5f487@kernel.org/ [1]
CC: Jesse Brandeburg <jesse.brandeburg@intel.com>
CC: Tony Nguyen <anthony.l.nguyen@intel.com>
CC: Marcin Wojtas <mw@semihalf.com>
CC: Russell King <linux@armlinux.org.uk>
CC: Sunil Goutham <sgoutham@marvell.com>
CC: Geetha sowjanya <gakula@marvell.com>
CC: Subbaraya Sundeep <sbhatta@marvell.com>
CC: hariprasad <hkelam@marvell.com>
CC: Saeed Mahameed <saeedm@nvidia.com>
CC: Leon Romanovsky <leon@kernel.org>
CC: Edward Cree <ecree.xilinx@gmail.com>
CC: Martin Habets <habetsm.xilinx@gmail.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Link: https://lore.kernel.org/r/20231213003321.605376-3-ahmed.zaki@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 3ab2b6a90419..66fe254c3e51 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -609,6 +609,12 @@ struct ethtool_mm_stats {
  *	which may be zero.  On GET (read from the driver), the size of the
  *	hardware hash key.
  * @key: The hash key of size @key_size bytes.
+ * @rss_context: RSS context identifier.  Context 0 is the default for normal
+ *	traffic; other contexts can be referenced as the destination for RX flow
+ *	classification rules.  On SET, %ETH_RXFH_CONTEXT_ALLOC is used
+ *	to allocate a new RSS context; on return this field will
+ *	contain the ID of the newly allocated context.
+ * @rss_delete: Set to non-ZERO to remove the @rss_context context.
  */
 struct ethtool_rxfh_param {
 	u8	hfunc;
@@ -616,12 +622,16 @@ struct ethtool_rxfh_param {
 	u32	*indir;
 	u32	key_size;
 	u8	*key;
+	u32	rss_context;
+	u8	rss_delete;
 };
 
 /**
  * struct ethtool_ops - optional netdev operations
  * @cap_link_lanes_supported: indicates if the driver supports lanes
  *	parameter.
+ * @cap_rss_ctx_supported: indicates if the driver supports RSS
+ *	contexts.
  * @supported_coalesce_params: supported types of interrupt coalescing.
  * @supported_ring_params: supported ring params.
  * @get_drvinfo: Report driver/device information. Modern drivers no
@@ -718,15 +728,6 @@ struct ethtool_rxfh_param {
  *	will remain unchanged.
  *	Returns a negative error code or zero. An error code must be returned
  *	if at least one unsupported change was requested.
- * @get_rxfh_context: Get the contents of the RX flow hash indirection table,
- *	hash key, and/or hash function assiciated to the given rss context.
- *	Returns a negative error code or zero.
- * @set_rxfh_context: Create, remove and configure RSS contexts. Allows setting
- *	the contents of the RX flow hash indirection table, hash key, and/or
- *	hash function associated to the given context. Arguments which are set
- *	to %NULL or zero will remain unchanged.
- *	Returns a negative error code or zero. An error code must be returned
- *	if at least one unsupported change was requested.
  * @get_channels: Get number of channels.
  * @set_channels: Set number of channels.  Returns a negative error code or
  *	zero.
@@ -809,6 +810,7 @@ struct ethtool_rxfh_param {
  */
 struct ethtool_ops {
 	u32     cap_link_lanes_supported:1;
+	u32     cap_rss_ctx_supported:1;
 	u32	supported_coalesce_params;
 	u32	supported_ring_params;
 	void	(*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *);
@@ -871,12 +873,6 @@ struct ethtool_ops {
 	int	(*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *);
 	int	(*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *,
 			    struct netlink_ext_ack *extack);
-	int	(*get_rxfh_context)(struct net_device *,
-				    struct ethtool_rxfh_param *,
-				    u32 rss_context);
-	int	(*set_rxfh_context)(struct net_device *,
-				    struct ethtool_rxfh_param *,
-				    u32 *rss_context, bool delete);
 	void	(*get_channels)(struct net_device *, struct ethtool_channels *);
 	int	(*set_channels)(struct net_device *, struct ethtool_channels *);
 	int	(*get_dump_flag)(struct net_device *, struct ethtool_dump *);
-- 
cgit v1.2.3


From 13e59344fb9d3c9d3acd138ae320b5b67b658694 Mon Sep 17 00:00:00 2001
From: Ahmed Zaki <ahmed.zaki@intel.com>
Date: Tue, 12 Dec 2023 17:33:16 -0700
Subject: net: ethtool: add support for symmetric-xor RSS hash

Symmetric RSS hash functions are beneficial in applications that monitor
both Tx and Rx packets of the same flow (IDS, software firewalls, ..etc).
Getting all traffic of the same flow on the same RX queue results in
higher CPU cache efficiency.

A NIC that supports "symmetric-xor" can achieve this RSS hash symmetry
by XORing the source and destination fields and pass the values to the
RSS hash algorithm.

The user may request RSS hash symmetry for a specific algorithm, via:

    # ethtool -X eth0 hfunc <hash_alg> symmetric-xor

or turn symmetry off (asymmetric) by:

    # ethtool -X eth0 hfunc <hash_alg>

The specific fields for each flow type should then be specified as usual
via:
    # ethtool -N|-U eth0 rx-flow-hash <flow_type> s|d|f|n

Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Link: https://lore.kernel.org/r/20231213003321.605376-4-ahmed.zaki@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 66fe254c3e51..cfcd952a1d4f 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -615,6 +615,8 @@ struct ethtool_mm_stats {
  *	to allocate a new RSS context; on return this field will
  *	contain the ID of the newly allocated context.
  * @rss_delete: Set to non-ZERO to remove the @rss_context context.
+ * @input_xfrm: Defines how the input data is transformed. Valid values are one
+ *	of %RXH_XFRM_*.
  */
 struct ethtool_rxfh_param {
 	u8	hfunc;
@@ -624,6 +626,7 @@ struct ethtool_rxfh_param {
 	u8	*key;
 	u32	rss_context;
 	u8	rss_delete;
+	u8	input_xfrm;
 };
 
 /**
@@ -632,6 +635,8 @@ struct ethtool_rxfh_param {
  *	parameter.
  * @cap_rss_ctx_supported: indicates if the driver supports RSS
  *	contexts.
+ * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor
+ *	RSS.
  * @supported_coalesce_params: supported types of interrupt coalescing.
  * @supported_ring_params: supported ring params.
  * @get_drvinfo: Report driver/device information. Modern drivers no
@@ -811,6 +816,7 @@ struct ethtool_rxfh_param {
 struct ethtool_ops {
 	u32     cap_link_lanes_supported:1;
 	u32     cap_rss_ctx_supported:1;
+	u32	cap_rss_sym_xor_supported:1;
 	u32	supported_coalesce_params;
 	u32	supported_ring_params;
 	void	(*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *);
-- 
cgit v1.2.3


From dc6e44c9d6d68e8aa5de78d15f43f93145719b72 Mon Sep 17 00:00:00 2001
From: Qi Zhang <qi.z.zhang@intel.com>
Date: Tue, 12 Dec 2023 17:33:18 -0700
Subject: ice: refactor RSS configuration

Refactor the driver to use a communication data structure for RSS
config. To do so we introduce the new ice_rss_hash_cfg struct, and then
pass it as an argument to several functions.

Also introduce enum ice_rss_cfg_hdr_type to specify a more granular and
flexible RSS configuration:

ICE_RSS_OUTER_HEADERS - take outer layer as RSS input set
ICE_RSS_INNER_HEADERS - take inner layer as RSS input set
ICE_RSS_INNER_HEADERS_W_OUTER_IPV4 - take inner layer as RSS input set for
                                     packet with outer IPV4
ICE_RSS_INNER_HEADERS_W_OUTER_IPV6 - take inner layer as RSS input set for
                                     packet with outer IPV6
ICE_RSS_ANY_HEADERS - try with outer first then inner (same as the
		      behaviour without this change)

Finally, move the virtchnl_rss_algorithm enum to be with the other RSS
related structures in the virtchnl.h file.

There should be no functional change due to this patch.

Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Co-developed-by: Ahmed Zaki <ahmed.zaki@intel.com>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Link: https://lore.kernel.org/r/20231213003321.605376-6-ahmed.zaki@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/avf/virtchnl.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 6b3acf15be5c..b0e060cc79ac 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -911,6 +911,14 @@ struct virtchnl_rss_hena {
 
 VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hena);
 
+/* Type of RSS algorithm */
+enum virtchnl_rss_algorithm {
+	VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC	= 0,
+	VIRTCHNL_RSS_ALG_R_ASYMMETRIC		= 1,
+	VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC	= 2,
+	VIRTCHNL_RSS_ALG_XOR_SYMMETRIC		= 3,
+};
+
 /* VIRTCHNL_OP_ENABLE_CHANNELS
  * VIRTCHNL_OP_DISABLE_CHANNELS
  * VF sends these messages to enable or disable channels based on
@@ -1095,14 +1103,6 @@ enum virtchnl_vfr_states {
 	VIRTCHNL_VFR_VFACTIVE,
 };
 
-/* Type of RSS algorithm */
-enum virtchnl_rss_algorithm {
-	VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC	= 0,
-	VIRTCHNL_RSS_ALG_R_ASYMMETRIC		= 1,
-	VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC	= 2,
-	VIRTCHNL_RSS_ALG_XOR_SYMMETRIC		= 3,
-};
-
 #define VIRTCHNL_MAX_NUM_PROTO_HDRS	32
 #define PROTO_HDR_SHIFT			5
 #define PROTO_HDR_FIELD_START(proto_hdr_type) ((proto_hdr_type) << PROTO_HDR_SHIFT)
-- 
cgit v1.2.3


From 4a3de3fb0eb6897488dd510006abd9673f1fb34c Mon Sep 17 00:00:00 2001
From: Ahmed Zaki <ahmed.zaki@intel.com>
Date: Tue, 12 Dec 2023 17:33:21 -0700
Subject: iavf: enable symmetric-xor RSS for Toeplitz hash function

Allow the user to set the symmetric Toeplitz hash function via:

    # ethtool -X eth0 hfunc toeplitz symmetric-xor

The driver will reject any new RSS configuration if a field other than
(IP src/dst and L4 src/dst ports) is requested for hashing.

The symmetric RSS will not be supported on PFs not advertising the ADV RSS
Offload flag (ADV_RSS_SUPPORT()), for example the E700 series (i40e).

Reviewed-by: Madhu Chittim <madhu.chittim@intel.com>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Link: https://lore.kernel.org/r/20231213003321.605376-9-ahmed.zaki@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/avf/virtchnl.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index b0e060cc79ac..a44d9dc7e3eb 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -118,6 +118,7 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_GET_STATS = 15,
 	VIRTCHNL_OP_RSVD = 16,
 	VIRTCHNL_OP_EVENT = 17, /* must ALWAYS be 17 */
+	VIRTCHNL_OP_CONFIG_RSS_HFUNC = 18,
 	/* opcode 19 is reserved */
 	VIRTCHNL_OP_IWARP = 20, /* advanced opcode */
 	VIRTCHNL_OP_RDMA = VIRTCHNL_OP_IWARP,
@@ -919,6 +920,21 @@ enum virtchnl_rss_algorithm {
 	VIRTCHNL_RSS_ALG_XOR_SYMMETRIC		= 3,
 };
 
+/* VIRTCHNL_OP_CONFIG_RSS_HFUNC
+ * VF sends this message to configure the RSS hash function. Only supported
+ * if both PF and VF drivers set the VIRTCHNL_VF_OFFLOAD_RSS_PF bit during
+ * configuration negotiation.
+ * The hash function is initialized to VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC
+ * by the PF.
+ */
+struct virtchnl_rss_hfunc {
+	u16 vsi_id;
+	u16 rss_algorithm; /* enum virtchnl_rss_algorithm */
+	u32 reserved;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hfunc);
+
 /* VIRTCHNL_OP_ENABLE_CHANNELS
  * VIRTCHNL_OP_DISABLE_CHANNELS
  * VF sends these messages to enable or disable channels based on
@@ -1542,6 +1558,9 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 							 vrl->lut_entries);
 		}
 		break;
+	case VIRTCHNL_OP_CONFIG_RSS_HFUNC:
+		valid_len = sizeof(struct virtchnl_rss_hfunc);
+		break;
 	case VIRTCHNL_OP_GET_RSS_HENA_CAPS:
 		break;
 	case VIRTCHNL_OP_SET_RSS_HENA:
-- 
cgit v1.2.3


From 62210a26cd4f8ad52683a71c0226dfe85de1144d Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Wed, 18 Oct 2023 17:58:12 +0530
Subject: bus: mhi: ep: Use slab allocator where applicable

Use slab allocator for allocating the memory for objects used frequently
and are of fixed size. This reduces the overheard associated with
kmalloc().

Suggested-by: Alex Elder <elder@linaro.org>
Link: https://lore.kernel.org/r/20231018122812.47261-1-manivannan.sadhasivam@linaro.org
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi_ep.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h
index f198a8ac7ee7..ce85d42b685d 100644
--- a/include/linux/mhi_ep.h
+++ b/include/linux/mhi_ep.h
@@ -128,6 +128,9 @@ struct mhi_ep_cntrl {
 	struct work_struct reset_work;
 	struct work_struct cmd_ring_work;
 	struct work_struct ch_ring_work;
+	struct kmem_cache *ring_item_cache;
+	struct kmem_cache *ev_ring_el_cache;
+	struct kmem_cache *tre_buf_cache;
 
 	void (*raise_irq)(struct mhi_ep_cntrl *mhi_cntrl, u32 vector);
 	int (*alloc_map)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t *phys_ptr,
-- 
cgit v1.2.3


From b08ded2ef2e98768d5ee5f71da8fe768b1f7774b Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 17 Aug 2023 23:24:52 +0530
Subject: bus: mhi: ep: Pass mhi_ep_buf_info struct to read/write APIs

In the preparation of DMA async support, let's pass the parameters to
read_from_host() and write_to_host() APIs using mhi_ep_buf_info structure.

No functional change.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi_ep.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h
index ce85d42b685d..96f3a133540d 100644
--- a/include/linux/mhi_ep.h
+++ b/include/linux/mhi_ep.h
@@ -49,6 +49,18 @@ struct mhi_ep_db_info {
 	u32 status;
 };
 
+/**
+ * struct mhi_ep_buf_info - MHI Endpoint transfer buffer info
+ * @dev_addr: Address of the buffer in endpoint
+ * @host_addr: Address of the bufffer in host
+ * @size: Size of the buffer
+ */
+struct mhi_ep_buf_info {
+	void *dev_addr;
+	u64 host_addr;
+	size_t size;
+};
+
 /**
  * struct mhi_ep_cntrl - MHI Endpoint controller structure
  * @cntrl_dev: Pointer to the struct device of physical bus acting as the MHI
@@ -137,8 +149,8 @@ struct mhi_ep_cntrl {
 			 void __iomem **virt, size_t size);
 	void (*unmap_free)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t phys,
 			   void __iomem *virt, size_t size);
-	int (*read_from_host)(struct mhi_ep_cntrl *mhi_cntrl, u64 from, void *to, size_t size);
-	int (*write_to_host)(struct mhi_ep_cntrl *mhi_cntrl, void *from, u64 to, size_t size);
+	int (*read_from_host)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
+	int (*write_to_host)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
 
 	enum mhi_state mhi_state;
 
-- 
cgit v1.2.3


From 927105244f8bc48e6841826a5644c6a961e03b5d Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Mon, 27 Nov 2023 13:57:37 +0530
Subject: bus: mhi: ep: Rename read_from_host() and write_to_host() APIs

In the preparation for adding async API support, let's rename the existing
APIs to read_sync() and write_sync() to make it explicit that these APIs
are used for synchronous read/write.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi_ep.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h
index 96f3a133540d..b96b543bf2f6 100644
--- a/include/linux/mhi_ep.h
+++ b/include/linux/mhi_ep.h
@@ -94,8 +94,8 @@ struct mhi_ep_buf_info {
  * @raise_irq: CB function for raising IRQ to the host
  * @alloc_map: CB function for allocating memory in endpoint for storing host context and mapping it
  * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context
- * @read_from_host: CB function for reading from host memory from endpoint
- * @write_to_host: CB function for writing to host memory from endpoint
+ * @read_sync: CB function for reading from host memory synchronously
+ * @write_sync: CB function for writing to host memory synchronously
  * @mhi_state: MHI Endpoint state
  * @max_chan: Maximum channels supported by the endpoint controller
  * @mru: MRU (Maximum Receive Unit) value of the endpoint controller
@@ -149,8 +149,8 @@ struct mhi_ep_cntrl {
 			 void __iomem **virt, size_t size);
 	void (*unmap_free)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t phys,
 			   void __iomem *virt, size_t size);
-	int (*read_from_host)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
-	int (*write_to_host)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
+	int (*read_sync)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
+	int (*write_sync)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
 
 	enum mhi_state mhi_state;
 
-- 
cgit v1.2.3


From 8b786ed8fb089e347af21d13ba5677325fcd4cd8 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Mon, 27 Nov 2023 15:35:50 +0530
Subject: bus: mhi: ep: Introduce async read/write callbacks

These callbacks can be implemented by the controller drivers to perform
async read/write operation that increases the throughput.

For aiding the async operation, a completion callback is also introduced.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi_ep.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h
index b96b543bf2f6..14c6e8d3f573 100644
--- a/include/linux/mhi_ep.h
+++ b/include/linux/mhi_ep.h
@@ -54,11 +54,16 @@ struct mhi_ep_db_info {
  * @dev_addr: Address of the buffer in endpoint
  * @host_addr: Address of the bufffer in host
  * @size: Size of the buffer
+ * @cb: Callback to be executed by controller drivers after transfer completion (async)
+ * @cb_buf: Opaque buffer to be passed to the callback
  */
 struct mhi_ep_buf_info {
 	void *dev_addr;
 	u64 host_addr;
 	size_t size;
+
+	void (*cb)(struct mhi_ep_buf_info *buf_info);
+	void *cb_buf;
 };
 
 /**
@@ -96,6 +101,8 @@ struct mhi_ep_buf_info {
  * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context
  * @read_sync: CB function for reading from host memory synchronously
  * @write_sync: CB function for writing to host memory synchronously
+ * @read_async: CB function for reading from host memory asynchronously
+ * @write_async: CB function for writing to host memory asynchronously
  * @mhi_state: MHI Endpoint state
  * @max_chan: Maximum channels supported by the endpoint controller
  * @mru: MRU (Maximum Receive Unit) value of the endpoint controller
@@ -151,6 +158,8 @@ struct mhi_ep_cntrl {
 			   void __iomem *virt, size_t size);
 	int (*read_sync)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
 	int (*write_sync)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
+	int (*read_async)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
+	int (*write_async)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
 
 	enum mhi_state mhi_state;
 
-- 
cgit v1.2.3


From b4c2bea8ceaa50cd42a8f73667389d801a3ecf2d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 25 Oct 2023 16:02:03 +0200
Subject: add listmount(2) syscall

Add way to query the children of a particular mount.  This is a more
flexible way to iterate the mount tree than having to parse
/proc/self/mountinfo.

Lookup the mount by the new 64bit mount ID. If a mount needs to be
queried based on path, then statx(2) can be used to first query the
mount ID belonging to the path.

Return an array of new (64bit) mount ID's. Without privileges only
mounts are listed which are reachable from the task's root.

Folded into this patch are several later improvements. Keeping them
separate would make the history pointlessly confusing:

* Recursive listing of mounts is the default now (cf. [1]).
* Remove explicit LISTMOUNT_UNREACHABLE flag (cf. [1]) and fail if mount
  is unreachable from current root. This also makes permission checking
  consistent with statmount() (cf. [3]).
* Start listing mounts in unique mount ID order (cf. [2]) to allow
  continuing listmount() from a midpoint.
* Allow to continue listmount(). The @request_mask parameter is renamed
  and to @param to be usable by both statmount() and listmount().
  If @param is set to a mount id then listmount() will continue listing
  mounts from that id on. This allows listing mounts in multiple
  listmount invocations without having to resize the buffer. If @param
  is zero then the listing starts from the beginning (cf. [4]).
* Don't return EOVERFLOW, instead return the buffer size which allows to
  detect a full buffer as well (cf. [4]).

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20231025140205.3586473-6-mszeredi@redhat.com
Reviewed-by: Ian Kent <raven@themaw.net>
Link: https://lore.kernel.org/r/20231128160337.29094-2-mszeredi@redhat.com [1] (folded)
Link: https://lore.kernel.org/r/20231128160337.29094-3-mszeredi@redhat.com [2] (folded)
Link: https://lore.kernel.org/r/20231128160337.29094-4-mszeredi@redhat.com [3] (folded)
Link: https://lore.kernel.org/r/20231128160337.29094-5-mszeredi@redhat.com [4] (folded)
[Christian Brauner <brauner@kernel.org>: various smaller fixes]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/syscalls.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 530ca9adf5f1..2d6d3e76e3f7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -412,6 +412,9 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
 asmlinkage long sys_statmount(const struct mnt_id_req __user *req,
 			      struct statmount __user *buf, size_t bufsize,
 			      unsigned int flags);
+asmlinkage long sys_listmount(const struct mnt_id_req __user *req,
+			      u64 __user *buf, size_t bufsize,
+			      unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
 #if BITS_PER_LONG == 32
-- 
cgit v1.2.3


From bf873a800ac3234eba991603a450eaa517d27022 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 12 Dec 2023 20:35:11 -0800
Subject: net: skbuff: fix spelling errors

Correct spelling as reported by codespell.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20231213043511.10357-1-rdunlap@infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b370eb8d70f7..7ce38874dbd1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1069,7 +1069,7 @@ struct sk_buff {
 	refcount_t		users;
 
 #ifdef CONFIG_SKB_EXTENSIONS
-	/* only useable after checking ->active_extensions != 0 */
+	/* only usable after checking ->active_extensions != 0 */
 	struct skb_ext		*extensions;
 #endif
 };
@@ -3311,7 +3311,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
 					     unsigned int order)
 {
 	/* This piece of code contains several assumptions.
-	 * 1.  This is for device Rx, therefor a cold page is preferred.
+	 * 1.  This is for device Rx, therefore a cold page is preferred.
 	 * 2.  The expectation is the user wants a compound page.
 	 * 3.  If requesting a order 0 page it will not be compound
 	 *     due to the check to see if order has a value in prep_new_page
@@ -4247,7 +4247,7 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
 {
 	const void *a = skb_metadata_end(skb_a);
 	const void *b = skb_metadata_end(skb_b);
-	/* Using more efficient varaiant than plain call to memcmp(). */
+	/* Using more efficient variant than plain call to memcmp(). */
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
 	u64 diffs = 0;
 
-- 
cgit v1.2.3


From ee08acb58fe47fc3bc2c137965985cdb1df40b35 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 2 Nov 2023 20:33:18 +0530
Subject: bus: mhi: ep: Add support for async DMA write operation

In order to optimize the data transfer, let's use the async DMA operation
for writing (queuing) data to the host.

In the async path, the completion event for the transfer ring will only be
sent to the host when the controller driver notifies the MHI stack of the
actual transfer completion using the callback (mhi_ep_skb_completion)
supplied in "struct mhi_ep_buf_info".

Also to accommodate the async operation, the transfer ring read offset
(ring->rd_offset) is cached in the "struct mhi_ep_chan" and updated locally
to let the stack queue further ring items to the controller driver. But the
actual read offset of the transfer ring will only be updated in the
completion callback.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi_ep.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h
index 14c6e8d3f573..11bf3212f782 100644
--- a/include/linux/mhi_ep.h
+++ b/include/linux/mhi_ep.h
@@ -51,16 +51,20 @@ struct mhi_ep_db_info {
 
 /**
  * struct mhi_ep_buf_info - MHI Endpoint transfer buffer info
+ * @mhi_dev: MHI device associated with this buffer
  * @dev_addr: Address of the buffer in endpoint
  * @host_addr: Address of the bufffer in host
  * @size: Size of the buffer
+ * @code: Transfer completion code
  * @cb: Callback to be executed by controller drivers after transfer completion (async)
  * @cb_buf: Opaque buffer to be passed to the callback
  */
 struct mhi_ep_buf_info {
+	struct mhi_ep_device *mhi_dev;
 	void *dev_addr;
 	u64 host_addr;
 	size_t size;
+	int code;
 
 	void (*cb)(struct mhi_ep_buf_info *buf_info);
 	void *cb_buf;
-- 
cgit v1.2.3


From 0fe1798968115488c0c02f4633032a015b1faf97 Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Thu, 14 Dec 2023 15:52:29 +0300
Subject: virtio/vsock: send credit update during setting SO_RCVLOWAT

Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
O_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Rename 'set_rcvlowat' callback to 'notify_set_rcvlowat' and set
'sk->sk_rcvlowat' only in one place (i.e. 'vsock_set_rcvlowat'), so the
transport doesn't need to do it.

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_vsock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
-- 
cgit v1.2.3


From 4ad4c1f394b84f9941a10aa8aaf11102478a390b Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 24 Nov 2023 18:10:03 +0000
Subject: dma-mapping: don't store redundant offsets

A bus_dma_region necessarily stores both CPU and DMA base addresses for
a range, so there's no need to also store the difference between them.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-direct.h | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 18aade195884..3eb3589ff43e 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -21,7 +21,6 @@ struct bus_dma_region {
 	phys_addr_t	cpu_start;
 	dma_addr_t	dma_start;
 	u64		size;
-	u64		offset;
 };
 
 static inline dma_addr_t translate_phys_to_dma(struct device *dev,
@@ -29,9 +28,12 @@ static inline dma_addr_t translate_phys_to_dma(struct device *dev,
 {
 	const struct bus_dma_region *m;
 
-	for (m = dev->dma_range_map; m->size; m++)
-		if (paddr >= m->cpu_start && paddr - m->cpu_start < m->size)
-			return (dma_addr_t)paddr - m->offset;
+	for (m = dev->dma_range_map; m->size; m++) {
+		u64 offset = paddr - m->cpu_start;
+
+		if (paddr >= m->cpu_start && offset < m->size)
+			return m->dma_start + offset;
+	}
 
 	/* make sure dma_capable fails when no translation is available */
 	return DMA_MAPPING_ERROR;
@@ -42,9 +44,12 @@ static inline phys_addr_t translate_dma_to_phys(struct device *dev,
 {
 	const struct bus_dma_region *m;
 
-	for (m = dev->dma_range_map; m->size; m++)
-		if (dma_addr >= m->dma_start && dma_addr - m->dma_start < m->size)
-			return (phys_addr_t)dma_addr + m->offset;
+	for (m = dev->dma_range_map; m->size; m++) {
+		u64 offset = dma_addr - m->dma_start;
+
+		if (dma_addr >= m->dma_start && offset < m->size)
+			return m->cpu_start + offset;
+	}
 
 	return (phys_addr_t)-1;
 }
-- 
cgit v1.2.3


From 134c6eaa6087d78c0e289931ca15ae7a5007670d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 13 Dec 2023 15:02:35 -0800
Subject: driver core: Add a guard() definition for the device_lock()

At present there are ~200 usages of device_lock() in the kernel. Some of
those usages lead to "goto unlock;" patterns which have proven to be
error prone. Define a "device" guard() definition to allow for those to
be cleaned up and prevent new ones from appearing.

Link: http://lore.kernel.org/r/657897453dda8_269bd29492@dwillia2-mobl3.amr.corp.intel.com.notmuch
Link: http://lore.kernel.org/r/6577b0c2a02df_a04c5294bb@dwillia2-xfh.jf.intel.com.notmuch
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Link: https://lore.kernel.org/r/170250854466.1522182.17555361077409628655.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index d7a72a8749ea..6c83294395ac 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1007,6 +1007,8 @@ static inline void device_unlock(struct device *dev)
 	mutex_unlock(&dev->mutex);
 }
 
+DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T))
+
 static inline void device_lock_assert(struct device *dev)
 {
 	lockdep_assert_held(&dev->mutex);
-- 
cgit v1.2.3


From 7f38b70042fcaa49219045bd1a9a2836e27a58ac Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Dec 2023 11:15:27 +0000
Subject: of: device: Export of_device_make_bus_id()

This helper is really handy to create unique device names based on their
device tree path, we may need it outside of the OF core (in the NVMEM
subsystem) so let's export it. As this helper has nothing patform
specific, let's move it to of/device.c instead of of/platform.c so we
can add its prototype to of_device.h.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231215111536.316972-2-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/of_device.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 2c7a3d4bc775..a72661e47faa 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -40,6 +40,9 @@ static inline int of_dma_configure(struct device *dev,
 {
 	return of_dma_configure_id(dev, np, force_dma, NULL);
 }
+
+void of_device_make_bus_id(struct device *dev);
+
 #else /* CONFIG_OF */
 
 static inline int of_driver_match_device(struct device *dev,
@@ -82,6 +85,9 @@ static inline int of_dma_configure(struct device *dev,
 {
 	return 0;
 }
+
+static inline void of_device_make_bus_id(struct device *dev) {}
+
 #endif /* CONFIG_OF */
 
 #endif /* _LINUX_OF_DEVICE_H */
-- 
cgit v1.2.3


From 4a1a40233b4a9fc159a5c7a27dc34c5c7bc5be55 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Dec 2023 11:15:28 +0000
Subject: nvmem: Move of_nvmem_layout_get_container() in another header

nvmem-consumer.h is included by consumer devices, extracting data from
NVMEM devices whereas nvmem-provider.h is included by devices providing
NVMEM content.

The only users of of_nvmem_layout_get_container() outside of the core
are layout drivers, so better move its prototype to nvmem-provider.h.

While we do so, we also move the kdoc associated with the function to
the header rather than the .c file.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231215111536.316972-3-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-consumer.h |  7 -------
 include/linux/nvmem-provider.h | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index 6ec4b9743e25..2d306fa13b1a 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -247,7 +247,6 @@ struct nvmem_cell *of_nvmem_cell_get(struct device_node *np,
 				     const char *id);
 struct nvmem_device *of_nvmem_device_get(struct device_node *np,
 					 const char *name);
-struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem);
 #else
 static inline struct nvmem_cell *of_nvmem_cell_get(struct device_node *np,
 						   const char *id)
@@ -260,12 +259,6 @@ static inline struct nvmem_device *of_nvmem_device_get(struct device_node *np,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
-
-static inline struct device_node *
-of_nvmem_layout_get_container(struct nvmem_device *nvmem)
-{
-	return NULL;
-}
 #endif /* CONFIG_NVMEM && CONFIG_OF */
 
 #endif  /* ifndef _LINUX_NVMEM_CONSUMER_H */
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index e3930835235b..e5de21516387 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -244,6 +244,27 @@ nvmem_layout_get_match_data(struct nvmem_device *nvmem,
 
 #endif /* CONFIG_NVMEM */
 
+#if IS_ENABLED(CONFIG_NVMEM) && IS_ENABLED(CONFIG_OF)
+
+/**
+ * of_nvmem_layout_get_container() - Get OF node of layout container
+ *
+ * @nvmem: nvmem device
+ *
+ * Return: a node pointer with refcount incremented or NULL if no
+ * container exists. Use of_node_put() on it when done.
+ */
+struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem);
+
+#else  /* CONFIG_NVMEM && CONFIG_OF */
+
+static inline struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem)
+{
+	return NULL;
+}
+
+#endif /* CONFIG_NVMEM && CONFIG_OF */
+
 #define module_nvmem_layout_driver(__layout_driver)		\
 	module_driver(__layout_driver, nvmem_layout_register,	\
 		      nvmem_layout_unregister)
-- 
cgit v1.2.3


From 1b7c298a4ecbc28cc6ee94005734bff55eb83d22 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Dec 2023 11:15:30 +0000
Subject: nvmem: Simplify the ->add_cells() hook

The layout entry is not used and will anyway be made useless by the new
layout bus infrastructure coming next, so drop it. While at it, clarify
the kdoc entry.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231215111536.316972-5-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-provider.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index e5de21516387..3939991b3c5f 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -156,9 +156,8 @@ struct nvmem_cell_table {
  *
  * @name:		Layout name.
  * @of_match_table:	Open firmware match table.
- * @add_cells:		Will be called if a nvmem device is found which
- *			has this layout. The function will add layout
- *			specific cells with nvmem_add_one_cell().
+ * @add_cells:		Called to populate the layout using
+ *			nvmem_add_one_cell().
  * @fixup_cell_info:	Will be called before a cell is added. Can be
  *			used to modify the nvmem_cell_info.
  * @owner:		Pointer to struct module.
@@ -172,8 +171,7 @@ struct nvmem_cell_table {
 struct nvmem_layout {
 	const char *name;
 	const struct of_device_id *of_match_table;
-	int (*add_cells)(struct device *dev, struct nvmem_device *nvmem,
-			 struct nvmem_layout *layout);
+	int (*add_cells)(struct device *dev, struct nvmem_device *nvmem);
 	void (*fixup_cell_info)(struct nvmem_device *nvmem,
 				struct nvmem_layout *layout,
 				struct nvmem_cell_info *cell);
-- 
cgit v1.2.3


From 1172460e716784ac7e1049a537bdca8edbf97360 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Dec 2023 11:15:31 +0000
Subject: nvmem: Move and rename ->fixup_cell_info()

This hook is meant to be used by any provider and instantiating a layout
just for this is useless. Let's instead move this hook to the nvmem
device and add it to the config structure to be easily shared by the
providers.

While at moving this hook, rename it ->fixup_dt_cell_info() to clarify
its main intended purpose.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231215111536.316972-6-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-provider.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 3939991b3c5f..36415a602d9e 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -83,6 +83,8 @@ struct nvmem_cell_info {
  * @cells:	Optional array of pre-defined NVMEM cells.
  * @ncells:	Number of elements in cells.
  * @add_legacy_fixed_of_cells:	Read fixed NVMEM cells from old OF syntax.
+ * @fixup_dt_cell_info: Will be called before a cell is added. Can be
+ *		used to modify the nvmem_cell_info.
  * @keepout:	Optional array of keepout ranges (sorted ascending by start).
  * @nkeepout:	Number of elements in the keepout array.
  * @type:	Type of the nvmem storage
@@ -113,6 +115,8 @@ struct nvmem_config {
 	const struct nvmem_cell_info	*cells;
 	int			ncells;
 	bool			add_legacy_fixed_of_cells;
+	void (*fixup_dt_cell_info)(struct nvmem_device *nvmem,
+				   struct nvmem_cell_info *cell);
 	const struct nvmem_keepout *keepout;
 	unsigned int		nkeepout;
 	enum nvmem_type		type;
@@ -158,8 +162,6 @@ struct nvmem_cell_table {
  * @of_match_table:	Open firmware match table.
  * @add_cells:		Called to populate the layout using
  *			nvmem_add_one_cell().
- * @fixup_cell_info:	Will be called before a cell is added. Can be
- *			used to modify the nvmem_cell_info.
  * @owner:		Pointer to struct module.
  * @node:		List node.
  *
@@ -172,9 +174,6 @@ struct nvmem_layout {
 	const char *name;
 	const struct of_device_id *of_match_table;
 	int (*add_cells)(struct device *dev, struct nvmem_device *nvmem);
-	void (*fixup_cell_info)(struct nvmem_device *nvmem,
-				struct nvmem_layout *layout,
-				struct nvmem_cell_info *cell);
 
 	/* private */
 	struct module *owner;
-- 
cgit v1.2.3


From fc29fd821d9ac2ae3d32a722fac39ce874efb883 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Dec 2023 11:15:32 +0000
Subject: nvmem: core: Rework layouts to become regular devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current layout support was initially written without modules support in
mind. When the requirement for module support rose, the existing base
was improved to adopt modularization support, but kind of a design flaw
was introduced. With the existing implementation, when a storage device
registers into NVMEM, the core tries to hook a layout (if any) and
populates its cells immediately. This means, if the hardware description
expects a layout to be hooked up, but no driver was provided for that,
the storage medium will fail to probe and try later from
scratch. Even if we consider that the hardware description shall be
correct, we could still probe the storage device (especially if it
contains the rootfs).

One way to overcome this situation is to consider the layouts as
devices, and leverage the native notifier mechanism. When a new NVMEM
device is registered, we can populate its nvmem-layout child, if any,
and wait for the matching to be done in order to get the cells (the
waiting can be easily done with the NVMEM notifiers). If the layout
driver is compiled as a module, it should automatically be loaded. This
way, there is no strong order to enforce, any NVMEM device creation
or NVMEM layout driver insertion will be observed as a new event which
may lead to the creation of additional cells, without disturbing the
probes with costly (and sometimes endless) deferrals.

In order to achieve that goal we create a new bus for the nvmem-layouts
with minimal logic to match nvmem-layout devices with nvmem-layout
drivers. All this infrastructure code is created in the layouts.c file.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Tested-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231215111536.316972-7-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-provider.h | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 36415a602d9e..6fe65b35ea97 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -9,6 +9,7 @@
 #ifndef _LINUX_NVMEM_PROVIDER_H
 #define _LINUX_NVMEM_PROVIDER_H
 
+#include <linux/device.h>
 #include <linux/device/driver.h>
 #include <linux/err.h>
 #include <linux/errno.h>
@@ -158,12 +159,11 @@ struct nvmem_cell_table {
 /**
  * struct nvmem_layout - NVMEM layout definitions
  *
- * @name:		Layout name.
- * @of_match_table:	Open firmware match table.
- * @add_cells:		Called to populate the layout using
- *			nvmem_add_one_cell().
- * @owner:		Pointer to struct module.
- * @node:		List node.
+ * @dev:		Device-model layout device.
+ * @nvmem:		The underlying NVMEM device
+ * @add_cells:		Will be called if a nvmem device is found which
+ *			has this layout. The function will add layout
+ *			specific cells with nvmem_add_one_cell().
  *
  * A nvmem device can hold a well defined structure which can just be
  * evaluated during runtime. For example a TLV list, or a list of "name=val"
@@ -171,13 +171,15 @@ struct nvmem_cell_table {
  * cells.
  */
 struct nvmem_layout {
-	const char *name;
-	const struct of_device_id *of_match_table;
+	struct device dev;
+	struct nvmem_device *nvmem;
 	int (*add_cells)(struct device *dev, struct nvmem_device *nvmem);
+};
 
-	/* private */
-	struct module *owner;
-	struct list_head node;
+struct nvmem_layout_driver {
+	struct device_driver driver;
+	int (*probe)(struct nvmem_layout *layout);
+	void (*remove)(struct nvmem_layout *layout);
 };
 
 #if IS_ENABLED(CONFIG_NVMEM)
@@ -194,11 +196,15 @@ void nvmem_del_cell_table(struct nvmem_cell_table *table);
 int nvmem_add_one_cell(struct nvmem_device *nvmem,
 		       const struct nvmem_cell_info *info);
 
-int __nvmem_layout_register(struct nvmem_layout *layout, struct module *owner);
-#define nvmem_layout_register(layout) \
-	__nvmem_layout_register(layout, THIS_MODULE)
+int nvmem_layout_register(struct nvmem_layout *layout);
 void nvmem_layout_unregister(struct nvmem_layout *layout);
 
+int nvmem_layout_driver_register(struct nvmem_layout_driver *drv);
+void nvmem_layout_driver_unregister(struct nvmem_layout_driver *drv);
+#define module_nvmem_layout_driver(__nvmem_layout_driver)		\
+	module_driver(__nvmem_layout_driver, nvmem_layout_driver_register, \
+		      nvmem_layout_driver_unregister)
+
 const void *nvmem_layout_get_match_data(struct nvmem_device *nvmem,
 					struct nvmem_layout *layout);
 
@@ -262,8 +268,4 @@ static inline struct device_node *of_nvmem_layout_get_container(struct nvmem_dev
 
 #endif /* CONFIG_NVMEM && CONFIG_OF */
 
-#define module_nvmem_layout_driver(__layout_driver)		\
-	module_driver(__layout_driver, nvmem_layout_register,	\
-		      nvmem_layout_unregister)
-
 #endif  /* ifndef _LINUX_NVMEM_PROVIDER_H */
-- 
cgit v1.2.3


From 1f78c56007ba61b7b8c3f7dbb6787b6af116d3f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= <theo.lebrun@bootlin.com>
Date: Thu, 7 Dec 2023 18:56:06 +0100
Subject: tty: serial: amba: Use linux/{bits,bitfield}.h macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver uses bit shifts and hexadecimal expressions to declare
constants. Replace that with the BIT(), GENMASK() & FIELD_PREP_CONST()
macros to clarify intent.

include/linux/amba/serial.h gets included from arch/arm/include/debug/pl01x.S.
Avoid includes and macro tricks for the four defines that are involved:
UART01x_DR, UART01x_FR, UART01x_FR_TXFF and UART01x_FR_BUSY.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://lore.kernel.org/r/20231207-mbly-uart-v6-1-e384afa5e78c@bootlin.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/amba/serial.h | 251 +++++++++++++++++++++++---------------------
 1 file changed, 130 insertions(+), 121 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index 27003ec52114..9120de05ead0 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -10,6 +10,11 @@
 #ifndef ASM_ARM_HARDWARE_SERIAL_AMBA_H
 #define ASM_ARM_HARDWARE_SERIAL_AMBA_H
 
+#ifndef __ASSEMBLY__
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#endif
+
 #include <linux/types.h>
 
 /* -------------------------------------------------------------------------------
@@ -70,138 +75,142 @@
 #define ZX_UART011_ICR		0x4c
 #define ZX_UART011_DMACR	0x50
 
-#define UART011_DR_OE		(1 << 11)
-#define UART011_DR_BE		(1 << 10)
-#define UART011_DR_PE		(1 << 9)
-#define UART011_DR_FE		(1 << 8)
-
-#define UART01x_RSR_OE		0x08
-#define UART01x_RSR_BE		0x04
-#define UART01x_RSR_PE		0x02
-#define UART01x_RSR_FE		0x01
-
-#define UART011_FR_RI		0x100
-#define UART011_FR_TXFE		0x080
-#define UART011_FR_RXFF		0x040
-#define UART01x_FR_TXFF		0x020
-#define UART01x_FR_RXFE		0x010
-#define UART01x_FR_BUSY		0x008
-#define UART01x_FR_DCD		0x004
-#define UART01x_FR_DSR		0x002
-#define UART01x_FR_CTS		0x001
+#define UART011_DR_OE		BIT(11)
+#define UART011_DR_BE		BIT(10)
+#define UART011_DR_PE		BIT(9)
+#define UART011_DR_FE		BIT(8)
+
+#define UART01x_RSR_OE		BIT(3)
+#define UART01x_RSR_BE		BIT(2)
+#define UART01x_RSR_PE		BIT(1)
+#define UART01x_RSR_FE		BIT(0)
+
+#define UART011_FR_RI		BIT(8)
+#define UART011_FR_TXFE		BIT(7)
+#define UART011_FR_RXFF		BIT(6)
+#define UART01x_FR_TXFF		(1 << 5)	/* used in ASM */
+#define UART01x_FR_RXFE		BIT(4)
+#define UART01x_FR_BUSY		(1 << 3)	/* used in ASM */
+#define UART01x_FR_DCD		BIT(2)
+#define UART01x_FR_DSR		BIT(1)
+#define UART01x_FR_CTS		BIT(0)
 #define UART01x_FR_TMSK		(UART01x_FR_TXFF + UART01x_FR_BUSY)
 
 /*
  * Some bits of Flag Register on ZTE device have different position from
  * standard ones.
  */
-#define ZX_UART01x_FR_BUSY	0x100
-#define ZX_UART01x_FR_DSR	0x008
-#define ZX_UART01x_FR_CTS	0x002
-#define ZX_UART011_FR_RI	0x001
-
-#define UART011_CR_CTSEN	0x8000	/* CTS hardware flow control */
-#define UART011_CR_RTSEN	0x4000	/* RTS hardware flow control */
-#define UART011_CR_OUT2		0x2000	/* OUT2 */
-#define UART011_CR_OUT1		0x1000	/* OUT1 */
-#define UART011_CR_RTS		0x0800	/* RTS */
-#define UART011_CR_DTR		0x0400	/* DTR */
-#define UART011_CR_RXE		0x0200	/* receive enable */
-#define UART011_CR_TXE		0x0100	/* transmit enable */
-#define UART011_CR_LBE		0x0080	/* loopback enable */
-#define UART010_CR_RTIE		0x0040
-#define UART010_CR_TIE		0x0020
-#define UART010_CR_RIE		0x0010
-#define UART010_CR_MSIE		0x0008
-#define ST_UART011_CR_OVSFACT	0x0008	/* Oversampling factor */
-#define UART01x_CR_IIRLP	0x0004	/* SIR low power mode */
-#define UART01x_CR_SIREN	0x0002	/* SIR enable */
-#define UART01x_CR_UARTEN	0x0001	/* UART enable */
-
-#define UART011_LCRH_SPS	0x80
+#define ZX_UART01x_FR_BUSY	BIT(8)
+#define ZX_UART01x_FR_DSR	BIT(3)
+#define ZX_UART01x_FR_CTS	BIT(1)
+#define ZX_UART011_FR_RI	BIT(0)
+
+#define UART011_CR_CTSEN	BIT(15)	/* CTS hardware flow control */
+#define UART011_CR_RTSEN	BIT(14)	/* RTS hardware flow control */
+#define UART011_CR_OUT2		BIT(13)	/* OUT2 */
+#define UART011_CR_OUT1		BIT(12)	/* OUT1 */
+#define UART011_CR_RTS		BIT(11)	/* RTS */
+#define UART011_CR_DTR		BIT(10)	/* DTR */
+#define UART011_CR_RXE		BIT(9)	/* receive enable */
+#define UART011_CR_TXE		BIT(8)	/* transmit enable */
+#define UART011_CR_LBE		BIT(7)	/* loopback enable */
+#define UART010_CR_RTIE		BIT(6)
+#define UART010_CR_TIE		BIT(5)
+#define UART010_CR_RIE		BIT(4)
+#define UART010_CR_MSIE		BIT(3)
+#define ST_UART011_CR_OVSFACT	BIT(3)	/* Oversampling factor */
+#define UART01x_CR_IIRLP	BIT(2)	/* SIR low power mode */
+#define UART01x_CR_SIREN	BIT(1)	/* SIR enable */
+#define UART01x_CR_UARTEN	BIT(0)	/* UART enable */
+
+#define UART011_LCRH_SPS	BIT(7)
 #define UART01x_LCRH_WLEN_8	0x60
 #define UART01x_LCRH_WLEN_7	0x40
 #define UART01x_LCRH_WLEN_6	0x20
 #define UART01x_LCRH_WLEN_5	0x00
-#define UART01x_LCRH_FEN	0x10
-#define UART01x_LCRH_STP2	0x08
-#define UART01x_LCRH_EPS	0x04
-#define UART01x_LCRH_PEN	0x02
-#define UART01x_LCRH_BRK	0x01
-
-#define ST_UART011_DMAWM_RX_1	(0 << 3)
-#define ST_UART011_DMAWM_RX_2	(1 << 3)
-#define ST_UART011_DMAWM_RX_4	(2 << 3)
-#define ST_UART011_DMAWM_RX_8	(3 << 3)
-#define ST_UART011_DMAWM_RX_16	(4 << 3)
-#define ST_UART011_DMAWM_RX_32	(5 << 3)
-#define ST_UART011_DMAWM_RX_48	(6 << 3)
-#define ST_UART011_DMAWM_TX_1	0
-#define ST_UART011_DMAWM_TX_2	1
-#define ST_UART011_DMAWM_TX_4	2
-#define ST_UART011_DMAWM_TX_8	3
-#define ST_UART011_DMAWM_TX_16	4
-#define ST_UART011_DMAWM_TX_32	5
-#define ST_UART011_DMAWM_TX_48	6
-
-#define UART010_IIR_RTIS	0x08
-#define UART010_IIR_TIS		0x04
-#define UART010_IIR_RIS		0x02
-#define UART010_IIR_MIS		0x01
-
-#define UART011_IFLS_RX1_8	(0 << 3)
-#define UART011_IFLS_RX2_8	(1 << 3)
-#define UART011_IFLS_RX4_8	(2 << 3)
-#define UART011_IFLS_RX6_8	(3 << 3)
-#define UART011_IFLS_RX7_8	(4 << 3)
-#define UART011_IFLS_TX1_8	(0 << 0)
-#define UART011_IFLS_TX2_8	(1 << 0)
-#define UART011_IFLS_TX4_8	(2 << 0)
-#define UART011_IFLS_TX6_8	(3 << 0)
-#define UART011_IFLS_TX7_8	(4 << 0)
+#define UART01x_LCRH_FEN	BIT(4)
+#define UART01x_LCRH_STP2	BIT(3)
+#define UART01x_LCRH_EPS	BIT(2)
+#define UART01x_LCRH_PEN	BIT(1)
+#define UART01x_LCRH_BRK	BIT(0)
+
+#define ST_UART011_DMAWM_RX	GENMASK(5, 3)
+#define ST_UART011_DMAWM_RX_1	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 0)
+#define ST_UART011_DMAWM_RX_2	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 1)
+#define ST_UART011_DMAWM_RX_4	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 2)
+#define ST_UART011_DMAWM_RX_8	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 3)
+#define ST_UART011_DMAWM_RX_16	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 4)
+#define ST_UART011_DMAWM_RX_32	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 5)
+#define ST_UART011_DMAWM_RX_48	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 6)
+#define ST_UART011_DMAWM_TX	GENMASK(2, 0)
+#define ST_UART011_DMAWM_TX_1	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 0)
+#define ST_UART011_DMAWM_TX_2	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 1)
+#define ST_UART011_DMAWM_TX_4	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 2)
+#define ST_UART011_DMAWM_TX_8	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 3)
+#define ST_UART011_DMAWM_TX_16	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 4)
+#define ST_UART011_DMAWM_TX_32	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 5)
+#define ST_UART011_DMAWM_TX_48	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 6)
+
+#define UART010_IIR_RTIS	BIT(3)
+#define UART010_IIR_TIS		BIT(2)
+#define UART010_IIR_RIS		BIT(1)
+#define UART010_IIR_MIS		BIT(0)
+
+#define UART011_IFLS_RXIFLSEL	GENMASK(5, 3)
+#define UART011_IFLS_RX1_8	FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 0)
+#define UART011_IFLS_RX2_8	FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 1)
+#define UART011_IFLS_RX4_8	FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 2)
+#define UART011_IFLS_RX6_8	FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 3)
+#define UART011_IFLS_RX7_8	FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 4)
+#define UART011_IFLS_TXIFLSEL	GENMASK(2, 0)
+#define UART011_IFLS_TX1_8	FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 0)
+#define UART011_IFLS_TX2_8	FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 1)
+#define UART011_IFLS_TX4_8	FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 2)
+#define UART011_IFLS_TX6_8	FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 3)
+#define UART011_IFLS_TX7_8	FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 4)
 /* special values for ST vendor with deeper fifo */
-#define UART011_IFLS_RX_HALF	(5 << 3)
-#define UART011_IFLS_TX_HALF	(5 << 0)
-
-#define UART011_OEIM		(1 << 10)	/* overrun error interrupt mask */
-#define UART011_BEIM		(1 << 9)	/* break error interrupt mask */
-#define UART011_PEIM		(1 << 8)	/* parity error interrupt mask */
-#define UART011_FEIM		(1 << 7)	/* framing error interrupt mask */
-#define UART011_RTIM		(1 << 6)	/* receive timeout interrupt mask */
-#define UART011_TXIM		(1 << 5)	/* transmit interrupt mask */
-#define UART011_RXIM		(1 << 4)	/* receive interrupt mask */
-#define UART011_DSRMIM		(1 << 3)	/* DSR interrupt mask */
-#define UART011_DCDMIM		(1 << 2)	/* DCD interrupt mask */
-#define UART011_CTSMIM		(1 << 1)	/* CTS interrupt mask */
-#define UART011_RIMIM		(1 << 0)	/* RI interrupt mask */
-
-#define UART011_OEIS		(1 << 10)	/* overrun error interrupt status */
-#define UART011_BEIS		(1 << 9)	/* break error interrupt status */
-#define UART011_PEIS		(1 << 8)	/* parity error interrupt status */
-#define UART011_FEIS		(1 << 7)	/* framing error interrupt status */
-#define UART011_RTIS		(1 << 6)	/* receive timeout interrupt status */
-#define UART011_TXIS		(1 << 5)	/* transmit interrupt status */
-#define UART011_RXIS		(1 << 4)	/* receive interrupt status */
-#define UART011_DSRMIS		(1 << 3)	/* DSR interrupt status */
-#define UART011_DCDMIS		(1 << 2)	/* DCD interrupt status */
-#define UART011_CTSMIS		(1 << 1)	/* CTS interrupt status */
-#define UART011_RIMIS		(1 << 0)	/* RI interrupt status */
-
-#define UART011_OEIC		(1 << 10)	/* overrun error interrupt clear */
-#define UART011_BEIC		(1 << 9)	/* break error interrupt clear */
-#define UART011_PEIC		(1 << 8)	/* parity error interrupt clear */
-#define UART011_FEIC		(1 << 7)	/* framing error interrupt clear */
-#define UART011_RTIC		(1 << 6)	/* receive timeout interrupt clear */
-#define UART011_TXIC		(1 << 5)	/* transmit interrupt clear */
-#define UART011_RXIC		(1 << 4)	/* receive interrupt clear */
-#define UART011_DSRMIC		(1 << 3)	/* DSR interrupt clear */
-#define UART011_DCDMIC		(1 << 2)	/* DCD interrupt clear */
-#define UART011_CTSMIC		(1 << 1)	/* CTS interrupt clear */
-#define UART011_RIMIC		(1 << 0)	/* RI interrupt clear */
-
-#define UART011_DMAONERR	(1 << 2)	/* disable dma on error */
-#define UART011_TXDMAE		(1 << 1)	/* enable transmit dma */
-#define UART011_RXDMAE		(1 << 0)	/* enable receive dma */
+#define UART011_IFLS_RX_HALF	FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 5)
+#define UART011_IFLS_TX_HALF	FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 5)
+
+#define UART011_OEIM		BIT(10)	/* overrun error interrupt mask */
+#define UART011_BEIM		BIT(9)	/* break error interrupt mask */
+#define UART011_PEIM		BIT(8)	/* parity error interrupt mask */
+#define UART011_FEIM		BIT(7)	/* framing error interrupt mask */
+#define UART011_RTIM		BIT(6)	/* receive timeout interrupt mask */
+#define UART011_TXIM		BIT(5)	/* transmit interrupt mask */
+#define UART011_RXIM		BIT(4)	/* receive interrupt mask */
+#define UART011_DSRMIM		BIT(3)	/* DSR interrupt mask */
+#define UART011_DCDMIM		BIT(2)	/* DCD interrupt mask */
+#define UART011_CTSMIM		BIT(1)	/* CTS interrupt mask */
+#define UART011_RIMIM		BIT(0)	/* RI interrupt mask */
+
+#define UART011_OEIS		BIT(10)	/* overrun error interrupt status */
+#define UART011_BEIS		BIT(9)	/* break error interrupt status */
+#define UART011_PEIS		BIT(8)	/* parity error interrupt status */
+#define UART011_FEIS		BIT(7)	/* framing error interrupt status */
+#define UART011_RTIS		BIT(6)	/* receive timeout interrupt status */
+#define UART011_TXIS		BIT(5)	/* transmit interrupt status */
+#define UART011_RXIS		BIT(4)	/* receive interrupt status */
+#define UART011_DSRMIS		BIT(3)	/* DSR interrupt status */
+#define UART011_DCDMIS		BIT(2)	/* DCD interrupt status */
+#define UART011_CTSMIS		BIT(1)	/* CTS interrupt status */
+#define UART011_RIMIS		BIT(0)	/* RI interrupt status */
+
+#define UART011_OEIC		BIT(10)	/* overrun error interrupt clear */
+#define UART011_BEIC		BIT(9)	/* break error interrupt clear */
+#define UART011_PEIC		BIT(8)	/* parity error interrupt clear */
+#define UART011_FEIC		BIT(7)	/* framing error interrupt clear */
+#define UART011_RTIC		BIT(6)	/* receive timeout interrupt clear */
+#define UART011_TXIC		BIT(5)	/* transmit interrupt clear */
+#define UART011_RXIC		BIT(4)	/* receive interrupt clear */
+#define UART011_DSRMIC		BIT(3)	/* DSR interrupt clear */
+#define UART011_DCDMIC		BIT(2)	/* DCD interrupt clear */
+#define UART011_CTSMIC		BIT(1)	/* CTS interrupt clear */
+#define UART011_RIMIC		BIT(0)	/* RI interrupt clear */
+
+#define UART011_DMAONERR	BIT(2)	/* disable dma on error */
+#define UART011_TXDMAE		BIT(1)	/* enable transmit dma */
+#define UART011_RXDMAE		BIT(0)	/* enable receive dma */
 
 #define UART01x_RSR_ANY		(UART01x_RSR_OE | UART01x_RSR_BE | UART01x_RSR_PE | UART01x_RSR_FE)
 #define UART01x_FR_MODEM_ANY	(UART01x_FR_DCD | UART01x_FR_DSR | UART01x_FR_CTS)
-- 
cgit v1.2.3


From 0c734c5ea76e333fbb8dd83b5bab46291b38096b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 14 Dec 2023 11:08:15 -0700
Subject: block: improve struct request_queue layout

It's clearly been a while since someone looked at this, so I gave it a
quick shot. There are few issues in here:

- Random bundling of members that are mostly read-only and often written
- Random holes that need not be there

This moves the most frequently used bits into cacheline 1 and 2, with
the 2nd one being more write intensive than the first one, which is
basically read-only.

Outside of making this work a bit more efficiently, it also reduces the
size of struct request_queue for my test setup from 864 bytes (spanning
14 cachelines!) to 832 bytes and 13 cachelines.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/d2b7b61c-4868-45c0-9060-4f9c73de9d7e@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 89 ++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 17c0a7d0d319..185ed3770e3a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -367,59 +367,51 @@ struct blk_independent_access_ranges {
 };
 
 struct request_queue {
-	struct request		*last_merge;
-	struct elevator_queue	*elevator;
-
-	struct percpu_ref	q_usage_counter;
+	/*
+	 * The queue owner gets to use this for whatever they like.
+	 * ll_rw_blk doesn't touch it.
+	 */
+	void			*queuedata;
 
-	struct blk_queue_stats	*stats;
-	struct rq_qos		*rq_qos;
-	struct mutex		rq_qos_mutex;
+	struct elevator_queue	*elevator;
 
 	const struct blk_mq_ops	*mq_ops;
 
 	/* sw queues */
 	struct blk_mq_ctx __percpu	*queue_ctx;
 
+	/*
+	 * various queue flags, see QUEUE_* below
+	 */
+	unsigned long		queue_flags;
+
+	unsigned int		rq_timeout;
+
 	unsigned int		queue_depth;
 
+	refcount_t		refs;
+
 	/* hw dispatch queues */
-	struct xarray		hctx_table;
 	unsigned int		nr_hw_queues;
+	struct xarray		hctx_table;
 
-	/*
-	 * The queue owner gets to use this for whatever they like.
-	 * ll_rw_blk doesn't touch it.
-	 */
-	void			*queuedata;
-
-	/*
-	 * various queue flags, see QUEUE_* below
-	 */
-	unsigned long		queue_flags;
-	/*
-	 * Number of contexts that have called blk_set_pm_only(). If this
-	 * counter is above zero then only RQF_PM requests are processed.
-	 */
-	atomic_t		pm_only;
+	struct percpu_ref	q_usage_counter;
 
-	/*
-	 * ida allocated id for this queue.  Used to index queues from
-	 * ioctx.
-	 */
-	int			id;
+	struct request		*last_merge;
 
 	spinlock_t		queue_lock;
 
-	struct gendisk		*disk;
+	int			quiesce_depth;
 
-	refcount_t		refs;
+	struct gendisk		*disk;
 
 	/*
 	 * mq queue kobject
 	 */
 	struct kobject *mq_kobj;
 
+	struct queue_limits	limits;
+
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity integrity;
 #endif	/* CONFIG_BLK_DEV_INTEGRITY */
@@ -430,24 +422,40 @@ struct request_queue {
 #endif
 
 	/*
-	 * queue settings
+	 * Number of contexts that have called blk_set_pm_only(). If this
+	 * counter is above zero then only RQF_PM requests are processed.
 	 */
-	unsigned long		nr_requests;	/* Max # of requests */
+	atomic_t		pm_only;
+
+	struct blk_queue_stats	*stats;
+	struct rq_qos		*rq_qos;
+	struct mutex		rq_qos_mutex;
+
+	/*
+	 * ida allocated id for this queue.  Used to index queues from
+	 * ioctx.
+	 */
+	int			id;
 
 	unsigned int		dma_pad_mask;
 
+	/*
+	 * queue settings
+	 */
+	unsigned long		nr_requests;	/* Max # of requests */
+
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 	struct blk_crypto_profile *crypto_profile;
 	struct kobject *crypto_kobject;
 #endif
 
-	unsigned int		rq_timeout;
-
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
 
 	atomic_t		nr_active_requests_shared_tags;
 
+	unsigned int		required_elevator_features;
+
 	struct blk_mq_tags	*sched_shared_tags;
 
 	struct list_head	icq_list;
@@ -458,11 +466,12 @@ struct request_queue {
 	struct mutex		blkcg_mutex;
 #endif
 
-	struct queue_limits	limits;
+	int			node;
 
-	unsigned int		required_elevator_features;
+	spinlock_t		requeue_lock;
+	struct list_head	requeue_list;
+	struct delayed_work	requeue_work;
 
-	int			node;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	struct blk_trace __rcu	*blk_trace;
 #endif
@@ -472,10 +481,6 @@ struct request_queue {
 	struct blk_flush_queue	*fq;
 	struct list_head	flush_list;
 
-	struct list_head	requeue_list;
-	spinlock_t		requeue_lock;
-	struct delayed_work	requeue_work;
-
 	struct mutex		sysfs_lock;
 	struct mutex		sysfs_dir_lock;
 
@@ -500,8 +505,6 @@ struct request_queue {
 	 */
 	struct mutex		mq_freeze_lock;
 
-	int			quiesce_depth;
-
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
 
-- 
cgit v1.2.3


From 826a5d8c9df9605fb4fdefa45432f95580241a1f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 25 Oct 2023 21:42:57 +0300
Subject: device property: Implement device_is_big_endian()

Some users want to use the struct device pointer to see if the
device is big endian in terms of Open Firmware specifications,
i.e. if it has a "big-endian" property, or if the kernel was
compiled for BE *and* the device has a "native-endian" property.

Provide inline helper for the users.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20231025184259.250588-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/property.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 9f2585d705a8..55c2692ffa8c 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -80,12 +80,38 @@ int fwnode_property_match_string(const struct fwnode_handle *fwnode,
 
 bool fwnode_device_is_available(const struct fwnode_handle *fwnode);
 
+static inline bool fwnode_device_is_big_endian(const struct fwnode_handle *fwnode)
+{
+	if (fwnode_property_present(fwnode, "big-endian"))
+		return true;
+	if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) &&
+	    fwnode_property_present(fwnode, "native-endian"))
+		return true;
+	return false;
+}
+
 static inline
 bool fwnode_device_is_compatible(const struct fwnode_handle *fwnode, const char *compat)
 {
 	return fwnode_property_match_string(fwnode, "compatible", compat) >= 0;
 }
 
+/**
+ * device_is_big_endian - check if a device has BE registers
+ * @dev: Pointer to the struct device
+ *
+ * Returns: true if the device has a "big-endian" property, or if the kernel
+ * was compiled for BE *and* the device has a "native-endian" property.
+ * Returns false otherwise.
+ *
+ * Callers would nominally use ioread32be/iowrite32be if
+ * device_is_big_endian() == true, or readl/writel otherwise.
+ */
+static inline bool device_is_big_endian(const struct device *dev)
+{
+	return fwnode_device_is_big_endian(dev_fwnode(dev));
+}
+
 /**
  * device_is_compatible - match 'compatible' property of the device with a given string
  * @dev: Pointer to the struct device
-- 
cgit v1.2.3


From c27dfca4555bf74dd7dd7161d8ef2790ec1c7283 Mon Sep 17 00:00:00 2001
From: Ricky Wu <ricky_wu@realtek.com>
Date: Fri, 8 Dec 2023 11:21:43 +0800
Subject: misc: rtsx: add to support new card reader rts5264 new definition and
 function

in order to support NEW chip rts5264, the definitions of some internal
registers are define in new file rts5264.h, and some callback functions
and the workflow for rts5264 are define in new file rts5264.c

also add rts5264.o to Makefile

Signed-off-by: Ricky Wu <ricky_wu@realtek.com>
Link: https://lore.kernel.org/r/20231208032145.2143580-2-ricky_wu@realtek.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/rtsx_pci.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h
index 534038d962e4..4612ef09a0c7 100644
--- a/include/linux/rtsx_pci.h
+++ b/include/linux/rtsx_pci.h
@@ -60,6 +60,7 @@
 #define   SD_EXIST			(1 << 16)
 #define   DELINK_INT			GPIO0_INT
 #define   MS_OC_INT			(1 << 23)
+#define   SD_OVP_INT		(1 << 23)
 #define   SD_OC_INT			(1 << 22)
 
 #define CARD_INT		(XD_INT | MS_INT | SD_INT)
@@ -80,6 +81,7 @@
 #define   OC_INT_EN			(1 << 23)
 #define   DELINK_INT_EN			GPIO0_INT_EN
 #define   MS_OC_INT_EN			(1 << 23)
+#define   SD_OVP_INT_EN			(1 << 23)
 #define   SD_OC_INT_EN			(1 << 22)
 
 #define RTSX_DUM_REG			0x1C
@@ -583,6 +585,7 @@
 #define   OBFF_DISABLE			0x00
 
 #define CDRESUMECTL			0xFE52
+#define CDGW				0xFE53
 #define WAKE_SEL_CTL			0xFE54
 #define PCLK_CTL			0xFE55
 #define   PCLK_MODE_SEL			0x20
@@ -764,6 +767,9 @@
 #define   SD_VIO_LDO_1V8		0x40
 #define   SD_VIO_LDO_3V3		0x70
 
+#define RTS5264_AUTOLOAD_CFG2		0xFF7D
+#define RTS5264_CHIP_RST_N_SEL		(1 << 6)
+
 #define RTS5260_AUTOLOAD_CFG4		0xFF7F
 #define   RTS5260_MIMO_DISABLE		0x8A
 /*RTS5261*/
@@ -1261,6 +1267,7 @@ struct rtsx_pcr {
 	u8				dma_error_count;
 	u8			ocp_stat;
 	u8			ocp_stat2;
+	u8			ovp_stat;
 	u8			rtd3_en;
 };
 
@@ -1271,6 +1278,7 @@ struct rtsx_pcr {
 #define PID_5260	0x5260
 #define PID_5261	0x5261
 #define PID_5228	0x5228
+#define PID_5264	0x5264
 
 #define CHK_PCI_PID(pcr, pid)		((pcr)->pci->device == (pid))
 #define PCI_VID(pcr)			((pcr)->pci->vendor)
-- 
cgit v1.2.3


From b6e53731e07db7e8d35b789fd83565fe75540180 Mon Sep 17 00:00:00 2001
From: Fei Shao <fshao@chromium.org>
Date: Wed, 6 Dec 2023 15:17:26 -0800
Subject: spmi: Introduce device-managed functions

Utilize the managed resource (devres) framework and add the following
devm_* helpers for the SPMI driver:

- devm_spmi_controller_alloc()
- devm_spmi_controller_add()

[sboyd@kernel.org: Rename to spmi-devres for module niceness, slap on
GPL module license]

Signed-off-by: Fei Shao <fshao@chromium.org>
Link: https://lore.kernel.org/r/20230824104101.4083400-2-fshao@chromium.org
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
Link: https://lore.kernel.org/r/20231206231733.4031901-4-sboyd@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/spmi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spmi.h b/include/linux/spmi.h
index 2a4ce4144f9f..28e8c8bd3944 100644
--- a/include/linux/spmi.h
+++ b/include/linux/spmi.h
@@ -120,6 +120,9 @@ static inline void spmi_controller_put(struct spmi_controller *ctrl)
 int spmi_controller_add(struct spmi_controller *ctrl);
 void spmi_controller_remove(struct spmi_controller *ctrl);
 
+struct spmi_controller *devm_spmi_controller_alloc(struct device *parent, size_t size);
+int devm_spmi_controller_add(struct device *parent, struct spmi_controller *ctrl);
+
 /**
  * struct spmi_driver - SPMI slave device driver
  * @driver:	SPMI device drivers should initialize name and owner field of
-- 
cgit v1.2.3


From 8d6608e4f89a0a21caadcf32fb5ed700e2f5682d Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@amd.com>
Date: Thu, 14 Dec 2023 15:53:47 +0100
Subject: firmware: xilinx: Remove clock_setrate and clock_getrate api

As per the current code base, PM_CLOCK_SETRATE and PM_CLOCK_GETRATE
APIs are not supported for the runtime operations. In the case of
ZynqMP returning an error from TF-A when there is any request to
access these APIs and for Versal also it is returning an error like
NO_ACCESS from the firmware. So, just removing the unused code to
avoid the confusion around these APIs.

Also, there is no issue with the backward compatibility as these APIs
were never used since implemented. Hence no need to bump up the
version of the feature check API as well.

Signed-off-by: Ronak Jain <ronak.jain@amd.com>
Signed-off-by: Michal Simek <michal.simek@amd.com>
Link: https://lore.kernel.org/r/6ccbffbafd1f0f48f6574d5a3bf2db6a5603fdb0.1702565618.git.michal.simek@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 6b48294f3c92..c6a7fb1f980b 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -160,8 +160,6 @@ enum pm_api_id {
 	PM_CLOCK_GETSTATE = 38,
 	PM_CLOCK_SETDIVIDER = 39,
 	PM_CLOCK_GETDIVIDER = 40,
-	PM_CLOCK_SETRATE = 41,
-	PM_CLOCK_GETRATE = 42,
 	PM_CLOCK_SETPARENT = 43,
 	PM_CLOCK_GETPARENT = 44,
 	PM_FPGA_READ = 46,
@@ -533,8 +531,6 @@ int zynqmp_pm_clock_disable(u32 clock_id);
 int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state);
 int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider);
 int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider);
-int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate);
-int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate);
 int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id);
 int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id);
 int zynqmp_pm_set_pll_frac_mode(u32 clk_id, u32 mode);
@@ -639,16 +635,6 @@ static inline int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider)
 	return -ENODEV;
 }
 
-static inline int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate)
-{
-	return -ENODEV;
-}
-
-static inline int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate)
-{
-	return -ENODEV;
-}
-
 static inline int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From b9ae996210163e89a2a9aece7c582fb43694485a Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@amd.com>
Date: Thu, 14 Dec 2023 15:53:48 +0100
Subject: firmware: xilinx: Remove zynqmp_pm_pinctrl_get_function()

There is no user for this interface that's why remove it.

Signed-off-by: Michal Simek <michal.simek@amd.com>
Link: https://lore.kernel.org/r/e52a415a004e28a43e6d08e9e22d9e8fef3737df.1702565618.git.michal.simek@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index c6a7fb1f980b..1478f691cc10 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -149,7 +149,6 @@ enum pm_api_id {
 	PM_SECURE_SHA = 26,
 	PM_PINCTRL_REQUEST = 28,
 	PM_PINCTRL_RELEASE = 29,
-	PM_PINCTRL_GET_FUNCTION = 30,
 	PM_PINCTRL_SET_FUNCTION = 31,
 	PM_PINCTRL_CONFIG_PARAM_GET = 32,
 	PM_PINCTRL_CONFIG_PARAM_SET = 33,
@@ -567,7 +566,6 @@ int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype);
 int zynqmp_pm_set_boot_health_status(u32 value);
 int zynqmp_pm_pinctrl_request(const u32 pin);
 int zynqmp_pm_pinctrl_release(const u32 pin);
-int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id);
 int zynqmp_pm_pinctrl_set_function(const u32 pin, const u32 id);
 int zynqmp_pm_pinctrl_get_config(const u32 pin, const u32 param,
 				 u32 *value);
@@ -804,11 +802,6 @@ static inline int zynqmp_pm_pinctrl_release(const u32 pin)
 	return -ENODEV;
 }
 
-static inline int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id)
-{
-	return -ENODEV;
-}
-
 static inline int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From f8fa5d76925991976b3e7076f9d1052515ec1fca Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Dec 2023 13:24:10 -0700
Subject: cred: switch to using atomic_long_t

There are multiple ways to grab references to credentials, and the only
protection we have against overflowing it is the memory required to do
so.

With memory sizes only moving in one direction, let's bump the reference
count to 64-bit and move it outside the realm of feasibly overflowing.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cred.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index af8d353a4b86..a3383f8efb8f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -109,7 +109,7 @@ static inline int groups_search(const struct group_info *group_info, kgid_t grp)
  * same context as task->real_cred.
  */
 struct cred {
-	atomic_t	usage;
+	atomic_long_t	usage;
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	atomic_t	subscribers;	/* number of processes subscribed */
 	void		*put_addr;
@@ -229,7 +229,7 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
  */
 static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
 {
-	atomic_add(nr, &cred->usage);
+	atomic_long_add(nr, &cred->usage);
 	return cred;
 }
 
@@ -288,7 +288,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
 	struct cred *nonconst_cred = (struct cred *) cred;
 	if (!cred)
 		return NULL;
-	if (!atomic_inc_not_zero(&nonconst_cred->usage))
+	if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
 		return NULL;
 	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
@@ -313,7 +313,7 @@ static inline void put_cred_many(const struct cred *_cred, int nr)
 
 	if (cred) {
 		validate_creds(cred);
-		if (atomic_sub_and_test(nr, &cred->usage))
+		if (atomic_long_sub_and_test(nr, &cred->usage))
 			__put_cred(cred);
 	}
 }
-- 
cgit v1.2.3


From ae1914174a63a558113e80d24ccac2773f9f7b2b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Dec 2023 13:40:57 -0700
Subject: cred: get rid of CONFIG_DEBUG_CREDENTIALS

This code is rarely (never?) enabled by distros, and it hasn't caught
anything in decades. Let's kill off this legacy debug code.

Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cred.h | 50 --------------------------------------------------
 1 file changed, 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index a3383f8efb8f..2976f534a7a3 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -110,13 +110,6 @@ static inline int groups_search(const struct group_info *group_info, kgid_t grp)
  */
 struct cred {
 	atomic_long_t	usage;
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	atomic_t	subscribers;	/* number of processes subscribed */
-	void		*put_addr;
-	unsigned	magic;
-#define CRED_MAGIC	0x43736564
-#define CRED_MAGIC_DEAD	0x44656144
-#endif
 	kuid_t		uid;		/* real UID of the task */
 	kgid_t		gid;		/* real GID of the task */
 	kuid_t		suid;		/* saved UID of the task */
@@ -172,46 +165,6 @@ extern int cred_fscmp(const struct cred *, const struct cred *);
 extern void __init cred_init(void);
 extern int set_cred_ucounts(struct cred *);
 
-/*
- * check for validity of credentials
- */
-#ifdef CONFIG_DEBUG_CREDENTIALS
-extern void __noreturn __invalid_creds(const struct cred *, const char *, unsigned);
-extern void __validate_process_creds(struct task_struct *,
-				     const char *, unsigned);
-
-extern bool creds_are_invalid(const struct cred *cred);
-
-static inline void __validate_creds(const struct cred *cred,
-				    const char *file, unsigned line)
-{
-	if (unlikely(creds_are_invalid(cred)))
-		__invalid_creds(cred, file, line);
-}
-
-#define validate_creds(cred)				\
-do {							\
-	__validate_creds((cred), __FILE__, __LINE__);	\
-} while(0)
-
-#define validate_process_creds()				\
-do {								\
-	__validate_process_creds(current, __FILE__, __LINE__);	\
-} while(0)
-
-extern void validate_creds_for_do_exit(struct task_struct *);
-#else
-static inline void validate_creds(const struct cred *cred)
-{
-}
-static inline void validate_creds_for_do_exit(struct task_struct *tsk)
-{
-}
-static inline void validate_process_creds(void)
-{
-}
-#endif
-
 static inline bool cap_ambient_invariant_ok(const struct cred *cred)
 {
 	return cap_issubset(cred->cap_ambient,
@@ -264,7 +217,6 @@ static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
 	struct cred *nonconst_cred = (struct cred *) cred;
 	if (!cred)
 		return cred;
-	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
 	return get_new_cred_many(nonconst_cred, nr);
 }
@@ -290,7 +242,6 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
 		return NULL;
 	if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
 		return NULL;
-	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
 	return cred;
 }
@@ -312,7 +263,6 @@ static inline void put_cred_many(const struct cred *_cred, int nr)
 	struct cred *cred = (struct cred *) _cred;
 
 	if (cred) {
-		validate_creds(cred);
 		if (atomic_long_sub_and_test(nr, &cred->usage))
 			__put_cred(cred);
 	}
-- 
cgit v1.2.3


From 4382159696c9af67ee047ed55f2dbf05480f52f6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Dec 2023 10:12:17 +0100
Subject: cfi: Flip headers

Normal include order is that linux/foo.h should include asm/foo.h, CFI has it
the wrong way around.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Sami Tolvanen <samitolvanen@google.com>
Link: https://lore.kernel.org/r/20231215092707.231038174@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/cfi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 3552ec82b725..2309d74e77e6 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -9,6 +9,7 @@
 
 #include <linux/bug.h>
 #include <linux/module.h>
+#include <asm/cfi.h>
 
 #ifdef CONFIG_CFI_CLANG
 enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
-- 
cgit v1.2.3


From 4f9087f16651aca4a5f32da840a53f6660f0579a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Dec 2023 10:12:18 +0100
Subject: x86/cfi,bpf: Fix BPF JIT call

The current BPF call convention is __nocfi, except when it calls !JIT things,
then it calls regular C functions.

It so happens that with FineIBT the __nocfi and C calling conventions are
incompatible. Specifically __nocfi will call at func+0, while FineIBT will have
endbr-poison there, which is not a valid indirect target. Causing #CP.

Notably this only triggers on IBT enabled hardware, which is probably why this
hasn't been reported (also, most people will have JIT on anyway).

Implement proper CFI prologues for the BPF JIT codegen and drop __nocfi for
x86.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20231215092707.345270396@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 12 ++++++++++--
 include/linux/cfi.h |  7 +++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c87c608a3689..9d84c376851a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -29,6 +29,7 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/static_call.h>
 #include <linux/memcontrol.h>
+#include <linux/cfi.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -1211,7 +1212,11 @@ struct bpf_dispatcher {
 #endif
 };
 
-static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
+#ifndef __bpfcall
+#define __bpfcall __nocfi
+#endif
+
+static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func(
 	const void *ctx,
 	const struct bpf_insn *insnsi,
 	bpf_func_t bpf_func)
@@ -1303,7 +1308,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
 
 #define DEFINE_BPF_DISPATCHER(name)					\
 	__BPF_DISPATCHER_SC(name);					\
-	noinline __nocfi unsigned int bpf_dispatcher_##name##_func(	\
+	noinline __bpfcall unsigned int bpf_dispatcher_##name##_func(	\
 		const void *ctx,					\
 		const struct bpf_insn *insnsi,				\
 		bpf_func_t bpf_func)					\
@@ -1453,6 +1458,9 @@ struct bpf_prog_aux {
 	struct bpf_kfunc_desc_tab *kfunc_tab;
 	struct bpf_kfunc_btf_tab *kfunc_btf_tab;
 	u32 size_poke_tab;
+#ifdef CONFIG_FINEIBT
+	struct bpf_ksym ksym_prefix;
+#endif
 	struct bpf_ksym ksym;
 	const struct bpf_prog_ops *ops;
 	struct bpf_map **used_maps;
diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 2309d74e77e6..1ed2d96c0cfc 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -11,6 +11,13 @@
 #include <linux/module.h>
 #include <asm/cfi.h>
 
+#ifndef cfi_get_offset
+static inline int cfi_get_offset(void)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_CFI_CLANG
 enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
 				      unsigned long *target, u32 type);
-- 
cgit v1.2.3


From 2cd3e3772e41377f32d6eea643e0590774e9187c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Dec 2023 10:12:20 +0100
Subject: x86/cfi,bpf: Fix bpf_struct_ops CFI

BPF struct_ops uses __arch_prepare_bpf_trampoline() to write
trampolines for indirect function calls. These tramplines much have
matching CFI.

In order to obtain the correct CFI hash for the various methods, add a
matching structure that contains stub functions, the compiler will
generate correct CFI which we can pilfer for the trampolines.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20231215092707.566977112@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9d84c376851a..db46b3359bf5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1060,6 +1060,17 @@ struct btf_func_model {
  */
 #define BPF_TRAMP_F_TAIL_CALL_CTX	BIT(7)
 
+/*
+ * Indicate the trampoline should be suitable to receive indirect calls;
+ * without this indirectly calling the generated code can result in #UD/#CP,
+ * depending on the CFI options.
+ *
+ * Used by bpf_struct_ops.
+ *
+ * Incompatible with FENTRY usage, overloads @func_addr argument.
+ */
+#define BPF_TRAMP_F_INDIRECT		BIT(8)
+
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.
  */
@@ -1697,6 +1708,7 @@ struct bpf_struct_ops {
 	struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
 	u32 type_id;
 	u32 value_id;
+	void *cfi_stubs;
 };
 
 #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
@@ -1710,6 +1722,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 				      struct bpf_tramp_link *link,
 				      const struct btf_func_model *model,
+				      void *stub_func,
 				      void *image, void *image_end);
 static inline bool bpf_try_module_get(const void *data, struct module *owner)
 {
-- 
cgit v1.2.3


From e9d13b9d2f99ccf7afeab490d97eaa5ac9846598 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Dec 2023 10:12:21 +0100
Subject: cfi: Add CFI_NOSEAL()

Add a CFI_NOSEAL() helper to mark functions that need to retain their
CFI information, despite not otherwise leaking their address.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20231215092707.669401084@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/cfi.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 1ed2d96c0cfc..f0df518e11dd 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -46,4 +46,8 @@ static inline void module_cfi_finalize(const Elf_Ehdr *hdr,
 #endif /* CONFIG_ARCH_USES_CFI_TRAPS */
 #endif /* CONFIG_MODULES */
 
+#ifndef CFI_NOSEAL
+#define CFI_NOSEAL(x)
+#endif
+
 #endif /* _LINUX_CFI_H */
-- 
cgit v1.2.3


From 852486b35f344887786d63250946dd921a05d7e8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Date: Fri, 15 Dec 2023 10:12:23 +0100
Subject: x86/cfi,bpf: Fix bpf_exception_cb() signature

As per the earlier patches, BPF sub-programs have bpf_callback_t
signature and CFI expects callers to have matching signature. This is
violated by bpf_prog_aux::bpf_exception_cb().

[peterz: Changelog]
Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/CAADnVQ+Z7UcXXBBhMubhcMM=R-dExk-uHtfOLtoLxQ1XxEpqEA@mail.gmail.com
Link: https://lore.kernel.org/r/20231215092707.910319166@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index db46b3359bf5..5e694934cf37 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1484,7 +1484,7 @@ struct bpf_prog_aux {
 	int cgroup_atype; /* enum cgroup_bpf_attach_type */
 	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
-	unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp);
+	u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64);
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
-- 
cgit v1.2.3


From 117211aa739a926e6555cfea883be84bee6f1695 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sat, 16 Dec 2023 00:05:02 +0100
Subject: bpf: Add missing BPF_LINK_TYPE invocations

Pengfei Xu reported [1] Syzkaller/KASAN issue found in bpf_link_show_fdinfo.

The reason is missing BPF_LINK_TYPE invocation for uprobe multi
link and for several other links, adding that.

[1] https://lore.kernel.org/bpf/ZXptoKRSLspnk2ie@xpf.sh.intel.com/

Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link")
Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support")
Fixes: 84601d6ee68a ("bpf: add bpf_link support for BPF_NETFILTER programs")
Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device")
Reported-by: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Pengfei Xu <pengfei.xu@intel.com>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/bpf/20231215230502.2769743-1-jolsa@kernel.org
---
 include/linux/bpf_types.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fc0d6f32c687..94baced5a1ad 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -142,9 +142,13 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
 #ifdef CONFIG_NET
 BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
 BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp)
+BPF_LINK_TYPE(BPF_LINK_TYPE_NETFILTER, netfilter)
+BPF_LINK_TYPE(BPF_LINK_TYPE_TCX, tcx)
+BPF_LINK_TYPE(BPF_LINK_TYPE_NETKIT, netkit)
 #endif
 #ifdef CONFIG_PERF_EVENTS
 BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
 #endif
 BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi)
 BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops)
+BPF_LINK_TYPE(BPF_LINK_TYPE_UPROBE_MULTI, uprobe_multi)
-- 
cgit v1.2.3


From bb339db4d363c84e0a8d70827df591397ccd7312 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Fri, 15 Dec 2023 17:56:48 +0000
Subject: arm: perf: Fix ARCH=arm build with GCC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LLVM ignores everything inside the if statement and doesn't generate
errors, but GCC doesn't ignore it, resulting in the following error:

  drivers/perf/arm_pmuv3.c: In function ‘armv8pmu_write_evtype’:
  include/linux/bits.h:34:29: error: left shift count >= width of type [-Werror=shift-count-overflow]
  34 |         (((~UL(0)) - (UL(1) << (l)) + 1) & \

Fix it by using GENMASK_ULL which doesn't overflow on arm32 (even though
the value is never used there).

Fixes: 3115ee021bfb ("arm64: perf: Include threshold control fields in PMEVTYPER mask")
Reported-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Closes: https://lore.kernel.org/linux-arm-kernel/20231215120817.h2f3akgv72zhrtqo@pengutronix.de/
Signed-off-by: James Clark <james.clark@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20231215175648.3397170-2-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmuv3.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 0f4d62ef3a9a..46377e134d67 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -234,8 +234,8 @@
  * PMXEVTYPER: Event selection reg
  */
 #define ARMV8_PMU_EVTYPE_EVENT	GENMASK(15, 0)	/* Mask for EVENT bits */
-#define ARMV8_PMU_EVTYPE_TH	GENMASK(43, 32)
-#define ARMV8_PMU_EVTYPE_TC	GENMASK(63, 61)
+#define ARMV8_PMU_EVTYPE_TH	GENMASK_ULL(43, 32) /* arm64 only */
+#define ARMV8_PMU_EVTYPE_TC	GENMASK_ULL(63, 61) /* arm64 only */
 
 /*
  * Event filters for PMUv3
-- 
cgit v1.2.3


From ebb30ccbbdbd6fae5177b676da4f4ac92bb4f635 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Fri, 15 Dec 2023 14:15:31 +0100
Subject: net: phy: make addr type u8 in phy_package_shared struct

Switch addr type in phy_package_shared struct to u8.

The value is already checked to be non negative and to be less than
PHY_MAX_ADDR, hence u8 is better suited than using int.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index dbb5e13e3e1b..4b13cc85c4f5 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -338,7 +338,7 @@ struct mdio_bus_stats {
  * phy_package_leave().
  */
 struct phy_package_shared {
-	int addr;
+	u8 addr;
 	refcount_t refcnt;
 	unsigned long flags;
 	size_t priv_size;
-- 
cgit v1.2.3


From 9eea577eb1155fe4a183bc5e7bf269b0b2e7a6ba Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Fri, 15 Dec 2023 14:15:32 +0100
Subject: net: phy: extend PHY package API to support multiple global address

Current API for PHY package are limited to single address to configure
global settings for the PHY package.

It was found that some PHY package (for example the qca807x, a PHY
package that is shipped with a bundle of 5 PHY) requires multiple PHY
address to configure global settings. An example scenario is a PHY that
have a dedicated PHY for PSGMII/serdes calibrarion and have a specific
PHY in the package where the global PHY mode is set and affects every
other PHY in the package.

Change the API in the following way:
- Change phy_package_join() to take the base addr of the PHY package
  instead of the global PHY addr.
- Make __/phy_package_write/read() require an additional arg that
  select what global PHY address to use by passing the offset from the
  base addr passed on phy_package_join().

Each user of this API is updated to follow this new implementation
following a pattern where an enum is defined to declare the offset of the
addr.

We also drop the check if shared is defined as any user of the
phy_package_read/write is expected to use phy_package_join first. Misuse
of this will correctly trigger a kernel panic for NULL pointer
exception.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 64 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 4b13cc85c4f5..d653f660c39d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -327,7 +327,8 @@ struct mdio_bus_stats {
 
 /**
  * struct phy_package_shared - Shared information in PHY packages
- * @addr: Common PHY address used to combine PHYs in one package
+ * @base_addr: Base PHY address of PHY package used to combine PHYs
+ *   in one package and for offset calculation of phy_package_read/write
  * @refcnt: Number of PHYs connected to this shared data
  * @flags: Initialization of PHY package
  * @priv_size: Size of the shared private data @priv
@@ -338,7 +339,7 @@ struct mdio_bus_stats {
  * phy_package_leave().
  */
 struct phy_package_shared {
-	u8 addr;
+	u8 base_addr;
 	refcount_t refcnt;
 	unsigned long flags;
 	size_t priv_size;
@@ -1976,10 +1977,10 @@ int phy_ethtool_get_link_ksettings(struct net_device *ndev,
 int phy_ethtool_set_link_ksettings(struct net_device *ndev,
 				   const struct ethtool_link_ksettings *cmd);
 int phy_ethtool_nway_reset(struct net_device *ndev);
-int phy_package_join(struct phy_device *phydev, int addr, size_t priv_size);
+int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size);
 void phy_package_leave(struct phy_device *phydev);
 int devm_phy_package_join(struct device *dev, struct phy_device *phydev,
-			  int addr, size_t priv_size);
+			  int base_addr, size_t priv_size);
 
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
@@ -2002,46 +2003,65 @@ int __phy_hwtstamp_set(struct phy_device *phydev,
 		       struct kernel_hwtstamp_config *config,
 		       struct netlink_ext_ack *extack);
 
-static inline int phy_package_read(struct phy_device *phydev, u32 regnum)
+static inline int phy_package_address(struct phy_device *phydev,
+				      unsigned int addr_offset)
 {
 	struct phy_package_shared *shared = phydev->shared;
+	u8 base_addr = shared->base_addr;
 
-	if (!shared)
+	if (addr_offset >= PHY_MAX_ADDR - base_addr)
 		return -EIO;
 
-	return mdiobus_read(phydev->mdio.bus, shared->addr, regnum);
+	/* we know that addr will be in the range 0..31 and thus the
+	 * implicit cast to a signed int is not a problem.
+	 */
+	return base_addr + addr_offset;
 }
 
-static inline int __phy_package_read(struct phy_device *phydev, u32 regnum)
+static inline int phy_package_read(struct phy_device *phydev,
+				   unsigned int addr_offset, u32 regnum)
 {
-	struct phy_package_shared *shared = phydev->shared;
+	int addr = phy_package_address(phydev, addr_offset);
 
-	if (!shared)
-		return -EIO;
+	if (addr < 0)
+		return addr;
+
+	return mdiobus_read(phydev->mdio.bus, addr, regnum);
+}
+
+static inline int __phy_package_read(struct phy_device *phydev,
+				     unsigned int addr_offset, u32 regnum)
+{
+	int addr = phy_package_address(phydev, addr_offset);
+
+	if (addr < 0)
+		return addr;
 
-	return __mdiobus_read(phydev->mdio.bus, shared->addr, regnum);
+	return __mdiobus_read(phydev->mdio.bus, addr, regnum);
 }
 
 static inline int phy_package_write(struct phy_device *phydev,
-				    u32 regnum, u16 val)
+				    unsigned int addr_offset, u32 regnum,
+				    u16 val)
 {
-	struct phy_package_shared *shared = phydev->shared;
+	int addr = phy_package_address(phydev, addr_offset);
 
-	if (!shared)
-		return -EIO;
+	if (addr < 0)
+		return addr;
 
-	return mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val);
+	return mdiobus_write(phydev->mdio.bus, addr, regnum, val);
 }
 
 static inline int __phy_package_write(struct phy_device *phydev,
-				      u32 regnum, u16 val)
+				      unsigned int addr_offset, u32 regnum,
+				      u16 val)
 {
-	struct phy_package_shared *shared = phydev->shared;
+	int addr = phy_package_address(phydev, addr_offset);
 
-	if (!shared)
-		return -EIO;
+	if (addr < 0)
+		return addr;
 
-	return __mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val);
+	return __mdiobus_write(phydev->mdio.bus, addr, regnum, val);
 }
 
 static inline bool __phy_package_set_once(struct phy_device *phydev,
-- 
cgit v1.2.3


From d63710fc0f1a501fd75a7025e3070a96ffa1645f Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Fri, 15 Dec 2023 14:15:34 +0100
Subject: net: phy: add support for PHY package MMD read/write

Some PHY in PHY package may require to read/write MMD regs to correctly
configure the PHY package.

Add support for these additional required function in both lock and no
lock variant.

It's assumed that the entire PHY package is either C22 or C45. We use
C22 or C45 way of writing/reading to mmd regs based on the passed phydev
whether it's C22 or C45.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index d653f660c39d..e9e85d347587 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -2064,6 +2064,22 @@ static inline int __phy_package_write(struct phy_device *phydev,
 	return __mdiobus_write(phydev->mdio.bus, addr, regnum, val);
 }
 
+int __phy_package_read_mmd(struct phy_device *phydev,
+			   unsigned int addr_offset, int devad,
+			   u32 regnum);
+
+int phy_package_read_mmd(struct phy_device *phydev,
+			 unsigned int addr_offset, int devad,
+			 u32 regnum);
+
+int __phy_package_write_mmd(struct phy_device *phydev,
+			    unsigned int addr_offset, int devad,
+			    u32 regnum, u16 val);
+
+int phy_package_write_mmd(struct phy_device *phydev,
+			  unsigned int addr_offset, int devad,
+			  u32 regnum, u16 val);
+
 static inline bool __phy_package_set_once(struct phy_device *phydev,
 					  unsigned int b)
 {
-- 
cgit v1.2.3


From 120931db07b49252aba2073096b595482d71857c Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 27 Nov 2023 23:36:52 -0600
Subject: rtc: Add support for configuring the UIP timeout for RTC reads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The UIP timeout is hardcoded to 10ms for all RTC reads, but in some
contexts this might not be enough time. Add a timeout parameter to
mc146818_get_time() and mc146818_get_time_callback().

If UIP timeout is configured by caller to be >=100 ms and a call
takes this long, log a warning.

Make all callers use 10ms to ensure no functional changes.

Cc:  <stable@vger.kernel.org> # 6.1.y
Fixes: ec5895c0f2d8 ("rtc: mc146818-lib: extract mc146818_avoid_UIP")
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Tested-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
Reviewed-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
Acked-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
Link: https://lore.kernel.org/r/20231128053653.101798-4-mario.limonciello@amd.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/mc146818rtc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h
index b0da04fe087b..34dfcc77f505 100644
--- a/include/linux/mc146818rtc.h
+++ b/include/linux/mc146818rtc.h
@@ -126,10 +126,11 @@ struct cmos_rtc_board_info {
 #endif /* ARCH_RTC_LOCATION */
 
 bool mc146818_does_rtc_work(void);
-int mc146818_get_time(struct rtc_time *time);
+int mc146818_get_time(struct rtc_time *time, int timeout);
 int mc146818_set_time(struct rtc_time *time);
 
 bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param),
+			int timeout,
 			void *param);
 
 #endif /* _MC146818RTC_H */
-- 
cgit v1.2.3


From 32da0f00ddcb101730cf242289b2b10ede0e1156 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Fri, 15 Dec 2023 14:57:10 -0300
Subject: net: rtnl: introduce rcu_replace_pointer_rtnl

Introduce the rcu_replace_pointer_rtnl helper to lockdep check rtnl lock
rcu replacements, alongside the already existing helpers.

This is a quality of life helper so instead of using:
   rcu_replace_pointer(rp, p, lockdep_rtnl_is_held())
   .. or the open coded..
   rtnl_dereference() / rcu_assign_pointer()
   .. or the lazy check version ..
   rcu_replace_pointer(rp, p, 1)
Use:
   rcu_replace_pointer_rtnl(rp, p)

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 6a8543b34e2c..410529fca18b 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -79,6 +79,18 @@ static inline bool lockdep_rtnl_is_held(void)
 #define rtnl_dereference(p)					\
 	rcu_dereference_protected(p, lockdep_rtnl_is_held())
 
+/**
+ * rcu_replace_pointer_rtnl - replace an RCU pointer under rtnl_lock, returning
+ * its old value
+ * @rp: RCU pointer, whose value is returned
+ * @p: regular pointer
+ *
+ * Perform a replacement under rtnl_lock, where @rp is an RCU-annotated
+ * pointer. The old value of @rp is returned, and @rp is set to @p
+ */
+#define rcu_replace_pointer_rtnl(rp, p)			\
+	rcu_replace_pointer(rp, p, lockdep_rtnl_is_held())
+
 static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
 {
 	return rtnl_dereference(dev->ingress_queue);
-- 
cgit v1.2.3


From b2adbc9cea752539f6421e9d4642408f666c1251 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alvin=20=C5=A0ipraga?= <alsi@bang-olufsen.dk>
Date: Fri, 24 Nov 2023 14:17:44 +0100
Subject: clk: si5351: allow PLLs to be adjusted without reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new PLL reset mode flag which controls whether or not to
reset a PLL after adjusting its rate. The mode can be configured through
platform data or device tree.

Since commit 6dc669a22c77 ("clk: si5351: Add PLL soft reset"), the
driver unconditionally resets a PLL whenever its rate is adjusted.
The rationale was that a PLL reset was required to get three outputs
working at the same time. Before this change, the driver never reset the
PLLs.

Commit b26ff127c52c ("clk: si5351: Apply PLL soft reset before enabling
the outputs") subsequently introduced an option to reset the PLL when
enabling a clock output that sourced it. Here, the rationale was that
this is required to get a deterministic phase relationship between
multiple output clocks.

This clearly shows that it is useful to reset the PLLs in applications
where multiple clock outputs are used. However, the Si5351 also allows
for glitch-free rate adjustment of its PLLs if one avoids resetting the
PLL. In our audio application where a single Si5351 clock output is used
to supply a runtime adjustable bit clock, this unconditional PLL reset
behaviour introduces unwanted glitches in the clock output.

It would appear that the problem being solved in the former commit
may be solved by using the optional device tree property introduced in
the latter commit, obviating the need for an unconditional PLL reset
after rate adjustment. But it's not OK to break the default behaviour of
the driver, and it cannot be assumed that all device trees are using the
property introduced in the latter commit. Hence, the new behaviour is
made opt-in.

Cc: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Cc: Rabeeh Khoury <rabeeh@solid-run.com>
Cc: Jacob Siverskog <jacob@teenage.engineering>
Cc: Sergej Sawazki <sergej@taudac.com>
Signed-off-by: Alvin Šipraga <alsi@bang-olufsen.dk>
Acked-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Link: https://lore.kernel.org/r/20231124-alvin-clk-si5351-no-pll-reset-v6-3-69b82311cb90@bang-olufsen.dk
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/platform_data/si5351.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/si5351.h b/include/linux/platform_data/si5351.h
index c71a2dd66143..5f412a615532 100644
--- a/include/linux/platform_data/si5351.h
+++ b/include/linux/platform_data/si5351.h
@@ -105,10 +105,12 @@ struct si5351_clkout_config {
  * @clk_xtal: xtal input clock
  * @clk_clkin: clkin input clock
  * @pll_src: array of pll source clock setting
+ * @pll_reset: array indicating if plls should be reset after setting the rate
  * @clkout: array of clkout configuration
  */
 struct si5351_platform_data {
 	enum si5351_pll_src pll_src[2];
+	bool pll_reset[2];
 	struct si5351_clkout_config clkout[8];
 };
 
-- 
cgit v1.2.3


From 5a2a2cda916335fff4d804e58f36b2305926841e Mon Sep 17 00:00:00 2001
From: Wang Jinchao <wangjinchao@xfusion.com>
Date: Mon, 18 Dec 2023 15:16:16 +0800
Subject: gpiolib: remove duplicate inclusions

Remove second `#include <linux/err.h>`. Remove `#include <asm/errno.h>`
too as it's included by `err.h`.

Signed-off-by: Wang Jinchao <wangjinchao@xfusion.com>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio/driver.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index bd9bea7cb270..e846bd4e7559 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -722,7 +722,6 @@ int gpiochip_irqchip_add_domain(struct gpio_chip *gc,
 #else
 
 #include <asm/bug.h>
-#include <asm/errno.h>
 
 static inline int gpiochip_irqchip_add_domain(struct gpio_chip *gc,
 					      struct irq_domain *domain)
@@ -811,8 +810,6 @@ const char *gpio_device_get_label(struct gpio_device *gdev);
 
 #else /* CONFIG_GPIOLIB */
 
-#include <linux/err.h>
-
 #include <asm/bug.h>
 
 static inline struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc)
-- 
cgit v1.2.3


From 58ff9c5acb4aef58e118bbf39736cc4d6c11a3d3 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 22 Nov 2023 15:03:51 +0900
Subject: PCI: Rename PCI_IRQ_LEGACY to PCI_IRQ_INTX

Rename PCI_IRQ_LEGACY to PCI_IRQ_INTX to be more explicit about the type
of IRQ being referenced as well as to match the PCI specifications
terms. Redefine PCI_IRQ_LEGACY as an alias to PCI_IRQ_INTX to avoid the
need for doing the renaming tree-wide. New drivers and new code should
now prefer using PCI_IRQ_INTX instead of PCI_IRQ_LEGACY.

Link: https://lore.kernel.org/r/20231122060406.14695-2-dlemoal@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/pci.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60ca768bc867..9ab4b46c0d19 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1073,11 +1073,13 @@ enum {
 	PCI_SCAN_ALL_PCIE_DEVS	= 0x00000040,	/* Scan all, not just dev 0 */
 };
 
-#define PCI_IRQ_LEGACY		(1 << 0) /* Allow legacy interrupts */
+#define PCI_IRQ_INTX		(1 << 0) /* Allow INTx interrupts */
 #define PCI_IRQ_MSI		(1 << 1) /* Allow MSI interrupts */
 #define PCI_IRQ_MSIX		(1 << 2) /* Allow MSI-X interrupts */
 #define PCI_IRQ_AFFINITY	(1 << 3) /* Auto-assign affinity */
 
+#define PCI_IRQ_LEGACY		PCI_IRQ_INTX /* Deprecated! Use PCI_IRQ_INTX */
+
 /* These external functions are only available when PCI support is enabled */
 #ifdef CONFIG_PCI
 
-- 
cgit v1.2.3


From 74955cb8ccc38539f8c029336e07e6b43b6a942e Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 22 Nov 2023 15:03:52 +0900
Subject: PCI: endpoint: Drop PCI_EPC_IRQ_XXX definitions

linux/pci.h defines the IRQ flags PCI_IRQ_INTX, PCI_IRQ_MSI and
PCI_IRQ_MSIX. Let's use these flags directly instead of the endpoint
definitions provided by enum pci_epc_irq_type. This removes the need
for defining this enum type completely.

Link: https://lore.kernel.org/r/20231122060406.14695-3-dlemoal@kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/pci-epc.h | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 5cb694031072..f498f9aa2ab0 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -19,13 +19,6 @@ enum pci_epc_interface_type {
 	SECONDARY_INTERFACE,
 };
 
-enum pci_epc_irq_type {
-	PCI_EPC_IRQ_UNKNOWN,
-	PCI_EPC_IRQ_LEGACY,
-	PCI_EPC_IRQ_MSI,
-	PCI_EPC_IRQ_MSIX,
-};
-
 static inline const char *
 pci_epc_interface_string(enum pci_epc_interface_type type)
 {
@@ -79,7 +72,7 @@ struct pci_epc_ops {
 			    u16 interrupts, enum pci_barno, u32 offset);
 	int	(*get_msix)(struct pci_epc *epc, u8 func_no, u8 vfunc_no);
 	int	(*raise_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
-			     enum pci_epc_irq_type type, u16 interrupt_num);
+			     unsigned int type, u16 interrupt_num);
 	int	(*map_msi_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			       phys_addr_t phys_addr, u8 interrupt_num,
 			       u32 entry_size, u32 *msi_data,
@@ -229,7 +222,7 @@ int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			phys_addr_t phys_addr, u8 interrupt_num,
 			u32 entry_size, u32 *msi_data, u32 *msi_addr_offset);
 int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
-		      enum pci_epc_irq_type type, u16 interrupt_num);
+		      unsigned int type, u16 interrupt_num);
 int pci_epc_start(struct pci_epc *epc);
 void pci_epc_stop(struct pci_epc *epc);
 const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc,
-- 
cgit v1.2.3


From 3314f2097dee43defc20554f961a8b17f4787e2d Mon Sep 17 00:00:00 2001
From: Jesse Brandeburg <jesse.brandeburg@intel.com>
Date: Tue, 5 Dec 2023 17:01:01 -0800
Subject: intel: add bit macro includes where needed

This series is introducing the use of FIELD_GET and FIELD_PREP which
requires bitfield.h to be included. Fix all the includes in this one
change, and rearrange includes into alphabetical order to ease
readability and future maintenance.

virtchnl.h and it's usage was modified to have it's own includes as it
should. This required including bits.h for virtchnl.h.

Reviewed-by: Marcin Szycik <marcin.szycik@linux.intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index a44d9dc7e3eb..8e177b67e82f 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -5,6 +5,7 @@
 #define _VIRTCHNL_H_
 
 #include <linux/bitops.h>
+#include <linux/bits.h>
 #include <linux/overflow.h>
 #include <uapi/linux/if_ether.h>
 
-- 
cgit v1.2.3


From d81f0d7b8b23ec79f80be602ed6129ded27862e8 Mon Sep 17 00:00:00 2001
From: Rae Moar <rmoar@google.com>
Date: Wed, 13 Dec 2023 19:44:17 +0000
Subject: kunit: add KUNIT_INIT_TABLE to init linker section

Add KUNIT_INIT_TABLE to the INIT_DATA linker section.

Alter the KUnit macros to create init tests:
kunit_test_init_section_suites

Update lib/kunit/executor.c to run both the suites in KUNIT_TABLE and
KUNIT_INIT_TABLE.

Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Rae Moar <rmoar@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/linux/module.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index a98e188cf37b..9cd0009bd050 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -540,6 +540,8 @@ struct module {
 	struct static_call_site *static_call_sites;
 #endif
 #if IS_ENABLED(CONFIG_KUNIT)
+	int num_kunit_init_suites;
+	struct kunit_suite **kunit_init_suites;
 	int num_kunit_suites;
 	struct kunit_suite **kunit_suites;
 #endif
-- 
cgit v1.2.3


From 86362293044b382aece355f9e4e3f7116dcd1eae Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Sat, 22 Jul 2023 16:08:44 -0700
Subject: PCI: endpoint: Make struct pci_epf_ops in pci_epf_driver const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pci_epf_ops struct contains a set of callbacks that are used by the
pci_epf_driver, and is never modified by the EPF core itself.

Marking the struct pointer const allows EPF drivers to declare their
pci_epf_ops struct to be const.

This allows the struct to be placed in the read-only section. Which
for example brings some security benefits as the callbacks can not be
overwritten.

[kwilczynski: commit log]
Link: https://lore.kernel.org/linux-pci/20230722230848.589428-1-lars@metafoo.de
Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
---
 include/linux/pci-epf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 3f44b6aec477..764dc62b3acc 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -98,7 +98,7 @@ struct pci_epf_driver {
 	void	(*remove)(struct pci_epf *epf);
 
 	struct device_driver	driver;
-	struct pci_epf_ops	*ops;
+	const struct pci_epf_ops *ops;
 	struct module		*owner;
 	struct list_head	epf_group;
 	const struct pci_epf_device_id	*id_table;
-- 
cgit v1.2.3


From d23569979ca1cd139a42c410e0c7b9e6014c3b3a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 13 Dec 2023 09:37:01 -0500
Subject: tracing: Allow creating instances with specified system events

A trace instance may only need to enable specific events. As the eventfs
directory of an instance currently creates all events which adds overhead,
allow internal instances to be created with just the events in systems
that they care about. This currently only deals with systems and not
individual events, but this should bring down the overhead of creating
instances for specific use cases quite bit.

The trace_array_get_by_name() now has another parameter "systems". This
parameter is a const string pointer of a comma/space separated list of
event systems that should be created by the trace_array. (Note if the
trace_array already exists, this parameter is ignored).

The list of systems is saved and if a module is loaded, its events will
not be added unless the system for those events also match the systems
string.

Link: https://lore.kernel.org/linux-trace-kernel/20231213093701.03fddec0@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: Arun Easi   <aeasi@marvell.com>
Cc: Daniel Wagner <dwagner@suse.de>
Tested-by: Dmytro Maluka <dmaluka@chromium.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace.h b/include/linux/trace.h
index 2a70a447184c..fdcd76b7be83 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -51,7 +51,7 @@ int trace_array_printk(struct trace_array *tr, unsigned long ip,
 		       const char *fmt, ...);
 int trace_array_init_printk(struct trace_array *tr);
 void trace_array_put(struct trace_array *tr);
-struct trace_array *trace_array_get_by_name(const char *name);
+struct trace_array *trace_array_get_by_name(const char *name, const char *systems);
 int trace_array_destroy(struct trace_array *tr);
 
 /* For osnoise tracer */
@@ -84,7 +84,7 @@ static inline int trace_array_init_printk(struct trace_array *tr)
 static inline void trace_array_put(struct trace_array *tr)
 {
 }
-static inline struct trace_array *trace_array_get_by_name(const char *name)
+static inline struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 8ec90be7f15fac42992ea821be929d3b06cd0fd9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 12 Dec 2023 13:19:01 -0500
Subject: tracing: Allow for max buffer data size trace_marker writes

Allow a trace write to be as big as the ring buffer tracing data will
allow. Currently, it only allows writes of 1KB in size, but there's no
reason that it cannot allow what the ring buffer can hold.

Link: https://lore.kernel.org/linux-trace-kernel/20231212131901.5f501e72@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 782e14f62201..b1b03b2c0f08 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -141,6 +141,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
 bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter);
 
 unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu);
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer);
 
 void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu);
 void ring_buffer_reset_online_cpus(struct trace_buffer *buffer);
-- 
cgit v1.2.3


From 40fc60e36c60ba85b2974e507b67df40c94e9578 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Sat, 9 Dec 2023 17:52:20 -0500
Subject: trace_seq: Increase the buffer size to almost two pages

Now that trace_marker can hold more than 1KB string, and can write as much
as the ring buffer can hold, the trace_seq is not big enough to hold
writes:

 ~# a="1234567890"
 ~# cnt=4080
 ~# s=""
 ~# while [ $cnt -gt 10 ]; do
 ~#	s="${s}${a}"
 ~#	cnt=$((cnt-10))
 ~# done
 ~# echo $s > trace_marker
 ~# cat trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 2/2   #P:8
 #
 #                                _-----=> irqs-off/BH-disabled
 #                               / _----=> need-resched
 #                              | / _---=> hardirq/softirq
 #                              || / _--=> preempt-depth
 #                              ||| / _-=> migrate-disable
 #                              |||| /     delay
 #           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
 #              | |         |   |||||     |         |
            <...>-860     [002] .....   105.543465: tracing_mark_write[LINE TOO BIG]
            <...>-860     [002] .....   105.543496: tracing_mark_write: 789012345678901234567890

By increasing the trace_seq buffer to almost two pages, it can now print
out the first line.

This also subtracts the rest of the trace_seq fields from the buffer, so
that the entire trace_seq is now PAGE_SIZE aligned.

Link: https://lore.kernel.org/linux-trace-kernel/20231209175220.19867af4@gandalf.local.home

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_seq.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 3691e0e76a1a..9ec229dfddaa 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -8,11 +8,14 @@
 
 /*
  * Trace sequences are used to allow a function to call several other functions
- * to create a string of data to use (up to a max of PAGE_SIZE).
+ * to create a string of data to use.
  */
 
+#define TRACE_SEQ_BUFFER_SIZE	(PAGE_SIZE * 2 - \
+	(sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int)))
+
 struct trace_seq {
-	char			buffer[PAGE_SIZE];
+	char			buffer[TRACE_SEQ_BUFFER_SIZE];
 	struct seq_buf		seq;
 	size_t			readpos;
 	int			full;
@@ -21,7 +24,7 @@ struct trace_seq {
 static inline void
 trace_seq_init(struct trace_seq *s)
 {
-	seq_buf_init(&s->seq, s->buffer, PAGE_SIZE);
+	seq_buf_init(&s->seq, s->buffer, TRACE_SEQ_BUFFER_SIZE);
 	s->full = 0;
 	s->readpos = 0;
 }
-- 
cgit v1.2.3


From b9670ee2e975e1cb6751019d5dc5c193aecd8ba2 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Sun, 17 Dec 2023 12:07:02 -0800
Subject: Revert "iio: hid-sensor-als: Add light chromaticity support"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit ee3710f39f9d0ae5137a866138d005fe1ad18132.

This commit assumes that every HID descriptor for ALS sensor has
presence of usage id ID HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X and
HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y. When the above usage ids are
absent,  driver probe fails. This breaks ALS sensor functionality on
many platforms.

Till we have a good solution, revert this commit.

Reported-by: Thomas Weißschuh <thomas@t-8ch.de>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218223
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc:  <stable@vger.kernel.org>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20231217200703.719876-2-srinivas.pandruvada@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/hid-sensor-ids.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 6730ee900ee1..8af4fb3e0254 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -22,9 +22,6 @@
 #define HID_USAGE_SENSOR_DATA_LIGHT				0x2004d0
 #define HID_USAGE_SENSOR_LIGHT_ILLUM				0x2004d1
 #define HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE		0x2004d2
-#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY			0x2004d3
-#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X			0x2004d4
-#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y			0x2004d5
 
 /* PROX (200011) */
 #define HID_USAGE_SENSOR_PROX                                   0x200011
-- 
cgit v1.2.3


From d4005431673929a1259ad791db87408fcf85d2cc Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Sun, 17 Dec 2023 12:07:03 -0800
Subject: Revert "iio: hid-sensor-als: Add light color temperature support"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 5f05285df691b1e82108eead7165feae238c95ef.

This commit assumes that every HID descriptor for ALS sensor has
presence of usage id ID HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE.
When the above usage id is absent,  driver probe fails. This breaks
ALS sensor functionality on many platforms.

Till we have a good solution, revert this commit.

Reported-by: Thomas Weißschuh <thomas@t-8ch.de>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218223
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc:  <stable@vger.kernel.org>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20231217200703.719876-3-srinivas.pandruvada@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/hid-sensor-ids.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 8af4fb3e0254..13b1e65fbdcc 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -21,7 +21,6 @@
 #define HID_USAGE_SENSOR_ALS					0x200041
 #define HID_USAGE_SENSOR_DATA_LIGHT				0x2004d0
 #define HID_USAGE_SENSOR_LIGHT_ILLUM				0x2004d1
-#define HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE		0x2004d2
 
 /* PROX (200011) */
 #define HID_USAGE_SENSOR_PROX                                   0x200011
-- 
cgit v1.2.3


From 7259eb7b534735b9c1153654c0bb4c5f059c0dd3 Mon Sep 17 00:00:00 2001
From: Moti Haimovski <mhaimovski@habana.ai>
Date: Sun, 12 Nov 2023 18:07:10 +0200
Subject: accel/habanalabs/gaudi2: add signed dev info uAPI

User will provide a nonce via the INFO ioctl, and will retrieve
the signed device info generated using given nonce.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 include/linux/habanalabs/cpucp_if.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
index 86ea7c63a0d2..f316c8d0f3fc 100644
--- a/include/linux/habanalabs/cpucp_if.h
+++ b/include/linux/habanalabs/cpucp_if.h
@@ -659,6 +659,12 @@ enum pq_init_status {
  *       number (nonce) provided by the host to prevent replay attacks.
  *       public key and certificate also provided as part of the FW response.
  *
+ * CPUCP_PACKET_INFO_SIGNED_GET -
+ *       Get the device information signed by the Trusted Platform device.
+ *       device info data is also hashed with some unique number (nonce) provided
+ *       by the host to prevent replay attacks. public key and certificate also
+ *       provided as part of the FW response.
+ *
  * CPUCP_PACKET_MONITOR_DUMP_GET -
  *       Get monitors registers dump from the CpuCP kernel.
  *       The CPU will put the registers dump in the a buffer allocated by the driver
@@ -733,7 +739,7 @@ enum cpucp_packet_id {
 	CPUCP_PACKET_ENGINE_CORE_ASID_SET,	/* internal */
 	CPUCP_PACKET_RESERVED2,			/* not used */
 	CPUCP_PACKET_SEC_ATTEST_GET,		/* internal */
-	CPUCP_PACKET_RESERVED3,			/* not used */
+	CPUCP_PACKET_INFO_SIGNED_GET,		/* internal */
 	CPUCP_PACKET_RESERVED4,			/* not used */
 	CPUCP_PACKET_MONITOR_DUMP_GET,		/* debugfs */
 	CPUCP_PACKET_RESERVED5,			/* not used */
-- 
cgit v1.2.3


From 403863e985e8eba608d53b2907caaf37b6176290 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Sat, 16 Dec 2023 13:29:58 +0100
Subject: netlink: introduce typedef for filter function

Make the code using filter function a bit nicer by consolidating the
filter function arguments using typedef.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/connector.h | 3 +--
 include/linux/netlink.h   | 6 ++++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/connector.h b/include/linux/connector.h
index cec2d99ae902..70bc1160f3d8 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -100,8 +100,7 @@ void cn_del_callback(const struct cb_id *id);
  */
 int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid,
 			 u32 group, gfp_t gfp_mask,
-			 int (*filter)(struct sock *dsk, struct sk_buff *skb,
-				       void *data),
+			 netlink_filter_fn filter,
 			 void *filter_data);
 
 /**
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index abe91ed6b9aa..1a4445bf2ab9 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -228,10 +228,12 @@ bool netlink_strict_get_check(struct sk_buff *skb);
 int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
 int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
 		      __u32 group, gfp_t allocation);
+
+typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data);
+
 int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb,
 			       __u32 portid, __u32 group, gfp_t allocation,
-			       int (*filter)(struct sock *dsk,
-					     struct sk_buff *skb, void *data),
+			       netlink_filter_fn filter,
 			       void *filter_data);
 int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code);
 int netlink_register_notifier(struct notifier_block *nb);
-- 
cgit v1.2.3


From 67ba055dd7758c34f6e64c9d35132362c1e1f0b5 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 19 Dec 2023 17:40:12 +0200
Subject: regulator: Reuse LINEAR_RANGE() in REGULATOR_LINEAR_RANGE()

REGULATOR_LINEAR_RANGE() repeats what LINEAR_RANGE() provides.
Deduplicate the former by using the latter. No functional change
intended.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://msgid.link/r/20231219154012.2478688-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 4b7eceb3828b..22a07c0900a4 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -51,12 +51,7 @@ enum regulator_detection_severity {
 
 /* Initialize struct linear_range for regulators */
 #define REGULATOR_LINEAR_RANGE(_min_uV, _min_sel, _max_sel, _step_uV)	\
-{									\
-	.min		= _min_uV,					\
-	.min_sel	= _min_sel,					\
-	.max_sel	= _max_sel,					\
-	.step		= _step_uV,					\
-}
+	LINEAR_RANGE(_min_uV, _min_sel, _max_sel, _step_uV)
 
 /**
  * struct regulator_ops - regulator operations.
-- 
cgit v1.2.3


From d17aff807f845cf93926c28705216639c7279110 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 19 Dec 2023 07:37:35 -0800
Subject: Revert BPF token-related functionality

This patch includes the following revert (one  conflicting BPF FS
patch and three token patch sets, represented by merge commits):
  - revert 0f5d5454c723 "Merge branch 'bpf-fs-mount-options-parsing-follow-ups'";
  - revert 750e785796bb "bpf: Support uid and gid when mounting bpffs";
  - revert 733763285acf "Merge branch 'bpf-token-support-in-libbpf-s-bpf-object'";
  - revert c35919dcce28 "Merge branch 'bpf-token-and-bpf-fs-based-delegation'".

Link: https://lore.kernel.org/bpf/CAHk-=wg7JuFYwGy=GOMbRCtOL+jwSQsdUaBsRWkDVYbxipbM5A@mail.gmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf.h           | 85 +++++--------------------------------------
 include/linux/filter.h        |  2 +-
 include/linux/lsm_hook_defs.h | 15 ++------
 include/linux/security.h      | 43 ++++------------------
 4 files changed, 22 insertions(+), 123 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2f54cc0436c4..7a8d4c81a39a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -52,10 +52,6 @@ struct module;
 struct bpf_func_state;
 struct ftrace_ops;
 struct cgroup;
-struct bpf_token;
-struct user_namespace;
-struct super_block;
-struct inode;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1488,7 +1484,6 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
-	struct bpf_token *token;
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
 	struct bpf_func_info *func_info;
@@ -1613,31 +1608,6 @@ struct bpf_link_primer {
 	u32 id;
 };
 
-struct bpf_mount_opts {
-	kuid_t uid;
-	kgid_t gid;
-	umode_t mode;
-
-	/* BPF token-related delegation options */
-	u64 delegate_cmds;
-	u64 delegate_maps;
-	u64 delegate_progs;
-	u64 delegate_attachs;
-};
-
-struct bpf_token {
-	struct work_struct work;
-	atomic64_t refcnt;
-	struct user_namespace *userns;
-	u64 allowed_cmds;
-	u64 allowed_maps;
-	u64 allowed_progs;
-	u64 allowed_attachs;
-#ifdef CONFIG_SECURITY
-	void *security;
-#endif
-};
-
 struct bpf_struct_ops_value;
 struct btf_member;
 
@@ -2097,7 +2067,6 @@ static inline void bpf_enable_instrumentation(void)
 	migrate_enable();
 }
 
-extern const struct super_operations bpf_super_ops;
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
@@ -2232,26 +2201,24 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map)
 
 extern int sysctl_unprivileged_bpf_disabled;
 
-bool bpf_token_capable(const struct bpf_token *token, int cap);
-
-static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token)
+static inline bool bpf_allow_ptr_leaks(void)
 {
-	return bpf_token_capable(token, CAP_PERFMON);
+	return perfmon_capable();
 }
 
-static inline bool bpf_allow_uninit_stack(const struct bpf_token *token)
+static inline bool bpf_allow_uninit_stack(void)
 {
-	return bpf_token_capable(token, CAP_PERFMON);
+	return perfmon_capable();
 }
 
-static inline bool bpf_bypass_spec_v1(const struct bpf_token *token)
+static inline bool bpf_bypass_spec_v1(void)
 {
-	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
+	return cpu_mitigations_off() || perfmon_capable();
 }
 
-static inline bool bpf_bypass_spec_v4(const struct bpf_token *token)
+static inline bool bpf_bypass_spec_v4(void)
 {
-	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
+	return cpu_mitigations_off() || perfmon_capable();
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
@@ -2268,21 +2235,8 @@ int bpf_link_new_fd(struct bpf_link *link);
 struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
 
-void bpf_token_inc(struct bpf_token *token);
-void bpf_token_put(struct bpf_token *token);
-int bpf_token_create(union bpf_attr *attr);
-struct bpf_token *bpf_token_get_from_fd(u32 ufd);
-
-bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
-bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
-bool bpf_token_allow_prog_type(const struct bpf_token *token,
-			       enum bpf_prog_type prog_type,
-			       enum bpf_attach_type attach_type);
-
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
-struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir,
-			    umode_t mode);
 
 #define BPF_ITER_FUNC_PREFIX "bpf_iter_"
 #define DEFINE_BPF_ITER_FUNC(target, args...)			\
@@ -2526,8 +2480,7 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type
 struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
 
-const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
-						 const struct bpf_prog *prog);
+const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
 void bpf_task_storage_free(struct task_struct *task);
 void bpf_cgrp_storage_free(struct cgroup *cgroup);
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
@@ -2646,24 +2599,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 	return -EOPNOTSUPP;
 }
 
-static inline bool bpf_token_capable(const struct bpf_token *token, int cap)
-{
-	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
-}
-
-static inline void bpf_token_inc(struct bpf_token *token)
-{
-}
-
-static inline void bpf_token_put(struct bpf_token *token)
-{
-}
-
-static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
 static inline void __dev_flush(void)
 {
 }
@@ -2787,7 +2722,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log,
 }
 
 static inline const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+bpf_base_func_proto(enum bpf_func_id func_id)
 {
 	return NULL;
 }
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 12d907f17d36..68fb6c8142fe 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
 		return false;
 	if (!bpf_jit_harden)
 		return false;
-	if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF))
+	if (bpf_jit_harden == 1 && bpf_capable())
 		return false;
 
 	return true;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 3fdd00b452ac..ff217a5ce552 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -398,17 +398,10 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule)
 LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size)
 LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
 LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
-LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr,
-	 struct bpf_token *token)
-LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
-LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
-	 struct bpf_token *token)
-LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
-LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr,
-	 struct path *path)
-LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token)
-LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd)
-LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap)
+LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
+LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
+LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux)
+LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux)
 #endif /* CONFIG_BPF_SYSCALL */
 
 LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
diff --git a/include/linux/security.h b/include/linux/security.h
index 00809d2d5c38..1d1df326c881 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -32,7 +32,6 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/sockptr.h>
-#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -2021,22 +2020,15 @@ static inline void securityfs_remove(struct dentry *dentry)
 union bpf_attr;
 struct bpf_map;
 struct bpf_prog;
-struct bpf_token;
+struct bpf_prog_aux;
 #ifdef CONFIG_SECURITY
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
 extern int security_bpf_prog(struct bpf_prog *prog);
-extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
-				   struct bpf_token *token);
+extern int security_bpf_map_alloc(struct bpf_map *map);
 extern void security_bpf_map_free(struct bpf_map *map);
-extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
-				  struct bpf_token *token);
-extern void security_bpf_prog_free(struct bpf_prog *prog);
-extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
-				     struct path *path);
-extern void security_bpf_token_free(struct bpf_token *token);
-extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
-extern int security_bpf_token_capable(const struct bpf_token *token, int cap);
+extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
+extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
 #else
 static inline int security_bpf(int cmd, union bpf_attr *attr,
 					     unsigned int size)
@@ -2054,8 +2046,7 @@ static inline int security_bpf_prog(struct bpf_prog *prog)
 	return 0;
 }
 
-static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
-					  struct bpf_token *token)
+static inline int security_bpf_map_alloc(struct bpf_map *map)
 {
 	return 0;
 }
@@ -2063,33 +2054,13 @@ static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *a
 static inline void security_bpf_map_free(struct bpf_map *map)
 { }
 
-static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
-					 struct bpf_token *token)
+static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
 {
 	return 0;
 }
 
-static inline void security_bpf_prog_free(struct bpf_prog *prog)
+static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
 { }
-
-static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
-				     struct path *path)
-{
-	return 0;
-}
-
-static inline void security_bpf_token_free(struct bpf_token *token)
-{ }
-
-static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
-{
-	return 0;
-}
-
-static inline int security_bpf_token_capable(const struct bpf_token *token, int cap)
-{
-	return 0;
-}
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_BPF_SYSCALL */
 
-- 
cgit v1.2.3


From fd27ef6b44bec26915c5b2b22c13856d9f0ba17a Mon Sep 17 00:00:00 2001
From: Feng Liu <feliu@nvidia.com>
Date: Tue, 19 Dec 2023 11:32:40 +0200
Subject: virtio-pci: Introduce admin virtqueue

Introduce support for the admin virtqueue. By negotiating
VIRTIO_F_ADMIN_VQ feature, driver detects capability and creates one
administration virtqueue. Administration virtqueue implementation in
virtio pci generic layer, enables multiple types of upper layer
drivers such as vfio, net, blk to utilize it.

Signed-off-by: Feng Liu <feliu@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20231219093247.170936-3-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/virtio_config.h     | 4 ++++
 include/linux/virtio_pci_modern.h | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 2b3438de2c4d..da9b271b54db 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -93,6 +93,8 @@ typedef void vq_callback_t(struct virtqueue *);
  *	Returns 0 on success or error status
  *	If disable_vq_and_reset is set, then enable_vq_after_reset must also be
  *	set.
+ * @create_avq: create admin virtqueue resource.
+ * @destroy_avq: destroy admin virtqueue resource.
  */
 struct virtio_config_ops {
 	void (*get)(struct virtio_device *vdev, unsigned offset,
@@ -120,6 +122,8 @@ struct virtio_config_ops {
 			       struct virtio_shm_region *region, u8 id);
 	int (*disable_vq_and_reset)(struct virtqueue *vq);
 	int (*enable_vq_after_reset)(struct virtqueue *vq);
+	int (*create_avq)(struct virtio_device *vdev);
+	void (*destroy_avq)(struct virtio_device *vdev);
 };
 
 /* If driver didn't advertise the feature, it will never appear. */
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index a09e13a577a9..c0b1b1ca1163 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -125,4 +125,6 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev);
 void vp_modern_remove(struct virtio_pci_modern_device *mdev);
 int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index);
 void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index);
+u16 vp_modern_avq_num(struct virtio_pci_modern_device *mdev);
+u16 vp_modern_avq_index(struct virtio_pci_modern_device *mdev);
 #endif
-- 
cgit v1.2.3


From 92792ac752aa80d5ee71bc291d90edd06cd76bd1 Mon Sep 17 00:00:00 2001
From: Feng Liu <feliu@nvidia.com>
Date: Tue, 19 Dec 2023 11:32:41 +0200
Subject: virtio-pci: Introduce admin command sending function

Add support for sending admin command through admin virtqueue interface.
Abort any inflight admin commands once device reset completes. Activate
admin queue when device becomes ready; deactivate on device reset.

To comply to the below specification statement [1], the admin virtqueue
is activated for upper layer users only after setting DRIVER_OK status.

[1] The driver MUST NOT send any buffer available notifications to the
device before setting DRIVER_OK.

Signed-off-by: Feng Liu <feliu@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20231219093247.170936-4-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/virtio.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 4cc614a38376..b0201747a263 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -103,6 +103,14 @@ int virtqueue_resize(struct virtqueue *vq, u32 num,
 int virtqueue_reset(struct virtqueue *vq,
 		    void (*recycle)(struct virtqueue *vq, void *buf));
 
+struct virtio_admin_cmd {
+	__le16 opcode;
+	__le16 group_type;
+	__le64 group_member_id;
+	struct scatterlist *data_sg;
+	struct scatterlist *result_sg;
+};
+
 /**
  * struct virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus
-- 
cgit v1.2.3


From c3fc3e098bd64c560dde49a6e72b21b055150abe Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Tue, 19 Dec 2023 11:32:44 +0200
Subject: virtio-pci: Introduce APIs to execute legacy IO admin commands

Introduce APIs to execute legacy IO admin commands.

It includes: io_legacy_read/write for both common and the device
configuration, io_legacy_notify_info.

In addition, exposing an API to check whether the legacy IO commands are
supported. (i.e. virtio_pci_admin_has_legacy_io()).

Those APIs will be used by the next patches from this series.

Note:
Unlike modern drivers which support hardware virtio devices, legacy
drivers assume software-based devices: e.g. they don't use proper memory
barriers on ARM, use big endian on PPC, etc. X86 drivers are mostly ok
though, more or less by chance. For now, only support legacy IO on X86.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20231219093247.170936-7-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/virtio_pci_admin.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 include/linux/virtio_pci_admin.h

(limited to 'include/linux')

diff --git a/include/linux/virtio_pci_admin.h b/include/linux/virtio_pci_admin.h
new file mode 100644
index 000000000000..f4a100a0fe2e
--- /dev/null
+++ b/include/linux/virtio_pci_admin.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VIRTIO_PCI_ADMIN_H
+#define _LINUX_VIRTIO_PCI_ADMIN_H
+
+#include <linux/types.h>
+#include <linux/pci.h>
+
+#ifdef CONFIG_VIRTIO_PCI_ADMIN_LEGACY
+bool virtio_pci_admin_has_legacy_io(struct pci_dev *pdev);
+int virtio_pci_admin_legacy_common_io_write(struct pci_dev *pdev, u8 offset,
+					    u8 size, u8 *buf);
+int virtio_pci_admin_legacy_common_io_read(struct pci_dev *pdev, u8 offset,
+					   u8 size, u8 *buf);
+int virtio_pci_admin_legacy_device_io_write(struct pci_dev *pdev, u8 offset,
+					    u8 size, u8 *buf);
+int virtio_pci_admin_legacy_device_io_read(struct pci_dev *pdev, u8 offset,
+					   u8 size, u8 *buf);
+int virtio_pci_admin_legacy_io_notify_info(struct pci_dev *pdev,
+					   u8 req_bar_flags, u8 *bar,
+					   u64 *bar_offset);
+#endif
+
+#endif /* _LINUX_VIRTIO_PCI_ADMIN_H */
-- 
cgit v1.2.3


From 8bccc5b80678c69f7729ce4cd232c0aa98fa6277 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Tue, 19 Dec 2023 11:32:45 +0200
Subject: vfio/pci: Expose vfio_pci_core_setup_barmap()

Expose vfio_pci_core_setup_barmap() to be used by drivers.

This will let drivers to mmap a BAR and re-use it from both vfio and the
driver when it's applicable.

This API will be used in the next patches by the vfio/virtio coming
driver.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20231219093247.170936-8-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio_pci_core.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 562e8754869d..67ac58e20e1d 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -127,6 +127,7 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
 int vfio_pci_core_enable(struct vfio_pci_core_device *vdev);
 void vfio_pci_core_disable(struct vfio_pci_core_device *vdev);
 void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
+int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar);
 pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 						pci_channel_state_t state);
 
-- 
cgit v1.2.3


From 8486ae162b3b6cc1055366f044495cf1966231f1 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Tue, 19 Dec 2023 11:32:46 +0200
Subject: vfio/pci: Expose vfio_pci_core_iowrite/read##size()

Expose vfio_pci_core_iowrite/read##size() to let it be used by drivers.

This functionality is needed to enable direct access to some physical
BAR of the device with the proper locks/checks in place.

The next patches from this series will use this functionality on a data
path flow when a direct access to the BAR is needed.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20231219093247.170936-9-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio_pci_core.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 67ac58e20e1d..85e84b92751b 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -131,4 +131,23 @@ int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar);
 pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 						pci_channel_state_t state);
 
+#define VFIO_IOWRITE_DECLATION(size) \
+int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev,	\
+			bool test_mem, u##size val, void __iomem *io);
+
+VFIO_IOWRITE_DECLATION(8)
+VFIO_IOWRITE_DECLATION(16)
+VFIO_IOWRITE_DECLATION(32)
+#ifdef iowrite64
+VFIO_IOWRITE_DECLATION(64)
+#endif
+
+#define VFIO_IOREAD_DECLATION(size) \
+int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev,	\
+			bool test_mem, u##size *val, void __iomem *io);
+
+VFIO_IOREAD_DECLATION(8)
+VFIO_IOREAD_DECLATION(16)
+VFIO_IOREAD_DECLATION(32)
+
 #endif /* VFIO_PCI_CORE_H */
-- 
cgit v1.2.3


From a4104821ad651d8a0b374f0b2474c345bbb42f82 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 19 Dec 2023 12:30:43 -0700
Subject: io_uring/unix: drop usage of io_uring socket

Since we no longer allow sending io_uring fds over SCM_RIGHTS, move to
using io_is_uring_fops() to detect whether this is a io_uring fd or not.
With that done, kill off io_uring_get_socket() as nobody calls it
anymore.

This is in preparation to yanking out the rest of the core related to
unix gc with io_uring.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index d8fc93492dc5..68ed6697fece 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -7,12 +7,12 @@
 #include <uapi/linux/io_uring.h>
 
 #if defined(CONFIG_IO_URING)
-struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
 void io_uring_unreg_ringfd(void);
 const char *io_uring_get_opcode(u8 opcode);
 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
+bool io_is_uring_fops(struct file *file);
 
 static inline void io_uring_files_cancel(void)
 {
@@ -32,10 +32,6 @@ static inline void io_uring_free(struct task_struct *tsk)
 		__io_uring_free(tsk);
 }
 #else
-static inline struct sock *io_uring_get_socket(struct file *file)
-{
-	return NULL;
-}
 static inline void io_uring_task_cancel(void)
 {
 }
@@ -54,6 +50,10 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
 {
 	return -EOPNOTSUPP;
 }
+static inline bool io_is_uring_fops(struct file *file)
+{
+	return false;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 6e5e6d274956305f1fc0340522b38f5f5be74bdb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 19 Dec 2023 12:36:34 -0700
Subject: io_uring: drop any code related to SCM_RIGHTS

This is dead code after we dropped support for passing io_uring fds
over SCM_RIGHTS, get rid of it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index bebab36abce8..fc8f2570b92b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -389,9 +389,6 @@ struct io_ring_ctx {
 	struct wait_queue_head		rsrc_quiesce_wq;
 	unsigned			rsrc_quiesce;
 
-	#if defined(CONFIG_UNIX)
-		struct socket		*ring_sock;
-	#endif
 	/* hashed buffered write serialization */
 	struct io_wq_hash		*hash_map;
 
-- 
cgit v1.2.3


From ee0cf5e07f44a10fce8f1bfa9db226c0b5ecf880 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= <theo.lebrun@bootlin.com>
Date: Mon, 18 Dec 2023 18:14:16 +0100
Subject: clk: fixed-rate: fix
 clk_hw_register_fixed_rate_with_accuracy_parent_hw
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add missing comma and remove extraneous NULL argument. The macro is
currently used by no one which explains why the typo slipped by.

Fixes: 2d34f09e79c9 ("clk: fixed-rate: Add support for specifying parents via DT/pointers")
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://lore.kernel.org/r/20231218-mbly-clk-v1-1-44ce54108f06@bootlin.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index ace3a4ce2fc9..1293c38ddb7f 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -448,8 +448,8 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
  */
 #define clk_hw_register_fixed_rate_with_accuracy_parent_hw(dev, name,	      \
 		parent_hw, flags, fixed_rate, fixed_accuracy)		      \
-	__clk_hw_register_fixed_rate((dev), NULL, (name), NULL, (parent_hw)   \
-				     NULL, NULL, (flags), (fixed_rate),	      \
+	__clk_hw_register_fixed_rate((dev), NULL, (name), NULL, (parent_hw),  \
+				     NULL, (flags), (fixed_rate),	      \
 				     (fixed_accuracy), 0, false)
 /**
  * clk_hw_register_fixed_rate_with_accuracy_parent_data - register fixed-rate
-- 
cgit v1.2.3


From 4ba1d0f23414135e4f426dae4cb5cdc2ce246f89 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:25 -0800
Subject: bpf: abstract away global subprog arg preparation logic from reg
 state setup

btf_prepare_func_args() is used to understand expectations and
restrictions on global subprog arguments. But current implementation is
hard to extend, as it intermixes BTF-based func prototype parsing and
interpretation logic with setting up register state at subprog entry.

Worse still, those registers are not completely set up inside
btf_prepare_func_args(), requiring some more logic later in
do_check_common(). Like calling mark_reg_unknown() and similar
initialization operations.

This intermixing of BTF interpretation and register state setup is
problematic. First, it causes duplication of BTF parsing logic for global
subprog verification (to set up initial state of global subprog) and
global subprog call sites analysis (when we need to check that whatever
is being passed into global subprog matches expectations), performed in
btf_check_subprog_call().

Given we want to extend global func argument with tags later, this
duplication is problematic. So refactor btf_prepare_func_args() to do
only BTF-based func proto and args parsing, returning high-level
argument "expectations" only, with no regard to specifics of register
state. I.e., if it's a context argument, instead of setting register
state to PTR_TO_CTX, we return ARG_PTR_TO_CTX enum for that argument as
"an argument specification" for further processing inside
do_check_common(). Similarly for SCALAR arguments, PTR_TO_MEM, etc.

This allows to reuse btf_prepare_func_args() in following patches at
global subprog call site analysis time. It also keeps register setup
code consistently in one place, do_check_common().

Besides all this, we cache this argument specs information inside
env->subprog_info, eliminating the need to redo these potentially
expensive BTF traversals, especially if BPF program's BTF is big and/or
there are lots of global subprog calls.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  3 +--
 include/linux/bpf_verifier.h | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7a8d4c81a39a..c050c82cc9a5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2470,8 +2470,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 				struct bpf_reg_state *regs);
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
-int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *reg, u32 *nargs);
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
 const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c2819a6579a5..5742e9c0a7b8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -606,6 +606,13 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 
 #define BPF_MAX_SUBPROGS 256
 
+struct bpf_subprog_arg_info {
+	enum bpf_arg_type arg_type;
+	union {
+		u32 mem_size;
+	};
+};
+
 struct bpf_subprog_info {
 	/* 'start' has to be the first field otherwise find_subprog() won't work */
 	u32 start; /* insn idx of function entry point */
@@ -617,6 +624,10 @@ struct bpf_subprog_info {
 	bool is_cb: 1;
 	bool is_async_cb: 1;
 	bool is_exception_cb: 1;
+	bool args_cached: 1;
+
+	u8 arg_cnt;
+	struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
 };
 
 struct bpf_verifier_env;
@@ -727,6 +738,11 @@ struct bpf_verifier_env {
 	char tmp_str_buf[TMP_STR_BUF_LEN];
 };
 
+static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
+{
+	return &env->subprog_info[subprog];
+}
+
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
-- 
cgit v1.2.3


From 5eccd2db42d77e3570619c32d39e39bf486607cf Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:26 -0800
Subject: bpf: reuse btf_prepare_func_args() check for main program BTF
 validation

Instead of btf_check_subprog_arg_match(), use btf_prepare_func_args()
logic to validate "trustworthiness" of main BPF program's BTF information,
if it is present.

We ignored results of original BTF check anyway, often times producing
confusing and ominously-sounding "reg type unsupported for arg#0
function" message, which has no apparent effect on program correctness
and verification process.

All the -EFAULT returning sanity checks are already performed in
check_btf_info_early(), so there is zero reason to have this duplication
of logic between btf_check_subprog_call() and btf_check_subprog_arg_match().
Dropping btf_check_subprog_arg_match() simplifies
btf_check_func_arg_match() further removing `bool processing_call` flag.

One subtle bit that was done by btf_check_subprog_arg_match() was
potentially marking main program's BTF as unreliable. We do this
explicitly now with a dedicated simple check, preserving the original
behavior, but now based on well factored btf_prepare_func_args() logic.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c050c82cc9a5..d0d7eff22b8a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2466,8 +2466,6 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   struct btf_func_model *m);
 
 struct bpf_reg_state;
-int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
-				struct bpf_reg_state *regs);
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
-- 
cgit v1.2.3


From e26080d0da87f20222ca6712b65f95a856fadee0 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:27 -0800
Subject: bpf: prepare btf_prepare_func_args() for handling static subprogs

Generalize btf_prepare_func_args() to support both global and static
subprogs. We are going to utilize this property in the next patch,
reusing btf_prepare_func_args() for subprog call logic instead of
reparsing BTF information in a completely separate implementation.

btf_prepare_func_args() now detects whether subprog is global or static
makes slight logic adjustments for static func cases, like not failing
fatally (-EFAULT) for conditions that are allowable for static subprogs.

Somewhat subtle (but major!) difference is the handling of pointer arguments.
Both global and static functions need to handle special context
arguments (which are pointers to predefined type names), but static
subprogs give up on any other pointers, falling back to marking subprog
as "unreliable", disabling the use of BTF type information altogether.

For global functions, though, we are assuming that such pointers to
unrecognized types are just pointers to fixed-sized memory region (or
error out if size cannot be established, like for `void *` pointers).

This patch accommodates these small differences and sets up a stage for
refactoring in the next patch, eliminating a separate BTF-based parsing
logic in btf_check_func_arg_match().

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5742e9c0a7b8..d3ea9ef04767 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -738,6 +738,11 @@ struct bpf_verifier_env {
 	char tmp_str_buf[TMP_STR_BUF_LEN];
 };
 
+static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
+{
+	return &env->prog->aux->func_info_aux[subprog];
+}
+
 static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
 {
 	return &env->subprog_info[subprog];
-- 
cgit v1.2.3


From c5a7244759b1eeacc59d0426fb73859afa942d0d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:28 -0800
Subject: bpf: move subprog call logic back to verifier.c

Subprog call logic in btf_check_subprog_call() currently has both a lot
of BTF parsing logic (which is, presumably, what justified putting it
into btf.c), but also a bunch of register state checks, some of each
utilize deep verifier logic helpers, necessarily exported from
verifier.c: check_ptr_off_reg(), check_func_arg_reg_off(),
and check_mem_reg().

Going forward, btf_check_subprog_call() will have a minimum of
BTF-related logic, but will get more internal verifier logic related to
register state manipulation. So move it into verifier.c to minimize
amount of verifier-specific logic exposed to btf.c.

We do this move before refactoring btf_check_func_arg_match() to
preserve as much history post-refactoring as possible.

No functional changes.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          | 2 --
 include/linux/bpf_verifier.h | 8 --------
 2 files changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d0d7eff22b8a..7671530d6e4e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2466,8 +2466,6 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   struct btf_func_model *m);
 
 struct bpf_reg_state;
-int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
-			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d3ea9ef04767..d07d857ca67f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -785,14 +785,6 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
 void
 bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt);
 
-int check_ptr_off_reg(struct bpf_verifier_env *env,
-		      const struct bpf_reg_state *reg, int regno);
-int check_func_arg_reg_off(struct bpf_verifier_env *env,
-			   const struct bpf_reg_state *reg, int regno,
-			   enum bpf_arg_type arg_type);
-int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-		   u32 regno, u32 mem_size);
-
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
 					     struct btf *btf, u32 btf_id)
-- 
cgit v1.2.3


From 7437bb73f087e5f216f9c6603f5149d354e315af Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 17 Dec 2023 17:53:57 +0100
Subject: block: remove support for the host aware zone model

When zones were first added the SCSI and ATA specs, two different
models were supported (in addition to the drive managed one that
is invisible to the host):

 - host managed where non-conventional zones there is strict requirement
   to write at the write pointer, or else an error is returned
 - host aware where a write point is maintained if writes always happen
   at it, otherwise it is left in an under-defined state and the
   sequential write preferred zones behave like conventional zones
   (probably very badly performing ones, though)

Not surprisingly this lukewarm model didn't prove to be very useful and
was finally removed from the ZBC and SBC specs (NVMe never implemented
it).  Due to to the easily disappearing write pointer host software
could never rely on the write pointer to actually be useful for say
recovery.

Fortunately only a few HDD prototypes shipped using this model which
never made it to mass production.  Drop the support before it is too
late.  Note that any such host aware prototype HDD can still be used
with Linux as we'll now treat it as a conventional HDD.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20231217165359.604246-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 37 +++----------------------------------
 1 file changed, 3 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 185ed3770e3a..28cda9fb239e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -263,18 +263,6 @@ static inline bool blk_op_is_passthrough(blk_opf_t op)
 	return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
 }
 
-/*
- * Zoned block device models (zoned limit).
- *
- * Note: This needs to be ordered from the least to the most severe
- * restrictions for the inheritance in blk_stack_limits() to work.
- */
-enum blk_zoned_model {
-	BLK_ZONED_NONE = 0,	/* Regular block device */
-	BLK_ZONED_HA,		/* Host-aware zoned block device */
-	BLK_ZONED_HM,		/* Host-managed zoned block device */
-};
-
 /*
  * BLK_BOUNCE_NONE:	never bounce (default)
  * BLK_BOUNCE_HIGH:	bounce all highmem pages
@@ -316,7 +304,7 @@ struct queue_limits {
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
 	unsigned char		raid_partial_stripes_expensive;
-	enum blk_zoned_model	zoned;
+	bool			zoned;
 
 	/*
 	 * Drivers that set dma_alignment to less than 511 must be prepared to
@@ -329,7 +317,7 @@ struct queue_limits {
 typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
 			       void *data);
 
-void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model);
+void disk_set_zoned(struct gendisk *disk, bool zoned);
 
 #define BLK_ALL_ZONES  ((unsigned int)-1)
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
@@ -617,23 +605,9 @@ static inline enum rpm_status queue_rpm_status(struct request_queue *q)
 }
 #endif
 
-static inline enum blk_zoned_model
-blk_queue_zoned_model(struct request_queue *q)
-{
-	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
-		return q->limits.zoned;
-	return BLK_ZONED_NONE;
-}
-
 static inline bool blk_queue_is_zoned(struct request_queue *q)
 {
-	switch (blk_queue_zoned_model(q)) {
-	case BLK_ZONED_HA:
-	case BLK_ZONED_HM:
-		return true;
-	default:
-		return false;
-	}
+	return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && q->limits.zoned;
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
@@ -1260,11 +1234,6 @@ static inline bool bdev_nowait(struct block_device *bdev)
 	return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
 }
 
-static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
-{
-	return blk_queue_zoned_model(bdev_get_queue(bdev));
-}
-
 static inline bool bdev_is_zoned(struct block_device *bdev)
 {
 	return blk_queue_is_zoned(bdev_get_queue(bdev));
-- 
cgit v1.2.3


From d73e93b4dfab10c80688b061c30048df05585c7e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 17 Dec 2023 17:53:58 +0100
Subject: block: simplify disk_set_zoned

Only use disk_set_zoned to actually enable zoned device support.
For clearing it, call disk_clear_zoned, which is renamed from
disk_clear_zone_settings and now directly clears the zoned flag as
well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20231217165359.604246-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 28cda9fb239e..bc236e77d85e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -317,7 +317,8 @@ struct queue_limits {
 typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
 			       void *data);
 
-void disk_set_zoned(struct gendisk *disk, bool zoned);
+void disk_set_zoned(struct gendisk *disk);
+void disk_clear_zoned(struct gendisk *disk);
 
 #define BLK_ALL_ZONES  ((unsigned int)-1)
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
-- 
cgit v1.2.3


From 8be0c877fb3b671dac0cf56d1f1f9e65f9a9fb81 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 16:43:05 +0100
Subject: thunderbolt: make tb_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the tb_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Andreas Noever <andreas.noever@gmail.com>
Cc: Michael Jamet <michael.jamet@intel.com>
Cc: Yehezkel Bernat <YehezkelShB@gmail.com>
Cc:  <linux-usb@vger.kernel.org>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Link: https://lore.kernel.org/r/2023121904-utopia-broadcast-06d1@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/thunderbolt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 6151c210d987..2c835e5c41f6 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -86,7 +86,7 @@ struct tb {
 	unsigned long privdata[];
 };
 
-extern struct bus_type tb_bus_type;
+extern const struct bus_type tb_bus_type;
 extern struct device_type tb_service_type;
 extern struct device_type tb_xdomain_type;
 
-- 
cgit v1.2.3


From 1a36e0f50f963465e9b2b980d250ab38b8fcd7a3 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Sun, 17 Dec 2023 10:32:38 +0200
Subject: net: Add MDB bulk deletion device operation

Add MDB net device operation that will be invoked by rtnetlink code in
response to received 'RTM_DELMDB' messages with the 'NLM_F_BULK' flag
set. Subsequent patches will implement the operation in the bridge and
VXLAN drivers.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1b935ee341b4..75c7725e5e4f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1329,6 +1329,9 @@ struct netdev_net_notifier {
  * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[],
  *		      struct netlink_ext_ack *extack);
  *	Deletes the MDB entry from dev.
+ * int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[],
+ *			   struct netlink_ext_ack *extack);
+ *	Bulk deletes MDB entries from dev.
  * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb,
  *		       struct netlink_callback *cb);
  *	Dumps MDB entries from dev. The first argument (marker) in the netlink
@@ -1611,6 +1614,9 @@ struct net_device_ops {
 	int			(*ndo_mdb_del)(struct net_device *dev,
 					       struct nlattr *tb[],
 					       struct netlink_ext_ack *extack);
+	int			(*ndo_mdb_del_bulk)(struct net_device *dev,
+						    struct nlattr *tb[],
+						    struct netlink_ext_ack *extack);
 	int			(*ndo_mdb_dump)(struct net_device *dev,
 						struct sk_buff *skb,
 						struct netlink_callback *cb);
-- 
cgit v1.2.3


From 40ca4ee3136d2d09977d1cab8c0c0e1582c3359d Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Tue, 12 Dec 2023 06:12:43 -0500
Subject: evm: don't copy up 'security.evm' xattr

The security.evm HMAC and the original file signatures contain
filesystem specific data.  As a result, the HMAC and signature
are not the same on the stacked and backing filesystems.

Don't copy up 'security.evm'.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/evm.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/evm.h b/include/linux/evm.h
index 01fc495a83e2..36ec884320d9 100644
--- a/include/linux/evm.h
+++ b/include/linux/evm.h
@@ -31,6 +31,7 @@ extern void evm_inode_post_setxattr(struct dentry *dentry,
 				    const char *xattr_name,
 				    const void *xattr_value,
 				    size_t xattr_value_len);
+extern int evm_inode_copy_up_xattr(const char *name);
 extern int evm_inode_removexattr(struct mnt_idmap *idmap,
 				 struct dentry *dentry, const char *xattr_name);
 extern void evm_inode_post_removexattr(struct dentry *dentry,
@@ -117,6 +118,11 @@ static inline void evm_inode_post_setxattr(struct dentry *dentry,
 	return;
 }
 
+static inline int  evm_inode_copy_up_xattr(const char *name)
+{
+	return 0;
+}
+
 static inline int evm_inode_removexattr(struct mnt_idmap *idmap,
 					struct dentry *dentry,
 					const char *xattr_name)
-- 
cgit v1.2.3


From cd708c938f055c9eb5a366ec1c8edcefa28afc28 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Mon, 18 Dec 2023 08:06:40 -0500
Subject: evm: add support to disable EVM on unsupported filesystems

Identify EVM unsupported filesystems by defining a new flag
SB_I_EVM_UNSUPPORTED.

Don't verify, write, remove or update 'security.evm' on unsupported
filesystems.

Acked-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..1474f36e9b38 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1164,6 +1164,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE	0x00000020
 #define SB_I_UNTRUSTED_MOUNTER		0x00000040
+#define SB_I_EVM_UNSUPPORTED		0x00000080
 
 #define SB_I_SKIP_SYNC	0x00000100	/* Skip superblock at global sync */
 #define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
-- 
cgit v1.2.3


From 139f84002145d8624f0195fb090b3a7670744a13 Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Tue, 19 Dec 2023 13:54:16 -0500
Subject: ring-buffer: Page size per ring buffer

Currently the size of one sub buffer page is global for all buffers and
it is hard coded to one system page. In order to introduce configurable
ring buffer sub page size, the internal logic should be refactored to
work with sub page size per ring buffer.

Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-3-tz.stoyanov@gmail.com
Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.009147038@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b1b03b2c0f08..ce46218ce46d 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -200,7 +200,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page,
 struct trace_seq;
 
 int ring_buffer_print_entry_header(struct trace_seq *s);
-int ring_buffer_print_page_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
-- 
cgit v1.2.3


From 2808e31ec12e5fbe2ae25acc027fcdc67b1fb7f0 Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Tue, 19 Dec 2023 13:54:17 -0500
Subject: ring-buffer: Add interface for configuring trace sub buffer size

The trace ring buffer sub page size can be configured, per trace
instance. A new ftrace file "buffer_subbuf_order" is added to get and
set the size of the ring buffer sub page for current trace instance.
The size must be an order of system page size, that's why the new
interface works with system page order, instead of absolute page size:
0 means the ring buffer sub page is equal to 1 system page and so
forth:
0 - 1 system page
1 - 2 system pages
2 - 4 system pages
...
The ring buffer sub page size is limited between 1 and 128 system
pages. The default value is 1 system page.
New ring buffer APIs are introduced:
 ring_buffer_subbuf_order_set()
 ring_buffer_subbuf_order_get()
 ring_buffer_subbuf_size_get()

Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-4-tz.stoyanov@gmail.com
Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.298324722@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index ce46218ce46d..12573306b889 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -202,6 +202,10 @@ struct trace_seq;
 int ring_buffer_print_entry_header(struct trace_seq *s);
 int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
 
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer);
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order);
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer);
+
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 };
-- 
cgit v1.2.3


From bce761d757452ba5eb77e11fecc37a04b67494e7 Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Tue, 19 Dec 2023 13:54:19 -0500
Subject: ring-buffer: Read and write to ring buffers with custom sub buffer
 size

As the size of the ring sub buffer page can be changed dynamically,
the logic that reads and writes to the buffer should be fixed to take
that into account. Some internal ring buffer APIs are changed:
 ring_buffer_alloc_read_page()
 ring_buffer_free_read_page()
 ring_buffer_read_page()
A new API is introduced:
 ring_buffer_read_page_data()

Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-6-tz.stoyanov@gmail.com
Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.875145995@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
[ Fixed kerneldoc on data_page parameter in ring_buffer_free_read_page() ]
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 12573306b889..fa802db216f9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -192,10 +192,15 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer);
 size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu);
 size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu);
 
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu);
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data);
-int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page,
+struct buffer_data_read_page;
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu);
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+				struct buffer_data_read_page *page);
+int ring_buffer_read_page(struct trace_buffer *buffer,
+			  struct buffer_data_read_page *data_page,
 			  size_t len, int cpu, int full);
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page);
 
 struct trace_seq;
 
-- 
cgit v1.2.3


From f3e25e68ceb2abaeefcac8f930c940c4494705d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Tue, 14 Nov 2023 12:20:11 +0100
Subject: pwm: Drop unused member "pwm" from struct pwm_device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This member is only assigned to and never read. So drop it.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index cda3597b84f2..8cadf9ee8d26 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -79,7 +79,6 @@ struct pwm_device {
 	const char *label;
 	unsigned long flags;
 	unsigned int hwpwm;
-	unsigned int pwm;
 	struct pwm_chip *chip;
 
 	struct pwm_args args;
-- 
cgit v1.2.3


From 54c86dd20bba23109e32e4e2f94ff93dd9863bc3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Tue, 14 Nov 2023 12:20:12 +0100
Subject: pwm: Replace PWM chip unique base by unique ID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Traditionally each PWM device had a unique ID stored in the "pwm" member
of struct pwm_device. However this number was hardly used and dropped
in the previous commit. To identify a certain PWM you're supposed to use
the chip's ID and the hwpwm of the PWM device now.

With the PWM chip base gone PWM chips can get their IDs better and
simpler using an idr.

This is expected to change the numbering of PWM chips, but nothing
should rely on the numbering anyhow.

Other than that the side effects are:

 - The PWM chip IDs are smaller and in most cases consecutive.
 - The ordering in /sys/kernel/debug/pwm is ordered by ascending PWM
   chip ID.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 8cadf9ee8d26..c27a4bb76012 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -292,7 +292,7 @@ struct pwm_chip {
 	struct device *dev;
 	const struct pwm_ops *ops;
 	struct module *owner;
-	int base;
+	unsigned int id;
 	unsigned int npwm;
 
 	struct pwm_device * (*of_xlate)(struct pwm_chip *chip,
@@ -300,7 +300,6 @@ struct pwm_chip {
 	unsigned int of_pwm_n_cells;
 
 	/* only used internally by the PWM framework */
-	struct list_head list;
 	struct pwm_device *pwms;
 };
 
-- 
cgit v1.2.3


From 2d91123ae5614b8737abd3a519b81265309a1ac3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 29 Nov 2023 11:18:32 +0100
Subject: pwm: Update kernel doc for struct pwm_chip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit c572f3b9c8b7 ("pwm: Replace PWM chip unique base by unique ID")
changed the members of struct pwm_chip, but failed to update the
documentation accordingly. Catch up and document the new member and drop
description for the two removed ones.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index c27a4bb76012..f87655c06c82 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -281,11 +281,10 @@ struct pwm_ops {
  * @dev: device providing the PWMs
  * @ops: callbacks for this PWM controller
  * @owner: module providing this chip
- * @base: number of first PWM controlled by this chip
+ * @id: unique number of this PWM chip
  * @npwm: number of PWMs controlled by this chip
  * @of_xlate: request a PWM device given a device tree PWM specifier
  * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier
- * @list: list node for internal use
  * @pwms: array of PWM devices allocated by the framework
  */
 struct pwm_chip {
-- 
cgit v1.2.3


From c748a6d77c06a78651030e17da6beb278a1c9470 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Tue, 19 Dec 2023 16:30:24 +0000
Subject: pwm: Rename pwm_apply_state() to pwm_apply_might_sleep()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to introduce a pwm api which can be used from atomic context,
we will need two functions for applying pwm changes:

	int pwm_apply_might_sleep(struct pwm *, struct pwm_state *);
	int pwm_apply_atomic(struct pwm *, struct pwm_state *);

This commit just deals with renaming pwm_apply_state(), a following
commit will introduce the pwm_apply_atomic() function.

Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> # for input
Acked-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Acked-by: Lee Jones <lee@kernel.org>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index f87655c06c82..b64b8a82415c 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -92,8 +92,8 @@ struct pwm_device {
  * @state: state to fill with the current PWM state
  *
  * The returned PWM state represents the state that was applied by a previous call to
- * pwm_apply_state(). Drivers may have to slightly tweak that state before programming it to
- * hardware. If pwm_apply_state() was never called, this returns either the current hardware
+ * pwm_apply_might_sleep(). Drivers may have to slightly tweak that state before programming it to
+ * hardware. If pwm_apply_might_sleep() was never called, this returns either the current hardware
  * state (if supported) or the default settings.
  */
 static inline void pwm_get_state(const struct pwm_device *pwm,
@@ -157,20 +157,20 @@ static inline void pwm_get_args(const struct pwm_device *pwm,
 }
 
 /**
- * pwm_init_state() - prepare a new state to be applied with pwm_apply_state()
+ * pwm_init_state() - prepare a new state to be applied with pwm_apply_might_sleep()
  * @pwm: PWM device
  * @state: state to fill with the prepared PWM state
  *
  * This functions prepares a state that can later be tweaked and applied
- * to the PWM device with pwm_apply_state(). This is a convenient function
+ * to the PWM device with pwm_apply_might_sleep(). This is a convenient function
  * that first retrieves the current PWM state and the replaces the period
  * and polarity fields with the reference values defined in pwm->args.
  * Once the function returns, you can adjust the ->enabled and ->duty_cycle
- * fields according to your needs before calling pwm_apply_state().
+ * fields according to your needs before calling pwm_apply_might_sleep().
  *
  * ->duty_cycle is initially set to zero to avoid cases where the current
  * ->duty_cycle value exceed the pwm_args->period one, which would trigger
- * an error if the user calls pwm_apply_state() without adjusting ->duty_cycle
+ * an error if the user calls pwm_apply_might_sleep() without adjusting ->duty_cycle
  * first.
  */
 static inline void pwm_init_state(const struct pwm_device *pwm,
@@ -226,7 +226,7 @@ pwm_get_relative_duty_cycle(const struct pwm_state *state, unsigned int scale)
  *
  * pwm_init_state(pwm, &state);
  * pwm_set_relative_duty_cycle(&state, 50, 100);
- * pwm_apply_state(pwm, &state);
+ * pwm_apply_might_sleep(pwm, &state);
  *
  * This functions returns -EINVAL if @duty_cycle and/or @scale are
  * inconsistent (@scale == 0 or @duty_cycle > @scale).
@@ -304,7 +304,7 @@ struct pwm_chip {
 
 #if IS_ENABLED(CONFIG_PWM)
 /* PWM user APIs */
-int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state);
+int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state);
 int pwm_adjust_config(struct pwm_device *pwm);
 
 /**
@@ -332,7 +332,7 @@ static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
 
 	state.duty_cycle = duty_ns;
 	state.period = period_ns;
-	return pwm_apply_state(pwm, &state);
+	return pwm_apply_might_sleep(pwm, &state);
 }
 
 /**
@@ -353,7 +353,7 @@ static inline int pwm_enable(struct pwm_device *pwm)
 		return 0;
 
 	state.enabled = true;
-	return pwm_apply_state(pwm, &state);
+	return pwm_apply_might_sleep(pwm, &state);
 }
 
 /**
@@ -372,7 +372,7 @@ static inline void pwm_disable(struct pwm_device *pwm)
 		return;
 
 	state.enabled = false;
-	pwm_apply_state(pwm, &state);
+	pwm_apply_might_sleep(pwm, &state);
 }
 
 /* PWM provider APIs */
@@ -403,8 +403,8 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
 				       struct fwnode_handle *fwnode,
 				       const char *con_id);
 #else
-static inline int pwm_apply_state(struct pwm_device *pwm,
-				  const struct pwm_state *state)
+static inline int pwm_apply_might_sleep(struct pwm_device *pwm,
+					const struct pwm_state *state)
 {
 	might_sleep();
 	return -ENOTSUPP;
@@ -521,7 +521,7 @@ static inline void pwm_apply_args(struct pwm_device *pwm)
 	state.period = pwm->args.period;
 	state.usage_power = false;
 
-	pwm_apply_state(pwm, &state);
+	pwm_apply_might_sleep(pwm, &state);
 }
 
 struct pwm_lookup {
-- 
cgit v1.2.3


From dc518b378dced419baa95d76a85f4c8c405722bc Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Tue, 19 Dec 2023 16:30:25 +0000
Subject: pwm: Replace ENOTSUPP with EOPNOTSUPP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to Documentation/dev-tools/checkpatch.rst ENOTSUPP is
not recommended and EOPNOTSUPP should be used instead.

Signed-off-by: Sean Young <sean@mess.org>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index b64b8a82415c..c9cb87b59ac8 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -407,12 +407,12 @@ static inline int pwm_apply_might_sleep(struct pwm_device *pwm,
 					const struct pwm_state *state)
 {
 	might_sleep();
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int pwm_adjust_config(struct pwm_device *pwm)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
-- 
cgit v1.2.3


From 7170d3beafc2373dd76b6b5d6e617d89e4e42b8b Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Tue, 19 Dec 2023 16:30:27 +0000
Subject: pwm: Make it possible to apply PWM changes in atomic context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some PWM devices require sleeping, for example if the pwm device is
connected over I2C. However, many PWM devices could be used from atomic
context, e.g. memory mapped PWM. This is useful for, for example, the
pwm-ir-tx driver which requires precise timing. Sleeping causes havoc
with the generated IR signal.

Since not all PWM devices can support atomic context, we also add a
pwm_might_sleep() function to check if is not supported.

Signed-off-by: Sean Young <sean@mess.org>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index c9cb87b59ac8..495af3627939 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -285,6 +285,7 @@ struct pwm_ops {
  * @npwm: number of PWMs controlled by this chip
  * @of_xlate: request a PWM device given a device tree PWM specifier
  * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier
+ * @atomic: can the driver's ->apply() be called in atomic context
  * @pwms: array of PWM devices allocated by the framework
  */
 struct pwm_chip {
@@ -297,6 +298,7 @@ struct pwm_chip {
 	struct pwm_device * (*of_xlate)(struct pwm_chip *chip,
 					const struct of_phandle_args *args);
 	unsigned int of_pwm_n_cells;
+	bool atomic;
 
 	/* only used internally by the PWM framework */
 	struct pwm_device *pwms;
@@ -305,6 +307,7 @@ struct pwm_chip {
 #if IS_ENABLED(CONFIG_PWM)
 /* PWM user APIs */
 int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state);
+int pwm_apply_atomic(struct pwm_device *pwm, const struct pwm_state *state);
 int pwm_adjust_config(struct pwm_device *pwm);
 
 /**
@@ -375,6 +378,17 @@ static inline void pwm_disable(struct pwm_device *pwm)
 	pwm_apply_might_sleep(pwm, &state);
 }
 
+/**
+ * pwm_might_sleep() - is pwm_apply_atomic() supported?
+ * @pwm: PWM device
+ *
+ * Returns: false if pwm_apply_atomic() can be called from atomic context.
+ */
+static inline bool pwm_might_sleep(struct pwm_device *pwm)
+{
+	return !pwm->chip->atomic;
+}
+
 /* PWM provider APIs */
 int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result,
 		unsigned long timeout);
@@ -403,6 +417,11 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
 				       struct fwnode_handle *fwnode,
 				       const char *con_id);
 #else
+static inline bool pwm_might_sleep(struct pwm_device *pwm)
+{
+	return true;
+}
+
 static inline int pwm_apply_might_sleep(struct pwm_device *pwm,
 					const struct pwm_state *state)
 {
@@ -410,6 +429,12 @@ static inline int pwm_apply_might_sleep(struct pwm_device *pwm,
 	return -EOPNOTSUPP;
 }
 
+static inline int pwm_apply_atomic(struct pwm_device *pwm,
+				   const struct pwm_state *state)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int pwm_adjust_config(struct pwm_device *pwm)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From fe22944cf05ede8e6f841cfecdb7093a53a3d9b3 Mon Sep 17 00:00:00 2001
From: xiaoming Wang <xiaoming.wang@intel.com>
Date: Tue, 19 Dec 2023 11:34:11 +0800
Subject: cpu/hotplug: Increase the number of dynamic states

The dynamically allocatable hotplug state space can be exhausted by
the existing drivers and infrastructure which install CPU hotplug
states dynamically. That prevents new drivers and infrastructure from
installing dynamically allocated states.

Increase the size of the CPUHP_AP_ONLINE_DYN state by 10 to make
room.

Signed-off-by: Xiaoming Wang <xiaoming.wang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231219033411.816100-1-xiaoming.wang@intel.com
---
 include/linux/cpuhotplug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index af6c21aab985..8bd454dfe453 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -239,7 +239,7 @@ enum cpuhp_state {
 	CPUHP_AP_RCUTREE_ONLINE,
 	CPUHP_AP_BASE_CACHEINFO_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
-	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
+	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 40,
 	CPUHP_AP_X86_HPET_ONLINE,
 	CPUHP_AP_X86_KVM_CLK_ONLINE,
 	CPUHP_AP_ACTIVE,
-- 
cgit v1.2.3


From 80e4a9987999e682366d60f43a7b2adefc48e222 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Sun, 10 Dec 2023 00:00:47 +0100
Subject: pwm: Drop two unused API functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These functions are unused. Also I think there is no valid use case
where these are correct to be called. So drop them.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 495af3627939..5dd665d8c909 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -111,12 +111,6 @@ static inline bool pwm_is_enabled(const struct pwm_device *pwm)
 	return state.enabled;
 }
 
-static inline void pwm_set_period(struct pwm_device *pwm, u64 period)
-{
-	if (pwm)
-		pwm->state.period = period;
-}
-
 static inline u64 pwm_get_period(const struct pwm_device *pwm)
 {
 	struct pwm_state state;
@@ -126,12 +120,6 @@ static inline u64 pwm_get_period(const struct pwm_device *pwm)
 	return state.period;
 }
 
-static inline void pwm_set_duty_cycle(struct pwm_device *pwm, unsigned int duty)
-{
-	if (pwm)
-		pwm->state.duty_cycle = duty;
-}
-
 static inline u64 pwm_get_duty_cycle(const struct pwm_device *pwm)
 {
 	struct pwm_state state;
-- 
cgit v1.2.3


From eba2591d99d1f14a04c8a8a845ab0795b93f5646 Mon Sep 17 00:00:00 2001
From: Alexandre Ghiti <alexghiti@rivosinc.com>
Date: Wed, 13 Dec 2023 21:29:59 +0100
Subject: mm: Introduce pudp/p4dp/pgdp_get() functions

Instead of directly dereferencing page tables entries, which can cause
issues (see commit 20a004e7b017 ("arm64: mm: Use READ_ONCE/WRITE_ONCE when
accessing page tables"), let's introduce new functions to get the
pud/p4d/pgd entries (the pte and pmd versions already exist).

Note that arm pgd_t is actually an array so pgdp_get() is defined as a
macro to avoid a build error.

Those new functions will be used in subsequent commits by the riscv
architecture.

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20231213203001.179237-3-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 include/linux/pgtable.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index af7639c3b0a3..8b7daccd11be 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -292,6 +292,27 @@ static inline pmd_t pmdp_get(pmd_t *pmdp)
 }
 #endif
 
+#ifndef pudp_get
+static inline pud_t pudp_get(pud_t *pudp)
+{
+	return READ_ONCE(*pudp);
+}
+#endif
+
+#ifndef p4dp_get
+static inline p4d_t p4dp_get(p4d_t *p4dp)
+{
+	return READ_ONCE(*p4dp);
+}
+#endif
+
+#ifndef pgdp_get
+static inline pgd_t pgdp_get(pgd_t *pgdp)
+{
+	return READ_ONCE(*pgdp);
+}
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long address,
-- 
cgit v1.2.3


From 7d7ef0a4686abe43cd76a141b340a348f45ecdf2 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Wed, 29 Nov 2023 03:21:53 +0000
Subject: mm: memcg: restore subtree stats flushing

Stats flushing for memcg currently follows the following rules:
- Always flush the entire memcg hierarchy (i.e. flush the root).
- Only one flusher is allowed at a time. If someone else tries to flush
  concurrently, they skip and return immediately.
- A periodic flusher flushes all the stats every 2 seconds.

The reason this approach is followed is because all flushes are serialized
by a global rstat spinlock.  On the memcg side, flushing is invoked from
userspace reads as well as in-kernel flushers (e.g.  reclaim, refault,
etc).  This approach aims to avoid serializing all flushers on the global
lock, which can cause a significant performance hit under high
concurrency.

This approach has the following problems:
- Occasionally a userspace read of the stats of a non-root cgroup will
  be too expensive as it has to flush the entire hierarchy [1].
- Sometimes the stats accuracy are compromised if there is an ongoing
  flush, and we skip and return before the subtree of interest is
  actually flushed, yielding stale stats (by up to 2s due to periodic
  flushing). This is more visible when reading stats from userspace,
  but can also affect in-kernel flushers.

The latter problem is particulary a concern when userspace reads stats
after an event occurs, but gets stats from before the event. Examples:
- When memory usage / pressure spikes, a userspace OOM handler may look
  at the stats of different memcgs to select a victim based on various
  heuristics (e.g. how much private memory will be freed by killing
  this). Reading stale stats from before the usage spike in this case
  may cause a wrongful OOM kill.
- A proactive reclaimer may read the stats after writing to
  memory.reclaim to measure the success of the reclaim operation. Stale
  stats from before reclaim may give a false negative.
- Reading the stats of a parent and a child memcg may be inconsistent
  (child larger than parent), if the flush doesn't happen when the
  parent is read, but happens when the child is read.

As for in-kernel flushers, they will occasionally get stale stats.  No
regressions are currently known from this, but if there are regressions,
they would be very difficult to debug and link to the source of the
problem.

This patch aims to fix these problems by restoring subtree flushing, and
removing the unified/coalesced flushing logic that skips flushing if there
is an ongoing flush.  This change would introduce a significant regression
with global stats flushing thresholds.  With per-memcg stats flushing
thresholds, this seems to perform really well.  The thresholds protect the
underlying lock from unnecessary contention.

This patch was tested in two ways to ensure the latency of flushing is
up to par, on a machine with 384 cpus:

- A synthetic test with 5000 concurrent workers in 500 cgroups doing
  allocations and reclaim, as well as 1000 readers for memory.stat
  (variation of [2]). No regressions were noticed in the total runtime.
  Note that significant regressions in this test are observed with
  global stats thresholds, but not with per-memcg thresholds.

- A synthetic stress test for concurrently reading memcg stats while
  memory allocation/freeing workers are running in the background,
  provided by Wei Xu [3]. With 250k threads reading the stats every
  100ms in 50k cgroups, 99.9% of reads take <= 50us. Less than 0.01%
  of reads take more than 1ms, and no reads take more than 100ms.

[1] https://lore.kernel.org/lkml/CABWYdi0c6__rh-K7dcM_pkf9BJdTRtAU08M43KO9ME4-dsgfoQ@mail.gmail.com/
[2] https://lore.kernel.org/lkml/CAJD7tka13M-zVZTyQJYL1iUAYvuQ1fcHbCjcOBZcz6POYTV-4g@mail.gmail.com/
[3] https://lore.kernel.org/lkml/CAAPL-u9D2b=iF5Lf_cRnKxUfkiEe0AMDTu6yhrUAzX0b6a6rDg@mail.gmail.com/

[akpm@linux-foundation.org: fix mm/zswap.c]
[yosryahmed@google.com: remove stats flushing mutex]
  Link: https://lkml.kernel.org/r/CAJD7tkZgP3m-VVPn+fF_YuvXeQYK=tZZjJHj=dzD=CcSSpp2qg@mail.gmail.com
Link: https://lkml.kernel.org/r/20231129032154.3710765-6-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ivan Babrou <ivan@cloudflare.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutny <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a308c8eacf20..43b77363ab8e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1051,8 +1051,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 	return x;
 }
 
-void mem_cgroup_flush_stats(void);
-void mem_cgroup_flush_stats_ratelimited(void);
+void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
+void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
 
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			      int val);
@@ -1563,11 +1563,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 	return node_page_state(lruvec_pgdat(lruvec), idx);
 }
 
-static inline void mem_cgroup_flush_stats(void)
+static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
 {
 }
 
-static inline void mem_cgroup_flush_stats_ratelimited(void)
+static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
 {
 }
 
-- 
cgit v1.2.3


From 3485b88390b0af9e05dc2c3f57e9936f41e159a0 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 7 Dec 2023 16:12:04 +0000
Subject: mm: thp: introduce multi-size THP sysfs interface

In preparation for adding support for anonymous multi-size THP, introduce
new sysfs structure that will be used to control the new behaviours.  A
new directory is added under transparent_hugepage for each supported THP
size, and contains an `enabled` file, which can be set to "inherit" (to
inherit the global setting), "always", "madvise" or "never".  For now, the
kernel still only supports PMD-sized anonymous THP, so only 1 directory is
populated.

The first half of the change converts transhuge_vma_suitable() and
hugepage_vma_check() so that they take a bitfield of orders for which the
user wants to determine support, and the functions filter out all the
orders that can't be supported, given the current sysfs configuration and
the VMA dimensions.  The resulting functions are renamed to
thp_vma_suitable_orders() and thp_vma_allowable_orders() respectively.
Convenience functions that take a single, unencoded order and return a
boolean are also defined as thp_vma_suitable_order() and
thp_vma_allowable_order().

The second half of the change implements the new sysfs interface.  It has
been done so that each supported THP size has a `struct thpsize`, which
describes the relevant metadata and is itself a kobject.  This is pretty
minimal for now, but should make it easy to add new per-thpsize files to
the interface if needed in future (e.g.  per-size defrag).  Rather than
keep the `enabled` state directly in the struct thpsize, I've elected to
directly encode it into huge_anon_orders_[always|madvise|inherit]
bitfields since this reduces the amount of work required in
thp_vma_allowable_orders() which is called for every page fault.

See Documentation/admin-guide/mm/transhuge.rst, as modified by this
commit, for details of how the new sysfs interface works.

[ryan.roberts@arm.com: fix build warning when CONFIG_SYSFS is disabled]
  Link: https://lkml.kernel.org/r/20231211125320.3997543-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20231207161211.2374093-4-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Barry Song <v-songbaohua@oppo.com>
Tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Itaru Kitayama <itaru.kitayama@gmail.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 181 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 155 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fa0350b0812a..609c153bae57 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -67,6 +67,24 @@ extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 
+/*
+ * Mask of all large folio orders supported for anonymous THP.
+ */
+#define THP_ORDERS_ALL_ANON	BIT(PMD_ORDER)
+
+/*
+ * Mask of all large folio orders supported for file THP.
+ */
+#define THP_ORDERS_ALL_FILE	(BIT(PMD_ORDER) | BIT(PUD_ORDER))
+
+/*
+ * Mask of all large folio orders supported for THP.
+ */
+#define THP_ORDERS_ALL		(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE)
+
+#define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \
+	(!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order)))
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define HPAGE_PMD_SHIFT PMD_SHIFT
 #define HPAGE_PMD_SIZE	((1UL) << HPAGE_PMD_SHIFT)
@@ -77,45 +95,105 @@ extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
 
 extern unsigned long transparent_hugepage_flags;
+extern unsigned long huge_anon_orders_always;
+extern unsigned long huge_anon_orders_madvise;
+extern unsigned long huge_anon_orders_inherit;
 
-#define hugepage_flags_enabled()					       \
-	(transparent_hugepage_flags &				       \
-	 ((1<<TRANSPARENT_HUGEPAGE_FLAG) |		       \
-	  (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
-#define hugepage_flags_always()				\
-	(transparent_hugepage_flags &			\
-	 (1<<TRANSPARENT_HUGEPAGE_FLAG))
+static inline bool hugepage_global_enabled(void)
+{
+	return transparent_hugepage_flags &
+			((1<<TRANSPARENT_HUGEPAGE_FLAG) |
+			(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
+}
+
+static inline bool hugepage_global_always(void)
+{
+	return transparent_hugepage_flags &
+			(1<<TRANSPARENT_HUGEPAGE_FLAG);
+}
+
+static inline bool hugepage_flags_enabled(void)
+{
+	/*
+	 * We cover both the anon and the file-backed case here; we must return
+	 * true if globally enabled, even when all anon sizes are set to never.
+	 * So we don't need to look at huge_anon_orders_inherit.
+	 */
+	return hugepage_global_enabled() ||
+	       huge_anon_orders_always ||
+	       huge_anon_orders_madvise;
+}
+
+static inline int highest_order(unsigned long orders)
+{
+	return fls_long(orders) - 1;
+}
+
+static inline int next_order(unsigned long *orders, int prev)
+{
+	*orders &= ~BIT(prev);
+	return highest_order(*orders);
+}
 
 /*
  * Do the below checks:
  *   - For file vma, check if the linear page offset of vma is
- *     HPAGE_PMD_NR aligned within the file.  The hugepage is
- *     guaranteed to be hugepage-aligned within the file, but we must
- *     check that the PMD-aligned addresses in the VMA map to
- *     PMD-aligned offsets within the file, else the hugepage will
- *     not be PMD-mappable.
- *   - For all vmas, check if the haddr is in an aligned HPAGE_PMD_SIZE
+ *     order-aligned within the file.  The hugepage is
+ *     guaranteed to be order-aligned within the file, but we must
+ *     check that the order-aligned addresses in the VMA map to
+ *     order-aligned offsets within the file, else the hugepage will
+ *     not be mappable.
+ *   - For all vmas, check if the haddr is in an aligned hugepage
  *     area.
  */
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
-		unsigned long addr)
+static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
+		unsigned long addr, int order)
 {
+	unsigned long hpage_size = PAGE_SIZE << order;
 	unsigned long haddr;
 
 	/* Don't have to check pgoff for anonymous vma */
 	if (!vma_is_anonymous(vma)) {
 		if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
-				HPAGE_PMD_NR))
+				hpage_size >> PAGE_SHIFT))
 			return false;
 	}
 
-	haddr = addr & HPAGE_PMD_MASK;
+	haddr = ALIGN_DOWN(addr, hpage_size);
 
-	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+	if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end)
 		return false;
 	return true;
 }
 
+/*
+ * Filter the bitfield of input orders to the ones suitable for use in the vma.
+ * See thp_vma_suitable_order().
+ * All orders that pass the checks are returned as a bitfield.
+ */
+static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
+		unsigned long addr, unsigned long orders)
+{
+	int order;
+
+	/*
+	 * Iterate over orders, highest to lowest, removing orders that don't
+	 * meet alignment requirements from the set. Exit loop at first order
+	 * that meets requirements, since all lower orders must also meet
+	 * requirements.
+	 */
+
+	order = highest_order(orders);
+
+	while (orders) {
+		if (thp_vma_suitable_order(vma, addr, order))
+			break;
+		order = next_order(&orders, order);
+	}
+
+	return orders;
+}
+
 static inline bool file_thp_enabled(struct vm_area_struct *vma)
 {
 	struct inode *inode;
@@ -130,8 +208,52 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 	       !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
 }
 
-bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
-			bool smaps, bool in_pf, bool enforce_sysfs);
+unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
+					 unsigned long vm_flags, bool smaps,
+					 bool in_pf, bool enforce_sysfs,
+					 unsigned long orders);
+
+/**
+ * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
+ * @vma:  the vm area to check
+ * @vm_flags: use these vm_flags instead of vma->vm_flags
+ * @smaps: whether answer will be used for smaps file
+ * @in_pf: whether answer will be used by page fault handler
+ * @enforce_sysfs: whether sysfs config should be taken into account
+ * @orders: bitfield of all orders to consider
+ *
+ * Calculates the intersection of the requested hugepage orders and the allowed
+ * hugepage orders for the provided vma. Permitted orders are encoded as a set
+ * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3
+ * corresponds to order-3, etc). Order-0 is never considered a hugepage order.
+ *
+ * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage
+ * orders are allowed.
+ */
+static inline
+unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
+				       unsigned long vm_flags, bool smaps,
+				       bool in_pf, bool enforce_sysfs,
+				       unsigned long orders)
+{
+	/* Optimization to check if required orders are enabled early. */
+	if (enforce_sysfs && vma_is_anonymous(vma)) {
+		unsigned long mask = READ_ONCE(huge_anon_orders_always);
+
+		if (vm_flags & VM_HUGEPAGE)
+			mask |= READ_ONCE(huge_anon_orders_madvise);
+		if (hugepage_global_always() ||
+		    ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
+			mask |= READ_ONCE(huge_anon_orders_inherit);
+
+		orders &= mask;
+		if (!orders)
+			return 0;
+	}
+
+	return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf,
+					  enforce_sysfs, orders);
+}
 
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
@@ -267,17 +389,24 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
 	return false;
 }
 
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
-		unsigned long addr)
+static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
+		unsigned long addr, int order)
 {
 	return false;
 }
 
-static inline bool hugepage_vma_check(struct vm_area_struct *vma,
-				      unsigned long vm_flags, bool smaps,
-				      bool in_pf, bool enforce_sysfs)
+static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
+		unsigned long addr, unsigned long orders)
 {
-	return false;
+	return 0;
+}
+
+static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
+					unsigned long vm_flags, bool smaps,
+					bool in_pf, bool enforce_sysfs,
+					unsigned long orders)
+{
+	return 0;
 }
 
 static inline void folio_prep_large_rmappable(struct folio *folio) {}
-- 
cgit v1.2.3


From 19eaf44954df64f9bc8dec398219e15ad0811497 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 7 Dec 2023 16:12:05 +0000
Subject: mm: thp: support allocation of anonymous multi-size THP

Introduce the logic to allow THP to be configured (through the new sysfs
interface we just added) to allocate large folios to back anonymous
memory, which are larger than the base page size but smaller than
PMD-size.  We call this new THP extension "multi-size THP" (mTHP).

mTHP continues to be PTE-mapped, but in many cases can still provide
similar benefits to traditional PMD-sized THP: Page faults are
significantly reduced (by a factor of e.g.  4, 8, 16, etc.  depending on
the configured order), but latency spikes are much less prominent because
the size of each page isn't as huge as the PMD-sized variant and there is
less memory to clear in each page fault.  The number of per-page
operations (e.g.  ref counting, rmap management, lru list management) are
also significantly reduced since those ops now become per-folio.

Some architectures also employ TLB compression mechanisms to squeeze more
entries in when a set of PTEs are virtually and physically contiguous and
approporiately aligned.  In this case, TLB misses will occur less often.

The new behaviour is disabled by default, but can be enabled at runtime by
writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled (see
documentation in previous commit).  The long term aim is to change the
default to include suitable lower orders, but there are some risks around
internal fragmentation that need to be better understood first.

[ryan.roberts@arm.com: resolve some multi-size THP review nits]
  Link: https://lkml.kernel.org/r/20231214160251.3574571-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Itaru Kitayama <itaru.kitayama@gmail.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 609c153bae57..fa7a38a30fc6 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 
 /*
- * Mask of all large folio orders supported for anonymous THP.
+ * Mask of all large folio orders supported for anonymous THP; all orders up to
+ * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
+ * (which is a limitation of the THP implementation).
  */
-#define THP_ORDERS_ALL_ANON	BIT(PMD_ORDER)
+#define THP_ORDERS_ALL_ANON	((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
 
 /*
  * Mask of all large folio orders supported for file THP.
-- 
cgit v1.2.3


From 6ad59a3838cd0a8536721e60b8e4fbe5fdeb233a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 13 Dec 2023 19:03:33 +0000
Subject: mm/damon: update email of SeongJae

Patch series "mm/damon: misc updates for 6.8".

Update comments, tests, and documents for DAMON.


This patch (of 6):

SeongJae is using his kernel.org account for DAMON development.  Update
the old email addresses on the comments of DAMON source files.

Link: https://lkml.kernel.org/r/20231213190338.54146-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231213190338.54146-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 12510d8c51c6..5881e4ac30be 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -2,7 +2,7 @@
 /*
  * DAMON api
  *
- * Author: SeongJae Park <sjpark@amazon.de>
+ * Author: SeongJae Park <sj@kernel.org>
  */
 
 #ifndef _DAMON_H_
-- 
cgit v1.2.3


From 0abfa8efad8dccc3899f64dafa985a251714a709 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 12 Dec 2023 20:33:16 -0800
Subject: gfp: gfp_types.h: fix typos & punctuation

Correct typos/spellos and punctutation.

Link: https://lkml.kernel.org/r/20231213043316.10128-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp_types.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index ae994534a12a..1b6053da8754 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -162,25 +162,25 @@ typedef unsigned int __bitwise gfp_t;
  * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
  *
  * The default allocator behavior depends on the request size. We have a concept
- * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
+ * of so-called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
  * !costly allocations are too essential to fail so they are implicitly
  * non-failing by default (with some exceptions like OOM victims might fail so
  * the caller still has to check for failures) while costly requests try to be
  * not disruptive and back off even without invoking the OOM killer.
  * The following three modifiers might be used to override some of these
- * implicit rules
+ * implicit rules.
  *
  * %__GFP_NORETRY: The VM implementation will try only very lightweight
  * memory direct reclaim to get some memory under memory pressure (thus
  * it can sleep). It will avoid disruptive actions like OOM killer. The
  * caller must handle the failure which is quite likely to happen under
  * heavy memory pressure. The flag is suitable when failure can easily be
- * handled at small cost, such as reduced throughput
+ * handled at small cost, such as reduced throughput.
  *
  * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
  * procedures that have previously failed if there is some indication
- * that progress has been made else where.  It can wait for other
- * tasks to attempt high level approaches to freeing memory such as
+ * that progress has been made elsewhere.  It can wait for other
+ * tasks to attempt high-level approaches to freeing memory such as
  * compaction (which removes fragmentation) and page-out.
  * There is still a definite limit to the number of retries, but it is
  * a larger limit than with %__GFP_NORETRY.
@@ -230,7 +230,7 @@ typedef unsigned int __bitwise gfp_t;
  * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that
  * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting
  * memory tags at the same time as zeroing memory has minimal additional
- * performace impact.
+ * performance impact.
  *
  * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation.
  * Used for userspace and vmalloc pages; the latter are unpoisoned by
-- 
cgit v1.2.3


From cbc2fe9d9cb226347365753f50d81bc48cc3c52e Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 13 Dec 2023 13:57:41 +0800
Subject: kexec_file: add kexec_file flag to control debug printing

Patch series "kexec_file: print out debugging message if required", v4.

Currently, specifying '-d' on kexec command will print a lot of debugging
informationabout kexec/kdump loading with kexec_load interface.

However, kexec_file_load prints nothing even though '-d' is specified.
It's very inconvenient to debug or analyze the kexec/kdump loading when
something wrong happened with kexec/kdump itself or develper want to check
the kexec/kdump loading.

In this patchset, a kexec_file flag is KEXEC_FILE_DEBUG added and checked
in code.  If it's passed in, debugging message of kexec_file code will be
printed out and can be seen from console and dmesg.  Otherwise, the
debugging message is printed like beofre when pr_debug() is taken.

Note:
****
=====
1) The code in kexec-tools utility also need be changed to support
passing KEXEC_FILE_DEBUG to kernel when 'kexec -s -d' is specified.
The patch link is here:
=========
[PATCH] kexec_file: add kexec_file flag to support debug printing
http://lists.infradead.org/pipermail/kexec/2023-November/028505.html

2) s390 also has kexec_file code, while I am not sure what debugging
information is necessary. So leave it to s390 developer.

Test:
****
====
Testing was done in v1 on x86_64 and arm64. For v4, tested on x86_64
again. And on x86_64, the printed messages look like below:
--------------------------------------------------------------
kexec measurement buffer for the loaded kernel at 0x207fffe000.
Loaded purgatory at 0x207fff9000
Loaded boot_param, command line and misc at 0x207fff3000 bufsz=0x1180 memsz=0x1180
Loaded 64bit kernel at 0x207c000000 bufsz=0xc88200 memsz=0x3c4a000
Loaded initrd at 0x2079e79000 bufsz=0x2186280 memsz=0x2186280
Final command line is: root=/dev/mapper/fedora_intel--knightslanding--lb--02-root ro
rd.lvm.lv=fedora_intel-knightslanding-lb-02/root console=ttyS0,115200N81 crashkernel=256M
E820 memmap:
0000000000000000-000000000009a3ff (1)
000000000009a400-000000000009ffff (2)
00000000000e0000-00000000000fffff (2)
0000000000100000-000000006ff83fff (1)
000000006ff84000-000000007ac50fff (2)
......
000000207fff6150-000000207fff615f (128)
000000207fff6160-000000207fff714f (1)
000000207fff7150-000000207fff715f (128)
000000207fff7160-000000207fff814f (1)
000000207fff8150-000000207fff815f (128)
000000207fff8160-000000207fffffff (1)
nr_segments = 5
segment[0]: buf=0x000000004e5ece74 bufsz=0x211 mem=0x207fffe000 memsz=0x1000
segment[1]: buf=0x000000009e871498 bufsz=0x4000 mem=0x207fff9000 memsz=0x5000
segment[2]: buf=0x00000000d879f1fe bufsz=0x1180 mem=0x207fff3000 memsz=0x2000
segment[3]: buf=0x000000001101cd86 bufsz=0xc88200 mem=0x207c000000 memsz=0x3c4a000
segment[4]: buf=0x00000000c6e38ac7 bufsz=0x2186280 mem=0x2079e79000 memsz=0x2187000
kexec_file_load: type:0, start:0x207fff91a0 head:0x109e004002 flags:0x8
---------------------------------------------------------------------------


This patch (of 7):

When specifying 'kexec -c -d', kexec_load interface will print loading
information, e.g the regions where kernel/initrd/purgatory/cmdline are
put, the memmap passed to 2nd kernel taken as system RAM ranges, and
printing all contents of struct kexec_segment, etc.  These are very
helpful for analyzing or positioning what's happening when kexec/kdump
itself failed.  The debugging printing for kexec_load interface is made in
user space utility kexec-tools.

Whereas, with kexec_file_load interface, 'kexec -s -d' print nothing.
Because kexec_file code is mostly implemented in kernel space, and the
debugging printing functionality is missed.  It's not convenient when
debugging kexec/kdump loading and jumping with kexec_file_load interface.

Now add KEXEC_FILE_DEBUG to kexec_file flag to control the debugging
message printing.  And add global variable kexec_file_dbg_print and macro
kexec_dprintk() to facilitate the printing.

This is a preparation, later kexec_dprintk() will be used to replace the
existing pr_debug().  Once 'kexec -s -d' is specified, it will print out
kexec/kdump loading information.  If '-d' is not specified, it regresses
to pr_debug().

Link: https://lkml.kernel.org/r/20231213055747.61826-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20231213055747.61826-2-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Joe Perches <joe@perches.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 8227455192b7..400cb6c02176 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -403,7 +403,7 @@ bool kexec_load_permitted(int kexec_image_type);
 
 /* List of defined/legal kexec file flags */
 #define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
-				 KEXEC_FILE_NO_INITRAMFS)
+				 KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG)
 
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
@@ -500,6 +500,13 @@ static inline int crash_hotplug_memory_support(void) { return 0; }
 static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
 #endif
 
+extern bool kexec_file_dbg_print;
+
+#define kexec_dprintk(fmt, ...)					\
+	printk("%s" fmt,					\
+	       kexec_file_dbg_print ? KERN_INFO : KERN_DEBUG,	\
+	       ##__VA_ARGS__)
+
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
-- 
cgit v1.2.3


From f6120d527b8611aeaa1a34a33337f530d78a789c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 19:56:59 -0500
Subject: task_stack.h: add missing include

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/sched/task_stack.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index f158b025c175..ccd72b978e1f 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -8,6 +8,7 @@
 
 #include <linux/sched.h>
 #include <linux/magic.h>
+#include <linux/refcount.h>
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 
-- 
cgit v1.2.3


From 6a2623b17634688cfec58dd44041c5db2143719d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 20:00:15 -0500
Subject: nsproxy.h: add missing include

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/nsproxy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 771cb0285872..5601d14e2886 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_NSPROXY_H
 #define _LINUX_NSPROXY_H
 
+#include <linux/refcount.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 
-- 
cgit v1.2.3


From 316aa04d1ffa61f73ce2679d1ae1dca8747aeb1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:00:59 -0500
Subject: kmsan: add missing types.h dependency

more header dependency pruning/fixing

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/kmsan_types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h
index 8bfa6c98176d..929287981afe 100644
--- a/include/linux/kmsan_types.h
+++ b/include/linux/kmsan_types.h
@@ -9,6 +9,8 @@
 #ifndef _LINUX_KMSAN_TYPES_H
 #define _LINUX_KMSAN_TYPES_H
 
+#include <linux/types.h>
+
 /* These constants are defined in the MSan LLVM instrumentation pass. */
 #define KMSAN_RETVAL_SIZE 800
 #define KMSAN_PARAM_SIZE 800
-- 
cgit v1.2.3


From 058e0529d12ae9fed34c6eeef700cd294f9622a1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 20:22:22 -0500
Subject: time_namespace.h: fix missing include

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/time_namespace.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 03d9c5ac01d1..5258d81cef17 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -7,6 +7,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/err.h>
+#include <linux/time64.h>
 
 struct user_namespace;
 extern struct user_namespace init_user_ns;
-- 
cgit v1.2.3


From ea115c248a478ce1acbf4776e4666fb663285b2f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 17 Dec 2023 20:01:01 -0500
Subject: torture: add missing dependency on hrtimer.h

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/torture.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index c98d0c83d117..1541454da03e 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -21,6 +21,7 @@
 #include <linux/debugobjects.h>
 #include <linux/bug.h>
 #include <linux/compiler.h>
+#include <linux/hrtimer.h>
 
 /* Definitions for a non-string torture-test module parameter. */
 #define torture_param(type, name, init, msg) \
-- 
cgit v1.2.3


From bea32141764bc76db2d75c9484b71ded56119ab4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 6 Dec 2022 14:32:23 -0500
Subject: nodemask: Split out include/linux/nodemask_types.h

sched.h, which defines task_struct, needs nodemask_t - but sched.h is a
frequently used header and ideally shouldn't be pulling in any more code
that it needs to.

This splits out nodemask_types.h which has the definition sched.h needs,
which will avoid a circular header dependency in the alloc tagging patch
series, and as a bonus should speed up kernel build times.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/nodemask.h       |  2 +-
 include/linux/nodemask_types.h | 10 ++++++++++
 include/linux/sched.h          |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/nodemask_types.h

(limited to 'include/linux')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 8d07116caaf1..b61438313a73 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -93,10 +93,10 @@
 #include <linux/threads.h>
 #include <linux/bitmap.h>
 #include <linux/minmax.h>
+#include <linux/nodemask_types.h>
 #include <linux/numa.h>
 #include <linux/random.h>
 
-typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
 extern nodemask_t _unused_nodemask_arg_;
 
 /**
diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h
new file mode 100644
index 000000000000..6b28d97ea6ed
--- /dev/null
+++ b/include/linux/nodemask_types.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_NODEMASK_TYPES_H
+#define __LINUX_NODEMASK_TYPES_H
+
+#include <linux/bitops.h>
+#include <linux/numa.h>
+
+typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
+
+#endif /* __LINUX_NODEMASK_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..5a5b7b122682 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -20,7 +20,7 @@
 #include <linux/hrtimer.h>
 #include <linux/irqflags.h>
 #include <linux/seccomp.h>
-#include <linux/nodemask.h>
+#include <linux/nodemask_types.h>
 #include <linux/rcupdate.h>
 #include <linux/refcount.h>
 #include <linux/resource.h>
-- 
cgit v1.2.3


From d9f29deb7fe8137fd1954871443cbbc1b6125832 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 23 Jan 2023 21:32:18 -0500
Subject: prandom: Remove unused include

prandom.h doesn't use percpu.h - this fixes some circular header issues.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/prandom.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index f2ed5b72b3d6..f7f1e5251c67 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -10,7 +10,6 @@
 
 #include <linux/types.h>
 #include <linux/once.h>
-#include <linux/percpu.h>
 #include <linux/random.h>
 
 struct rnd_state {
-- 
cgit v1.2.3


From 6060ef31f1162fb91a1688fa5098b38c4b9c680c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 2 Dec 2022 14:39:55 -0500
Subject: timekeeping: Kill percpu.h dependency

Slimming down recursive header includes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h        | 2 +-
 include/linux/time_namespace.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index f2044d5a652b..02d264ca9dce 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -16,7 +16,7 @@
 #include <linux/rbtree.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/percpu.h>
+#include <linux/percpu-defs.h>
 #include <linux/seqlock.h>
 #include <linux/timer.h>
 #include <linux/timerqueue.h>
diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 5258d81cef17..876e31b4461d 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -12,6 +12,8 @@
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
+struct vm_area_struct;
+
 struct timens_offsets {
 	struct timespec64 monotonic;
 	struct timespec64 boottime;
-- 
cgit v1.2.3


From d7a73e3f089204aee3393687e23fd45a22657b08 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:27:00 -0500
Subject: kernel/numa.c: Move logging out of numa.h

Moving these stub functions to a .c file means we can kill a sched.h
dependency on printk.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/numa.h | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/numa.h b/include/linux/numa.h
index a904861de800..915033a75731 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_NUMA_H
 #define _LINUX_NUMA_H
+#include <linux/init.h>
 #include <linux/types.h>
 
 #ifdef CONFIG_NODES_SHIFT
@@ -22,34 +23,26 @@
 #endif
 
 #ifdef CONFIG_NUMA
-#include <linux/printk.h>
 #include <asm/sparsemem.h>
 
 /* Generic implementation available */
 int numa_nearest_node(int node, unsigned int state);
 
 #ifndef memory_add_physaddr_to_nid
-static inline int memory_add_physaddr_to_nid(u64 start)
-{
-	pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
-			start);
-	return 0;
-}
+int memory_add_physaddr_to_nid(u64 start);
 #endif
+
 #ifndef phys_to_target_node
-static inline int phys_to_target_node(u64 start)
-{
-	pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
-			start);
-	return 0;
-}
+int phys_to_target_node(u64 start);
 #endif
+
 #ifndef numa_fill_memblks
 static inline int __init numa_fill_memblks(u64 start, u64 end)
 {
 	return NUMA_NO_MEMBLK;
 }
 #endif
+
 #else /* !CONFIG_NUMA */
 static inline int numa_nearest_node(int node, unsigned int state)
 {
-- 
cgit v1.2.3


From d1d71b30e1f85e8b5d7c0d8edc16869bdc4d535f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:05:04 -0500
Subject: sched.h: Move (spin|rwlock)_needbreak() to spinlock.h

This lets us kill the dependency on spinlock.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/sched.h    | 31 -------------------------------
 include/linux/spinlock.h | 31 +++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a5b7b122682..7501a3451a20 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2227,37 +2227,6 @@ static inline bool preempt_model_preemptible(void)
 	return preempt_model_full() || preempt_model_rt();
 }
 
-/*
- * Does a critical section need to be broken due to another
- * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
- * but a general need for low latency)
- */
-static inline int spin_needbreak(spinlock_t *lock)
-{
-#ifdef CONFIG_PREEMPTION
-	return spin_is_contended(lock);
-#else
-	return 0;
-#endif
-}
-
-/*
- * Check if a rwlock is contended.
- * Returns non-zero if there is another task waiting on the rwlock.
- * Returns zero if the lock is not contended or the system / underlying
- * rwlock implementation does not support contention detection.
- * Technically does not depend on CONFIG_PREEMPTION, but a general need
- * for low latency.
- */
-static inline int rwlock_needbreak(rwlock_t *lock)
-{
-#ifdef CONFIG_PREEMPTION
-	return rwlock_is_contended(lock);
-#else
-	return 0;
-#endif
-}
-
 static __always_inline bool need_resched(void)
 {
 	return unlikely(tif_need_resched());
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 31d3d747a9db..0c71f06454d9 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -449,6 +449,37 @@ static __always_inline int spin_is_contended(spinlock_t *lock)
 	return raw_spin_is_contended(&lock->rlock);
 }
 
+/*
+ * Does a critical section need to be broken due to another
+ * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
+ * but a general need for low latency)
+ */
+static inline int spin_needbreak(spinlock_t *lock)
+{
+#ifdef CONFIG_PREEMPTION
+	return spin_is_contended(lock);
+#else
+	return 0;
+#endif
+}
+
+/*
+ * Check if a rwlock is contended.
+ * Returns non-zero if there is another task waiting on the rwlock.
+ * Returns zero if the lock is not contended or the system / underlying
+ * rwlock implementation does not support contention detection.
+ * Technically does not depend on CONFIG_PREEMPTION, but a general need
+ * for low latency.
+ */
+static inline int rwlock_needbreak(rwlock_t *lock)
+{
+#ifdef CONFIG_PREEMPTION
+	return rwlock_is_contended(lock);
+#else
+	return 0;
+#endif
+}
+
 #define assert_spin_locked(lock)	assert_raw_spin_locked(&(lock)->rlock)
 
 #else  /* !CONFIG_PREEMPT_RT */
-- 
cgit v1.2.3


From 2e346b19aab9ee40e5e429667a0a515f1d68b714 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:11:05 -0500
Subject: ktime.h: move ktime_t to types.h

ktime.h pulls in quite a few headers recursively (including printk.h) -
this is going to help with trimming sched.h dependencies.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/ktime.h | 8 +++-----
 include/linux/types.h | 3 +++
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 73f20deb497d..3a4e723eae0f 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -21,12 +21,10 @@
 #ifndef _LINUX_KTIME_H
 #define _LINUX_KTIME_H
 
-#include <linux/time.h>
-#include <linux/jiffies.h>
 #include <asm/bug.h>
-
-/* Nanosecond scalar representation for kernel time values */
-typedef s64	ktime_t;
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/types.h>
 
 /**
  * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
diff --git a/include/linux/types.h b/include/linux/types.h
index 253168bb3fe1..2bc8766ba20c 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -120,6 +120,9 @@ typedef s64			int64_t;
 #define aligned_be64		__aligned_be64
 #define aligned_le64		__aligned_le64
 
+/* Nanosecond scalar representation for kernel time values */
+typedef s64	ktime_t;
+
 /**
  * The type used for indexing onto a disc or disc partition.
  *
-- 
cgit v1.2.3


From 50d91c76582513852e38eb80491f54d44cfb51fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 11:52:02 -0500
Subject: hrtimers: Split out hrtimer_types.h

We need to reduce the scope of what's included in sched.h: task_struct
includes a hrtimer, so split out the core types into their own header.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h       | 44 ++-----------------------------------
 include/linux/hrtimer_types.h | 50 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h         |  2 +-
 3 files changed, 53 insertions(+), 43 deletions(-)
 create mode 100644 include/linux/hrtimer_types.h

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 02d264ca9dce..87e3bedf8eb0 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -13,13 +13,13 @@
 #define _LINUX_HRTIMER_H
 
 #include <linux/hrtimer_defs.h>
-#include <linux/rbtree.h>
+#include <linux/hrtimer_types.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/percpu-defs.h>
+#include <linux/rbtree.h>
 #include <linux/seqlock.h>
 #include <linux/timer.h>
-#include <linux/timerqueue.h>
 
 struct hrtimer_clock_base;
 struct hrtimer_cpu_base;
@@ -59,14 +59,6 @@ enum hrtimer_mode {
 	HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
 };
 
-/*
- * Return values for the callback function
- */
-enum hrtimer_restart {
-	HRTIMER_NORESTART,	/* Timer is not restarted */
-	HRTIMER_RESTART,	/* Timer must be restarted */
-};
-
 /*
  * Values to track state of the timer
  *
@@ -94,38 +86,6 @@ enum hrtimer_restart {
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
 
-/**
- * struct hrtimer - the basic hrtimer structure
- * @node:	timerqueue node, which also manages node.expires,
- *		the absolute expiry time in the hrtimers internal
- *		representation. The time is related to the clock on
- *		which the timer is based. Is setup by adding
- *		slack to the _softexpires value. For non range timers
- *		identical to _softexpires.
- * @_softexpires: the absolute earliest expiry time of the hrtimer.
- *		The time which was given as expiry time when the timer
- *		was armed.
- * @function:	timer expiry callback function
- * @base:	pointer to the timer base (per cpu and per clock)
- * @state:	state information (See bit values above)
- * @is_rel:	Set if the timer was armed relative
- * @is_soft:	Set if hrtimer will be expired in soft interrupt context.
- * @is_hard:	Set if hrtimer will be expired in hard interrupt context
- *		even on RT.
- *
- * The hrtimer structure must be initialized by hrtimer_init()
- */
-struct hrtimer {
-	struct timerqueue_node		node;
-	ktime_t				_softexpires;
-	enum hrtimer_restart		(*function)(struct hrtimer *);
-	struct hrtimer_clock_base	*base;
-	u8				state;
-	u8				is_rel;
-	u8				is_soft;
-	u8				is_hard;
-};
-
 /**
  * struct hrtimer_sleeper - simple sleeper structure
  * @timer:	embedded timer structure
diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h
new file mode 100644
index 000000000000..f4ef391b96a7
--- /dev/null
+++ b/include/linux/hrtimer_types.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_HRTIMER_TYPES_H
+#define _LINUX_HRTIMER_TYPES_H
+
+#include <linux/types.h>
+#include <linux/timerqueue.h>
+
+struct hrtimer_clock_base;
+
+/*
+ * Return values for the callback function
+ */
+enum hrtimer_restart {
+	HRTIMER_NORESTART,	/* Timer is not restarted */
+	HRTIMER_RESTART,	/* Timer must be restarted */
+};
+
+/**
+ * struct hrtimer - the basic hrtimer structure
+ * @node:	timerqueue node, which also manages node.expires,
+ *		the absolute expiry time in the hrtimers internal
+ *		representation. The time is related to the clock on
+ *		which the timer is based. Is setup by adding
+ *		slack to the _softexpires value. For non range timers
+ *		identical to _softexpires.
+ * @_softexpires: the absolute earliest expiry time of the hrtimer.
+ *		The time which was given as expiry time when the timer
+ *		was armed.
+ * @function:	timer expiry callback function
+ * @base:	pointer to the timer base (per cpu and per clock)
+ * @state:	state information (See bit values above)
+ * @is_rel:	Set if the timer was armed relative
+ * @is_soft:	Set if hrtimer will be expired in soft interrupt context.
+ * @is_hard:	Set if hrtimer will be expired in hard interrupt context
+ *		even on RT.
+ *
+ * The hrtimer structure must be initialized by hrtimer_init()
+ */
+struct hrtimer {
+	struct timerqueue_node		node;
+	ktime_t				_softexpires;
+	enum hrtimer_restart		(*function)(struct hrtimer *);
+	struct hrtimer_clock_base	*base;
+	u8				state;
+	u8				is_rel;
+	u8				is_soft;
+	u8				is_hard;
+};
+
+#endif /* _LINUX_HRTIMER_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7501a3451a20..3762809652da 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -17,7 +17,7 @@
 #include <linux/kmsan_types.h>
 #include <linux/mutex.h>
 #include <linux/plist.h>
-#include <linux/hrtimer.h>
+#include <linux/hrtimer_types.h>
 #include <linux/irqflags.h>
 #include <linux/seccomp.h>
 #include <linux/nodemask_types.h>
-- 
cgit v1.2.3


From d84f317915172c6511fd6c14ea3f70c9d67fdf67 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 12:35:44 -0500
Subject: locking/mutex: split out mutex_types.h

Trimming down sched.h dependencies: we don't want to include more than
the base types.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/mutex.h       | 52 +--------------------------------
 include/linux/mutex_types.h | 71 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h       |  2 +-
 3 files changed, 73 insertions(+), 52 deletions(-)
 create mode 100644 include/linux/mutex_types.h

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index a33aa9eb9fc3..0dfba5df6524 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -20,6 +20,7 @@
 #include <linux/osq_lock.h>
 #include <linux/debug_locks.h>
 #include <linux/cleanup.h>
+#include <linux/mutex_types.h>
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __DEP_MAP_MUTEX_INITIALIZER(lockname)			\
@@ -33,49 +34,6 @@
 
 #ifndef CONFIG_PREEMPT_RT
 
-/*
- * Simple, straightforward mutexes with strict semantics:
- *
- * - only one task can hold the mutex at a time
- * - only the owner can unlock the mutex
- * - multiple unlocks are not permitted
- * - recursive locking is not permitted
- * - a mutex object must be initialized via the API
- * - a mutex object must not be initialized via memset or copying
- * - task may not exit with mutex held
- * - memory areas where held locks reside must not be freed
- * - held mutexes must not be reinitialized
- * - mutexes may not be used in hardware or software interrupt
- *   contexts such as tasklets and timers
- *
- * These semantics are fully enforced when DEBUG_MUTEXES is
- * enabled. Furthermore, besides enforcing the above rules, the mutex
- * debugging code also implements a number of additional features
- * that make lock debugging easier and faster:
- *
- * - uses symbolic names of mutexes, whenever they are printed in debug output
- * - point-of-acquire tracking, symbolic lookup of function names
- * - list of all locks held in the system, printout of them
- * - owner tracking
- * - detects self-recursing locks and prints out all relevant info
- * - detects multi-task circular deadlocks and prints out all affected
- *   locks and tasks (and only those tasks)
- */
-struct mutex {
-	atomic_long_t		owner;
-	raw_spinlock_t		wait_lock;
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	struct optimistic_spin_queue osq; /* Spinner MCS lock */
-#endif
-	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_MUTEXES
-	void			*magic;
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-};
-
 #ifdef CONFIG_DEBUG_MUTEXES
 
 #define __DEBUG_MUTEX_INITIALIZER(lockname)				\
@@ -131,14 +89,6 @@ extern bool mutex_is_locked(struct mutex *lock);
 /*
  * Preempt-RT variant based on rtmutexes.
  */
-#include <linux/rtmutex.h>
-
-struct mutex {
-	struct rt_mutex_base	rtmutex;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-};
 
 #define __MUTEX_INITIALIZER(mutexname)					\
 {									\
diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h
new file mode 100644
index 000000000000..fdf7f515fde8
--- /dev/null
+++ b/include/linux/mutex_types.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_MUTEX_TYPES_H
+#define __LINUX_MUTEX_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/lockdep_types.h>
+#include <linux/osq_lock.h>
+#include <linux/spinlock_types.h>
+#include <linux/types.h>
+
+#ifndef CONFIG_PREEMPT_RT
+
+/*
+ * Simple, straightforward mutexes with strict semantics:
+ *
+ * - only one task can hold the mutex at a time
+ * - only the owner can unlock the mutex
+ * - multiple unlocks are not permitted
+ * - recursive locking is not permitted
+ * - a mutex object must be initialized via the API
+ * - a mutex object must not be initialized via memset or copying
+ * - task may not exit with mutex held
+ * - memory areas where held locks reside must not be freed
+ * - held mutexes must not be reinitialized
+ * - mutexes may not be used in hardware or software interrupt
+ *   contexts such as tasklets and timers
+ *
+ * These semantics are fully enforced when DEBUG_MUTEXES is
+ * enabled. Furthermore, besides enforcing the above rules, the mutex
+ * debugging code also implements a number of additional features
+ * that make lock debugging easier and faster:
+ *
+ * - uses symbolic names of mutexes, whenever they are printed in debug output
+ * - point-of-acquire tracking, symbolic lookup of function names
+ * - list of all locks held in the system, printout of them
+ * - owner tracking
+ * - detects self-recursing locks and prints out all relevant info
+ * - detects multi-task circular deadlocks and prints out all affected
+ *   locks and tasks (and only those tasks)
+ */
+struct mutex {
+	atomic_long_t		owner;
+	raw_spinlock_t		wait_lock;
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+	struct optimistic_spin_queue osq; /* Spinner MCS lock */
+#endif
+	struct list_head	wait_list;
+#ifdef CONFIG_DEBUG_MUTEXES
+	void			*magic;
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+#else /* !CONFIG_PREEMPT_RT */
+/*
+ * Preempt-RT variant based on rtmutexes.
+ */
+#include <linux/rtmutex.h>
+
+struct mutex {
+	struct rt_mutex_base	rtmutex;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+#endif /* CONFIG_PREEMPT_RT */
+
+#endif /* __LINUX_MUTEX_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3762809652da..e8892789969b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -15,7 +15,7 @@
 #include <linux/sem.h>
 #include <linux/shm.h>
 #include <linux/kmsan_types.h>
-#include <linux/mutex.h>
+#include <linux/mutex_types.h>
 #include <linux/plist.h>
 #include <linux/hrtimer_types.h>
 #include <linux/irqflags.h>
-- 
cgit v1.2.3


From 53d31ba842d9cc391032d051a210c3c9941f1529 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 12:43:30 -0500
Subject: posix-cpu-timers: Split out posix-timers_types.h

Trimming down sched.h dependencies: we don't want to include more than
the base types.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/posix-timers.h       | 69 ++------------------------------
 include/linux/posix-timers_types.h | 80 ++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h              |  2 +-
 3 files changed, 84 insertions(+), 67 deletions(-)
 create mode 100644 include/linux/posix-timers_types.h

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index d607f51404fc..dc7b738de299 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -2,40 +2,16 @@
 #ifndef _linux_POSIX_TIMERS_H
 #define _linux_POSIX_TIMERS_H
 
-#include <linux/spinlock.h>
+#include <linux/alarmtimer.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
-#include <linux/alarmtimer.h>
+#include <linux/posix-timers_types.h>
+#include <linux/spinlock.h>
 #include <linux/timerqueue.h>
 
 struct kernel_siginfo;
 struct task_struct;
 
-/*
- * Bit fields within a clockid:
- *
- * The most significant 29 bits hold either a pid or a file descriptor.
- *
- * Bit 2 indicates whether a cpu clock refers to a thread or a process.
- *
- * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
- *
- * A clockid is invalid if bits 2, 1, and 0 are all set.
- */
-#define CPUCLOCK_PID(clock)		((pid_t) ~((clock) >> 3))
-#define CPUCLOCK_PERTHREAD(clock) \
-	(((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
-
-#define CPUCLOCK_PERTHREAD_MASK	4
-#define CPUCLOCK_WHICH(clock)	((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
-#define CPUCLOCK_CLOCK_MASK	3
-#define CPUCLOCK_PROF		0
-#define CPUCLOCK_VIRT		1
-#define CPUCLOCK_SCHED		2
-#define CPUCLOCK_MAX		3
-#define CLOCKFD			CPUCLOCK_MAX
-#define CLOCKFD_MASK		(CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
-
 static inline clockid_t make_process_cpuclock(const unsigned int pid,
 		const clockid_t clock)
 {
@@ -109,44 +85,6 @@ static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp)
 	ctmr->node.expires = exp;
 }
 
-/**
- * posix_cputimer_base - Container per posix CPU clock
- * @nextevt:		Earliest-expiration cache
- * @tqhead:		timerqueue head for cpu_timers
- */
-struct posix_cputimer_base {
-	u64			nextevt;
-	struct timerqueue_head	tqhead;
-};
-
-/**
- * posix_cputimers - Container for posix CPU timer related data
- * @bases:		Base container for posix CPU clocks
- * @timers_active:	Timers are queued.
- * @expiry_active:	Timer expiry is active. Used for
- *			process wide timers to avoid multiple
- *			task trying to handle expiry concurrently
- *
- * Used in task_struct and signal_struct
- */
-struct posix_cputimers {
-	struct posix_cputimer_base	bases[CPUCLOCK_MAX];
-	unsigned int			timers_active;
-	unsigned int			expiry_active;
-};
-
-/**
- * posix_cputimers_work - Container for task work based posix CPU timer expiry
- * @work:	The task work to be scheduled
- * @mutex:	Mutex held around expiry in context of this task work
- * @scheduled:  @work has been scheduled already, no further processing
- */
-struct posix_cputimers_work {
-	struct callback_head	work;
-	struct mutex		mutex;
-	unsigned int		scheduled;
-};
-
 static inline void posix_cputimers_init(struct posix_cputimers *pct)
 {
 	memset(pct, 0, sizeof(*pct));
@@ -179,7 +117,6 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 		.bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases),	\
 	},
 #else
-struct posix_cputimers { };
 struct cpu_timer { };
 #define INIT_CPU_TIMERS(s)
 static inline void posix_cputimers_init(struct posix_cputimers *pct) { }
diff --git a/include/linux/posix-timers_types.h b/include/linux/posix-timers_types.h
new file mode 100644
index 000000000000..4783fa17bfeb
--- /dev/null
+++ b/include/linux/posix-timers_types.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _linux_POSIX_TIMERS_TYPES_H
+#define _linux_POSIX_TIMERS_TYPES_H
+
+#include <linux/mutex_types.h>
+#include <linux/timerqueue.h>
+#include <linux/types.h>
+
+/*
+ * Bit fields within a clockid:
+ *
+ * The most significant 29 bits hold either a pid or a file descriptor.
+ *
+ * Bit 2 indicates whether a cpu clock refers to a thread or a process.
+ *
+ * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
+ *
+ * A clockid is invalid if bits 2, 1, and 0 are all set.
+ */
+#define CPUCLOCK_PID(clock)		((pid_t) ~((clock) >> 3))
+#define CPUCLOCK_PERTHREAD(clock) \
+	(((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
+
+#define CPUCLOCK_PERTHREAD_MASK	4
+#define CPUCLOCK_WHICH(clock)	((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
+#define CPUCLOCK_CLOCK_MASK	3
+#define CPUCLOCK_PROF		0
+#define CPUCLOCK_VIRT		1
+#define CPUCLOCK_SCHED		2
+#define CPUCLOCK_MAX		3
+#define CLOCKFD			CPUCLOCK_MAX
+#define CLOCKFD_MASK		(CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
+
+#ifdef CONFIG_POSIX_TIMERS
+
+/**
+ * posix_cputimer_base - Container per posix CPU clock
+ * @nextevt:		Earliest-expiration cache
+ * @tqhead:		timerqueue head for cpu_timers
+ */
+struct posix_cputimer_base {
+	u64			nextevt;
+	struct timerqueue_head	tqhead;
+};
+
+/**
+ * posix_cputimers - Container for posix CPU timer related data
+ * @bases:		Base container for posix CPU clocks
+ * @timers_active:	Timers are queued.
+ * @expiry_active:	Timer expiry is active. Used for
+ *			process wide timers to avoid multiple
+ *			task trying to handle expiry concurrently
+ *
+ * Used in task_struct and signal_struct
+ */
+struct posix_cputimers {
+	struct posix_cputimer_base	bases[CPUCLOCK_MAX];
+	unsigned int			timers_active;
+	unsigned int			expiry_active;
+};
+
+/**
+ * posix_cputimers_work - Container for task work based posix CPU timer expiry
+ * @work:	The task work to be scheduled
+ * @mutex:	Mutex held around expiry in context of this task work
+ * @scheduled:  @work has been scheduled already, no further processing
+ */
+struct posix_cputimers_work {
+	struct callback_head	work;
+	struct mutex		mutex;
+	unsigned int		scheduled;
+};
+
+#else /* CONFIG_POSIX_TIMERS */
+
+struct posix_cputimers { };
+
+#endif /* CONFIG_POSIX_TIMERS */
+
+#endif /* _linux_POSIX_TIMERS_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e8892789969b..6d803d0904d9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -31,7 +31,7 @@
 #include <linux/syscall_user_dispatch.h>
 #include <linux/mm_types_task.h>
 #include <linux/task_io_accounting.h>
-#include <linux/posix-timers.h>
+#include <linux/posix-timers_types.h>
 #include <linux/rseq.h>
 #include <linux/seqlock.h>
 #include <linux/kcsan.h>
-- 
cgit v1.2.3


From f038cc1379c0ff462d83895cae8beb75a0f6bf02 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:01:06 -0500
Subject: locking/seqlock: Split out seqlock_types.h

Trimming down sched.h dependencies: we don't want to include more than
the base types.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/sched.h         |  2 +-
 include/linux/seqlock.h       | 79 +-----------------------------------
 include/linux/seqlock_types.h | 93 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 78 deletions(-)
 create mode 100644 include/linux/seqlock_types.h

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6d803d0904d9..436f7ce1450a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -33,7 +33,7 @@
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers_types.h>
 #include <linux/rseq.h>
-#include <linux/seqlock.h>
+#include <linux/seqlock_types.h>
 #include <linux/kcsan.h>
 #include <linux/rv.h>
 #include <linux/livepatch_sched.h>
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index e92f9d5577ba..d90d8ee29d81 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -18,6 +18,7 @@
 #include <linux/lockdep.h>
 #include <linux/mutex.h>
 #include <linux/preempt.h>
+#include <linux/seqlock_types.h>
 #include <linux/spinlock.h>
 
 #include <asm/processor.h>
@@ -37,37 +38,6 @@
  */
 #define KCSAN_SEQLOCK_REGION_MAX 1000
 
-/*
- * Sequence counters (seqcount_t)
- *
- * This is the raw counting mechanism, without any writer protection.
- *
- * Write side critical sections must be serialized and non-preemptible.
- *
- * If readers can be invoked from hardirq or softirq contexts,
- * interrupts or bottom halves must also be respectively disabled before
- * entering the write section.
- *
- * This mechanism can't be used if the protected data contains pointers,
- * as the writer can invalidate a pointer that a reader is following.
- *
- * If the write serialization mechanism is one of the common kernel
- * locking primitives, use a sequence counter with associated lock
- * (seqcount_LOCKNAME_t) instead.
- *
- * If it's desired to automatically handle the sequence counter writer
- * serialization and non-preemptibility requirements, use a sequential
- * lock (seqlock_t) instead.
- *
- * See Documentation/locking/seqlock.rst
- */
-typedef struct seqcount {
-	unsigned sequence;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
-} seqcount_t;
-
 static inline void __seqcount_init(seqcount_t *s, const char *name,
 					  struct lock_class_key *key)
 {
@@ -131,28 +101,6 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
  * See Documentation/locking/seqlock.rst
  */
 
-/*
- * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot
- * disable preemption. It can lead to higher latencies, and the write side
- * sections will not be able to acquire locks which become sleeping locks
- * (e.g. spinlock_t).
- *
- * To remain preemptible while avoiding a possible livelock caused by the
- * reader preempting the writer, use a different technique: let the reader
- * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the
- * case, acquire then release the associated LOCKNAME writer serialization
- * lock. This will allow any possibly-preempted writer to make progress
- * until the end of its writer serialization lock critical section.
- *
- * This lock-unlock technique must be implemented for all of PREEMPT_RT
- * sleeping locks.  See Documentation/locking/locktypes.rst
- */
-#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
-#define __SEQ_LOCK(expr)	expr
-#else
-#define __SEQ_LOCK(expr)
-#endif
-
 /*
  * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated
  * @seqcount:	The real sequence counter
@@ -194,11 +142,6 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
  * @lockbase:		prefix for associated lock/unlock
  */
 #define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase)	\
-typedef struct seqcount_##lockname {					\
-	seqcount_t		seqcount;				\
-	__SEQ_LOCK(locktype	*lock);					\
-} seqcount_##lockname##_t;						\
-									\
 static __always_inline seqcount_t *					\
 __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)			\
 {									\
@@ -284,6 +227,7 @@ SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    raw_spin)
 SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, spin)
 SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, read)
 SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
+#undef SEQCOUNT_LOCKNAME
 
 /*
  * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
@@ -794,25 +738,6 @@ static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
 	smp_wmb();      /* increment "sequence" before following stores */
 }
 
-/*
- * Sequential locks (seqlock_t)
- *
- * Sequence counters with an embedded spinlock for writer serialization
- * and non-preemptibility.
- *
- * For more info, see:
- *    - Comments on top of seqcount_t
- *    - Documentation/locking/seqlock.rst
- */
-typedef struct {
-	/*
-	 * Make sure that readers don't starve writers on PREEMPT_RT: use
-	 * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK().
-	 */
-	seqcount_spinlock_t seqcount;
-	spinlock_t lock;
-} seqlock_t;
-
 #define __SEQLOCK_UNLOCKED(lockname)					\
 	{								\
 		.seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \
diff --git a/include/linux/seqlock_types.h b/include/linux/seqlock_types.h
new file mode 100644
index 000000000000..dfdf43e3fa3d
--- /dev/null
+++ b/include/linux/seqlock_types.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_SEQLOCK_TYPES_H
+#define __LINUX_SEQLOCK_TYPES_H
+
+#include <linux/lockdep_types.h>
+#include <linux/mutex_types.h>
+#include <linux/spinlock_types.h>
+
+/*
+ * Sequence counters (seqcount_t)
+ *
+ * This is the raw counting mechanism, without any writer protection.
+ *
+ * Write side critical sections must be serialized and non-preemptible.
+ *
+ * If readers can be invoked from hardirq or softirq contexts,
+ * interrupts or bottom halves must also be respectively disabled before
+ * entering the write section.
+ *
+ * This mechanism can't be used if the protected data contains pointers,
+ * as the writer can invalidate a pointer that a reader is following.
+ *
+ * If the write serialization mechanism is one of the common kernel
+ * locking primitives, use a sequence counter with associated lock
+ * (seqcount_LOCKNAME_t) instead.
+ *
+ * If it's desired to automatically handle the sequence counter writer
+ * serialization and non-preemptibility requirements, use a sequential
+ * lock (seqlock_t) instead.
+ *
+ * See Documentation/locking/seqlock.rst
+ */
+typedef struct seqcount {
+	unsigned sequence;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
+} seqcount_t;
+
+/*
+ * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot
+ * disable preemption. It can lead to higher latencies, and the write side
+ * sections will not be able to acquire locks which become sleeping locks
+ * (e.g. spinlock_t).
+ *
+ * To remain preemptible while avoiding a possible livelock caused by the
+ * reader preempting the writer, use a different technique: let the reader
+ * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the
+ * case, acquire then release the associated LOCKNAME writer serialization
+ * lock. This will allow any possibly-preempted writer to make progress
+ * until the end of its writer serialization lock critical section.
+ *
+ * This lock-unlock technique must be implemented for all of PREEMPT_RT
+ * sleeping locks.  See Documentation/locking/locktypes.rst
+ */
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
+#define __SEQ_LOCK(expr)	expr
+#else
+#define __SEQ_LOCK(expr)
+#endif
+
+#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase)	\
+typedef struct seqcount_##lockname {					\
+	seqcount_t		seqcount;				\
+	__SEQ_LOCK(locktype	*lock);					\
+} seqcount_##lockname##_t;
+
+SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    raw_spin)
+SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, spin)
+SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, read)
+SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
+#undef SEQCOUNT_LOCKNAME
+
+/*
+ * Sequential locks (seqlock_t)
+ *
+ * Sequence counters with an embedded spinlock for writer serialization
+ * and non-preemptibility.
+ *
+ * For more info, see:
+ *    - Comments on top of seqcount_t
+ *    - Documentation/locking/seqlock.rst
+ */
+typedef struct {
+	/*
+	 * Make sure that readers don't starve writers on PREEMPT_RT: use
+	 * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK().
+	 */
+	seqcount_spinlock_t seqcount;
+	spinlock_t lock;
+} seqlock_t;
+
+#endif /* __LINUX_SEQLOCK_TYPES_H */
-- 
cgit v1.2.3


From 6d5e9d63683042a8d344cd5d6f9cf23613864a29 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:03:22 -0500
Subject: pid: Split out pid_types.h

Trimming down sched.h dependencies: we dont't want to include more than
the base types.

Cc: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/pid.h       | 15 ++-------------
 include/linux/pid_types.h | 16 ++++++++++++++++
 include/linux/sched.h     |  2 +-
 include/linux/seccomp.h   |  2 ++
 4 files changed, 21 insertions(+), 14 deletions(-)
 create mode 100644 include/linux/pid_types.h

(limited to 'include/linux')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 653a527574c4..f254c3a45b9b 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -2,18 +2,10 @@
 #ifndef _LINUX_PID_H
 #define _LINUX_PID_H
 
+#include <linux/pid_types.h>
 #include <linux/rculist.h>
-#include <linux/wait.h>
 #include <linux/refcount.h>
-
-enum pid_type
-{
-	PIDTYPE_PID,
-	PIDTYPE_TGID,
-	PIDTYPE_PGID,
-	PIDTYPE_SID,
-	PIDTYPE_MAX,
-};
+#include <linux/wait.h>
 
 /*
  * What is struct pid?
@@ -110,9 +102,6 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old);
 extern void transfer_pid(struct task_struct *old, struct task_struct *new,
 			 enum pid_type);
 
-struct pid_namespace;
-extern struct pid_namespace init_pid_ns;
-
 extern int pid_max;
 extern int pid_max_min, pid_max_max;
 
diff --git a/include/linux/pid_types.h b/include/linux/pid_types.h
new file mode 100644
index 000000000000..c2aee1d91dcf
--- /dev/null
+++ b/include/linux/pid_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PID_TYPES_H
+#define _LINUX_PID_TYPES_H
+
+enum pid_type {
+	PIDTYPE_PID,
+	PIDTYPE_TGID,
+	PIDTYPE_PGID,
+	PIDTYPE_SID,
+	PIDTYPE_MAX,
+};
+
+struct pid_namespace;
+extern struct pid_namespace init_pid_ns;
+
+#endif /* _LINUX_PID_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 436f7ce1450a..37cc9d257073 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -11,7 +11,7 @@
 
 #include <asm/current.h>
 
-#include <linux/pid.h>
+#include <linux/pid_types.h>
 #include <linux/sem.h>
 #include <linux/shm.h>
 #include <linux/kmsan_types.h>
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 175079552f68..1ec0d8dc4b69 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -126,6 +126,8 @@ static inline long seccomp_get_metadata(struct task_struct *task,
 
 #ifdef CONFIG_SECCOMP_CACHE_DEBUG
 struct seq_file;
+struct pid_namespace;
+struct pid;
 
 int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
 			   struct pid *pid, struct task_struct *task);
-- 
cgit v1.2.3


From f551103cb964e9e6f5c03b3b8723424723731e76 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 17:49:24 -0500
Subject: sched.h: move pid helpers to pid.h

This is needed for killing the sched.h dependency on rcupdate.h, and
pid.h is a better place for this code anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/pid.h          | 125 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h        | 122 -----------------------------------------
 include/linux/sched/signal.h |   1 +
 3 files changed, 126 insertions(+), 122 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index f254c3a45b9b..395cacce1179 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -4,7 +4,9 @@
 
 #include <linux/pid_types.h>
 #include <linux/rculist.h>
+#include <linux/rcupdate.h>
 #include <linux/refcount.h>
+#include <linux/sched.h>
 #include <linux/wait.h>
 
 /*
@@ -204,4 +206,127 @@ pid_t pid_vnr(struct pid *pid);
 		}							\
 		task = tg___;						\
 	} while_each_pid_task(pid, type, task)
+
+static inline struct pid *task_pid(struct task_struct *task)
+{
+	return task->thread_pid;
+}
+
+/*
+ * the helpers to get the task's different pids as they are seen
+ * from various namespaces
+ *
+ * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
+ * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
+ *                     current.
+ * task_xid_nr_ns()  : id seen from the ns specified;
+ *
+ * see also pid_nr() etc in include/linux/pid.h
+ */
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
+
+static inline pid_t task_pid_nr(struct task_struct *tsk)
+{
+	return tsk->pid;
+}
+
+static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
+}
+
+static inline pid_t task_pid_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
+}
+
+
+static inline pid_t task_tgid_nr(struct task_struct *tsk)
+{
+	return tsk->tgid;
+}
+
+/**
+ * pid_alive - check that a task structure is not stale
+ * @p: Task structure to be checked.
+ *
+ * Test if a process is not yet dead (at most zombie state)
+ * If pid_alive fails, then pointers within the task structure
+ * can be stale and must not be dereferenced.
+ *
+ * Return: 1 if the process is alive. 0 otherwise.
+ */
+static inline int pid_alive(const struct task_struct *p)
+{
+	return p->thread_pid != NULL;
+}
+
+static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
+}
+
+static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
+}
+
+
+static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
+}
+
+static inline pid_t task_session_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
+}
+
+static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
+}
+
+static inline pid_t task_tgid_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
+}
+
+static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
+{
+	pid_t pid = 0;
+
+	rcu_read_lock();
+	if (pid_alive(tsk))
+		pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
+	rcu_read_unlock();
+
+	return pid;
+}
+
+static inline pid_t task_ppid_nr(const struct task_struct *tsk)
+{
+	return task_ppid_nr_ns(tsk, &init_pid_ns);
+}
+
+/* Obsolete, do not use: */
+static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+{
+	return task_pgrp_nr_ns(tsk, &init_pid_ns);
+}
+
+/**
+ * is_global_init - check if a task structure is init. Since init
+ * is free to have sub-threads we need to check tgid.
+ * @tsk: Task structure to be checked.
+ *
+ * Check if a task structure is the first user space task the kernel created.
+ *
+ * Return: 1 if the task structure is init. 0 otherwise.
+ */
+static inline int is_global_init(struct task_struct *tsk)
+{
+	return task_tgid_nr(tsk) == 1;
+}
+
 #endif /* _LINUX_PID_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 37cc9d257073..9e2708c2cfa6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1561,114 +1561,6 @@ struct task_struct {
 	 */
 };
 
-static inline struct pid *task_pid(struct task_struct *task)
-{
-	return task->thread_pid;
-}
-
-/*
- * the helpers to get the task's different pids as they are seen
- * from various namespaces
- *
- * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
- * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
- *                     current.
- * task_xid_nr_ns()  : id seen from the ns specified;
- *
- * see also pid_nr() etc in include/linux/pid.h
- */
-pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
-
-static inline pid_t task_pid_nr(struct task_struct *tsk)
-{
-	return tsk->pid;
-}
-
-static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
-}
-
-static inline pid_t task_pid_vnr(struct task_struct *tsk)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
-}
-
-
-static inline pid_t task_tgid_nr(struct task_struct *tsk)
-{
-	return tsk->tgid;
-}
-
-/**
- * pid_alive - check that a task structure is not stale
- * @p: Task structure to be checked.
- *
- * Test if a process is not yet dead (at most zombie state)
- * If pid_alive fails, then pointers within the task structure
- * can be stale and must not be dereferenced.
- *
- * Return: 1 if the process is alive. 0 otherwise.
- */
-static inline int pid_alive(const struct task_struct *p)
-{
-	return p->thread_pid != NULL;
-}
-
-static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
-}
-
-static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
-}
-
-
-static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
-}
-
-static inline pid_t task_session_vnr(struct task_struct *tsk)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
-}
-
-static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
-}
-
-static inline pid_t task_tgid_vnr(struct task_struct *tsk)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
-}
-
-static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
-{
-	pid_t pid = 0;
-
-	rcu_read_lock();
-	if (pid_alive(tsk))
-		pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
-	rcu_read_unlock();
-
-	return pid;
-}
-
-static inline pid_t task_ppid_nr(const struct task_struct *tsk)
-{
-	return task_ppid_nr_ns(tsk, &init_pid_ns);
-}
-
-/* Obsolete, do not use: */
-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
-{
-	return task_pgrp_nr_ns(tsk, &init_pid_ns);
-}
-
 #define TASK_REPORT_IDLE	(TASK_REPORT + 1)
 #define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)
 
@@ -1712,20 +1604,6 @@ static inline char task_state_to_char(struct task_struct *tsk)
 	return task_index_to_char(task_state_index(tsk));
 }
 
-/**
- * is_global_init - check if a task structure is init. Since init
- * is free to have sub-threads we need to check tgid.
- * @tsk: Task structure to be checked.
- *
- * Check if a task structure is the first user space task the kernel created.
- *
- * Return: 1 if the task structure is init. 0 otherwise.
- */
-static inline int is_global_init(struct task_struct *tsk)
-{
-	return task_tgid_nr(tsk) == 1;
-}
-
 extern struct pid *cad_pid;
 
 /*
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3499c1a8b929..b847d8fa75a9 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -9,6 +9,7 @@
 #include <linux/sched/task.h>
 #include <linux/cred.h>
 #include <linux/refcount.h>
+#include <linux/pid.h>
 #include <linux/posix-timers.h>
 #include <linux/mm_types.h>
 #include <asm/ptrace.h>
-- 
cgit v1.2.3


From 8b7787a543cde905e53eaf29172c9472fe8a6a75 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:12:49 -0500
Subject: plist: Split out plist_types.h

Trimming down sched.h dependencies: we don't want to include more than
the base types.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/plist.h       | 12 +-----------
 include/linux/plist_types.h | 17 +++++++++++++++++
 include/linux/sched.h       |  2 +-
 3 files changed, 19 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/plist_types.h

(limited to 'include/linux')

diff --git a/include/linux/plist.h b/include/linux/plist.h
index 0f352c1d3c80..8c1c8adf7fe9 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -75,20 +75,10 @@
 
 #include <linux/container_of.h>
 #include <linux/list.h>
-#include <linux/types.h>
+#include <linux/plist_types.h>
 
 #include <asm/bug.h>
 
-struct plist_head {
-	struct list_head node_list;
-};
-
-struct plist_node {
-	int			prio;
-	struct list_head	prio_list;
-	struct list_head	node_list;
-};
-
 /**
  * PLIST_HEAD_INIT - static struct plist_head initializer
  * @head:	struct plist_head variable name
diff --git a/include/linux/plist_types.h b/include/linux/plist_types.h
new file mode 100644
index 000000000000..c37e784330af
--- /dev/null
+++ b/include/linux/plist_types.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_PLIST_TYPES_H
+#define _LINUX_PLIST_TYPES_H
+
+#include <linux/types.h>
+
+struct plist_head {
+	struct list_head node_list;
+};
+
+struct plist_node {
+	int			prio;
+	struct list_head	prio_list;
+	struct list_head	node_list;
+};
+
+#endif /* _LINUX_PLIST_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9e2708c2cfa6..8c230f24688b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -16,7 +16,7 @@
 #include <linux/shm.h>
 #include <linux/kmsan_types.h>
 #include <linux/mutex_types.h>
-#include <linux/plist.h>
+#include <linux/plist_types.h>
 #include <linux/hrtimer_types.h>
 #include <linux/irqflags.h>
 #include <linux/seccomp.h>
-- 
cgit v1.2.3


From 6dfeff09d5ad331905c7066207053d286d58ac83 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 11 Dec 2023 18:14:41 +0000
Subject: wait: Remove uapi header file from main header file

There's really no overlap between uapi/linux/wait.h and linux/wait.h.
There are two files which rely on the uapi file being implcitly included,
so explicitly include it there and remove it from the main header file.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/wait.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 3473b663176f..8aa3372f21a0 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -9,7 +9,6 @@
 #include <linux/spinlock.h>
 
 #include <asm/current.h>
-#include <uapi/linux/wait.h>
 
 typedef struct wait_queue_entry wait_queue_entry_t;
 
-- 
cgit v1.2.3


From 097691960f7084ca82adb2e866d03a81753f0cb7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:28:46 -0500
Subject: rslib: kill bogus dependency on list.h

list_head is defined in types.h, not list.h - this kills a sched.h
dependency.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/rslib.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rslib.h b/include/linux/rslib.h
index 238bb85243d3..a04dacbdc8ae 100644
--- a/include/linux/rslib.h
+++ b/include/linux/rslib.h
@@ -10,7 +10,6 @@
 #ifndef _RSLIB_H_
 #define _RSLIB_H_
 
-#include <linux/list.h>
 #include <linux/types.h>	/* for gfp_t */
 #include <linux/gfp.h>		/* for GFP_KERNEL */
 
-- 
cgit v1.2.3


From eee51b0ae5c52a77ed65ad59b55002d1397b40d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:32:55 -0500
Subject: timerqueue: Split out timerqueue_types.h

Trimming down sched.h dependencies: timerqueue_types can include just
rbtree_types.h instead of pulling in rbtree.h.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/hrtimer_types.h      |  2 +-
 include/linux/posix-timers_types.h |  2 +-
 include/linux/timerqueue.h         | 13 +------------
 include/linux/timerqueue_types.h   | 17 +++++++++++++++++
 4 files changed, 20 insertions(+), 14 deletions(-)
 create mode 100644 include/linux/timerqueue_types.h

(limited to 'include/linux')

diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h
index f4ef391b96a7..ad66a3081735 100644
--- a/include/linux/hrtimer_types.h
+++ b/include/linux/hrtimer_types.h
@@ -3,7 +3,7 @@
 #define _LINUX_HRTIMER_TYPES_H
 
 #include <linux/types.h>
-#include <linux/timerqueue.h>
+#include <linux/timerqueue_types.h>
 
 struct hrtimer_clock_base;
 
diff --git a/include/linux/posix-timers_types.h b/include/linux/posix-timers_types.h
index 4783fa17bfeb..a4712c1008c9 100644
--- a/include/linux/posix-timers_types.h
+++ b/include/linux/posix-timers_types.h
@@ -3,7 +3,7 @@
 #define _linux_POSIX_TIMERS_TYPES_H
 
 #include <linux/mutex_types.h>
-#include <linux/timerqueue.h>
+#include <linux/timerqueue_types.h>
 #include <linux/types.h>
 
 /*
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index adc80e29168e..62973f7d4610 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -3,18 +3,7 @@
 #define _LINUX_TIMERQUEUE_H
 
 #include <linux/rbtree.h>
-#include <linux/ktime.h>
-
-
-struct timerqueue_node {
-	struct rb_node node;
-	ktime_t expires;
-};
-
-struct timerqueue_head {
-	struct rb_root_cached rb_root;
-};
-
+#include <linux/timerqueue_types.h>
 
 extern bool timerqueue_add(struct timerqueue_head *head,
 			   struct timerqueue_node *node);
diff --git a/include/linux/timerqueue_types.h b/include/linux/timerqueue_types.h
new file mode 100644
index 000000000000..dc298d0923e3
--- /dev/null
+++ b/include/linux/timerqueue_types.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TIMERQUEUE_TYPES_H
+#define _LINUX_TIMERQUEUE_TYPES_H
+
+#include <linux/rbtree_types.h>
+#include <linux/types.h>
+
+struct timerqueue_node {
+	struct rb_node node;
+	ktime_t expires;
+};
+
+struct timerqueue_head {
+	struct rb_root_cached rb_root;
+};
+
+#endif /* _LINUX_TIMERQUEUE_TYPES_H */
-- 
cgit v1.2.3


From 22c336d0d3118824fed08834069568c57c5641a6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:34:45 -0500
Subject: signal: Kill bogus dependency on list.h

list_head is in types.h, not list.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/signal.h       | 1 +
 include/linux/signal_types.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 3b98e7a28538..f19816832f05 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -3,6 +3,7 @@
 #define _LINUX_SIGNAL_H
 
 #include <linux/bug.h>
+#include <linux/list.h>
 #include <linux/signal_types.h>
 #include <linux/string.h>
 
diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h
index a70b2bdbf4d9..caf4f7a59ab9 100644
--- a/include/linux/signal_types.h
+++ b/include/linux/signal_types.h
@@ -6,7 +6,7 @@
  * Basic signal handling related data type definitions:
  */
 
-#include <linux/list.h>
+#include <linux/types.h>
 #include <uapi/linux/signal.h>
 
 typedef struct kernel_siginfo {
-- 
cgit v1.2.3


From dff0fd233a5104337069603d201f8cad74bc0e5a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:53:34 -0500
Subject: timers: Split out timer_types.h

Cutting down on sched.h dependencies: this is going to be used in
workqueue_types.h in the next patch, so we can kill the sched.h
dependency on workqueue.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/timer.h       | 16 +---------------
 include/linux/timer_types.h | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 15 deletions(-)
 create mode 100644 include/linux/timer_types.h

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 26a545bb0153..f18a2f1eb79e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -7,21 +7,7 @@
 #include <linux/stddef.h>
 #include <linux/debugobjects.h>
 #include <linux/stringify.h>
-
-struct timer_list {
-	/*
-	 * All fields that change during normal runtime grouped to the
-	 * same cacheline
-	 */
-	struct hlist_node	entry;
-	unsigned long		expires;
-	void			(*function)(struct timer_list *);
-	u32			flags;
-
-#ifdef CONFIG_LOCKDEP
-	struct lockdep_map	lockdep_map;
-#endif
-};
+#include <linux/timer_types.h>
 
 #ifdef CONFIG_LOCKDEP
 /*
diff --git a/include/linux/timer_types.h b/include/linux/timer_types.h
new file mode 100644
index 000000000000..fae5a388f914
--- /dev/null
+++ b/include/linux/timer_types.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TIMER_TYPES_H
+#define _LINUX_TIMER_TYPES_H
+
+#include <linux/lockdep_types.h>
+#include <linux/types.h>
+
+struct timer_list {
+	/*
+	 * All fields that change during normal runtime grouped to the
+	 * same cacheline
+	 */
+	struct hlist_node	entry;
+	unsigned long		expires;
+	void			(*function)(struct timer_list *);
+	u32			flags;
+
+#ifdef CONFIG_LOCKDEP
+	struct lockdep_map	lockdep_map;
+#endif
+};
+
+#endif /* _LINUX_TIMER_TYPES_H */
-- 
cgit v1.2.3


From b2fa8443db320c4873feca2588b957439e350890 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:55:01 -0500
Subject: workqueue: Split out workqueue_types.h

More sched.h dependency culling - this lets us kill a rhashtable-types.h
dependency on workqueue.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/dma-fence.h        |  1 +
 include/linux/rhashtable-types.h |  2 +-
 include/linux/timekeeping.h      |  1 +
 include/linux/workqueue.h        | 16 +---------------
 include/linux/workqueue_types.h  | 25 +++++++++++++++++++++++++
 5 files changed, 29 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/workqueue_types.h

(limited to 'include/linux')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index b3772edca2e6..e06bad467f55 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -21,6 +21,7 @@
 #include <linux/sched.h>
 #include <linux/printk.h>
 #include <linux/rcupdate.h>
+#include <linux/timekeeping.h>
 
 struct dma_fence;
 struct dma_fence_ops;
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 57467cbf4c5b..b6f3797277ff 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -12,7 +12,7 @@
 #include <linux/atomic.h>
 #include <linux/compiler.h>
 #include <linux/mutex.h>
-#include <linux/workqueue.h>
+#include <linux/workqueue_types.h>
 
 struct rhash_head {
 	struct rhash_head __rcu		*next;
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index fe1e467ba046..7c43e98cf211 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -4,6 +4,7 @@
 
 #include <linux/errno.h>
 #include <linux/clocksource_ids.h>
+#include <linux/ktime.h>
 
 /* Included from linux/ktime.h */
 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 24b1e5070f4d..f1bb2e35301f 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -14,12 +14,7 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 #include <linux/rcupdate.h>
-
-struct workqueue_struct;
-
-struct work_struct;
-typedef void (*work_func_t)(struct work_struct *work);
-void delayed_work_timer_fn(struct timer_list *t);
+#include <linux/workqueue_types.h>
 
 /*
  * The first word is the work queue pointer and the flags rolled into
@@ -95,15 +90,6 @@ enum {
 #define WORK_STRUCT_FLAG_MASK    ((1ul << WORK_STRUCT_FLAG_BITS) - 1)
 #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
 
-struct work_struct {
-	atomic_long_t data;
-	struct list_head entry;
-	work_func_t func;
-#ifdef CONFIG_LOCKDEP
-	struct lockdep_map lockdep_map;
-#endif
-};
-
 #define WORK_DATA_INIT()	ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
 #define WORK_DATA_STATIC_INIT()	\
 	ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))
diff --git a/include/linux/workqueue_types.h b/include/linux/workqueue_types.h
new file mode 100644
index 000000000000..4c38824f3ab4
--- /dev/null
+++ b/include/linux/workqueue_types.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_WORKQUEUE_TYPES_H
+#define _LINUX_WORKQUEUE_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/lockdep_types.h>
+#include <linux/timer_types.h>
+#include <linux/types.h>
+
+struct workqueue_struct;
+
+struct work_struct;
+typedef void (*work_func_t)(struct work_struct *work);
+void delayed_work_timer_fn(struct timer_list *t);
+
+struct work_struct {
+	atomic_long_t data;
+	struct list_head entry;
+	work_func_t func;
+#ifdef CONFIG_LOCKDEP
+	struct lockdep_map lockdep_map;
+#endif
+};
+
+#endif /* _LINUX_WORKQUEUE_TYPES_H */
-- 
cgit v1.2.3


From bc46ef3cea3d6f63952d7e29a324e889c34970a8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:58:25 -0500
Subject: shm: Slim down dependencies

list_head is in types.h, not list.h., and the uapi header wasn't needed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/shm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shm.h b/include/linux/shm.h
index d8e69aed3d32..c55bef0538e5 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -2,12 +2,12 @@
 #ifndef _LINUX_SHM_H_
 #define _LINUX_SHM_H_
 
-#include <linux/list.h>
+#include <linux/types.h>
 #include <asm/page.h>
-#include <uapi/linux/shm.h>
 #include <asm/shmparam.h>
 
 struct file;
+struct task_struct;
 
 #ifdef CONFIG_SYSVIPC
 struct sysv_shm {
-- 
cgit v1.2.3


From 72375a8864ebc0a20ca4a35f382441b01a0b85b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:00:10 -0500
Subject: ipc: Kill bogus dependency on spinlock.h

pruning sched.h dependencies, headers shouldn't pull in more than they
need.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/ipc.h   | 2 +-
 include/linux/sched.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index e1c9eea6015b..9b1434247aab 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_IPC_H
 #define _LINUX_IPC_H
 
-#include <linux/spinlock.h>
+#include <linux/spinlock_types.h>
 #include <linux/uidgid.h>
 #include <linux/rhashtable-types.h>
 #include <uapi/linux/ipc.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8c230f24688b..34400b16e57c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2139,6 +2139,8 @@ extern bool sched_task_on_rq(struct task_struct *p);
 extern unsigned long get_wchan(struct task_struct *p);
 extern struct task_struct *cpu_curr_snapshot(int cpu);
 
+#include <linux/spinlock.h>
+
 /*
  * In order to reduce various lock holder preemption latencies provide an
  * interface to see if a vCPU is currently running or not.
-- 
cgit v1.2.3


From 9983deb26d9021aecd971d25abf4cd263c72c385 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:01:25 -0500
Subject: Split out irqflags_types.h

We're working on only pulling in type definitions to sched.h whenever
possible.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/irqflags.h       | 14 +-------------
 include/linux/irqflags_types.h | 22 ++++++++++++++++++++++
 include/linux/sched.h          |  2 +-
 3 files changed, 24 insertions(+), 14 deletions(-)
 create mode 100644 include/linux/irqflags_types.h

(limited to 'include/linux')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 2b665c32f5fe..147feebd508c 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -12,6 +12,7 @@
 #ifndef _LINUX_TRACE_IRQFLAGS_H
 #define _LINUX_TRACE_IRQFLAGS_H
 
+#include <linux/irqflags_types.h>
 #include <linux/typecheck.h>
 #include <linux/cleanup.h>
 #include <asm/irqflags.h>
@@ -34,19 +35,6 @@
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 
-/* Per-task IRQ trace events information. */
-struct irqtrace_events {
-	unsigned int	irq_events;
-	unsigned long	hardirq_enable_ip;
-	unsigned long	hardirq_disable_ip;
-	unsigned int	hardirq_enable_event;
-	unsigned int	hardirq_disable_event;
-	unsigned long	softirq_disable_ip;
-	unsigned long	softirq_enable_ip;
-	unsigned int	softirq_disable_event;
-	unsigned int	softirq_enable_event;
-};
-
 DECLARE_PER_CPU(int, hardirqs_enabled);
 DECLARE_PER_CPU(int, hardirq_context);
 
diff --git a/include/linux/irqflags_types.h b/include/linux/irqflags_types.h
new file mode 100644
index 000000000000..c13f0d915097
--- /dev/null
+++ b/include/linux/irqflags_types.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IRQFLAGS_TYPES_H
+#define _LINUX_IRQFLAGS_TYPES_H
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/* Per-task IRQ trace events information. */
+struct irqtrace_events {
+	unsigned int	irq_events;
+	unsigned long	hardirq_enable_ip;
+	unsigned long	hardirq_disable_ip;
+	unsigned int	hardirq_enable_event;
+	unsigned int	hardirq_disable_event;
+	unsigned long	softirq_disable_ip;
+	unsigned long	softirq_enable_ip;
+	unsigned int	softirq_disable_event;
+	unsigned int	softirq_enable_event;
+};
+
+#endif
+
+#endif /* _LINUX_IRQFLAGS_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 34400b16e57c..026390e6fe59 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -11,6 +11,7 @@
 
 #include <asm/current.h>
 
+#include <linux/irqflags_types.h>
 #include <linux/pid_types.h>
 #include <linux/sem.h>
 #include <linux/shm.h>
@@ -18,7 +19,6 @@
 #include <linux/mutex_types.h>
 #include <linux/plist_types.h>
 #include <linux/hrtimer_types.h>
-#include <linux/irqflags.h>
 #include <linux/seccomp.h>
 #include <linux/nodemask_types.h>
 #include <linux/rcupdate.h>
-- 
cgit v1.2.3


From 959d8dc8046186ffea5410f51fcb309880f0dfaa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:15:35 -0500
Subject: mm_types_task.h: Trim dependencies

more sched.h header dependency trimming

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/mm_types_task.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index aa44fff8bb9d..a2f6179b672b 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -9,9 +9,6 @@
  */
 
 #include <linux/types.h>
-#include <linux/threads.h>
-#include <linux/atomic.h>
-#include <linux/cpumask.h>
 
 #include <asm/page.h>
 
@@ -36,6 +33,8 @@ enum {
 	NR_MM_COUNTERS
 };
 
+struct page;
+
 struct page_frag {
 	struct page *page;
 #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
-- 
cgit v1.2.3


From 55b899aa3e7d0dc02ff9075b883d29eb2d0cb49a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:25:40 -0500
Subject: syscall_user_dispatch.h: split out *_types.h

thread_info.h pulls in a lot of junk that sched.h that we don't need; in
particular, this helps to kill the printk.h dependency.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/sched.h                       |  2 +-
 include/linux/syscall_user_dispatch.h       |  9 +--------
 include/linux/syscall_user_dispatch_types.h | 22 ++++++++++++++++++++++
 3 files changed, 24 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/syscall_user_dispatch_types.h

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 026390e6fe59..925711edd7f7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -28,7 +28,7 @@
 #include <linux/sched/prio.h>
 #include <linux/sched/types.h>
 #include <linux/signal_types.h>
-#include <linux/syscall_user_dispatch.h>
+#include <linux/syscall_user_dispatch_types.h>
 #include <linux/mm_types_task.h>
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers_types.h>
diff --git a/include/linux/syscall_user_dispatch.h b/include/linux/syscall_user_dispatch.h
index 641ca8880995..3858a6ffdd5c 100644
--- a/include/linux/syscall_user_dispatch.h
+++ b/include/linux/syscall_user_dispatch.h
@@ -6,16 +6,10 @@
 #define _SYSCALL_USER_DISPATCH_H
 
 #include <linux/thread_info.h>
+#include <linux/syscall_user_dispatch_types.h>
 
 #ifdef CONFIG_GENERIC_ENTRY
 
-struct syscall_user_dispatch {
-	char __user	*selector;
-	unsigned long	offset;
-	unsigned long	len;
-	bool		on_dispatch;
-};
-
 int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
 			      unsigned long len, char __user *selector);
 
@@ -29,7 +23,6 @@ int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long siz
 				     void __user *data);
 
 #else
-struct syscall_user_dispatch {};
 
 static inline int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
 					    unsigned long len, char __user *selector)
diff --git a/include/linux/syscall_user_dispatch_types.h b/include/linux/syscall_user_dispatch_types.h
new file mode 100644
index 000000000000..3be36b06c7d7
--- /dev/null
+++ b/include/linux/syscall_user_dispatch_types.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _SYSCALL_USER_DISPATCH_TYPES_H
+#define _SYSCALL_USER_DISPATCH_TYPES_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_GENERIC_ENTRY
+
+struct syscall_user_dispatch {
+	char __user	*selector;
+	unsigned long	offset;
+	unsigned long	len;
+	bool		on_dispatch;
+};
+
+#else
+
+struct syscall_user_dispatch {};
+
+#endif
+
+#endif /* _SYSCALL_USER_DISPATCH_TYPES_H */
-- 
cgit v1.2.3


From f9d6966b7f4182f612208f9dad9e2cfaaf667ba3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 15:15:38 -0500
Subject: refcount: Split out refcount_types.h

More trimming of sched.h dependencies.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/refcount.h       | 13 +------------
 include/linux/refcount_types.h | 19 +++++++++++++++++++
 include/linux/sched.h          |  2 +-
 include/linux/sched/task.h     |  1 +
 4 files changed, 22 insertions(+), 13 deletions(-)
 create mode 100644 include/linux/refcount_types.h

(limited to 'include/linux')

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index a62fcca97486..85c6df0d1bef 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -96,22 +96,11 @@
 #include <linux/bug.h>
 #include <linux/compiler.h>
 #include <linux/limits.h>
+#include <linux/refcount_types.h>
 #include <linux/spinlock_types.h>
 
 struct mutex;
 
-/**
- * typedef refcount_t - variant of atomic_t specialized for reference counts
- * @refs: atomic_t counter field
- *
- * The counter saturates at REFCOUNT_SATURATED and will not move once
- * there. This avoids wrapping the counter and causing 'spurious'
- * use-after-free bugs.
- */
-typedef struct refcount_struct {
-	atomic_t refs;
-} refcount_t;
-
 #define REFCOUNT_INIT(n)	{ .refs = ATOMIC_INIT(n), }
 #define REFCOUNT_MAX		INT_MAX
 #define REFCOUNT_SATURATED	(INT_MIN / 2)
diff --git a/include/linux/refcount_types.h b/include/linux/refcount_types.h
new file mode 100644
index 000000000000..162004f06edf
--- /dev/null
+++ b/include/linux/refcount_types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_REFCOUNT_TYPES_H
+#define _LINUX_REFCOUNT_TYPES_H
+
+#include <linux/types.h>
+
+/**
+ * typedef refcount_t - variant of atomic_t specialized for reference counts
+ * @refs: atomic_t counter field
+ *
+ * The counter saturates at REFCOUNT_SATURATED and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free bugs.
+ */
+typedef struct refcount_struct {
+	atomic_t refs;
+} refcount_t;
+
+#endif /* _LINUX_REFCOUNT_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 925711edd7f7..414e4df701ec 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -22,7 +22,7 @@
 #include <linux/seccomp.h>
 #include <linux/nodemask_types.h>
 #include <linux/rcupdate.h>
-#include <linux/refcount.h>
+#include <linux/refcount_types.h>
 #include <linux/resource.h>
 #include <linux/latencytop.h>
 #include <linux/sched/prio.h>
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a23af225c898..1880ae21a9cb 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -7,6 +7,7 @@
  * functionality:
  */
 
+#include <linux/refcount.h>
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 
-- 
cgit v1.2.3


From a6e1420ce4fc91da56c0a2444c4482245e7617d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 15:30:14 -0500
Subject: seccomp: Split out seccomp_types.h

More pruning of sched.h dependencies.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/sched.h         |  2 +-
 include/linux/seccomp.h       | 22 +---------------------
 include/linux/seccomp_types.h | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/seccomp_types.h

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 414e4df701ec..db6e3e629606 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -19,7 +19,7 @@
 #include <linux/mutex_types.h>
 #include <linux/plist_types.h>
 #include <linux/hrtimer_types.h>
-#include <linux/seccomp.h>
+#include <linux/seccomp_types.h>
 #include <linux/nodemask_types.h>
 #include <linux/rcupdate.h>
 #include <linux/refcount_types.h>
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 1ec0d8dc4b69..709ad84809e1 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -3,6 +3,7 @@
 #define _LINUX_SECCOMP_H
 
 #include <uapi/linux/seccomp.h>
+#include <linux/seccomp_types.h>
 
 #define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC | \
 					 SECCOMP_FILTER_FLAG_LOG | \
@@ -21,25 +22,6 @@
 #include <linux/atomic.h>
 #include <asm/seccomp.h>
 
-struct seccomp_filter;
-/**
- * struct seccomp - the state of a seccomp'ed process
- *
- * @mode:  indicates one of the valid values above for controlled
- *         system calls available to a process.
- * @filter_count: number of seccomp filters
- * @filter: must always point to a valid seccomp-filter or NULL as it is
- *          accessed without locking during system call entry.
- *
- *          @filter must only be accessed from the context of current as there
- *          is no read locking.
- */
-struct seccomp {
-	int mode;
-	atomic_t filter_count;
-	struct seccomp_filter *filter;
-};
-
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
 extern int __secure_computing(const struct seccomp_data *sd);
 static inline int secure_computing(void)
@@ -64,8 +46,6 @@ static inline int seccomp_mode(struct seccomp *s)
 
 #include <linux/errno.h>
 
-struct seccomp { };
-struct seccomp_filter { };
 struct seccomp_data;
 
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
diff --git a/include/linux/seccomp_types.h b/include/linux/seccomp_types.h
new file mode 100644
index 000000000000..cf0a0355024f
--- /dev/null
+++ b/include/linux/seccomp_types.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SECCOMP_TYPES_H
+#define _LINUX_SECCOMP_TYPES_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_SECCOMP
+
+struct seccomp_filter;
+/**
+ * struct seccomp - the state of a seccomp'ed process
+ *
+ * @mode:  indicates one of the valid values above for controlled
+ *         system calls available to a process.
+ * @filter_count: number of seccomp filters
+ * @filter: must always point to a valid seccomp-filter or NULL as it is
+ *          accessed without locking during system call entry.
+ *
+ *          @filter must only be accessed from the context of current as there
+ *          is no read locking.
+ */
+struct seccomp {
+	int mode;
+	atomic_t filter_count;
+	struct seccomp_filter *filter;
+};
+
+#else
+
+struct seccomp { };
+struct seccomp_filter { };
+
+#endif
+
+#endif /* _LINUX_SECCOMP_TYPES_H */
-- 
cgit v1.2.3


From af6da56a223831cb74d1cf006f5742db6403398e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 15:51:30 -0500
Subject: uidgid: Split out uidgid_types.h

More sched.h dependency pruning.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/sched.h        |  1 +
 include/linux/uidgid.h       | 11 +----------
 include/linux/uidgid_types.h | 15 +++++++++++++++
 3 files changed, 17 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/uidgid_types.h

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index db6e3e629606..10e25c7fc793 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -37,6 +37,7 @@
 #include <linux/kcsan.h>
 #include <linux/rv.h>
 #include <linux/livepatch_sched.h>
+#include <linux/uidgid_types.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index b0542cd11aeb..ba20b62f13e1 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -12,21 +12,12 @@
  * to detect when we overlook these differences.
  *
  */
-#include <linux/types.h>
+#include <linux/uidgid_types.h>
 #include <linux/highuid.h>
 
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
-typedef struct {
-	uid_t val;
-} kuid_t;
-
-
-typedef struct {
-	gid_t val;
-} kgid_t;
-
 #define KUIDT_INIT(value) (kuid_t){ value }
 #define KGIDT_INIT(value) (kgid_t){ value }
 
diff --git a/include/linux/uidgid_types.h b/include/linux/uidgid_types.h
new file mode 100644
index 000000000000..b35ac4955a33
--- /dev/null
+++ b/include/linux/uidgid_types.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UIDGID_TYPES_H
+#define _LINUX_UIDGID_TYPES_H
+
+#include <linux/types.h>
+
+typedef struct {
+	uid_t val;
+} kuid_t;
+
+typedef struct {
+	gid_t val;
+} kgid_t;
+
+#endif /* _LINUX_UIDGID_TYPES_H */
-- 
cgit v1.2.3


From e034d49eb01c7c83a08a3ce2a1091b55f806b26b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 15:52:17 -0500
Subject: sem: Split out sem_types.h

More sched.h dependency pruning.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/audit.h     |  1 +
 include/linux/sched.h     |  4 +++-
 include/linux/sem.h       | 10 +---------
 include/linux/sem_types.h | 13 +++++++++++++
 4 files changed, 18 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/sem_types.h

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 51b1b7054a23..0050ef288ab3 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -36,6 +36,7 @@ struct mqstat;
 struct audit_watch;
 struct audit_tree;
 struct sk_buff;
+struct kern_ipc_perm;
 
 struct audit_krule {
 	u32			pflags;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 10e25c7fc793..d341a6c302d6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -11,14 +11,16 @@
 
 #include <asm/current.h>
 
+#include <linux/cache.h>
 #include <linux/irqflags_types.h>
 #include <linux/pid_types.h>
-#include <linux/sem.h>
+#include <linux/sem_types.h>
 #include <linux/shm.h>
 #include <linux/kmsan_types.h>
 #include <linux/mutex_types.h>
 #include <linux/plist_types.h>
 #include <linux/hrtimer_types.h>
+#include <linux/timer_types.h>
 #include <linux/seccomp_types.h>
 #include <linux/nodemask_types.h>
 #include <linux/rcupdate.h>
diff --git a/include/linux/sem.h b/include/linux/sem.h
index 5608a500c43e..c4deefe42aeb 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -3,25 +3,17 @@
 #define _LINUX_SEM_H
 
 #include <uapi/linux/sem.h>
+#include <linux/sem_types.h>
 
 struct task_struct;
-struct sem_undo_list;
 
 #ifdef CONFIG_SYSVIPC
 
-struct sysv_sem {
-	struct sem_undo_list *undo_list;
-};
-
 extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
 extern void exit_sem(struct task_struct *tsk);
 
 #else
 
-struct sysv_sem {
-	/* empty */
-};
-
 static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
 {
 	return 0;
diff --git a/include/linux/sem_types.h b/include/linux/sem_types.h
new file mode 100644
index 000000000000..73df1971a7ae
--- /dev/null
+++ b/include/linux/sem_types.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SEM_TYPES_H
+#define _LINUX_SEM_TYPES_H
+
+struct sem_undo_list;
+
+struct sysv_sem {
+#ifdef CONFIG_SYSVIPC
+	struct sem_undo_list *undo_list;
+#endif
+};
+
+#endif /* _LINUX_SEM_TYPES_H */
-- 
cgit v1.2.3


From 99bac36667b6b20b9b0a20dc976365d23f90628b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 16:58:51 -0500
Subject: lockdep: move held_lock to lockdep_types.h

held_lock is embedded in task_struct, and we don't want sched.h pulling
in all of lockdep.h

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Acked-by: Waiman Long <longman@redhat.com>
---
 include/linux/lockdep.h       | 57 -------------------------------------------
 include/linux/lockdep_types.h | 57 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index dc2844b071c2..08b0d1d9d78b 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -82,63 +82,6 @@ struct lock_chain {
 	u64				chain_key;
 };
 
-#define MAX_LOCKDEP_KEYS_BITS		13
-#define MAX_LOCKDEP_KEYS		(1UL << MAX_LOCKDEP_KEYS_BITS)
-#define INITIAL_CHAIN_KEY		-1
-
-struct held_lock {
-	/*
-	 * One-way hash of the dependency chain up to this point. We
-	 * hash the hashes step by step as the dependency chain grows.
-	 *
-	 * We use it for dependency-caching and we skip detection
-	 * passes and dependency-updates if there is a cache-hit, so
-	 * it is absolutely critical for 100% coverage of the validator
-	 * to have a unique key value for every unique dependency path
-	 * that can occur in the system, to make a unique hash value
-	 * as likely as possible - hence the 64-bit width.
-	 *
-	 * The task struct holds the current hash value (initialized
-	 * with zero), here we store the previous hash value:
-	 */
-	u64				prev_chain_key;
-	unsigned long			acquire_ip;
-	struct lockdep_map		*instance;
-	struct lockdep_map		*nest_lock;
-#ifdef CONFIG_LOCK_STAT
-	u64 				waittime_stamp;
-	u64				holdtime_stamp;
-#endif
-	/*
-	 * class_idx is zero-indexed; it points to the element in
-	 * lock_classes this held lock instance belongs to. class_idx is in
-	 * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive.
-	 */
-	unsigned int			class_idx:MAX_LOCKDEP_KEYS_BITS;
-	/*
-	 * The lock-stack is unified in that the lock chains of interrupt
-	 * contexts nest ontop of process context chains, but we 'separate'
-	 * the hashes by starting with 0 if we cross into an interrupt
-	 * context, and we also keep do not add cross-context lock
-	 * dependencies - the lock usage graph walking covers that area
-	 * anyway, and we'd just unnecessarily increase the number of
-	 * dependencies otherwise. [Note: hardirq and softirq contexts
-	 * are separated from each other too.]
-	 *
-	 * The following field is used to detect when we cross into an
-	 * interrupt context:
-	 */
-	unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
-	unsigned int trylock:1;						/* 16 bits */
-
-	unsigned int read:2;        /* see lock_acquire() comment */
-	unsigned int check:1;       /* see lock_acquire() comment */
-	unsigned int hardirqs_off:1;
-	unsigned int sync:1;
-	unsigned int references:11;					/* 32 bits */
-	unsigned int pin_count;
-};
-
 /*
  * Initialization, self-test and debugging-output methods:
  */
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 2ebc323d345a..9c533c8d701e 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -198,6 +198,63 @@ struct lockdep_map {
 
 struct pin_cookie { unsigned int val; };
 
+#define MAX_LOCKDEP_KEYS_BITS		13
+#define MAX_LOCKDEP_KEYS		(1UL << MAX_LOCKDEP_KEYS_BITS)
+#define INITIAL_CHAIN_KEY		-1
+
+struct held_lock {
+	/*
+	 * One-way hash of the dependency chain up to this point. We
+	 * hash the hashes step by step as the dependency chain grows.
+	 *
+	 * We use it for dependency-caching and we skip detection
+	 * passes and dependency-updates if there is a cache-hit, so
+	 * it is absolutely critical for 100% coverage of the validator
+	 * to have a unique key value for every unique dependency path
+	 * that can occur in the system, to make a unique hash value
+	 * as likely as possible - hence the 64-bit width.
+	 *
+	 * The task struct holds the current hash value (initialized
+	 * with zero), here we store the previous hash value:
+	 */
+	u64				prev_chain_key;
+	unsigned long			acquire_ip;
+	struct lockdep_map		*instance;
+	struct lockdep_map		*nest_lock;
+#ifdef CONFIG_LOCK_STAT
+	u64 				waittime_stamp;
+	u64				holdtime_stamp;
+#endif
+	/*
+	 * class_idx is zero-indexed; it points to the element in
+	 * lock_classes this held lock instance belongs to. class_idx is in
+	 * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive.
+	 */
+	unsigned int			class_idx:MAX_LOCKDEP_KEYS_BITS;
+	/*
+	 * The lock-stack is unified in that the lock chains of interrupt
+	 * contexts nest ontop of process context chains, but we 'separate'
+	 * the hashes by starting with 0 if we cross into an interrupt
+	 * context, and we also keep do not add cross-context lock
+	 * dependencies - the lock usage graph walking covers that area
+	 * anyway, and we'd just unnecessarily increase the number of
+	 * dependencies otherwise. [Note: hardirq and softirq contexts
+	 * are separated from each other too.]
+	 *
+	 * The following field is used to detect when we cross into an
+	 * interrupt context:
+	 */
+	unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
+	unsigned int trylock:1;						/* 16 bits */
+
+	unsigned int read:2;        /* see lock_acquire() comment */
+	unsigned int check:1;       /* see lock_acquire() comment */
+	unsigned int hardirqs_off:1;
+	unsigned int sync:1;
+	unsigned int references:11;					/* 32 bits */
+	unsigned int pin_count;
+};
+
 #else /* !CONFIG_LOCKDEP */
 
 /*
-- 
cgit v1.2.3


From cba6167f0adb07b6cd1b8758dd67718c772e108c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 17:00:41 -0500
Subject: restart_block: Trim includes

We don't actually use any timekeeping types, no need to pull in
time64.h.

Also, sched.h uses restart_block; add it as a direct dependency.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/restart_block.h | 2 +-
 include/linux/sched.h         | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h
index 980a65594412..13f17676c5f4 100644
--- a/include/linux/restart_block.h
+++ b/include/linux/restart_block.h
@@ -7,8 +7,8 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
-#include <linux/time64.h>
 
+struct __kernel_timespec;
 struct timespec;
 struct old_timespec32;
 struct pollfd;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d341a6c302d6..dd002d193726 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include <linux/mm_types_task.h>
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers_types.h>
+#include <linux/restart_block.h>
 #include <linux/rseq.h>
 #include <linux/seqlock_types.h>
 #include <linux/kcsan.h>
-- 
cgit v1.2.3


From a7e7b40c4bc115dbf2a2bb453d7bbb2e0ea99703 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@nvidia.com>
Date: Fri, 15 Dec 2023 19:31:14 -0800
Subject: net/mlx5e: Use the correct lag ports number when creating TISes

The cited commit moved the code of mlx5e_create_tises() and changed the
loop to create TISes over MLX5_MAX_PORTS constant value, instead of
getting the correct lag ports supported by the device, which can cause
FW errors on devices with less than MLX5_MAX_PORTS ports.

Change that back to mlx5e_get_num_lag_ports(mdev).

Also IPoIB interfaces create there own TISes, they don't use the eth
TISes, pass a flag to indicate that.

Fixes: b25bd37c859f ("net/mlx5: Move TISes from priv to mdev HW resources")
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7ee5b79ff3d6..aafb36c9e5d9 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -681,6 +681,7 @@ struct mlx5e_resources {
 		struct mlx5_sq_bfreg       bfreg;
 #define MLX5_MAX_NUM_TC 8
 		u32                        tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC];
+		bool			   tisn_valid;
 	} hw_objs;
 	struct net_device *uplink_netdev;
 	struct mutex uplink_netdev_lock;
-- 
cgit v1.2.3


From e04984a37398b3f4f5a79c993b94c6b1224184cc Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Tue, 19 Dec 2023 14:46:20 +0200
Subject: net/mlx5: Fix query of sd_group field

The sd_group field moved in the HW spec from the MPIR register
to the vport context.
Align the query accordingly.

Fixes: f5e956329960 ("net/mlx5: Expose Management PCIe Index Register (MPIR)")
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 10 +++++++---
 include/linux/mlx5/vport.h    |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index fee20fc010c2..bf2d51952e48 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4030,8 +4030,13 @@ struct mlx5_ifc_nic_vport_context_bits {
 	u8	   affiliation_criteria[0x4];
 	u8	   affiliated_vhca_id[0x10];
 
-	u8	   reserved_at_60[0xd0];
+	u8	   reserved_at_60[0xa0];
 
+	u8	   reserved_at_100[0x1];
+	u8         sd_group[0x3];
+	u8	   reserved_at_104[0x1c];
+
+	u8	   reserved_at_120[0x10];
 	u8         mtu[0x10];
 
 	u8         system_image_guid[0x40];
@@ -10116,8 +10121,7 @@ struct mlx5_ifc_mpir_reg_bits {
 	u8         reserved_at_20[0x20];
 
 	u8         local_port[0x8];
-	u8         reserved_at_28[0x15];
-	u8         sd_group[0x3];
+	u8         reserved_at_28[0x18];
 
 	u8         reserved_at_60[0x20];
 };
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index fbb9bf447889..c36cc6d82926 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -72,6 +72,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu);
 int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu);
 int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
 					   u64 *system_image_guid);
+int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group);
 int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
 int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
 				    u16 vport, u64 node_guid);
-- 
cgit v1.2.3


From c88c49ac9c18fb7c3fa431126de1d8f8f555e912 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Tue, 5 Dec 2023 23:54:21 +0200
Subject: net/mlx5: Enable SD feature

Have an actual mlx5_sd instance in the core device, and fix the getter
accordingly. This allows SD stuff to flow, the feature becomes supported
only here.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index aafb36c9e5d9..cd286b681970 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -822,6 +822,7 @@ struct mlx5_core_dev {
 	struct blocking_notifier_head macsec_nh;
 #endif
 	u64 num_ipsec_offloads;
+	struct mlx5_sd          *sd;
 };
 
 struct mlx5_db {
-- 
cgit v1.2.3


From 22c4640698a1d47606b5a4264a584e8046641784 Mon Sep 17 00:00:00 2001
From: Armen Ratner <armeng@nvidia.com>
Date: Fri, 8 Sep 2023 14:53:09 -0500
Subject: net/mlx5: Implement management PF Ethernet profile

Add management PF modules, which introduce support for the structures
needed to create the resources for the MGMT PF to work.
Also, add the necessary calls and functions to establish this
functionality.

Signed-off-by: Armen Ratner <armeng@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Daniel Jurgens <danielj@nvidia.com>
---
 include/linux/mlx5/driver.h   |  8 ++++++++
 include/linux/mlx5/mlx5_ifc.h | 14 +++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index cd286b681970..2bba88c67f58 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1224,6 +1224,14 @@ static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev)
 	return dev->caps.embedded_cpu;
 }
 
+static inline bool mlx5_core_is_mgmt_pf(const struct mlx5_core_dev *dev)
+{
+	if (!MLX5_CAP_GEN_2(dev, local_mng_port_valid))
+		return false;
+
+	return MLX5_CAP_GEN_2(dev, local_mng_port);
+}
+
 static inline bool
 mlx5_core_is_ecpf_esw_manager(const struct mlx5_core_dev *dev)
 {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index bf2d51952e48..586569209254 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1954,8 +1954,10 @@ enum {
 struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_0[0x80];
 
-	u8         migratable[0x1];
-	u8         reserved_at_81[0x1f];
+	u8	   migratable[0x1];
+	u8	   reserved_at_81[0x19];
+	u8	   local_mng_port[0x1];
+	u8	   reserved_at_9b[0x5];
 
 	u8	   max_reformat_insert_size[0x8];
 	u8	   max_reformat_insert_offset[0x8];
@@ -1973,7 +1975,13 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 
 	u8	   allowed_object_for_other_vhca_access[0x40];
 
-	u8	   reserved_at_140[0x60];
+	u8	   reserved_at_140[0x20];
+
+	u8	   reserved_at_160[0xa];
+	u8	   local_mng_port_valid[0x1];
+	u8	   reserved_at_16b[0x15];
+
+	u8	   reserved_at_180[0x20];
 
 	u8	   flow_table_type_2_type[0x8];
 	u8	   reserved_at_1a8[0x3];
-- 
cgit v1.2.3


From ee9ec49046951eff704752669f0c388b506ddbdf Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Tue, 19 Dec 2023 18:50:02 +0100
Subject: iio: buffer-dma: Get rid of outgoing queue

The buffer-dma code was using two queues, incoming and outgoing, to
manage the state of the blocks in use.

While this totally works, it adds some complexity to the code,
especially since the code only manages 2 blocks. It is much easier to
just check each block's state manually, and keep a counter for the next
block to dequeue.

Since the new DMABUF based API wouldn't use the outgoing queue anyway,
getting rid of it now makes the upcoming changes simpler.

With this change, the IIO_BLOCK_STATE_DEQUEUED is now useless, and can
be removed.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Link: https://lore.kernel.org/r/20231219175009.65482-2-paul@crapouillou.net
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/buffer-dma.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h
index 6564bdcdac66..18d3702fa95d 100644
--- a/include/linux/iio/buffer-dma.h
+++ b/include/linux/iio/buffer-dma.h
@@ -19,14 +19,12 @@ struct device;
 
 /**
  * enum iio_block_state - State of a struct iio_dma_buffer_block
- * @IIO_BLOCK_STATE_DEQUEUED: Block is not queued
  * @IIO_BLOCK_STATE_QUEUED: Block is on the incoming queue
  * @IIO_BLOCK_STATE_ACTIVE: Block is currently being processed by the DMA
  * @IIO_BLOCK_STATE_DONE: Block is on the outgoing queue
  * @IIO_BLOCK_STATE_DEAD: Block has been marked as to be freed
  */
 enum iio_block_state {
-	IIO_BLOCK_STATE_DEQUEUED,
 	IIO_BLOCK_STATE_QUEUED,
 	IIO_BLOCK_STATE_ACTIVE,
 	IIO_BLOCK_STATE_DONE,
@@ -73,12 +71,15 @@ struct iio_dma_buffer_block {
  * @active_block: Block being used in read()
  * @pos: Read offset in the active block
  * @block_size: Size of each block
+ * @next_dequeue: index of next block that will be dequeued
  */
 struct iio_dma_buffer_queue_fileio {
 	struct iio_dma_buffer_block *blocks[2];
 	struct iio_dma_buffer_block *active_block;
 	size_t pos;
 	size_t block_size;
+
+	unsigned int next_dequeue;
 };
 
 /**
@@ -93,7 +94,6 @@ struct iio_dma_buffer_queue_fileio {
  *   list and typically also a list of active blocks in the part that handles
  *   the DMA controller
  * @incoming: List of buffers on the incoming queue
- * @outgoing: List of buffers on the outgoing queue
  * @active: Whether the buffer is currently active
  * @fileio: FileIO state
  */
@@ -105,7 +105,6 @@ struct iio_dma_buffer_queue {
 	struct mutex lock;
 	spinlock_t list_lock;
 	struct list_head incoming;
-	struct list_head outgoing;
 
 	bool active;
 
-- 
cgit v1.2.3


From 3efdc78fdc21ab82694707eb234ab93f28d13ba8 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 13 Dec 2023 22:44:38 -0800
Subject: fs/proc: show correct device and inode numbers in /proc/pid/maps

/proc/pid/maps shows device and inode numbers of vma->vm_file-s. Here is
an issue. If a mapped file is on a stackable file system (e.g.,
overlayfs), vma->vm_file is a backing file whose f_inode is on the
underlying filesystem. To show correct numbers, we need to get a user
file and shows its numbers. The same trick is used to show file paths in
/proc/pid/maps.

Cc: Alexander Mikhalitsyn <alexander@mihalicyn.com>
Suggested-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Andrei Vagin <avagin@google.com>
Link: https://lore.kernel.org/r/20231214064439.1023011-1-avagin@google.com
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index f171505940ff..a3a48a5d8728 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2523,20 +2523,28 @@ struct file *backing_file_open(const struct path *user_path, int flags,
 struct path *backing_file_user_path(struct file *f);
 
 /*
- * file_user_path - get the path to display for memory mapped file
- *
  * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
  * stored in ->vm_file is a backing file whose f_inode is on the underlying
- * filesystem.  When the mapped file path is displayed to user (e.g. via
- * /proc/<pid>/maps), this helper should be used to get the path to display
- * to the user, which is the path of the fd that user has requested to map.
+ * filesystem.  When the mapped file path and inode number are displayed to
+ * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the
+ * path and inode number to display to the user, which is the path of the fd
+ * that user has requested to map and the inode number that would be returned
+ * by fstat() on that same fd.
  */
+/* Get the path to display in /proc/<pid>/maps */
 static inline const struct path *file_user_path(struct file *f)
 {
 	if (unlikely(f->f_mode & FMODE_BACKING))
 		return backing_file_user_path(f);
 	return &f->f_path;
 }
+/* Get the inode whose inode number to display in /proc/<pid>/maps */
+static inline const struct inode *file_user_inode(struct file *f)
+{
+	if (unlikely(f->f_mode & FMODE_BACKING))
+		return d_inode(backing_file_user_path(f)->dentry);
+	return file_inode(f);
+}
 
 static inline struct file *file_clone_open(struct file *file)
 {
-- 
cgit v1.2.3


From 5ae81209491ed3718fee798db6fb2cc81214824c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 14:52:36 +0100
Subject: driver core: bus: make bus_sort_breadthfirst() take a const pointer

For some reason, during the big "clean up the driver core for a const
struct bus_type" work, the bus_sort_breadthfirst() call was missed.  Fix
this up by changing the type to be a const * as it should be.

Cc: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/2023121935-stinking-ditzy-fd5d@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/bus.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index ae10c4322754..25127f750349 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -232,7 +232,7 @@ bus_find_device_by_acpi_dev(const struct bus_type *bus, const void *adev)
 
 int bus_for_each_drv(const struct bus_type *bus, struct device_driver *start,
 		     void *data, int (*fn)(struct device_driver *, void *));
-void bus_sort_breadthfirst(struct bus_type *bus,
+void bus_sort_breadthfirst(const struct bus_type *bus,
 			   int (*compare)(const struct device *a,
 					  const struct device *b));
 /*
-- 
cgit v1.2.3


From 32f78abe59c740b6ec34c89dc10a09208eae7e1f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 14:15:09 +0100
Subject: driver core: bus: constantify subsys_register() calls

The functions subsys_register() and subsys_virtual_register() should be
taking a constant pointer to a struct bus_type, as they do not actually
modify anything in it, so fix up the function definitions to do so
properly.

This also changes the pointer type in struct subsys_interface to be
constant as well, as again, that's the proper signature of it.

Cc: Rafael J. Wysocki <rafael@kernel.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/2023121908-grove-genetics-f8af@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 4aa34c8d1361..aefc5ca7f1cf 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -63,7 +63,7 @@ struct msi_device_data;
  */
 struct subsys_interface {
 	const char *name;
-	struct bus_type *subsys;
+	const struct bus_type *subsys;
 	struct list_head node;
 	int (*add_dev)(struct device *dev, struct subsys_interface *sif);
 	void (*remove_dev)(struct device *dev, struct subsys_interface *sif);
@@ -72,9 +72,9 @@ struct subsys_interface {
 int subsys_interface_register(struct subsys_interface *sif);
 void subsys_interface_unregister(struct subsys_interface *sif);
 
-int subsys_system_register(struct bus_type *subsys,
+int subsys_system_register(const struct bus_type *subsys,
 			   const struct attribute_group **groups);
-int subsys_virtual_register(struct bus_type *subsys,
+int subsys_virtual_register(const struct bus_type *subsys,
 			    const struct attribute_group **groups);
 
 /*
-- 
cgit v1.2.3


From dedb868994d8308c6c4650203e190ec619005806 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 16:03:20 +0100
Subject: driver core: container: make container_subsys const

Now that the driver core can properly handle constant struct bus_type,
move the container_subsys variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/2023121919-chatter-grumbling-9ef3@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/container.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/container.h b/include/linux/container.h
index 2566a1baa736..dd00cc918a92 100644
--- a/include/linux/container.h
+++ b/include/linux/container.h
@@ -12,7 +12,7 @@
 #include <linux/device.h>
 
 /* drivers/base/power/container.c */
-extern struct bus_type container_subsys;
+extern const struct bus_type container_subsys;
 
 struct container_dev {
 	struct device dev;
-- 
cgit v1.2.3


From 39299bdd2546688d92ed9db4948f6219ca1b9542 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 9 Dec 2023 00:41:55 +0000
Subject: keys, dns: Allow key types (eg. DNS) to be reclaimed immediately on
 expiry

If a key has an expiration time, then when that time passes, the key is
left around for a certain amount of time before being collected (5 mins by
default) so that EKEYEXPIRED can be returned instead of ENOKEY.  This is a
problem for DNS keys because we want to redo the DNS lookup immediately at
that point.

Fix this by allowing key types to be marked such that keys of that type
don't have this extra period, but are reclaimed as soon as they expire and
turn this on for dns_resolver-type keys.  To make this easier to handle,
key->expiry is changed to be permanent if TIME64_MAX rather than 0.

Furthermore, give such new-style negative DNS results a 1s default expiry
if no other expiry time is set rather than allowing it to stick around
indefinitely.  This shouldn't be zero as ls will follow a failing stat call
immediately with a second with AT_SYMLINK_NOFOLLOW added.

Fixes: 1a4240f4764a ("DNS: Separate out CIFS DNS Resolver code")
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Markus Suvanto <markus.suvanto@gmail.com>
cc: Wang Lei <wang840925@gmail.com>
cc: Jeff Layton <jlayton@redhat.com>
cc: Steve French <smfrench@gmail.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Jarkko Sakkinen <jarkko@kernel.org>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: linux-nfs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: keyrings@vger.kernel.org
cc: netdev@vger.kernel.org
---
 include/linux/key-type.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/key-type.h b/include/linux/key-type.h
index 7d985a1dfe4a..5caf3ce82373 100644
--- a/include/linux/key-type.h
+++ b/include/linux/key-type.h
@@ -73,6 +73,7 @@ struct key_type {
 
 	unsigned int flags;
 #define KEY_TYPE_NET_DOMAIN	0x00000001 /* Keys of this type have a net namespace domain */
+#define KEY_TYPE_INSTANT_REAP	0x00000002 /* Keys of this type don't have a delay after expiring */
 
 	/* vet a description */
 	int (*vet_description)(const char *description);
-- 
cgit v1.2.3


From 4515d08a742c76612b65d2f47a87d12860519842 Mon Sep 17 00:00:00 2001
From: Marco Pagani <marpagan@redhat.com>
Date: Thu, 21 Dec 2023 17:58:47 +0100
Subject: kernel/module: improve documentation for try_module_get()

The sentence "this call will fail if the module is already being
removed" is potentially confusing and may contradict the rest of the
documentation. If one tries to get a module that has already been
removed using a stale pointer, the kernel will crash.

Signed-off-by: Marco Pagani <marpagan@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index a98e188cf37b..08364d5cbc07 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -668,7 +668,7 @@ extern void __module_get(struct module *module);
  * @module: the module we should check for
  *
  * Only try to get a module reference count if the module is not being removed.
- * This call will fail if the module is already being removed.
+ * This call will fail if the module is in the process of being removed.
  *
  * Care must also be taken to ensure the module exists and is alive prior to
  * usage of this call. This can be gauranteed through two means:
-- 
cgit v1.2.3


From 645f3d85129d8aac3b896ba685fbc20a31c2c036 Mon Sep 17 00:00:00 2001
From: Mukesh Sisodiya <mukesh.sisodiya@intel.com>
Date: Wed, 20 Dec 2023 13:41:38 +0200
Subject: wifi: cfg80211: handle UHB AP and STA power type

UHB AP send supported power type(LPI, SP, VLP)
in beacon and probe response IE and STA should
connect to these AP only if their regulatory support
the AP power type.

Beacon/Probe response are reported to userspace
with reason "STA regulatory not supporting to connect to AP
based on transmitted power type" and it should
not connect to AP.

Signed-off-by: Mukesh Sisodiya <mukesh.sisodiya@intel.com>
Reviewed-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20231220133549.cbfbef9170a9.I432f78438de18aa9f5c9006be12e41dc34cc47c5@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 8ad008591e32..2f5554482047 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2720,6 +2720,7 @@ static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len)
 
 #define IEEE80211_6GHZ_CTRL_REG_LPI_AP	0
 #define IEEE80211_6GHZ_CTRL_REG_SP_AP	1
+#define IEEE80211_6GHZ_CTRL_REG_VLP_AP	2
 
 /**
  * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field
-- 
cgit v1.2.3


From d68019471995ba47e56a9da355df13a1cdb5bf7e Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 18 Dec 2023 08:45:18 +0100
Subject: entry: Move exit to usermode functions to header file

To allow inlining, move exit_to_user_mode() to
entry-common.h.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218074520.1998026-2-svens@linux.ibm.com
---
 include/linux/entry-common.h | 53 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index d95ab85f96ba..6a6e98f3805f 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -7,6 +7,10 @@
 #include <linux/syscalls.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
+#include <linux/context_tracking.h>
+#include <linux/livepatch.h>
+#include <linux/resume_user_mode.h>
+#include <linux/tick.h>
 
 #include <asm/entry-common.h>
 
@@ -258,6 +262,43 @@ static __always_inline void arch_exit_to_user_mode(void) { }
  */
 void arch_do_signal_or_restart(struct pt_regs *regs);
 
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ */
+unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+				     unsigned long ti_work);
+
+/**
+ * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * @regs:	Pointer to pt_regs on entry stack
+ *
+ * 1) check that interrupts are disabled
+ * 2) call tick_nohz_user_enter_prepare()
+ * 3) call exit_to_user_mode_loop() if any flags from
+ *    EXIT_TO_USER_MODE_WORK are set
+ * 4) check that interrupts are still disabled
+ */
+static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+	unsigned long ti_work;
+
+	lockdep_assert_irqs_disabled();
+
+	/* Flush pending rcuog wakeup before the last need_resched() check */
+	tick_nohz_user_enter_prepare();
+
+	ti_work = read_thread_flags();
+	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
+		ti_work = exit_to_user_mode_loop(regs, ti_work);
+
+	arch_exit_to_user_mode_prepare(regs, ti_work);
+
+	/* Ensure that kernel state is sane for a return to userspace */
+	kmap_assert_nomap();
+	lockdep_assert_irqs_disabled();
+	lockdep_sys_exit();
+}
+
 /**
  * exit_to_user_mode - Fixup state when exiting to user mode
  *
@@ -276,7 +317,17 @@ void arch_do_signal_or_restart(struct pt_regs *regs);
  * non-instrumentable.
  * The caller has to invoke syscall_exit_to_user_mode_work() before this.
  */
-void exit_to_user_mode(void);
+static __always_inline void exit_to_user_mode(void)
+{
+	instrumentation_begin();
+	trace_hardirqs_on_prepare();
+	lockdep_hardirqs_on_prepare();
+	instrumentation_end();
+
+	user_enter_irqoff();
+	arch_exit_to_user_mode();
+	lockdep_hardirqs_on(CALLER_ADDR0);
+}
 
 /**
  * syscall_exit_to_user_mode_work - Handle work before returning to user mode
-- 
cgit v1.2.3


From caf4062e35b21cd7d3d35ac2f58f9765d02d32a0 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 18 Dec 2023 08:45:19 +0100
Subject: entry: Move enter_from_user_mode() to header file

To allow inlining of enter_from_user_mode(), move it to
entry-common.h.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218074520.1998026-3-svens@linux.ibm.com
---
 include/linux/entry-common.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 6a6e98f3805f..c4205390448e 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -11,6 +11,7 @@
 #include <linux/livepatch.h>
 #include <linux/resume_user_mode.h>
 #include <linux/tick.h>
+#include <linux/kmsan.h>
 
 #include <asm/entry-common.h>
 
@@ -102,7 +103,19 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {}
  * done between establishing state and enabling interrupts. The caller must
  * enable interrupts before invoking syscall_enter_from_user_mode_work().
  */
-void enter_from_user_mode(struct pt_regs *regs);
+static __always_inline void enter_from_user_mode(struct pt_regs *regs)
+{
+	arch_enter_from_user_mode(regs);
+	lockdep_hardirqs_off(CALLER_ADDR0);
+
+	CT_WARN_ON(__ct_state() != CONTEXT_USER);
+	user_exit_irqoff();
+
+	instrumentation_begin();
+	kmsan_unpoison_entry_regs(regs);
+	trace_hardirqs_off_finish();
+	instrumentation_end();
+}
 
 /**
  * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
-- 
cgit v1.2.3


From 221a164035fd8b554a44bd7c4bf8e7715a497561 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 18 Dec 2023 08:45:20 +0100
Subject: entry: Move syscall_enter_from_user_mode() to header file

To allow inlining of syscall_enter_from_user_mode(), move it
to entry-common.h.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218074520.1998026-4-svens@linux.ibm.com
---
 include/linux/entry-common.h | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index c4205390448e..b0fb775a600d 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -134,6 +134,9 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
  */
 void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
 
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
+			 unsigned long work);
+
 /**
  * syscall_enter_from_user_mode_work - Check and handle work before invoking
  *				       a syscall
@@ -157,7 +160,15 @@ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
  *     ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter()
  *  2) Invocation of audit_syscall_entry()
  */
-long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);
+static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
+{
+	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
+
+	if (work & SYSCALL_WORK_ENTER)
+		syscall = syscall_trace_enter(regs, syscall, work);
+
+	return syscall;
+}
 
 /**
  * syscall_enter_from_user_mode - Establish state and check and handle work
@@ -176,7 +187,19 @@ long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);
  * Returns: The original or a modified syscall number. See
  * syscall_enter_from_user_mode_work() for further explanation.
  */
-long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall);
+static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+{
+	long ret;
+
+	enter_from_user_mode(regs);
+
+	instrumentation_begin();
+	local_irq_enable();
+	ret = syscall_enter_from_user_mode_work(regs, syscall);
+	instrumentation_end();
+
+	return ret;
+}
 
 /**
  * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable()
-- 
cgit v1.2.3


From bbcd80f53a5e8c27c2511f539fec8c373f500cf4 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Dec 2023 13:32:05 +0100
Subject: mtd: rawnand: Prevent crossing LUN boundaries during sequential reads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ONFI specification states that devices do not need to support
sequential reads across LUN boundaries. In order to prevent such event
from happening and possibly failing, let's introduce the concept of
"pause" in the sequential read to handle these cases. The first/last
pages remain the same but any time we cross a LUN boundary we will end
and restart (if relevant) the sequential read operation.

Cc: stable@vger.kernel.org
Fixes: 003fe4b9545b ("mtd: rawnand: Support for sequential cache reads")
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Tested-by: Martin Hundebøll <martin@geanix.com>
Link: https://lore.kernel.org/linux-mtd/20231215123208.516590-2-miquel.raynal@bootlin.com
---
 include/linux/mtd/rawnand.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index c29ace15a053..9d0fc5109af6 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1265,6 +1265,7 @@ struct nand_secure_region {
  * @cont_read: Sequential page read internals
  * @cont_read.ongoing: Whether a continuous read is ongoing or not
  * @cont_read.first_page: Start of the continuous read operation
+ * @cont_read.pause_page: End of the current sequential cache read operation
  * @cont_read.last_page: End of the continuous read operation
  * @controller: The hardware controller	structure which is shared among multiple
  *              independent devices
@@ -1321,6 +1322,7 @@ struct nand_chip {
 	struct {
 		bool ongoing;
 		unsigned int first_page;
+		unsigned int pause_page;
 		unsigned int last_page;
 	} cont_read;
 
-- 
cgit v1.2.3


From 8e6e83d77227d9ba39e0c7b50693f1b4f8728006 Mon Sep 17 00:00:00 2001
From: Kundan Kumar <kundan.kumar@samsung.com>
Date: Fri, 22 Dec 2023 15:47:07 +0530
Subject: block: skip start/end time stamping for passthrough IO

commit 41fa722239b4 ("blk-mq: do not include passthrough requests in I/O
accounting")' disables I/O accounting for passthrough requests. Since tools
like 'iostat' do not show anything useful for passthrough I/O, it's
wasteful to do start/end time-stamping. So do away with that.

Avoiding the time-stamping improves the I/O performance by ~7%

Signed-off-by: Kundan Kumar <kundan.kumar@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Link: https://lore.kernel.org/r/20231222101707.6921-1-kundan.kumar@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1ab3081c82ed..a676e116085f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -830,6 +830,12 @@ void blk_mq_end_request_batch(struct io_comp_batch *ib);
  */
 static inline bool blk_mq_need_time_stamp(struct request *rq)
 {
+	/*
+	 * passthrough io doesn't use iostat accounting, cgroup stats
+	 * and io scheduler functionalities.
+	 */
+	if (blk_rq_is_passthrough(rq))
+		return false;
 	return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED));
 }
 
-- 
cgit v1.2.3


From 60e43fe5285e2077ce9904d78cd42a230d03b788 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 21 Dec 2023 15:02:31 -0700
Subject: lib/firmware_table: tables: Add CDAT table parsing support

The CDAT table is very similar to ACPI tables when it comes to sub-table
and entry structures. The helper functions can be also used to parse the
CDAT table. Add support to the helper functions to deal with an external
CDAT table, and also handle the endieness since CDAT can be processed by a
BE host. Export a function cdat_table_parse() for CXL driver to parse
a CDAT table.

In order to minimize ACPICA code changes, __force is being utilized to deal
with the case of a big endian (BE) host parsing a CDAT. All CDAT data
structure variables are being force casted to __leX as appropriate.

Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Len Brown <lenb@kernel.org>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/170319615131.2212653.10932785667981494238.stgit@djiang5-mobl3
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/fw_table.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fw_table.h b/include/linux/fw_table.h
index ca49947f0a77..95421860397a 100644
--- a/include/linux/fw_table.h
+++ b/include/linux/fw_table.h
@@ -25,16 +25,35 @@ struct acpi_subtable_proc {
 	int count;
 };
 
+union fw_table_header {
+	struct acpi_table_header acpi;
+	struct acpi_table_cdat cdat;
+};
+
 union acpi_subtable_headers {
 	struct acpi_subtable_header common;
 	struct acpi_hmat_structure hmat;
 	struct acpi_prmt_module_header prmt;
 	struct acpi_cedt_header cedt;
+	struct acpi_cdat_header cdat;
 };
 
 int acpi_parse_entries_array(char *id, unsigned long table_size,
-			     struct acpi_table_header *table_header,
+			     union fw_table_header *table_header,
 			     struct acpi_subtable_proc *proc,
 			     int proc_num, unsigned int max_entries);
 
+int cdat_table_parse(enum acpi_cdat_type type,
+		     acpi_tbl_entry_handler_arg handler_arg, void *arg,
+		     struct acpi_table_cdat *table_header);
+
+/* CXL is the only non-ACPI consumer of the FIRMWARE_TABLE library */
+#if IS_ENABLED(CONFIG_ACPI) && !IS_ENABLED(CONFIG_CXL_BUS)
+#define EXPORT_SYMBOL_FWTBL_LIB(x) EXPORT_SYMBOL_ACPI_LIB(x)
+#define __init_or_fwtbl_lib __init_or_acpilib
+#else
+#define EXPORT_SYMBOL_FWTBL_LIB(x) EXPORT_SYMBOL_NS_GPL(x, CXL)
+#define __init_or_fwtbl_lib
+#endif
+
 #endif
-- 
cgit v1.2.3


From 6a954e94d038f41d79c4e04348c95774d1c9337d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 21 Dec 2023 15:02:37 -0700
Subject: base/node / acpi: Change 'node_hmem_attrs' to 'access_coordinates'

Dan Williams suggested changing the struct 'node_hmem_attrs' to
'access_coordinates' [1]. The struct is a container of r/w-latency and
r/w-bandwidth numbers. Moving forward, this container will also be used by
CXL to store the performance characteristics of each link hop in
the PCIE/CXL topology. So, where node_hmem_attrs is just the access
parameters of a memory-node, access_coordinates applies more broadly
to hardware topology characteristics. The observation is that seemed like
an exercise in having the application identify "where" it falls on a
spectrum of bandwidth and latency needs. For the tuple of
read/write-latency and read/write-bandwidth, "coordinates" is not a perfect
fit. Sometimes it is just conveying values in isolation and not a
"location" relative to other performance points, but in the end this data
is used to identify the performance operation point of a given memory-node.
[2]

Link: http://lore.kernel.org/r/64471313421f7_1b66294d5@dwillia2-xfh.jf.intel.com.notmuch/
Link: https://lore.kernel.org/linux-cxl/645e6215ee0de_1e6f2945e@dwillia2-xfh.jf.intel.com.notmuch/
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/170319615734.2212653.15319394025985499185.stgit@djiang5-mobl3
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/memory-tiers.h | 10 +++++-----
 include/linux/node.h         |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 1e39d27bee41..69e781900082 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -33,7 +33,7 @@ struct memory_dev_type {
 	struct kref kref;
 };
 
-struct node_hmem_attrs;
+struct access_coordinate;
 
 #ifdef CONFIG_NUMA
 extern bool numa_demotion_enabled;
@@ -45,9 +45,9 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype);
 int register_mt_adistance_algorithm(struct notifier_block *nb);
 int unregister_mt_adistance_algorithm(struct notifier_block *nb);
 int mt_calc_adistance(int node, int *adist);
-int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
 			     const char *source);
-int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist);
+int mt_perf_to_adistance(struct access_coordinate *perf, int *adist);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -126,13 +126,13 @@ static inline int mt_calc_adistance(int node, int *adist)
 	return NOTIFY_DONE;
 }
 
-static inline int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+static inline int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
 					   const char *source)
 {
 	return -EIO;
 }
 
-static inline int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
+static inline int mt_perf_to_adistance(struct access_coordinate *perf, int *adist)
 {
 	return -EIO;
 }
diff --git a/include/linux/node.h b/include/linux/node.h
index 427a5975cf40..25b66d705ee2 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -20,14 +20,14 @@
 #include <linux/list.h>
 
 /**
- * struct node_hmem_attrs - heterogeneous memory performance attributes
+ * struct access_coordinate - generic performance coordinates container
  *
  * @read_bandwidth:	Read bandwidth in MB/s
  * @write_bandwidth:	Write bandwidth in MB/s
  * @read_latency:	Read latency in nanoseconds
  * @write_latency:	Write latency in nanoseconds
  */
-struct node_hmem_attrs {
+struct access_coordinate {
 	unsigned int read_bandwidth;
 	unsigned int write_bandwidth;
 	unsigned int read_latency;
@@ -65,7 +65,7 @@ struct node_cache_attrs {
 
 #ifdef CONFIG_HMEM_REPORTING
 void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs);
-void node_set_perf_attrs(unsigned int nid, struct node_hmem_attrs *hmem_attrs,
+void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord,
 			 unsigned access);
 #else
 static inline void node_add_cache(unsigned int nid,
@@ -74,7 +74,7 @@ static inline void node_add_cache(unsigned int nid,
 }
 
 static inline void node_set_perf_attrs(unsigned int nid,
-				       struct node_hmem_attrs *hmem_attrs,
+				       struct access_coordinate *coord,
 				       unsigned access)
 {
 }
-- 
cgit v1.2.3


From ca53543d8e340070fb37fde93f36ed9012c76b90 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 21 Dec 2023 15:03:07 -0700
Subject: acpi: numa: Add helper function to retrieve the performance
 attributes

Add helper to retrieve the performance attributes based on the device
handle.  The helper function is exported so the CXL driver can use that
to acquire the performance data between the CPU and the CXL host bridge.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/170319618721.2212653.5552947472849081786.stgit@djiang5-mobl3
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/acpi.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4db54e928b36..8b0761c682f9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -15,6 +15,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/property.h>
 #include <linux/uuid.h>
+#include <linux/node.h>
 
 struct irq_domain;
 struct irq_domain_ops;
@@ -424,6 +425,16 @@ extern int acpi_blacklisted(void);
 extern void acpi_osi_setup(char *str);
 extern bool acpi_osi_is_win8(void);
 
+#ifdef CONFIG_ACPI_HMAT
+int acpi_get_genport_coordinates(u32 uid, struct access_coordinate *coord);
+#else
+static inline int acpi_get_genport_coordinates(u32 uid,
+					       struct access_coordinate *coord)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 #ifdef CONFIG_ACPI_NUMA
 int acpi_map_pxm_to_node(int pxm);
 int acpi_get_node(acpi_handle handle);
-- 
cgit v1.2.3


From 4d07a05397c8c15c37c8c3abb7afaea1dcd2f0e7 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 21 Dec 2023 15:03:39 -0700
Subject: cxl: Calculate and store PCI link latency for the downstream ports

The latency is calculated by dividing the flit size over the bandwidth. Add
support to retrieve the flit size for the CXL switch device and calculate
the latency of the PCIe link. Cache the latency number with cxl_dport.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/170319621931.2212653.6800240203604822886.stgit@djiang5-mobl3
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index dea043bc1e38..504a4ba2c29e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1364,6 +1364,7 @@ int pcie_set_mps(struct pci_dev *dev, int mps);
 u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
 			     enum pci_bus_speed *speed,
 			     enum pcie_link_width *width);
+int pcie_link_speed_mbps(struct pci_dev *pdev);
 void pcie_print_link_status(struct pci_dev *dev);
 int pcie_reset_flr(struct pci_dev *dev, bool probe);
 int pcie_flr(struct pci_dev *dev);
-- 
cgit v1.2.3


From dcc3e46472d678f4af5ce1194a23649231c5d241 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Mon, 18 Dec 2023 17:26:26 -0700
Subject: net: skbuff: Remove some excess struct-member documentation

Remove documentation for nonexistent structure members, addressing these
warnings:

  ./include/linux/skbuff.h:1063: warning: Excess struct member 'sp' description in 'sk_buff'
  ./include/linux/skbuff.h:1063: warning: Excess struct member 'nf_bridge' description in 'sk_buff'

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ea5c8ab3ed00..50e92c8471dc 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -754,7 +754,6 @@ typedef unsigned char *sk_buff_data_t;
  *	@dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@_skb_refdst: destination entry (with norefcount bit)
- *	@sp: the security path, used for xfrm
  *	@len: Length of actual data
  *	@data_len: Data length
  *	@mac_len: Length of link layer header
@@ -788,7 +787,6 @@ typedef unsigned char *sk_buff_data_t;
  *	@tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
  *	@_sk_redir: socket redirection information for skmsg
  *	@_nfct: Associated connection, if any (with nfctinfo bits)
- *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
  *	@tc_index: Traffic control index
  *	@hash: the packet hash
-- 
cgit v1.2.3


From 520adf3ba4a4bdd41450c57b17ef01f8a069fbfe Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:05:22 -0800
Subject: driver core: class: fix Excess kernel-doc description warning

Remove the @p: lines to prevent the kernel-doc warning:

include/linux/device/class.h:72: warning: Excess struct member 'p' description in 'class'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20231223050522.13867-1-rdunlap@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/class.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index abf3d3bfb6fe..c576b49c55c2 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -40,8 +40,6 @@ struct fwnode_handle;
  *		for the devices belonging to the class. Usually tied to
  *		device's namespace.
  * @pm:		The default device power management operations of this class.
- * @p:		The private data of the driver core, no one other than the
- *		driver core can touch this.
  *
  * A class is a higher-level view of a device that abstracts out low-level
  * implementation details. Drivers may see a SCSI disk or an ATA disk, but,
-- 
cgit v1.2.3


From ae4d90f7ca49eb71f8a3dca64d06d4c4e2193705 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:05:32 -0800
Subject: driver core: device.h: fix Excess kernel-doc description warning

Remove the @knode_class: line to prevent the kernel-doc warning:

include/linux/device.h:807: warning: Excess struct member 'knode_class' description in 'device'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20231223050532.13881-1-rdunlap@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index aefc5ca7f1cf..ed600dbf950e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -662,7 +662,6 @@ struct device_physical_location {
  * @id:		device instance
  * @devres_lock: Spinlock to protect the resource of the device.
  * @devres_head: The resources list of the device.
- * @knode_class: The node used to add the device to the class list.
  * @class:	The class of the device.
  * @groups:	Optional attribute groups.
  * @release:	Callback to free the device after all references have
-- 
cgit v1.2.3


From 1760bfa7d7ca490cf8a61fe50ddeb1769cadd89e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:06:36 -0800
Subject: usb: linux/usb.h: fix Excess kernel-doc description warning

Remove the @removable: line to prevent the kernel-doc warning:

include/linux/usb.h:732: warning: Excess struct member 'removable' description in 'usb_device'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-usb@vger.kernel.org
Link: https://lore.kernel.org/r/20231223050636.14022-1-rdunlap@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 618e5a0b1a22..07556341ba2b 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -632,7 +632,6 @@ struct usb3_lpm_parameters {
  * @reset_resume: needs reset instead of resume
  * @port_is_suspended: the upstream port is suspended (L2 or U3)
  * @slot_id: Slot ID assigned by xHCI
- * @removable: Device can be physically removed from this port
  * @l1_params: best effor service latency for USB2 L1 LPM state, and L1 timeout.
  * @u1_params: exit latencies for USB3 U1 LPM state, and hub-initiated timeout.
  * @u2_params: exit latencies for USB3 U2 LPM state, and hub-initiated timeout.
-- 
cgit v1.2.3


From f6847807c22f6944c71c981b630b9fff30801e73 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Wed, 22 Nov 2023 23:18:11 +0100
Subject: linux/export: Fix alignment for 64-bit ksymtab entries

An alignment of 4 bytes is wrong for 64-bit platforms which don't define
CONFIG_HAVE_ARCH_PREL32_RELOCATIONS (which then store 64-bit pointers).
Fix their alignment to 8 bytes.

Fixes: ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost")
Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/export-internal.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h
index 69501e0ec239..cd253eb51d6c 100644
--- a/include/linux/export-internal.h
+++ b/include/linux/export-internal.h
@@ -16,10 +16,13 @@
  * and eliminates the need for absolute relocations that require runtime
  * processing on relocatable kernels.
  */
+#define __KSYM_ALIGN		".balign 4"
 #define __KSYM_REF(sym)		".long " #sym "- ."
 #elif defined(CONFIG_64BIT)
+#define __KSYM_ALIGN		".balign 8"
 #define __KSYM_REF(sym)		".quad " #sym
 #else
+#define __KSYM_ALIGN		".balign 4"
 #define __KSYM_REF(sym)		".long " #sym
 #endif
 
@@ -42,7 +45,7 @@
 	    "	.asciz \"" ns "\""					"\n"	\
 	    "	.previous"						"\n"	\
 	    "	.section \"___ksymtab" sec "+" #name "\", \"a\""	"\n"	\
-	    "	.balign	4"						"\n"	\
+		__KSYM_ALIGN						"\n"	\
 	    "__ksymtab_" #name ":"					"\n"	\
 		__KSYM_REF(sym)						"\n"	\
 		__KSYM_REF(__kstrtab_ ##name)				"\n"	\
-- 
cgit v1.2.3


From f91a704f7161c2cf0fcd41fa9fbec4355b813fff Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 2 Oct 2023 17:19:46 +0300
Subject: fs: prepare for stackable filesystems backing file helpers

In preparation for factoring out some backing file io helpers from
overlayfs, move backing_file_open() into a new file fs/backing-file.c
and header.

Add a MAINTAINERS entry for stackable filesystems and add a Kconfig
FS_STACK which stackable filesystems need to select.

For now, the backing_file struct, the backing_file alloc/free functions
and the backing_file_real_path() accessor remain internal to file_table.c.
We may change that in the future.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 include/linux/backing-file.h | 17 +++++++++++++++++
 include/linux/fs.h           |  3 ---
 2 files changed, 17 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/backing-file.h

(limited to 'include/linux')

diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
new file mode 100644
index 000000000000..55c9e804f780
--- /dev/null
+++ b/include/linux/backing-file.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common helpers for stackable filesystems and backing files.
+ *
+ * Copyright (C) 2023 CTERA Networks.
+ */
+
+#ifndef _LINUX_BACKING_FILE_H
+#define _LINUX_BACKING_FILE_H
+
+#include <linux/file.h>
+
+struct file *backing_file_open(const struct path *user_path, int flags,
+			       const struct path *real_path,
+			       const struct cred *cred);
+
+#endif /* _LINUX_BACKING_FILE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 900d0cd55b50..db5d07e6e02e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2575,9 +2575,6 @@ struct file *dentry_open(const struct path *path, int flags,
 			 const struct cred *creds);
 struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 			   const struct cred *cred);
-struct file *backing_file_open(const struct path *user_path, int flags,
-			       const struct path *real_path,
-			       const struct cred *cred);
 struct path *backing_file_user_path(struct file *f);
 
 /*
-- 
cgit v1.2.3


From a6293b3e285cd0d7692141d7981a5f144f0e2f0b Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 22 Nov 2023 17:48:52 +0200
Subject: fs: factor out backing_file_{read,write}_iter() helpers

Overlayfs submits files io to backing files on other filesystems.
Factor out some common helpers to perform io to backing files, into
fs/backing-file.c.

Suggested-by: Miklos Szeredi <miklos@szeredi.hu>
Link: https://lore.kernel.org/r/CAJfpeguhmZbjP3JLqtUy0AdWaHOkAPWeP827BBWwRFEAUgnUcQ@mail.gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 include/linux/backing-file.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 55c9e804f780..0648d548a418 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -9,9 +9,24 @@
 #define _LINUX_BACKING_FILE_H
 
 #include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/fs.h>
+
+struct backing_file_ctx {
+	const struct cred *cred;
+	struct file *user_file;
+	void (*accessed)(struct file *);
+	void (*end_write)(struct file *);
+};
 
 struct file *backing_file_open(const struct path *user_path, int flags,
 			       const struct path *real_path,
 			       const struct cred *cred);
+ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
+			       struct kiocb *iocb, int flags,
+			       struct backing_file_ctx *ctx);
+ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
+				struct kiocb *iocb, int flags,
+				struct backing_file_ctx *ctx);
 
 #endif /* _LINUX_BACKING_FILE_H */
-- 
cgit v1.2.3


From 9b7e9e2f5d5c3d079ec46bc71b114012e362ea6e Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 13 Oct 2023 12:13:12 +0300
Subject: fs: factor out backing_file_splice_{read,write}() helpers

There is not much in those helpers, but it makes sense to have them
logically next to the backing_file_{read,write}_iter() helpers as they
may grow more common logic in the future.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 include/linux/backing-file.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 0648d548a418..0546d5b1c9f5 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -28,5 +28,13 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
 ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
 				struct kiocb *iocb, int flags,
 				struct backing_file_ctx *ctx);
+ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags,
+				 struct backing_file_ctx *ctx);
+ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
+				  struct file *out, loff_t *ppos, size_t len,
+				  unsigned int flags,
+				  struct backing_file_ctx *ctx);
 
 #endif /* _LINUX_BACKING_FILE_H */
-- 
cgit v1.2.3


From f567377e406c032fff0799bde4fdf4a977529b84 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 13 Oct 2023 12:49:37 +0300
Subject: fs: factor out backing_file_mmap() helper

Assert that the file object is allocated in a backing_file container
so that file_user_path() could be used to display the user path and
not the backing file's path in /proc/<pid>/maps.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 include/linux/backing-file.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 0546d5b1c9f5..3f1fe1774f1b 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -36,5 +36,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
 				  struct file *out, loff_t *ppos, size_t len,
 				  unsigned int flags,
 				  struct backing_file_ctx *ctx);
+int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
+		      struct backing_file_ctx *ctx);
 
 #endif /* _LINUX_BACKING_FILE_H */
-- 
cgit v1.2.3


From 9942cb22ea458c34fa17b73d143ea32d4df1caca Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 11 Dec 2023 11:48:49 +0100
Subject: sched/topology: Add a new arch_scale_freq_ref() method

Create a new method to get a unique and fixed max frequency. Currently
cpuinfo.max_freq or the highest (or last) state of performance domain are
used as the max frequency when computing the frequency for a level of
utilization, but:

  - cpuinfo_max_freq can change at runtime. boost is one example of
    such change.

  - cpuinfo.max_freq and last item of the PD can be different leading to
    different results between cpufreq and energy model.

We need to save the reference frequency that has been used when computing
the CPUs capacity and use this fixed and coherent value to convert between
frequency and CPU's capacity.

In fact, we already save the frequency that has been used when computing
the capacity of each CPU. We extend the precision to save kHz instead of
MHz currently and we modify the type to be aligned with other variables
used when converting frequency to capacity and the other way.

[ mingo: Minor edits. ]

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/20231211104855.558096-2-vincent.guittot@linaro.org
---
 include/linux/arch_topology.h  | 7 +++++++
 include/linux/sched/topology.h | 8 ++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index a07b510e7dc5..32c24ff4f2a8 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -27,6 +27,13 @@ static inline unsigned long topology_get_cpu_scale(int cpu)
 
 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
 
+DECLARE_PER_CPU(unsigned long, capacity_freq_ref);
+
+static inline unsigned long topology_get_freq_ref(int cpu)
+{
+	return per_cpu(capacity_freq_ref, cpu);
+}
+
 DECLARE_PER_CPU(unsigned long, arch_freq_scale);
 
 static inline unsigned long topology_get_freq_scale(int cpu)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index de545ba85218..a6e04b4a21d7 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -279,6 +279,14 @@ void arch_update_thermal_pressure(const struct cpumask *cpus,
 { }
 #endif
 
+#ifndef arch_scale_freq_ref
+static __always_inline
+unsigned int arch_scale_freq_ref(int cpu)
+{
+	return 0;
+}
+#endif
+
 static inline int task_node(const struct task_struct *p)
 {
 	return cpu_to_node(task_cpu(p));
-- 
cgit v1.2.3


From 599457ba15403037b489fe536266a3d5f9efaed7 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 11 Dec 2023 11:48:50 +0100
Subject: cpufreq: Use the fixed and coherent frequency for scaling capacity

cpuinfo.max_freq can change at runtime because of boost as an example. This
implies that the value could be different from the frequency that has been
used to compute the capacity of a CPU.

The new arch_scale_freq_ref() returns a fixed and coherent frequency
that can be used to compute the capacity for a given frequency.

[ Also fix a arch_set_freq_scale()  newline style wart in <linux/cpufreq.h>. ]

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20231211104855.558096-3-vincent.guittot@linaro.org
---
 include/linux/cpufreq.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1c5ca92a0555..afda5f24d3dd 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1203,6 +1203,7 @@ void arch_set_freq_scale(const struct cpumask *cpus,
 {
 }
 #endif
+
 /* the following are really really optional */
 extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
 extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;
-- 
cgit v1.2.3


From 15cbbd1d317e07b4e5c6aca5d4c5579539a82784 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 11 Dec 2023 11:48:52 +0100
Subject: energy_model: Use a fixed reference frequency

The last item of a performance domain is not always the performance point
that has been used to compute CPU's capacity. This can lead to different
target frequency compared with other part of the system like schedutil and
would result in wrong energy estimation.

A new arch_scale_freq_ref() is available to return a fixed and coherent
frequency reference that can be used when computing the CPU's frequency
for an level of utilization. Use this function to get this reference
frequency.

Energy model is never used without defining arch_scale_freq_ref() but
can be compiled. Define a default arch_scale_freq_ref() returning 0
in such case.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lore.kernel.org/r/20231211104855.558096-5-vincent.guittot@linaro.org
---
 include/linux/energy_model.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index adec808b371a..88d91e087471 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -224,7 +224,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 				unsigned long max_util, unsigned long sum_util,
 				unsigned long allowed_cpu_cap)
 {
-	unsigned long freq, scale_cpu;
+	unsigned long freq, ref_freq, scale_cpu;
 	struct em_perf_state *ps;
 	int cpu;
 
@@ -241,10 +241,10 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	 */
 	cpu = cpumask_first(to_cpumask(pd->cpus));
 	scale_cpu = arch_scale_cpu_capacity(cpu);
-	ps = &pd->table[pd->nr_perf_states - 1];
+	ref_freq = arch_scale_freq_ref(cpu);
 
 	max_util = min(max_util, allowed_cpu_cap);
-	freq = map_util_freq(max_util, ps->frequency, scale_cpu);
+	freq = map_util_freq(max_util, ref_freq, scale_cpu);
 
 	/*
 	 * Find the lowest performance state of the Energy Model above the
-- 
cgit v1.2.3


From 1f023007f5e782bda19ad9104830c404fd622c5d Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 11 Dec 2023 11:48:55 +0100
Subject: arm64/amu: Use capacity_ref_freq() to set AMU ratio

Use the new capacity_ref_freq() method to set the ratio that is used by AMU for
computing the arch_scale_freq_capacity().
This helps to keep everything aligned using the same reference for
computing CPUs capacity.

The default value of the ratio (stored in per_cpu(arch_max_freq_scale))
ensures that arch_scale_freq_capacity() returns max capacity until it is
set to its correct value with the cpu capacity and capacity_ref_freq().

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20231211104855.558096-8-vincent.guittot@linaro.org
---
 include/linux/arch_topology.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 32c24ff4f2a8..a63d61ca55af 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -99,6 +99,7 @@ void update_siblings_masks(unsigned int cpu);
 void remove_cpu_topology(unsigned int cpuid);
 void reset_cpu_topology(void);
 int parse_acpi_topology(void);
+void freq_inv_set_max_ratio(int cpu, u64 max_rate);
 #endif
 
 #endif /* _LINUX_ARCH_TOPOLOGY_H_ */
-- 
cgit v1.2.3


From 11137d384996bb05cf33c8163db271e1bac3f4bf Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 1 Dec 2023 17:16:52 +0100
Subject: sched/fair: Simplify util_est

With UTIL_EST_FASTUP now being permanent, we can take advantage of the
fact that the ewma jumps directly to a higher utilization at dequeue to
simplify util_est and remove the enqueued field.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reviewed-by: Hongyan Xia <hongyan.xia2@arm.com>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/20231201161652.1241695-3-vincent.guittot@linaro.org
---
 include/linux/sched.h | 49 ++++++++++++-------------------------------------
 1 file changed, 12 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d258162deb0..03bfe9ab2951 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -415,42 +415,6 @@ struct load_weight {
 	u32				inv_weight;
 };
 
-/**
- * struct util_est - Estimation utilization of FAIR tasks
- * @enqueued: instantaneous estimated utilization of a task/cpu
- * @ewma:     the Exponential Weighted Moving Average (EWMA)
- *            utilization of a task
- *
- * Support data structure to track an Exponential Weighted Moving Average
- * (EWMA) of a FAIR task's utilization. New samples are added to the moving
- * average each time a task completes an activation. Sample's weight is chosen
- * so that the EWMA will be relatively insensitive to transient changes to the
- * task's workload.
- *
- * The enqueued attribute has a slightly different meaning for tasks and cpus:
- * - task:   the task's util_avg at last task dequeue time
- * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
- * Thus, the util_est.enqueued of a task represents the contribution on the
- * estimated utilization of the CPU where that task is currently enqueued.
- *
- * Only for tasks we track a moving average of the past instantaneous
- * estimated utilization. This allows to absorb sporadic drops in utilization
- * of an otherwise almost periodic task.
- *
- * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
- * updates. When a task is dequeued, its util_est should not be updated if its
- * util_avg has not been updated in the meantime.
- * This information is mapped into the MSB bit of util_est.enqueued at dequeue
- * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
- * for a task) it is safe to use MSB.
- */
-struct util_est {
-	unsigned int			enqueued;
-	unsigned int			ewma;
-#define UTIL_EST_WEIGHT_SHIFT		2
-#define UTIL_AVG_UNCHANGED		0x80000000
-} __attribute__((__aligned__(sizeof(u64))));
-
 /*
  * The load/runnable/util_avg accumulates an infinite geometric series
  * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@ -505,9 +469,20 @@ struct sched_avg {
 	unsigned long			load_avg;
 	unsigned long			runnable_avg;
 	unsigned long			util_avg;
-	struct util_est			util_est;
+	unsigned int			util_est;
 } ____cacheline_aligned;
 
+/*
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est at dequeue time.
+ * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
+ * it is safe to use MSB.
+ */
+#define UTIL_EST_WEIGHT_SHIFT		2
+#define UTIL_AVG_UNCHANGED		0x80000000
+
 struct sched_statistics {
 #ifdef CONFIG_SCHEDSTATS
 	u64				wait_start;
-- 
cgit v1.2.3


From 4498a8eccc97de3d65f876b6fdeddb439ef73abc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 20 Nov 2023 17:09:47 +0000
Subject: netfs, fscache: Remove ->begin_cache_operation

Remove ->begin_cache_operation() in favour of just calling fscache directly.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: Christian Brauner <christian@brauner.io>
cc: linux-fsdevel@vger.kernel.org
cc: linux-cachefs@redhat.com
---
 include/linux/fscache.h | 3 ---
 include/linux/netfs.h   | 4 +---
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 8e312c8323a8..9ed6696aee7a 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -437,9 +437,6 @@ const struct netfs_cache_ops *fscache_operation_valid(const struct netfs_cache_r
  * indicates the cache resources to which the operation state should be
  * attached; @cookie indicates the cache object that will be accessed.
  *
- * This is intended to be called from the ->begin_cache_operation() netfs lib
- * operation as implemented by the network filesystem.
- *
  * @cres->inval_counter is set from @cookie->inval_counter for comparison at
  * the end of the operation.  This allows invalidation during the operation to
  * be detected by the caller.
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index b11a84f6c32b..d294ff8f9ae4 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -208,7 +208,6 @@ struct netfs_io_request {
 struct netfs_request_ops {
 	int (*init_request)(struct netfs_io_request *rreq, struct file *file);
 	void (*free_request)(struct netfs_io_request *rreq);
-	int (*begin_cache_operation)(struct netfs_io_request *rreq);
 
 	void (*expand_readahead)(struct netfs_io_request *rreq);
 	bool (*clamp_length)(struct netfs_io_subrequest *subreq);
@@ -229,8 +228,7 @@ enum netfs_read_from_hole {
 };
 
 /*
- * Table of operations for access to a cache.  This is obtained by
- * rreq->ops->begin_cache_operation().
+ * Table of operations for access to a cache.
  */
 struct netfs_cache_ops {
 	/* End an operation */
-- 
cgit v1.2.3


From 7eb5b3e3a0a55f2d166ca949ef47ca6e0c704aab Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 21 Nov 2023 15:43:52 +0000
Subject: netfs, fscache: Move /proc/fs/fscache to /proc/fs/netfs and put in a
 symlink

Rename /proc/fs/fscache to "netfs" and make a symlink from fscache to that.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: Christian Brauner <christian@brauner.io>
cc: linux-fsdevel@vger.kernel.org
cc: linux-cachefs@redhat.com
---
 include/linux/netfs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index d294ff8f9ae4..9bd91cd615d5 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -294,7 +294,6 @@ void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
 			  enum netfs_sreq_ref_trace what);
 void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
 			  bool was_async, enum netfs_sreq_ref_trace what);
-void netfs_stats_show(struct seq_file *);
 ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
 				struct iov_iter *new,
 				iov_iter_extraction_t extraction_flags);
-- 
cgit v1.2.3


From c9c4ff12df110feb1b91951010f673f4b16e49e8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 27 Nov 2023 13:58:07 +0000
Subject: netfs: Move pinning-for-writeback from fscache to netfs

Move the resource pinning-for-writeback from fscache code to netfslib code.
This is used to keep a cache backing object pinned whilst we have dirty
pages on the netfs inode in the pagecache such that VM writeback will be
able to reach it.

Whilst we're at it, switch the parameters of netfs_unpin_writeback() to
match ->write_inode() so that it can be used for that directly.

Note that this mechanism could be more generically useful than that for
network filesystems.  Quite often they have to keep around other resources
(e.g. authentication tokens or network connections) until the writeback is
complete.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/fs.h        |  2 +-
 include/linux/fscache.h   | 42 ------------------------------------------
 include/linux/netfs.h     |  3 +++
 include/linux/writeback.h |  2 +-
 4 files changed, 5 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..68a957261694 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2294,7 +2294,7 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
 #define I_CREATING		(1 << 15)
 #define I_DONTCACHE		(1 << 16)
 #define I_SYNC_QUEUED		(1 << 17)
-#define I_PINNING_FSCACHE_WB	(1 << 18)
+#define I_PINNING_NETFS_WB	(1 << 18)
 
 #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 9ed6696aee7a..6e8562cbcc43 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -626,48 +626,6 @@ static inline void fscache_write_to_cache(struct fscache_cookie *cookie,
 
 }
 
-#if __fscache_available
-bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio,
-		struct fscache_cookie *cookie);
-#else
-#define fscache_dirty_folio(MAPPING, FOLIO, COOKIE) \
-		filemap_dirty_folio(MAPPING, FOLIO)
-#endif
-
-/**
- * fscache_unpin_writeback - Unpin writeback resources
- * @wbc: The writeback control
- * @cookie: The cookie referring to the cache object
- *
- * Unpin the writeback resources pinned by fscache_dirty_folio().  This is
- * intended to be called by the netfs's ->write_inode() method.
- */
-static inline void fscache_unpin_writeback(struct writeback_control *wbc,
-					   struct fscache_cookie *cookie)
-{
-	if (wbc->unpinned_fscache_wb)
-		fscache_unuse_cookie(cookie, NULL, NULL);
-}
-
-/**
- * fscache_clear_inode_writeback - Clear writeback resources pinned by an inode
- * @cookie: The cookie referring to the cache object
- * @inode: The inode to clean up
- * @aux: Auxiliary data to apply to the inode
- *
- * Clear any writeback resources held by an inode when the inode is evicted.
- * This must be called before clear_inode() is called.
- */
-static inline void fscache_clear_inode_writeback(struct fscache_cookie *cookie,
-						 struct inode *inode,
-						 const void *aux)
-{
-	if (inode->i_state & I_PINNING_FSCACHE_WB) {
-		loff_t i_size = i_size_read(inode);
-		fscache_unuse_cookie(cookie, aux, &i_size);
-	}
-}
-
 /**
  * fscache_note_page_release - Note that a netfs page got released
  * @cookie: The cookie corresponding to the file
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 9bd91cd615d5..32faf6c89702 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -288,6 +288,9 @@ int netfs_read_folio(struct file *, struct folio *);
 int netfs_write_begin(struct netfs_inode *, struct file *,
 		struct address_space *, loff_t pos, unsigned int len,
 		struct folio **, void **fsdata);
+bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio);
+int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
+void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
 
 void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool);
 void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 083387c00f0c..1e08392fb43e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -60,7 +60,7 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
-	unsigned unpinned_fscache_wb:1;	/* Cleared I_PINNING_FSCACHE_WB */
+	unsigned unpinned_netfs_wb:1;	/* Cleared I_PINNING_NETFS_WB */
 
 	/*
 	 * When writeback IOs are bounced through async layers, only the
-- 
cgit v1.2.3


From 87b57a048964abfd5f3d8b79bc55687327f5a380 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2022 10:34:27 +0000
Subject: netfs: Add a procfile to list in-progress requests

Add a procfile, /proc/fs/netfs/requests, to list in-progress netfslib I/O
requests.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 32faf6c89702..7244ddebd974 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -175,10 +175,14 @@ enum netfs_io_origin {
  * operations to a variety of data stores and then stitch the result together.
  */
 struct netfs_io_request {
-	struct work_struct	work;
+	union {
+		struct work_struct work;
+		struct rcu_head rcu;
+	};
 	struct inode		*inode;		/* The file being accessed */
 	struct address_space	*mapping;	/* The mapping being accessed */
 	struct netfs_cache_resources cache_resources;
+	struct list_head	proc_link;	/* Link in netfs_iorequests */
 	struct list_head	subrequests;	/* Contributory I/O operations */
 	void			*netfs_priv;	/* Private data for the netfs */
 	unsigned int		debug_id;
-- 
cgit v1.2.3


From cc3cb0a18da46a51d9fc173155576ba1d068e536 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 9 Mar 2022 11:01:12 +0000
Subject: netfs: Allow the netfs to make the io (sub)request alloc larger

Allow the network filesystem to specify extra space to be allocated on the
end of the io (sub)request.  This allows cifs, for example, to use this
space rather than allocating its own cifs_readdata struct.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 7244ddebd974..d6f27000eeb0 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -210,6 +210,8 @@ struct netfs_io_request {
  * Operations the network filesystem can/must provide to the helpers.
  */
 struct netfs_request_ops {
+	unsigned int	io_request_size;	/* Alloc size for netfs_io_request struct */
+	unsigned int	io_subrequest_size;	/* Alloc size for netfs_io_subrequest struct */
 	int (*init_request)(struct netfs_io_request *rreq, struct file *file);
 	void (*free_request)(struct netfs_io_request *rreq);
 
-- 
cgit v1.2.3


From 5f5ce7ba15e7e6a6539ac8e1f845757aaebecf0d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 25 Feb 2022 11:19:14 +0000
Subject: netfs: Add a ->free_subrequest() op

Add a ->free_subrequest() op so that the netfs can clean up data attached
to a subrequest.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index d6f27000eeb0..06f57d9d09f6 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -214,6 +214,7 @@ struct netfs_request_ops {
 	unsigned int	io_subrequest_size;	/* Alloc size for netfs_io_subrequest struct */
 	int (*init_request)(struct netfs_io_request *rreq, struct file *file);
 	void (*free_request)(struct netfs_io_request *rreq);
+	void (*free_subrequest)(struct netfs_io_subrequest *rreq);
 
 	void (*expand_readahead)(struct netfs_io_request *rreq);
 	bool (*clamp_length)(struct netfs_io_subrequest *subreq);
-- 
cgit v1.2.3


From c1ec4d7c2e13471558cfea302b7583856284f94c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 20 Aug 2021 17:08:30 +0100
Subject: netfs: Provide invalidate_folio and release_folio calls

Provide default invalidate_folio and release_folio calls.  These will need
to interact with invalidation correctly at some point.  They will be needed
if netfslib is to make use of folio->private for its own purposes.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 06f57d9d09f6..8efbfd3b2820 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -293,11 +293,13 @@ struct readahead_control;
 void netfs_readahead(struct readahead_control *);
 int netfs_read_folio(struct file *, struct folio *);
 int netfs_write_begin(struct netfs_inode *, struct file *,
-		struct address_space *, loff_t pos, unsigned int len,
-		struct folio **, void **fsdata);
+		      struct address_space *, loff_t pos, unsigned int len,
+		      struct folio **, void **fsdata);
 bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio);
 int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
 void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
+void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
+bool netfs_release_folio(struct folio *folio, gfp_t gfp);
 
 void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool);
 void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
-- 
cgit v1.2.3


From 46ed60dcd4f2c94d27735743ce55cd8d6b93cc1d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 11 Oct 2023 15:34:07 +0100
Subject: netfs: Implement unbuffered/DIO vs buffered I/O locking

Borrow NFS's direct-vs-buffered I/O locking into netfslib.  Similar code is
also used in ceph.

Modify it to have the correct checker annotations for i_rwsem lock
acquisition/release and to return -ERESTARTSYS if waits are interrupted.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 8efbfd3b2820..fc6d9756a029 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -129,6 +129,8 @@ struct netfs_inode {
 	struct fscache_cookie	*cache;
 #endif
 	loff_t			remote_i_size;	/* Size of the remote file */
+	unsigned long		flags;
+#define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 };
 
 /*
@@ -310,6 +312,13 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
 				struct iov_iter *new,
 				iov_iter_extraction_t extraction_flags);
 
+int netfs_start_io_read(struct inode *inode);
+void netfs_end_io_read(struct inode *inode);
+int netfs_start_io_write(struct inode *inode);
+void netfs_end_io_write(struct inode *inode);
+int netfs_start_io_direct(struct inode *inode);
+void netfs_end_io_direct(struct inode *inode);
+
 /**
  * netfs_inode - Get the netfs inode context from the inode
  * @inode: The inode to query
@@ -335,6 +344,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx,
 {
 	ctx->ops = ops;
 	ctx->remote_i_size = i_size_read(&ctx->inode);
+	ctx->flags = 0;
 #if IS_ENABLED(CONFIG_FSCACHE)
 	ctx->cache = NULL;
 #endif
-- 
cgit v1.2.3


From 92b6cc5d1e7cbe569f00e9c1249ac8214fd5e2d2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 26 Sep 2023 17:42:26 +0100
Subject: netfs: Add iov_iters to (sub)requests to describe various buffers

Add three iov_iter structs:

 (1) Add an iov_iter (->iter) to the I/O request to describe the
     unencrypted-side buffer.

 (2) Add an iov_iter (->io_iter) to the I/O request to describe the
     encrypted-side I/O buffer.  This may be a different size to the buffer
     in (1).

 (3) Add an iov_iter (->io_iter) to the I/O subrequest to describe the part
     of the I/O buffer for that subrequest.

This will allow future patches to point to a bounce buffer instead for
purposes of handling oversize writes, decryption (where we want to save the
encrypted data to the cache) and decompression.

These iov_iters persist for the lifetime of the (sub)request, and so can be
accessed multiple times without worrying about them being deallocated upon
return to the caller.

The network filesystem must appropriately advance the iterator before
terminating the request.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index fc6d9756a029..3da962e977f5 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -150,6 +150,7 @@ struct netfs_cache_resources {
 struct netfs_io_subrequest {
 	struct netfs_io_request *rreq;		/* Supervising I/O request */
 	struct list_head	rreq_link;	/* Link in rreq->subrequests */
+	struct iov_iter		io_iter;	/* Iterator for this subrequest */
 	loff_t			start;		/* Where to start the I/O */
 	size_t			len;		/* Size of the I/O */
 	size_t			transferred;	/* Amount of data transferred */
@@ -186,6 +187,8 @@ struct netfs_io_request {
 	struct netfs_cache_resources cache_resources;
 	struct list_head	proc_link;	/* Link in netfs_iorequests */
 	struct list_head	subrequests;	/* Contributory I/O operations */
+	struct iov_iter		iter;		/* Unencrypted-side iterator */
+	struct iov_iter		io_iter;	/* I/O (Encrypted-side) iterator */
 	void			*netfs_priv;	/* Private data for the netfs */
 	unsigned int		debug_id;
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
-- 
cgit v1.2.3


From f1bb47a31dff6d4b34fb14e99850860ee74bb003 Mon Sep 17 00:00:00 2001
From: Alfred Piccioni <alpic@google.com>
Date: Tue, 19 Dec 2023 10:09:09 +0100
Subject: lsm: new security_file_ioctl_compat() hook

Some ioctl commands do not require ioctl permission, but are routed to
other permissions such as FILE_GETATTR or FILE_SETATTR. This routing is
done by comparing the ioctl cmd to a set of 64-bit flags (FS_IOC_*).

However, if a 32-bit process is running on a 64-bit kernel, it emits
32-bit flags (FS_IOC32_*) for certain ioctl operations. These flags are
being checked erroneously, which leads to these ioctl operations being
routed to the ioctl permission, rather than the correct file
permissions.

This was also noted in a RED-PEN finding from a while back -
"/* RED-PEN how should LSM module know it's handling 32bit? */".

This patch introduces a new hook, security_file_ioctl_compat(), that is
called from the compat ioctl syscall. All current LSMs have been changed
to support this hook.

Reviewing the three places where we are currently using
security_file_ioctl(), it appears that only SELinux needs a dedicated
compat change; TOMOYO and SMACK appear to be functional without any
change.

Cc: stable@vger.kernel.org
Fixes: 0b24dcb7f2f7 ("Revert "selinux: simplify ioctl checking"")
Signed-off-by: Alfred Piccioni <alpic@google.com>
Reviewed-by: Stephen Smalley <stephen.smalley.work@gmail.com>
[PM: subject tweak, line length fixes, and alignment corrections]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 ++
 include/linux/security.h      | 9 +++++++++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index c925a0d26edf..185924c56378 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -171,6 +171,8 @@ LSM_HOOK(int, 0, file_alloc_security, struct file *file)
 LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file)
 LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd,
 	 unsigned long arg)
+LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd,
+	 unsigned long arg)
 LSM_HOOK(int, 0, mmap_addr, unsigned long addr)
 LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot,
 	 unsigned long prot, unsigned long flags)
diff --git a/include/linux/security.h b/include/linux/security.h
index 750130a7b9dd..d0eb20f90b26 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -394,6 +394,8 @@ int security_file_permission(struct file *file, int mask);
 int security_file_alloc(struct file *file);
 void security_file_free(struct file *file);
 int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int security_file_ioctl_compat(struct file *file, unsigned int cmd,
+			       unsigned long arg);
 int security_mmap_file(struct file *file, unsigned long prot,
 			unsigned long flags);
 int security_mmap_addr(unsigned long addr);
@@ -1002,6 +1004,13 @@ static inline int security_file_ioctl(struct file *file, unsigned int cmd,
 	return 0;
 }
 
+static inline int security_file_ioctl_compat(struct file *file,
+					     unsigned int cmd,
+					     unsigned long arg)
+{
+	return 0;
+}
+
 static inline int security_mmap_file(struct file *file, unsigned long prot,
 				     unsigned long flags)
 {
-- 
cgit v1.2.3


From 8645e659e2d227f6ce8fcea1ac640c324fbbb3e6 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:05:56 -0800
Subject: iio: linux/iio.h: fix Excess kernel-doc description warning

Remove the @of_xlate: lines to prevent the kernel-doc warning:

include/linux/iio/iio.h:534: warning: Excess struct member 'of_xlate' description in 'iio_info'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Lars-Peter Clausen <lars@metafoo.de>
Cc: linux-iio@vger.kernel.org
Link: https://lore.kernel.org/r/20231223050556.13948-1-rdunlap@infradead.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index d0ce3b71106a..c5b36d2c1e73 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -434,13 +434,7 @@ struct iio_trigger; /* forward declaration */
  * @update_scan_mode:	function to configure device and scan buffer when
  *			channels have changed
  * @debugfs_reg_access:	function to read or write register value of device
- * @of_xlate:		function pointer to obtain channel specifier index.
- *			When #iio-cells is greater than '0', the driver could
- *			provide a custom of_xlate function that reads the
- *			*args* and returns the appropriate index in registered
- *			IIO channels array.
  * @fwnode_xlate:	fwnode based function pointer to obtain channel specifier index.
- *			Functionally the same as @of_xlate.
  * @hwfifo_set_watermark: function pointer to set the current hardware
  *			fifo watermark level; see hwfifo_* entries in
  *			Documentation/ABI/testing/sysfs-bus-iio for details on
-- 
cgit v1.2.3


From 02d374f3418df577c850f0cd45c3da9245ead547 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 26 Dec 2023 08:15:24 +0000
Subject: block: renumber QUEUE_FLAG_HW_WC

For the QUEUE_FLAG_HW_WC to actually work, it needs to have a separate
number from QUEUE_FLAG_FUA, doh.

Fixes: 43c9835b144c ("block: don't allow enabling a cache on devices that don't support it")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231226081524.180289-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 51fa7ffdee83..88e9dd4b71fb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -538,7 +538,7 @@ struct request_queue {
 #define QUEUE_FLAG_ADD_RANDOM	10	/* Contributes to random pool */
 #define QUEUE_FLAG_SYNCHRONOUS	11	/* always completes in submit context */
 #define QUEUE_FLAG_SAME_FORCE	12	/* force complete on same CPU */
-#define QUEUE_FLAG_HW_WC	18	/* Write back caching supported */
+#define QUEUE_FLAG_HW_WC	13	/* Write back caching supported */
 #define QUEUE_FLAG_INIT_DONE	14	/* queue is initialized */
 #define QUEUE_FLAG_STABLE_WRITES 15	/* don't modify blks until WB is done */
 #define QUEUE_FLAG_POLL		16	/* IO polling enabled if set */
-- 
cgit v1.2.3


From 1c042f8d4bc342b7985b1de3d76836f1a1083b65 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Dec 2023 08:05:38 +0100
Subject: block: reject invalid operation in submit_bio_noacct

submit_bio_noacct allows completely invalid operations, or operations
that are not supported in the bio path.  Extent the existing switch
statement to rejcect all invalid types.

Move the code point for REQ_OP_ZONE_APPEND so that it's not right in the
middle of the zone management operations and the switch statement can
follow the numerical order of the operations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231221070538.1112446-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d5c5e59ddbd2..68c9eb2374a4 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -378,6 +378,8 @@ enum req_op {
 	REQ_OP_DISCARD		= (__force blk_opf_t)3,
 	/* securely erase sectors */
 	REQ_OP_SECURE_ERASE	= (__force blk_opf_t)5,
+	/* write data at the current zone write pointer */
+	REQ_OP_ZONE_APPEND	= (__force blk_opf_t)7,
 	/* write the zero filled sector many times */
 	REQ_OP_WRITE_ZEROES	= (__force blk_opf_t)9,
 	/* Open a zone */
@@ -386,12 +388,10 @@ enum req_op {
 	REQ_OP_ZONE_CLOSE	= (__force blk_opf_t)11,
 	/* Transition a zone to full */
 	REQ_OP_ZONE_FINISH	= (__force blk_opf_t)12,
-	/* write data at the current zone write pointer */
-	REQ_OP_ZONE_APPEND	= (__force blk_opf_t)13,
 	/* reset a zone write pointer */
-	REQ_OP_ZONE_RESET	= (__force blk_opf_t)15,
+	REQ_OP_ZONE_RESET	= (__force blk_opf_t)13,
 	/* reset all the zone present on the device */
-	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)17,
+	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)15,
 
 	/* Driver private requests */
 	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
-- 
cgit v1.2.3


From b3bf76024f645369e1fc45e0b08a2bd24f200d9b Mon Sep 17 00:00:00 2001
From: Wen Gu <guwen@linux.alibaba.com>
Date: Tue, 19 Dec 2023 22:26:16 +0800
Subject: net/smc: manage system EID in SMC stack instead of ISM driver

The System EID (SEID) is an internal EID that is used by the SMCv2
software stack that has a predefined and constant value representing
the s390 physical machine that the OS is executing on. So it should
be managed by SMC stack instead of ISM driver and be consistent for
all ISMv2 device (including virtual ISM devices) on s390 architecture.

Suggested-by: Alexandra Winter <wintera@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Reviewed-and-tested-by: Wenjia Zhang <wenjia@linux.ibm.com>
Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ism.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ism.h b/include/linux/ism.h
index 9a4c204df3da..5428edd90982 100644
--- a/include/linux/ism.h
+++ b/include/linux/ism.h
@@ -86,7 +86,6 @@ int  ism_register_dmb(struct ism_dev *dev, struct ism_dmb *dmb,
 int  ism_unregister_dmb(struct ism_dev *dev, struct ism_dmb *dmb);
 int  ism_move(struct ism_dev *dev, u64 dmb_tok, unsigned int idx, bool sf,
 	      unsigned int offset, void *data, unsigned int size);
-u8  *ism_get_seid(void);
 
 const struct smcd_ops *ism_get_smcd_ops(void);
 
-- 
cgit v1.2.3


From 0942155a48e4cfc2c83e514c86a3de8f78f6af02 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@grsecurity.net>
Date: Wed, 20 Dec 2023 14:35:05 +0100
Subject: PCI: Remove unused 'node' member from struct pci_driver

Remove the unused 'node' member. It got replaced by device_driver chaining
more than 20 years ago in commit 4b4a837f2b57 ("PCI: start to use common
fields of struct device_driver more...") of the history.git tree.

Link: https://lore.kernel.org/r/20231220133505.8798-1-minipli@grsecurity.net
Signed-off-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Kalle Valo <kvalo@kernel.org>
---
 include/linux/pci.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60ca768bc867..1a89dc66f89a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -885,7 +885,6 @@ struct module;
 
 /**
  * struct pci_driver - PCI driver structure
- * @node:	List of driver structures.
  * @name:	Driver name.
  * @id_table:	Pointer to table of device IDs the driver is
  *		interested in.  Most drivers should export this
@@ -940,7 +939,6 @@ struct module;
  *		own I/O address space.
  */
 struct pci_driver {
-	struct list_head	node;
 	const char		*name;
 	const struct pci_device_id *id_table;	/* Must be non-NULL for probe to be called */
 	int  (*probe)(struct pci_dev *dev, const struct pci_device_id *id);	/* New device inserted */
-- 
cgit v1.2.3


From ba367479c7ad0b870461024cd5ae7a1ea6e1e3db Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 19 Dec 2023 11:32:39 +0530
Subject: OPP: The level field is always of unsigned int type

By mistake, dev_pm_opp_find_level_floor() used the level parameter as
unsigned long instead of unsigned int. Fix it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm_opp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 81dff7facdc9..74768c47d790 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -163,7 +163,7 @@ struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
 					      unsigned int *level);
 
 struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev,
-					       unsigned long *level);
+					       unsigned int *level);
 
 struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev,
 					   unsigned int *bw, int index);
@@ -330,7 +330,7 @@ static inline struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
 }
 
 static inline struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev,
-							     unsigned long *level)
+							     unsigned int *level)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
-- 
cgit v1.2.3


From 90abde49ea85a8af9a56bbab8c419aefc77f919a Mon Sep 17 00:00:00 2001
From: "Radu Pirea (NXP OSS)" <radu-nicolae.pirea@oss.nxp.com>
Date: Tue, 19 Dec 2023 16:53:25 +0200
Subject: net: rename dsa_realloc_skb to skb_ensure_writable_head_tail

Rename dsa_realloc_skb to skb_ensure_writable_head_tail and move it to
skbuff.c to use it as helper.

Signed-off-by: Radu Pirea (NXP OSS) <radu-nicolae.pirea@oss.nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 50e92c8471dc..a5ae952454c8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4007,6 +4007,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features
 				 unsigned int offset);
 struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len);
+int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev);
 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
-- 
cgit v1.2.3


From 932562a6045ed613d45bd100db37114273c22077 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 15:58:20 -0500
Subject: rseq: Split out rseq.h from sched.h

We're trying to get sched.h down to more or less just types only, not
code - rseq can live in its own header.

This helps us kill the dependency on preempt.h in sched.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/resume_user_mode.h |   1 +
 include/linux/rseq.h             | 131 +++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h            | 125 +------------------------------------
 3 files changed, 133 insertions(+), 124 deletions(-)
 create mode 100644 include/linux/rseq.h

(limited to 'include/linux')

diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h
index f8f3e958e9cf..e0135e0adae0 100644
--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/task_work.h>
 #include <linux/memcontrol.h>
+#include <linux/rseq.h>
 #include <linux/blk-cgroup.h>
 
 /**
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
new file mode 100644
index 000000000000..bc8af3eb5598
--- /dev/null
+++ b/include/linux/rseq.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _LINUX_RSEQ_H
+#define _LINUX_RSEQ_H
+
+#ifdef CONFIG_RSEQ
+
+#include <linux/preempt.h>
+#include <linux/sched.h>
+
+/*
+ * Map the event mask on the user-space ABI enum rseq_cs_flags
+ * for direct mask checks.
+ */
+enum rseq_event_mask_bits {
+	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
+	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
+	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
+};
+
+enum rseq_event_mask {
+	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
+	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
+	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
+};
+
+static inline void rseq_set_notify_resume(struct task_struct *t)
+{
+	if (t->rseq)
+		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+}
+
+void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
+
+static inline void rseq_handle_notify_resume(struct ksignal *ksig,
+					     struct pt_regs *regs)
+{
+	if (current->rseq)
+		__rseq_handle_notify_resume(ksig, regs);
+}
+
+static inline void rseq_signal_deliver(struct ksignal *ksig,
+				       struct pt_regs *regs)
+{
+	preempt_disable();
+	__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
+	preempt_enable();
+	rseq_handle_notify_resume(ksig, regs);
+}
+
+/* rseq_preempt() requires preemption to be disabled. */
+static inline void rseq_preempt(struct task_struct *t)
+{
+	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
+	rseq_set_notify_resume(t);
+}
+
+/* rseq_migrate() requires preemption to be disabled. */
+static inline void rseq_migrate(struct task_struct *t)
+{
+	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
+	rseq_set_notify_resume(t);
+}
+
+/*
+ * If parent process has a registered restartable sequences area, the
+ * child inherits. Unregister rseq for a clone with CLONE_VM set.
+ */
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+{
+	if (clone_flags & CLONE_VM) {
+		t->rseq = NULL;
+		t->rseq_len = 0;
+		t->rseq_sig = 0;
+		t->rseq_event_mask = 0;
+	} else {
+		t->rseq = current->rseq;
+		t->rseq_len = current->rseq_len;
+		t->rseq_sig = current->rseq_sig;
+		t->rseq_event_mask = current->rseq_event_mask;
+	}
+}
+
+static inline void rseq_execve(struct task_struct *t)
+{
+	t->rseq = NULL;
+	t->rseq_len = 0;
+	t->rseq_sig = 0;
+	t->rseq_event_mask = 0;
+}
+
+#else
+
+static inline void rseq_set_notify_resume(struct task_struct *t)
+{
+}
+static inline void rseq_handle_notify_resume(struct ksignal *ksig,
+					     struct pt_regs *regs)
+{
+}
+static inline void rseq_signal_deliver(struct ksignal *ksig,
+				       struct pt_regs *regs)
+{
+}
+static inline void rseq_preempt(struct task_struct *t)
+{
+}
+static inline void rseq_migrate(struct task_struct *t)
+{
+}
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+{
+}
+static inline void rseq_execve(struct task_struct *t)
+{
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_RSEQ
+
+void rseq_syscall(struct pt_regs *regs);
+
+#else
+
+static inline void rseq_syscall(struct pt_regs *regs)
+{
+}
+
+#endif
+
+#endif /* _LINUX_RSEQ_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dd002d193726..a588b94988bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -35,7 +35,7 @@
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers_types.h>
 #include <linux/restart_block.h>
-#include <linux/rseq.h>
+#include <uapi/linux/rseq.h>
 #include <linux/seqlock_types.h>
 #include <linux/kcsan.h>
 #include <linux/rv.h>
@@ -2181,129 +2181,6 @@ static inline bool owner_on_cpu(struct task_struct *owner)
 unsigned long sched_cpu_util(int cpu);
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_RSEQ
-
-/*
- * Map the event mask on the user-space ABI enum rseq_cs_flags
- * for direct mask checks.
- */
-enum rseq_event_mask_bits {
-	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
-	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
-	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
-};
-
-enum rseq_event_mask {
-	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
-	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
-	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
-};
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-	if (t->rseq)
-		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
-}
-
-void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
-
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-					     struct pt_regs *regs)
-{
-	if (current->rseq)
-		__rseq_handle_notify_resume(ksig, regs);
-}
-
-static inline void rseq_signal_deliver(struct ksignal *ksig,
-				       struct pt_regs *regs)
-{
-	preempt_disable();
-	__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
-	preempt_enable();
-	rseq_handle_notify_resume(ksig, regs);
-}
-
-/* rseq_preempt() requires preemption to be disabled. */
-static inline void rseq_preempt(struct task_struct *t)
-{
-	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
-	rseq_set_notify_resume(t);
-}
-
-/* rseq_migrate() requires preemption to be disabled. */
-static inline void rseq_migrate(struct task_struct *t)
-{
-	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
-	rseq_set_notify_resume(t);
-}
-
-/*
- * If parent process has a registered restartable sequences area, the
- * child inherits. Unregister rseq for a clone with CLONE_VM set.
- */
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
-{
-	if (clone_flags & CLONE_VM) {
-		t->rseq = NULL;
-		t->rseq_len = 0;
-		t->rseq_sig = 0;
-		t->rseq_event_mask = 0;
-	} else {
-		t->rseq = current->rseq;
-		t->rseq_len = current->rseq_len;
-		t->rseq_sig = current->rseq_sig;
-		t->rseq_event_mask = current->rseq_event_mask;
-	}
-}
-
-static inline void rseq_execve(struct task_struct *t)
-{
-	t->rseq = NULL;
-	t->rseq_len = 0;
-	t->rseq_sig = 0;
-	t->rseq_event_mask = 0;
-}
-
-#else
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-}
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-					     struct pt_regs *regs)
-{
-}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
-				       struct pt_regs *regs)
-{
-}
-static inline void rseq_preempt(struct task_struct *t)
-{
-}
-static inline void rseq_migrate(struct task_struct *t)
-{
-}
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
-{
-}
-static inline void rseq_execve(struct task_struct *t)
-{
-}
-
-#endif
-
-#ifdef CONFIG_DEBUG_RSEQ
-
-void rseq_syscall(struct pt_regs *regs);
-
-#else
-
-static inline void rseq_syscall(struct pt_regs *regs)
-{
-}
-
-#endif
-
 #ifdef CONFIG_SCHED_CORE
 extern void sched_core_free(struct task_struct *tsk);
 extern void sched_core_fork(struct task_struct *p);
-- 
cgit v1.2.3


From 2b010a69350f2c995f40585fb801904874c85dd1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 16:04:03 -0500
Subject: preempt.h: Kill dependency on list.h

We really only need types.h, list.h is big.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/preempt.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 9aa6358a1a16..7233e9cf1bab 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -9,7 +9,7 @@
 
 #include <linux/linkage.h>
 #include <linux/cleanup.h>
-#include <linux/list.h>
+#include <linux/types.h>
 
 /*
  * We put the hardirq and softirq counter into the preemption
@@ -360,7 +360,9 @@ void preempt_notifier_unregister(struct preempt_notifier *notifier);
 static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 				     struct preempt_ops *ops)
 {
-	INIT_HLIST_NODE(&notifier->link);
+	/* INIT_HLIST_NODE() open coded, to avoid dependency on list.h */
+	notifier->link.next = NULL;
+	notifier->link.pprev = NULL;
 	notifier->ops = ops;
 }
 
-- 
cgit v1.2.3


From 1e2f2d31997a9496f99e2b43255d6a48b06fbcc2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 15:51:54 -0500
Subject: Kill sched.h dependency on rcupdate.h

by moving cond_resched_rcu() to rcupdate_wait.h, we can kill another big
sched.h dependency.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/rcupdate_wait.h | 10 ++++++++++
 include/linux/sched.h         | 15 +++++----------
 include/linux/sched/task.h    |  1 +
 3 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h
index 5e0f74f2f8ca..d07f0848802e 100644
--- a/include/linux/rcupdate_wait.h
+++ b/include/linux/rcupdate_wait.h
@@ -8,6 +8,7 @@
 
 #include <linux/rcupdate.h>
 #include <linux/completion.h>
+#include <linux/sched.h>
 
 /*
  * Structure allowing asynchronous waiting on RCU.
@@ -55,4 +56,13 @@ do {									\
 #define synchronize_rcu_mult(...) \
 	_wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__)
 
+static inline void cond_resched_rcu(void)
+{
+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
+	rcu_read_unlock();
+	cond_resched();
+	rcu_read_lock();
+#endif
+}
+
 #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a588b94988bc..814bfdafbc1c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -10,9 +10,14 @@
 #include <uapi/linux/sched.h>
 
 #include <asm/current.h>
+#include <asm/processor.h>
+#include <linux/thread_info.h>
+#include <linux/preempt.h>
+#include <linux/cpumask.h>
 
 #include <linux/cache.h>
 #include <linux/irqflags_types.h>
+#include <linux/smp_types.h>
 #include <linux/pid_types.h>
 #include <linux/sem_types.h>
 #include <linux/shm.h>
@@ -23,7 +28,6 @@
 #include <linux/timer_types.h>
 #include <linux/seccomp_types.h>
 #include <linux/nodemask_types.h>
-#include <linux/rcupdate.h>
 #include <linux/refcount_types.h>
 #include <linux/resource.h>
 #include <linux/latencytop.h>
@@ -2059,15 +2063,6 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
 	__cond_resched_rwlock_write(lock);					\
 })
 
-static inline void cond_resched_rcu(void)
-{
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
-	rcu_read_unlock();
-	cond_resched();
-	rcu_read_lock();
-#endif
-}
-
 #ifdef CONFIG_PREEMPT_DYNAMIC
 
 extern bool preempt_model_none(void);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 1880ae21a9cb..538cdfbe895f 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -7,6 +7,7 @@
  * functionality:
  */
 
+#include <linux/rcupdate.h>
 #include <linux/refcount.h>
 #include <linux/sched.h>
 #include <linux/uaccess.h>
-- 
cgit v1.2.3


From d6b9f4e6f7fb589d8024a31cc4883d15d0c8def4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 27 Dec 2023 09:23:05 +0000
Subject: block: rename and document BLK_DEF_MAX_SECTORS

Give BLK_DEF_MAX_SECTORS a _CAP postfix and document what it is used for.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231227092305.279567-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bc236e77d85e..94701a63ad8a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1056,7 +1056,14 @@ enum blk_default_limits {
 	BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
 };
 
-#define BLK_DEF_MAX_SECTORS 2560u
+/*
+ * Default upper limit for the software max_sectors limit used for
+ * regular file system I/O.  This can be increased through sysfs.
+ *
+ * Not to be confused with the max_hw_sector limit that is entirely
+ * controlled by the driver, usually based on hardware limits.
+ */
+#define BLK_DEF_MAX_SECTORS_CAP	2560u
 
 static inline unsigned long queue_segment_boundary(const struct request_queue *q)
 {
-- 
cgit v1.2.3


From 21d706d5cf570917594b21edee81893bdce09ab8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 9 Jul 2021 08:41:17 +0100
Subject: netfs: Add support for DIO buffering

Add a bvec array pointer and an iterator to netfs_io_request for either
holding a copy of a DIO iterator or a list of all the bits of buffer
pointed to by a DIO iterator.

There are two problems:  Firstly, if an iovec-class iov_iter is passed to
->read_iter() or ->write_iter(), this cannot be passed directly to
kernel_sendmsg() or kernel_recvmsg() as that may cause locking recursion if
a fault is generated, so we need to keep track of the pages involved
separately.

Secondly, if the I/O is asynchronous, we must copy the iov_iter describing
the buffer before returning to the caller as it may be immediately
deallocated.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 3da962e977f5..2bb1273b38f4 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -190,6 +190,8 @@ struct netfs_io_request {
 	struct iov_iter		iter;		/* Unencrypted-side iterator */
 	struct iov_iter		io_iter;	/* I/O (Encrypted-side) iterator */
 	void			*netfs_priv;	/* Private data for the netfs */
+	struct bio_vec		*direct_bv;	/* DIO buffer list (when handling iovec-iter) */
+	unsigned int		direct_bv_count; /* Number of elements in direct_bv[] */
 	unsigned int		debug_id;
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
@@ -197,6 +199,7 @@ struct netfs_io_request {
 	size_t			len;		/* Length of the request */
 	short			error;		/* 0 or error that occurred */
 	enum netfs_io_origin	origin;		/* Origin of the request */
+	bool			direct_bv_unpin; /* T if direct_bv[] must be unpinned */
 	loff_t			i_size;		/* Size of the file */
 	loff_t			start;		/* Start position */
 	pgoff_t			no_unlock_folio; /* Don't unlock this folio after read */
-- 
cgit v1.2.3


From 7d828a06634799aba0fa392913c7fe2953eb64a6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 22 Sep 2023 13:25:22 +0100
Subject: netfs: Provide tools to create a buffer in an xarray

Provide tools to create a buffer in an xarray, with a function to add new
folios with a mark.  This will be used to create bounce buffer and can be
used more easily to create a list of folios the span of which would require
more than a page's worth of bio_vec structs.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 2bb1273b38f4..c05365e3f428 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -109,6 +109,10 @@ static inline int wait_on_page_fscache_killable(struct page *page)
 	return folio_wait_private_2_killable(page_folio(page));
 }
 
+/* Marks used on xarray-based buffers */
+#define NETFS_BUF_PUT_MARK	XA_MARK_0	/* - Page needs putting  */
+#define NETFS_BUF_PAGECACHE_MARK XA_MARK_1	/* - Page needs wb/dirty flag wrangling */
+
 enum netfs_io_source {
 	NETFS_FILL_WITH_ZEROES,
 	NETFS_DOWNLOAD_FROM_SERVER,
-- 
cgit v1.2.3


From cae932d3aee55035a54415dcea8e7ecf2ec469b5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 22 Sep 2023 14:49:47 +0100
Subject: netfs: Add func to calculate pagecount/size-limited span of an
 iterator

Add a function to work out how much of an ITER_BVEC or ITER_XARRAY iterator
we can use in a pagecount-limited and size-limited span.  This will be
used, for example, to limit the number of segments in a subrequest to the
maximum number of elements that an RDMA transfer can handle.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index c05365e3f428..d673d0785b9d 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -321,6 +321,8 @@ void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
 ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
 				struct iov_iter *new,
 				iov_iter_extraction_t extraction_flags);
+size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
+			size_t max_size, size_t max_segs);
 
 int netfs_start_io_read(struct inode *inode);
 void netfs_end_io_read(struct inode *inode);
-- 
cgit v1.2.3


From 768ddb1eacf5dd997ecf393e7bab9796bad047e0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 27 May 2022 13:45:28 +0100
Subject: netfs: Limit subrequest by size or number of segments

Limit a subrequest to a maximum size and/or a maximum number of contiguous
physical regions.  This permits, for instance, an subreq's iterator to be
limited to the number of DMA'able segments that a large RDMA request can
handle.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index d673d0785b9d..44cd13ad695a 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -161,6 +161,7 @@ struct netfs_io_subrequest {
 	refcount_t		ref;
 	short			error;		/* 0 or error that occurred */
 	unsigned short		debug_index;	/* Index in list (for debugging output) */
+	unsigned int		max_nr_segs;	/* 0 or max number of segments in an iterator */
 	enum netfs_io_source	source;		/* Where to read from/write to */
 	unsigned long		flags;
 #define NETFS_SREQ_COPY_TO_CACHE	0	/* Set if should copy the data to the cache */
-- 
cgit v1.2.3


From 16af134ca4b7051b1587108f2066ec90ae029f74 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 9 Feb 2022 19:52:13 +0000
Subject: netfs: Extend the netfs_io_*request structs to handle writes

Modify the netfs_io_request struct to act as a point around which writes
can be coordinated.  It represents and pins a range of pages that need
writing and a list of regions of dirty data in that range of pages.

If RMW is required, the original data can be downloaded into the bounce
buffer, decrypted if necessary, the modifications made, then the modified
data can be reencrypted/recompressed and sent back to the server.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 44cd13ad695a..f302123a3e38 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -118,6 +118,9 @@ enum netfs_io_source {
 	NETFS_DOWNLOAD_FROM_SERVER,
 	NETFS_READ_FROM_CACHE,
 	NETFS_INVALID_READ,
+	NETFS_UPLOAD_TO_SERVER,
+	NETFS_WRITE_TO_CACHE,
+	NETFS_INVALID_WRITE,
 } __mode(byte);
 
 typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
@@ -149,9 +152,14 @@ struct netfs_cache_resources {
 };
 
 /*
- * Descriptor for a single component subrequest.
+ * Descriptor for a single component subrequest.  Each operation represents an
+ * individual read/write from/to a server, a cache, a journal, etc..
+ *
+ * The buffer iterator is persistent for the life of the subrequest struct and
+ * the pages it points to can be relied on to exist for the duration.
  */
 struct netfs_io_subrequest {
+	struct work_struct	work;
 	struct netfs_io_request *rreq;		/* Supervising I/O request */
 	struct list_head	rreq_link;	/* Link in rreq->subrequests */
 	struct iov_iter		io_iter;	/* Iterator for this subrequest */
@@ -176,6 +184,8 @@ enum netfs_io_origin {
 	NETFS_READAHEAD,		/* This read was triggered by readahead */
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
+	NETFS_WRITEBACK,		/* This write was triggered by writepages */
+	nr__netfs_io_origin
 } __mode(byte);
 
 /*
@@ -198,6 +208,7 @@ struct netfs_io_request {
 	struct bio_vec		*direct_bv;	/* DIO buffer list (when handling iovec-iter) */
 	unsigned int		direct_bv_count; /* Number of elements in direct_bv[] */
 	unsigned int		debug_id;
+	unsigned int		subreq_counter;	/* Next subreq->debug_index */
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
 	size_t			submitted;	/* Amount submitted for I/O so far */
@@ -216,6 +227,8 @@ struct netfs_io_request {
 #define NETFS_RREQ_DONT_UNLOCK_FOLIOS	3	/* Don't unlock the folios on completion */
 #define NETFS_RREQ_FAILED		4	/* The request failed */
 #define NETFS_RREQ_IN_PROGRESS		5	/* Unlocked when the request completes */
+#define NETFS_RREQ_WRITE_TO_CACHE	7	/* Need to write to the cache */
+#define NETFS_RREQ_UPLOAD_TO_SERVER	8	/* Need to write to the server */
 	const struct netfs_request_ops *netfs_ops;
 };
 
-- 
cgit v1.2.3


From c6dc54dd91bbf597942b4975b8adec660a16827d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 25 Feb 2022 12:27:53 +0000
Subject: netfs: Add a hook to allow tell the netfs to update its i_size

Add a hook for netfslib's write helpers to call to tell the network
filesystem that it should update its i_size.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index f302123a3e38..3fc41f616621 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -242,6 +242,7 @@ struct netfs_request_ops {
 	void (*free_request)(struct netfs_io_request *rreq);
 	void (*free_subrequest)(struct netfs_io_subrequest *rreq);
 
+	/* Read request handling */
 	void (*expand_readahead)(struct netfs_io_request *rreq);
 	bool (*clamp_length)(struct netfs_io_subrequest *subreq);
 	void (*issue_read)(struct netfs_io_subrequest *subreq);
@@ -249,6 +250,9 @@ struct netfs_request_ops {
 	int (*check_write_begin)(struct file *file, loff_t pos, unsigned len,
 				 struct folio **foliop, void **_fsdata);
 	void (*done)(struct netfs_io_request *rreq);
+
+	/* Modification handling */
+	void (*update_i_size)(struct inode *inode, loff_t i_size);
 };
 
 /*
-- 
cgit v1.2.3


From 9ebff83e648148b9ece97d4e4890dd84ca54d6ce Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 29 Sep 2023 17:28:25 +0100
Subject: netfs: Prep to use folio->private for write grouping and streaming
 write

Prepare to use folio->private to hold information write grouping and
streaming write.  These are implemented in the same commit as they both
make use of folio->private and will be both checked at the same time in
several places.

"Write grouping" involves ordering the writeback of groups of writes, such
as is needed for ceph snaps.  A group is represented by a
filesystem-supplied object which must contain a netfs_group struct.  This
contains just a refcount and a pointer to a destructor.

"Streaming write" is the storage of data in folios that are marked dirty,
but not uptodate, to avoid unnecessary reads of data.  This is represented
by a netfs_folio struct.  This contains the offset and length of the
modified region plus the otherwise displaced write grouping pointer.

The way folio->private is multiplexed is:

 (1) If private is NULL then neither is in operation on a dirty folio.

 (2) If private is set, with bit 0 clear, then this points to a group.

 (3) If private is set, with bit 0 set, then this points to a netfs_folio
     struct (with bit 0 AND'ed out).

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 3fc41f616621..cfba83e3e3d2 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -140,6 +140,47 @@ struct netfs_inode {
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 };
 
+/*
+ * A netfs group - for instance a ceph snap.  This is marked on dirty pages and
+ * pages marked with a group must be flushed before they can be written under
+ * the domain of another group.
+ */
+struct netfs_group {
+	refcount_t		ref;
+	void (*free)(struct netfs_group *netfs_group);
+};
+
+/*
+ * Information about a dirty page (attached only if necessary).
+ * folio->private
+ */
+struct netfs_folio {
+	struct netfs_group	*netfs_group;	/* Filesystem's grouping marker (or NULL). */
+	unsigned int		dirty_offset;	/* Write-streaming dirty data offset */
+	unsigned int		dirty_len;	/* Write-streaming dirty data length */
+};
+#define NETFS_FOLIO_INFO	0x1UL	/* OR'd with folio->private. */
+
+static inline struct netfs_folio *netfs_folio_info(struct folio *folio)
+{
+	void *priv = folio_get_private(folio);
+
+	if ((unsigned long)priv & NETFS_FOLIO_INFO)
+		return (struct netfs_folio *)((unsigned long)priv & ~NETFS_FOLIO_INFO);
+	return NULL;
+}
+
+static inline struct netfs_group *netfs_folio_group(struct folio *folio)
+{
+	struct netfs_folio *finfo;
+	void *priv = folio_get_private(folio);
+
+	finfo = netfs_folio_info(folio);
+	if (finfo)
+		return finfo->netfs_group;
+	return priv;
+}
+
 /*
  * Resources required to do operations on a cache.
  */
-- 
cgit v1.2.3


From 0e0f2dfe880fb19e4b15a7ca468623eb0b4ba586 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Jun 2021 22:31:48 +0100
Subject: netfs: Dispatch write requests to process a writeback slice

Dispatch one or more write reqeusts to process a writeback slice, where a
slice is tailored more to logical block divisions within the file (such as
crypto blocks, an object layout or cache granules) than the protocol RPC
maximum capacity.

The dispatch doesn't happen until throttling allows, at which point the
entire writeback slice is processed and queued.  A slice may be written to
multiple destinations (one or more servers and the local cache) and the
writes to each destination might be split up along different lines.

The writeback slice holds the required folios pinned.  An iov_iter is
provided in netfs_write_request that describes the buffer to be used.  This
may be part of the pagecache, may have auxiliary padding pages attached or
may be a bounce buffer resulting from crypto or compression.  Consequently,
the filesystem must not twiddle the folio markings directly.

The following API is available to the filesystem:

 (1) The ->create_write_requests() method is called to ask the filesystem
     to create the requests it needs.  This is passed the writeback slice
     to be processed.

 (2) The filesystem should then call netfs_create_write_request() to create
     the requests it needs.

 (3) Once a request is initialised, netfs_queue_write_request() can be
     called to dispatch it asynchronously, if not completed immediately.

 (4) netfs_write_request_completed() should be called to note the
     completion of a request.

 (5) netfs_get_write_request() and netfs_put_write_request() are provided
     to refcount a request.  These take constants from the netfs_wreq_trace
     enum for logging into ftrace.

 (6) The ->free_write_request is method is called to ask the filesystem to
     clean up a request.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index cfba83e3e3d2..890a5d8b2299 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -249,6 +249,7 @@ struct netfs_io_request {
 	struct bio_vec		*direct_bv;	/* DIO buffer list (when handling iovec-iter) */
 	unsigned int		direct_bv_count; /* Number of elements in direct_bv[] */
 	unsigned int		debug_id;
+	unsigned int		wsize;		/* Maximum write size (0 for none) */
 	unsigned int		subreq_counter;	/* Next subreq->debug_index */
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
@@ -271,6 +272,7 @@ struct netfs_io_request {
 #define NETFS_RREQ_WRITE_TO_CACHE	7	/* Need to write to the cache */
 #define NETFS_RREQ_UPLOAD_TO_SERVER	8	/* Need to write to the server */
 	const struct netfs_request_ops *netfs_ops;
+	void (*cleanup)(struct netfs_io_request *req);
 };
 
 /*
@@ -294,6 +296,11 @@ struct netfs_request_ops {
 
 	/* Modification handling */
 	void (*update_i_size)(struct inode *inode, loff_t i_size);
+
+	/* Write request handling */
+	void (*create_write_requests)(struct netfs_io_request *wreq,
+				      loff_t start, size_t len);
+	void (*invalidate_cache)(struct netfs_io_request *wreq);
 };
 
 /*
@@ -382,6 +389,12 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
 				iov_iter_extraction_t extraction_flags);
 size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
 			size_t max_size, size_t max_segs);
+struct netfs_io_subrequest *netfs_create_write_request(
+	struct netfs_io_request *wreq, enum netfs_io_source dest,
+	loff_t start, size_t len, work_func_t worker);
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+				       bool was_async);
+void netfs_queue_write_request(struct netfs_io_subrequest *subreq);
 
 int netfs_start_io_read(struct inode *inode);
 void netfs_end_io_read(struct inode *inode);
-- 
cgit v1.2.3


From c38f4e96e605f17990e871214e6ea1496bc4e65f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 17 Jun 2021 13:09:21 +0100
Subject: netfs: Provide func to copy data to pagecache for buffered write

Provide a netfs write helper, netfs_perform_write() to buffer data to be
written in the pagecache and mark the modified folios dirty.

It will perform "streaming writes" for folios that aren't currently
resident, if possible, storing data in partially modified folios that are
marked dirty, but not uptodate.  It will also tag pages as belonging to
fs-specific write groups if so directed by the filesystem.

This is derived from generic_perform_write(), but doesn't use
->write_begin() and ->write_end(), having that logic rolled in instead.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 890a5d8b2299..70f578cf3715 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -367,6 +367,11 @@ struct netfs_cache_ops {
 			       loff_t *_data_start, size_t *_data_len);
 };
 
+/* High-level write API */
+ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
+			    struct netfs_group *netfs_group);
+
+/* Address operations API */
 struct readahead_control;
 void netfs_readahead(struct readahead_control *);
 int netfs_read_folio(struct file *, struct folio *);
-- 
cgit v1.2.3


From 016dc8516aec8719641e7aaaacd78d344759178e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2022 17:39:55 +0000
Subject: netfs: Implement unbuffered/DIO read support

Implement support for unbuffered and DIO reads in the netfs library,
utilising the existing read helper code to do block splitting and
individual queuing.  The code also handles extraction of the destination
buffer from the supplied iterator, allowing async unbuffered reads to take
place.

The read will be split up according to the rsize setting and, if supplied,
the ->clamp_length() method.  Note that the next subrequest will be issued
as soon as issue_op returns, without waiting for previous ones to finish.
The network filesystem needs to pause or handle queuing them if it doesn't
want to fire them all at the server simultaneously.

Once all the subrequests have finished, the state will be assessed and the
amount of data to be indicated as having being obtained will be
determined.  As the subrequests may finish in any order, if an intermediate
subrequest is short, any further subrequests may be copied into the buffer
and then abandoned.

In the future, this will also take care of doing an unbuffered read from
encrypted content, with the decryption being done by the library.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 70f578cf3715..7c1309568459 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -226,6 +226,7 @@ enum netfs_io_origin {
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
+	NETFS_DIO_READ,			/* This is a direct I/O read */
 	nr__netfs_io_origin
 } __mode(byte);
 
@@ -240,6 +241,7 @@ struct netfs_io_request {
 	};
 	struct inode		*inode;		/* The file being accessed */
 	struct address_space	*mapping;	/* The mapping being accessed */
+	struct kiocb		*iocb;		/* AIO completion vector */
 	struct netfs_cache_resources cache_resources;
 	struct list_head	proc_link;	/* Link in netfs_iorequests */
 	struct list_head	subrequests;	/* Contributory I/O operations */
@@ -249,12 +251,14 @@ struct netfs_io_request {
 	struct bio_vec		*direct_bv;	/* DIO buffer list (when handling iovec-iter) */
 	unsigned int		direct_bv_count; /* Number of elements in direct_bv[] */
 	unsigned int		debug_id;
+	unsigned int		rsize;		/* Maximum read size (0 for none) */
 	unsigned int		wsize;		/* Maximum write size (0 for none) */
 	unsigned int		subreq_counter;	/* Next subreq->debug_index */
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
 	size_t			submitted;	/* Amount submitted for I/O so far */
 	size_t			len;		/* Length of the request */
+	size_t			transferred;	/* Amount to be indicated as transferred */
 	short			error;		/* 0 or error that occurred */
 	enum netfs_io_origin	origin;		/* Origin of the request */
 	bool			direct_bv_unpin; /* T if direct_bv[] must be unpinned */
@@ -271,6 +275,8 @@ struct netfs_io_request {
 #define NETFS_RREQ_IN_PROGRESS		5	/* Unlocked when the request completes */
 #define NETFS_RREQ_WRITE_TO_CACHE	7	/* Need to write to the cache */
 #define NETFS_RREQ_UPLOAD_TO_SERVER	8	/* Need to write to the server */
+#define NETFS_RREQ_NONBLOCK		9	/* Don't block if possible (O_NONBLOCK) */
+#define NETFS_RREQ_BLOCKED		10	/* We blocked */
 	const struct netfs_request_ops *netfs_ops;
 	void (*cleanup)(struct netfs_io_request *req);
 };
@@ -367,6 +373,9 @@ struct netfs_cache_ops {
 			       loff_t *_data_start, size_t *_data_len);
 };
 
+/* High-level read API. */
+ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+
 /* High-level write API */
 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 			    struct netfs_group *netfs_group);
-- 
cgit v1.2.3


From 153a9961b551101cd38e94e26cd92fbfd198b19b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 21 Feb 2022 11:38:17 +0000
Subject: netfs: Implement unbuffered/DIO write support

Implement support for unbuffered writes and direct I/O writes.  If the
write is misaligned with respect to the fscrypt block size, then RMW cycles
are performed if necessary.  DIO writes are a special case of unbuffered
writes with extra restriction imposed, such as block size alignment
requirements.

Also provide a field that can tell the code to add some extra space onto
the bounce buffer for use by the filesystem in the case of a
content-encrypted file.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 7c1309568459..e1dfd6775c2c 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -138,6 +138,7 @@ struct netfs_inode {
 	loff_t			remote_i_size;	/* Size of the remote file */
 	unsigned long		flags;
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
+#define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
 };
 
 /*
@@ -226,7 +227,9 @@ enum netfs_io_origin {
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
+	NETFS_UNBUFFERED_WRITE,		/* This is an unbuffered write */
 	NETFS_DIO_READ,			/* This is a direct I/O read */
+	NETFS_DIO_WRITE,		/* This is a direct I/O write */
 	nr__netfs_io_origin
 } __mode(byte);
 
@@ -379,6 +382,7 @@ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 /* High-level write API */
 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 			    struct netfs_group *netfs_group);
+ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from);
 
 /* Address operations API */
 struct readahead_control;
-- 
cgit v1.2.3


From 938e13a73b244278a3777f38fa915bd239b2efd2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 17 Jun 2021 13:09:21 +0100
Subject: netfs: Implement buffered write API

Institute a netfs write helper, netfs_file_write_iter(), to be pointed at
by the network filesystem ->write_iter() call.  Make it handled buffered
writes by calling the previously defined netfs_perform_write() to copy the
source data into the pagecache.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index e1dfd6775c2c..0948ecf69aa5 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -382,7 +382,10 @@ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 /* High-level write API */
 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 			    struct netfs_group *netfs_group);
+ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from,
+					 struct netfs_group *netfs_group);
 ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from);
+ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from);
 
 /* Address operations API */
 struct readahead_control;
-- 
cgit v1.2.3


From 102a7e2c598c22bd2621fa97eb1c93c89d469a12 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 15 Feb 2022 23:15:57 +0000
Subject: netfs: Allow buffered shared-writeable mmap through
 netfs_page_mkwrite()

Provide an entry point to delegate a filesystem's ->page_mkwrite() to.
This checks for conflicting writes, then attached any netfs-specific group
marking (e.g. ceph snap) to the page to be considered dirty.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 0948ecf69aa5..d7f324c7c22a 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -400,6 +400,10 @@ void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
 void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
 bool netfs_release_folio(struct folio *folio, gfp_t gfp);
 
+/* VMA operations API. */
+vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);
+
+/* (Sub)request management API. */
 void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool);
 void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
 			  enum netfs_sreq_ref_trace what);
-- 
cgit v1.2.3


From 80645bd4aa33a5c325f11b8dc6b38b38410ad5c0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 11 Oct 2023 09:29:43 +0100
Subject: netfs: Provide netfs_file_read_iter()

Provide a top-level-ish function that can be pointed to directly by
->read_iter file op.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index d7f324c7c22a..19a41c437af3 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -378,6 +378,8 @@ struct netfs_cache_ops {
 
 /* High-level read API. */
 ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 
 /* High-level write API */
 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
-- 
cgit v1.2.3


From e0ace6ca98bef0d8d354040f13ffc0a498813ee9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 22 Nov 2023 17:18:17 +0000
Subject: netfs, cachefiles: Pass upper bound length to allow expansion

Make netfslib pass the maximum length to the ->prepare_write() op to tell
the cache how much it can expand the length of a write to.  This allows a
write to the server at the end of a file to be limited to a few bytes
whilst writing an entire block to the cache (something required by direct
I/O).

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 19a41c437af3..2856389f4694 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -261,6 +261,7 @@ struct netfs_io_request {
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
 	size_t			submitted;	/* Amount submitted for I/O so far */
 	size_t			len;		/* Length of the request */
+	size_t			upper_len;	/* Length can be extended to here */
 	size_t			transferred;	/* Amount to be indicated as transferred */
 	short			error;		/* 0 or error that occurred */
 	enum netfs_io_origin	origin;		/* Origin of the request */
@@ -357,8 +358,8 @@ struct netfs_cache_ops {
 	 * actually do.
 	 */
 	int (*prepare_write)(struct netfs_cache_resources *cres,
-			     loff_t *_start, size_t *_len, loff_t i_size,
-			     bool no_space_allocated_yet);
+			     loff_t *_start, size_t *_len, size_t upper_len,
+			     loff_t i_size, bool no_space_allocated_yet);
 
 	/* Prepare an on-demand read operation, shortening it to a cached/uncached
 	 * boundary as appropriate.
-- 
cgit v1.2.3


From 62c3b7481b9a108cb99ef9438dba66bb4738768b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 28 Sep 2023 11:46:49 +0100
Subject: netfs: Provide a writepages implementation

Provide an implementation of writepages for network filesystems to delegate
to.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 2856389f4694..86bb8cb7f8d0 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -397,6 +397,8 @@ int netfs_read_folio(struct file *, struct folio *);
 int netfs_write_begin(struct netfs_inode *, struct file *,
 		      struct address_space *, loff_t pos, unsigned int len,
 		      struct folio **, void **fsdata);
+int netfs_writepages(struct address_space *mapping,
+		     struct writeback_control *wbc);
 bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio);
 int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
 void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
-- 
cgit v1.2.3


From 4a79616cfb27d76947ea37f0336745ef929d56be Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 5 Oct 2023 16:52:58 +0100
Subject: netfs: Provide a launder_folio implementation

Provide a launder_folio implementation for netfslib.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 86bb8cb7f8d0..29c66acad925 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -227,6 +227,7 @@ enum netfs_io_origin {
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
+	NETFS_LAUNDER_WRITE,		/* This is triggered by ->launder_folio() */
 	NETFS_UNBUFFERED_WRITE,		/* This is an unbuffered write */
 	NETFS_DIO_READ,			/* This is a direct I/O read */
 	NETFS_DIO_WRITE,		/* This is a direct I/O write */
@@ -404,6 +405,7 @@ int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
 void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
 void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
 bool netfs_release_folio(struct folio *folio, gfp_t gfp);
+int netfs_launder_folio(struct folio *folio);
 
 /* VMA operations API. */
 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);
-- 
cgit v1.2.3


From 41d8e7673a7726cba57cb8112d81c89cfb6c3e35 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 12 Oct 2023 09:06:24 +0100
Subject: netfs: Implement a write-through caching option

Provide a flag whereby a filesystem may request that cifs_perform_write()
perform write-through caching.  This involves putting pages directly into
writeback rather than dirty and attaching them to a write operation as we
go.

Further, the writes being made are limited to the byte range being written
rather than whole folios being written.  This can be used by cifs, for
example, to deal with strict byte-range locking.

This can't be used with content encryption as that may require expansion of
the write RPC beyond the write being made.

This doesn't affect writes via mmap - those are written back in the normal
way; similarly failed writethrough writes are marked dirty and left to
writeback to retry.  Another option would be to simply invalidate them, but
the contents can be simultaneously accessed by read() and through mmap.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 29c66acad925..8a2dd882a781 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -139,6 +139,7 @@ struct netfs_inode {
 	unsigned long		flags;
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 #define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
+#define NETFS_ICTX_WRITETHROUGH	2		/* Write-through caching */
 };
 
 /*
@@ -227,6 +228,7 @@ enum netfs_io_origin {
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
+	NETFS_WRITETHROUGH,		/* This write was made by netfs_perform_write() */
 	NETFS_LAUNDER_WRITE,		/* This is triggered by ->launder_folio() */
 	NETFS_UNBUFFERED_WRITE,		/* This is an unbuffered write */
 	NETFS_DIO_READ,			/* This is a direct I/O read */
-- 
cgit v1.2.3


From 100ccd18bb41ea7abb4fbb419202c06079559501 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2023 13:39:02 +0000
Subject: netfs: Optimise away reads above the point at which there can be no
 data

Track the file position above which the server is not expected to have any
data (the "zero point") and preemptively assume that we can satisfy
requests by filling them with zeroes locally rather than attempting to
download them if they're over that line - even if we've written data back
to the server.  Assume that any data that was written back above that
position is held in the local cache.  Note that we have to split requests
that straddle the line.

Make use of this to optimise away some reads from the server.  We need to
set the zero point in the following circumstances:

 (1) When we see an extant remote inode and have no cache for it, we set
     the zero_point to i_size.

 (2) On local inode creation, we set zero_point to 0.

 (3) On local truncation down, we reduce zero_point to the new i_size if
     the new i_size is lower.

 (4) On local truncation up, we don't change zero_point.

 (5) On local modification, we don't change zero_point.

 (6) On remote invalidation, we set zero_point to the new i_size.

 (7) If stored data is discarded from the pagecache or culled from fscache,
     we must set zero_point above that if the data also got written to the
     server.

 (8) If dirty data is written back to the server, but not fscache, we must
     set zero_point above that.

 (9) If a direct I/O write is made, set zero_point above that.

Assuming the above, any read from the server at or above the zero_point
position will return all zeroes.

The zero_point value can be stored in the cache, provided the above rules
are applied to it by any code that culls part of the local cache.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 8a2dd882a781..852956aa3c4b 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -136,6 +136,8 @@ struct netfs_inode {
 	struct fscache_cookie	*cache;
 #endif
 	loff_t			remote_i_size;	/* Size of the remote file */
+	loff_t			zero_point;	/* Size after which we assume there's no data
+						 * on the server */
 	unsigned long		flags;
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 #define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
@@ -453,31 +455,44 @@ static inline struct netfs_inode *netfs_inode(struct inode *inode)
  * netfs_inode_init - Initialise a netfslib inode context
  * @ctx: The netfs inode to initialise
  * @ops: The netfs's operations list
+ * @use_zero_point: True to use the zero_point read optimisation
  *
  * Initialise the netfs library context struct.  This is expected to follow on
  * directly from the VFS inode struct.
  */
 static inline void netfs_inode_init(struct netfs_inode *ctx,
-				    const struct netfs_request_ops *ops)
+				    const struct netfs_request_ops *ops,
+				    bool use_zero_point)
 {
 	ctx->ops = ops;
 	ctx->remote_i_size = i_size_read(&ctx->inode);
+	ctx->zero_point = LLONG_MAX;
 	ctx->flags = 0;
 #if IS_ENABLED(CONFIG_FSCACHE)
 	ctx->cache = NULL;
 #endif
+	/* ->releasepage() drives zero_point */
+	if (use_zero_point) {
+		ctx->zero_point = ctx->remote_i_size;
+		mapping_set_release_always(ctx->inode.i_mapping);
+	}
 }
 
 /**
  * netfs_resize_file - Note that a file got resized
  * @ctx: The netfs inode being resized
  * @new_i_size: The new file size
+ * @changed_on_server: The change was applied to the server
  *
  * Inform the netfs lib that a file got resized so that it can adjust its state.
  */
-static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size)
+static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size,
+				     bool changed_on_server)
 {
-	ctx->remote_i_size = new_i_size;
+	if (changed_on_server)
+		ctx->remote_i_size = new_i_size;
+	if (new_i_size < ctx->zero_point)
+		ctx->zero_point = new_i_size;
 }
 
 /**
-- 
cgit v1.2.3


From c39e2ae3943d4ee278af4e1b1dcfd5946da1089b Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Thu, 28 Dec 2023 11:06:08 +0100
Subject: fs: fix __sb_write_started() kerneldoc formatting

When running 'make htmldocs', I see the following warning:

  Documentation/filesystems/api-summary:14: ./include/linux/fs.h:1659: WARNING: Definition list ends without a blank line; unexpected unindent.

The official guidance [1] seems to be to use lists, which will prevent
both the "unexpected unindent" warning as well as ensure that each line
is formatted on a separate line in the HTML output instead of being
all considered a single paragraph.

[1]: https://docs.kernel.org/doc-guide/kernel-doc.html#return-values

Fixes: 8802e580ee64 ("fs: create __sb_write_started() helper")
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Link: https://lore.kernel.org/r/20231228100608.3123987-1-vegard.nossum@oracle.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index db5d07e6e02e..473063f385e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1650,9 +1650,9 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
  * @sb: the super we write to
  * @level: the freeze level
  *
- * > 0 sb freeze level is held
- *   0 sb freeze level is not held
- * < 0 !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
+ * * > 0 - sb freeze level is held
+ * *   0 - sb freeze level is not held
+ * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
  */
 static inline int __sb_write_started(const struct super_block *sb, int level)
 {
-- 
cgit v1.2.3


From 0b68ab50b8101a35b51fb9ec203cd988e47dbed3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 23 Dec 2023 14:53:47 +0100
Subject: sysctl: delete unused define SYSCTL_PERM_EMPTY_DIR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It seems it was never used.

Fixes: 2f2665c13af4 ("sysctl: replace child with an enumeration")
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/sysctl.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 61b40ea81f4d..26a38161c28f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -255,8 +255,6 @@ extern int unaligned_enabled;
 extern int unaligned_dump_stack;
 extern int no_unaligned_warning;
 
-#define SYSCTL_PERM_EMPTY_DIR	(1 << 0)
-
 #else /* CONFIG_SYSCTL */
 
 static inline void register_sysctl_init(const char *path, struct ctl_table *table)
-- 
cgit v1.2.3


From 561429807d50aad76f1205b0b1d7b4aacf365d4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Wed, 20 Dec 2023 22:23:35 +0100
Subject: sysctl: remove struct ctl_path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All usages of this struct have been removed from the kernel tree.

The struct is still referenced by scripts/check-sysctl-docs but that
script is broken anyways as it only supports the register_sysctl_paths()
API and not the currently used register_sysctl() one.

Fixes: 0199849acd07 ("sysctl: remove register_sysctl_paths()")
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Joel Granados <j.granados@samsung.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/sysctl.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 26a38161c28f..ee7d33b89e9e 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -210,11 +210,6 @@ struct ctl_table_root {
 	int (*permissions)(struct ctl_table_header *head, struct ctl_table *table);
 };
 
-/* struct ctl_path describes where in the hierarchy a table is added */
-struct ctl_path {
-	const char *procname;
-};
-
 #define register_sysctl(path, table)	\
 	register_sysctl_sz(path, table, ARRAY_SIZE(table))
 
-- 
cgit v1.2.3


From 4e814173a8c4f432fd068b1c796f0416328c9d99 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 18 Dec 2023 20:25:02 +0100
Subject: thermal: core: Fix thermal zone suspend-resume synchronization

There are 3 synchronization issues with thermal zone suspend-resume
during system-wide transitions:

 1. The resume code runs in a PM notifier which is invoked after user
    space has been thawed, so it can run concurrently with user space
    which can trigger a thermal zone device removal.  If that happens,
    the thermal zone resume code may use a stale pointer to the next
    list element and crash, because it does not hold thermal_list_lock
    while walking thermal_tz_list.

 2. The thermal zone resume code calls thermal_zone_device_init()
    outside the zone lock, so user space or an update triggered by
    the platform firmware may see an inconsistent state of a
    thermal zone leading to unexpected behavior.

 3. Clearing the in_suspend global variable in thermal_pm_notify()
    allows __thermal_zone_device_update() to continue for all thermal
    zones and it may as well run before the thermal_tz_list walk (or
    at any point during the list walk for that matter) and attempt to
    operate on a thermal zone that has not been resumed yet.  It may
    also race destructively with thermal_zone_device_init().

To address these issues, add thermal_list_lock locking to
thermal_pm_notify(), especially arount the thermal_tz_list,
make it call thermal_zone_device_init() back-to-back with
__thermal_zone_device_update() under the zone lock and replace
in_suspend with per-zone bool "suspend" indicators set and unset
under the given zone's lock.

Link: https://lore.kernel.org/linux-pm/20231218162348.69101-1-bo.ye@mediatek.com/
Reported-by: Bo Ye <bo.ye@mediatek.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 09f6eb82c191..d00622b64d50 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -152,6 +152,7 @@ struct thermal_cooling_device {
  * @node:	node in thermal_tz_list (in thermal_core.c)
  * @poll_queue:	delayed work for polling
  * @notify_event: Last notification event
+ * @suspended: thermal zone suspend indicator
  */
 struct thermal_zone_device {
 	int id;
@@ -185,6 +186,7 @@ struct thermal_zone_device {
 	struct list_head node;
 	struct delayed_work poll_queue;
 	enum thermal_notify_event notify_event;
+	bool suspended;
 };
 
 /**
-- 
cgit v1.2.3


From 753547de0daecbdbd1af3618987ddade325d9aaa Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 28 Dec 2023 11:36:03 +0100
Subject: linux/export: Ensure natural alignment of kcrctab array

The ___kcrctab section holds an array of 32-bit CRC values.
Add a .balign 4 to tell the linker the correct memory alignment.

Fixes: f3304ecd7f06 ("linux/export: use inline assembler to populate symbol CRCs")
Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/export-internal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h
index cd253eb51d6c..d445705ac13c 100644
--- a/include/linux/export-internal.h
+++ b/include/linux/export-internal.h
@@ -64,6 +64,7 @@
 
 #define SYMBOL_CRC(sym, crc, sec)   \
 	asm(".section \"___kcrctab" sec "+" #sym "\",\"a\""	"\n" \
+	    ".balign 4"						"\n" \
 	    "__crc_" #sym ":"					"\n" \
 	    ".long " #crc					"\n" \
 	    ".previous"						"\n")
-- 
cgit v1.2.3


From 1271ca00aa7f9bb3fd94cb7ac8f654de71099580 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 19 Dec 2023 16:55:31 -0700
Subject: ethtool: reformat kerneldoc for struct ethtool_fec_stats

The kerneldoc comment for struct ethtool_fec_stats attempts to describe the
"total" and "lanes" fields of the ethtool_fec_stat substructure in a way
leading to these warnings:

  ./include/linux/ethtool.h:424: warning: Excess struct member 'lane' description in 'ethtool_fec_stats'
  ./include/linux/ethtool.h:424: warning: Excess struct member 'total' description in 'ethtool_fec_stats'

Reformat the comment to retain the information while eliminating the
warnings.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index cfcd952a1d4f..325e0778e937 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -411,8 +411,10 @@ struct ethtool_pause_stats {
  *	not entire FEC data blocks. This is a non-standard statistic.
  *	Reported to user space as %ETHTOOL_A_FEC_STAT_CORR_BITS.
  *
- * @lane: per-lane/PCS-instance counts as defined by the standard
- * @total: error counts for the entire port, for drivers incapable of reporting
+ * For each of the above fields, the two substructure members are:
+ *
+ * - @lanes: per-lane/PCS-instance counts as defined by the standard
+ * - @total: error counts for the entire port, for drivers incapable of reporting
  *	per-lane stats
  *
  * Drivers should fill in either only total or per-lane statistics, core
-- 
cgit v1.2.3


From a8c959402d4dd6823918b33828d79900ae58c700 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 20 Dec 2023 23:17:45 +0000
Subject: thermal: core: Add governor callback for thermal zone change

Add a new callback to the struct thermal_governor. It can be used for
updating governors when there is a change in the thermal zone internals,
e.g. thermal cooling device is bind to the thermal zone.

That makes possible to move some heavy operations like memory allocations
related to the number of cooling instances out of the throttle() callback.

Both callback code paths (throttle() and update_tz()) are protected with
the same thermal zone lock, which guaranties the consistency.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index d00622b64d50..4d96fefb2767 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -51,6 +51,8 @@ enum thermal_notify_event {
 	THERMAL_DEVICE_POWER_CAPABILITY_CHANGED, /* power capability changed */
 	THERMAL_TABLE_CHANGED, /* Thermal table(s) changed */
 	THERMAL_EVENT_KEEP_ALIVE, /* Request for user space handler to respond */
+	THERMAL_TZ_BIND_CDEV, /* Cooling dev is bind to the thermal zone */
+	THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */
 };
 
 /**
@@ -199,6 +201,8 @@ struct thermal_zone_device {
  *			thermal zone.
  * @throttle:	callback called for every trip point even if temperature is
  *		below the trip point temperature
+ * @update_tz:	callback called when thermal zone internals have changed, e.g.
+ *		thermal cooling instance was added/removed
  * @governor_list:	node in thermal_governor_list (in thermal_core.c)
  */
 struct thermal_governor {
@@ -207,6 +211,8 @@ struct thermal_governor {
 	void (*unbind_from_tz)(struct thermal_zone_device *tz);
 	int (*throttle)(struct thermal_zone_device *tz,
 			const struct thermal_trip *trip);
+	void (*update_tz)(struct thermal_zone_device *tz,
+			  enum thermal_notify_event reason);
 	struct list_head	governor_list;
 };
 
-- 
cgit v1.2.3


From bfc57bd1685981730bfe9802d9de7603a0a43bc4 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 20 Dec 2023 23:17:52 +0000
Subject: thermal/sysfs: Update governors when the 'weight' has changed

Support governors update when the thermal instance's weight has changed.
This allows to adjust internal state for the governor.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
[ rjw: Add two empty code lines aroung the locking ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 4d96fefb2767..9d0427da32af 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -53,6 +53,7 @@ enum thermal_notify_event {
 	THERMAL_EVENT_KEEP_ALIVE, /* Request for user space handler to respond */
 	THERMAL_TZ_BIND_CDEV, /* Cooling dev is bind to the thermal zone */
 	THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */
+	THERMAL_INSTANCE_WEIGHT_CHANGED, /* Thermal instance weight changed */
 };
 
 /**
-- 
cgit v1.2.3


From adef440691bab824e39c1b17382322d195e1fab0 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 6 Dec 2023 02:36:56 -0800
Subject: userfaultfd: UFFDIO_MOVE uABI

Implement the uABI of UFFDIO_MOVE ioctl.
UFFDIO_COPY performs ~20% better than UFFDIO_MOVE when the application
needs pages to be allocated [1]. However, with UFFDIO_MOVE, if pages are
available (in userspace) for recycling, as is usually the case in heap
compaction algorithms, then we can avoid the page allocation and memcpy
(done by UFFDIO_COPY). Also, since the pages are recycled in the
userspace, we avoid the need to release (via madvise) the pages back to
the kernel [2].

We see over 40% reduction (on a Google pixel 6 device) in the compacting
thread's completion time by using UFFDIO_MOVE vs.  UFFDIO_COPY.  This was
measured using a benchmark that emulates a heap compaction implementation
using userfaultfd (to allow concurrent accesses by application threads).
More details of the usecase are explained in [2].  Furthermore,
UFFDIO_MOVE enables moving swapped-out pages without touching them within
the same vma.  Today, it can only be done by mremap, however it forces
splitting the vma.

[1] https://lore.kernel.org/all/1425575884-2574-1-git-send-email-aarcange@redhat.com/
[2] https://lore.kernel.org/linux-mm/CA+EESO4uO84SSnBhArH4HvLNhaUQ5nZKNKXqxRCyjniNVjp0Aw@mail.gmail.com/

Update for the ioctl_userfaultfd(2)  manpage:

   UFFDIO_MOVE
       (Since Linux xxx)  Move a continuous memory chunk into the
       userfault registered range and optionally wake up the blocked
       thread. The source and destination addresses and the number of
       bytes to move are specified by the src, dst, and len fields of
       the uffdio_move structure pointed to by argp:

           struct uffdio_move {
               __u64 dst;    /* Destination of move */
               __u64 src;    /* Source of move */
               __u64 len;    /* Number of bytes to move */
               __u64 mode;   /* Flags controlling behavior of move */
               __s64 move;   /* Number of bytes moved, or negated error */
           };

       The following value may be bitwise ORed in mode to change the
       behavior of the UFFDIO_MOVE operation:

       UFFDIO_MOVE_MODE_DONTWAKE
              Do not wake up the thread that waits for page-fault
              resolution

       UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES
              Allow holes in the source virtual range that is being moved.
              When not specified, the holes will result in ENOENT error.
              When specified, the holes will be accounted as successfully
              moved memory. This is mostly useful to move hugepage aligned
              virtual regions without knowing if there are transparent
              hugepages in the regions or not, but preventing the risk of
              having to split the hugepage during the operation.

       The move field is used by the kernel to return the number of
       bytes that was actually moved, or an error (a negated errno-
       style value).  If the value returned in move doesn't match the
       value that was specified in len, the operation fails with the
       error EAGAIN.  The move field is output-only; it is not read by
       the UFFDIO_MOVE operation.

       The operation may fail for various reasons. Usually, remapping of
       pages that are not exclusive to the given process fail; once KSM
       might deduplicate pages or fork() COW-shares pages during fork()
       with child processes, they are no longer exclusive. Further, the
       kernel might only perform lightweight checks for detecting whether
       the pages are exclusive, and return -EBUSY in case that check fails.
       To make the operation more likely to succeed, KSM should be
       disabled, fork() should be avoided or MADV_DONTFORK should be
       configured for the source VMA before fork().

       This ioctl(2) operation returns 0 on success.  In this case, the
       entire area was moved.  On error, -1 is returned and errno is
       set to indicate the error.  Possible errors include:

       EAGAIN The number of bytes moved (i.e., the value returned in
              the move field) does not equal the value that was
              specified in the len field.

       EINVAL Either dst or len was not a multiple of the system page
              size, or the range specified by src and len or dst and len
              was invalid.

       EINVAL An invalid bit was specified in the mode field.

       ENOENT
              The source virtual memory range has unmapped holes and
              UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES is not set.

       EEXIST
              The destination virtual memory range is fully or partially
              mapped.

       EBUSY
              The pages in the source virtual memory range are either
              pinned or not exclusive to the process. The kernel might
              only perform lightweight checks for detecting whether the
              pages are exclusive. To make the operation more likely to
              succeed, KSM should be disabled, fork() should be avoided
              or MADV_DONTFORK should be configured for the source virtual
              memory area before fork().

       ENOMEM Allocating memory needed for the operation failed.

       ESRCH
              The target process has exited at the time of a UFFDIO_MOVE
              operation.

Link: https://lkml.kernel.org/r/20231206103702.3873743-3-surenb@google.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: ZhangPeng <zhangpeng362@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h          |  5 +++++
 include/linux/userfaultfd_k.h | 11 +++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 3c2fc291b071..af6a32b6f3e7 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -121,6 +121,11 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
 	down_write(&anon_vma->root->rwsem);
 }
 
+static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
+{
+	return down_write_trylock(&anon_vma->root->rwsem);
+}
+
 static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
 {
 	up_write(&anon_vma->root->rwsem);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f2dc19f40d05..e4056547fbe6 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -93,6 +93,17 @@ extern int mwriteprotect_range(struct mm_struct *dst_mm,
 extern long uffd_wp_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long len, bool enable_wp);
 
+/* move_pages */
+void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
+void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
+ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
+		   unsigned long dst_start, unsigned long src_start,
+		   unsigned long len, __u64 flags);
+int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
+			struct vm_area_struct *dst_vma,
+			struct vm_area_struct *src_vma,
+			unsigned long dst_addr, unsigned long src_addr);
+
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
 					struct vm_userfaultfd_ctx vm_ctx)
-- 
cgit v1.2.3


From 96db66d9c8f3c1547325af01b1f328b85d6ee1b9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 11 Dec 2023 16:22:06 +0000
Subject: mm: convert ksm_might_need_to_copy() to work on folios

Patch series "Finish two folio conversions".

Most callers of page_add_new_anon_rmap() and
lru_cache_add_inactive_or_unevictable() have been converted to their folio
equivalents, but there are still a few stragglers.  There's a bit of
preparatory work in ksm and unuse_pte(), but after that it's pretty
mechanical.


This patch (of 9):

Accept a folio as an argument and return a folio result.  Removes a call
to compound_head() in do_swap_page(), and prevents folio & page from
getting out of sync in unuse_pte().

Reviewed-by: David Hildenbrand <david@redhat.com>
[willy@infradead.org: fix smatch warning]
  Link: https://lkml.kernel.org/r/ZXnPtblC6A1IkyAB@casper.infradead.org
[david@redhat.com: only adjust the page if the folio changed]
  Link: https://lkml.kernel.org/r/6a8f2110-fa91-4c10-9eae-88315309a6e3@redhat.com
Link: https://lkml.kernel.org/r/20231211162214.2146080-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231211162214.2146080-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 4643d5244e77..401348e9f92b 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -76,7 +76,7 @@ static inline void ksm_exit(struct mm_struct *mm)
  * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
  * but what if the vma was unmerged while the page was swapped out?
  */
-struct page *ksm_might_need_to_copy(struct page *page,
+struct folio *ksm_might_need_to_copy(struct folio *folio,
 			struct vm_area_struct *vma, unsigned long addr);
 
 void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
@@ -129,10 +129,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 	return 0;
 }
 
-static inline struct page *ksm_might_need_to_copy(struct page *page,
+static inline struct folio *ksm_might_need_to_copy(struct folio *folio,
 			struct vm_area_struct *vma, unsigned long addr)
 {
-	return page;
+	return folio;
 }
 
 static inline void rmap_walk_ksm(struct folio *folio,
-- 
cgit v1.2.3


From cafa8e37a2ebd344ae0774324c21f46640bbaab3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 11 Dec 2023 16:22:14 +0000
Subject: mm: remove page_add_new_anon_rmap and
 lru_cache_add_inactive_or_unevictable

All callers have now been converted to folio_add_new_anon_rmap() and
folio_add_lru_vma() so we can remove the wrapper.

Link: https://lkml.kernel.org/r/20231211162214.2146080-10-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 2 --
 include/linux/swap.h | 3 ---
 2 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index af6a32b6f3e7..0ae2bb0e77f5 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -197,8 +197,6 @@ typedef int __bitwise rmap_t;
 void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
 void page_add_anon_rmap(struct page *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
-		unsigned long address);
 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 void page_add_file_rmap(struct page *, struct vm_area_struct *,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f6dd6575b905..3e1909087f6a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -397,9 +397,6 @@ void folio_deactivate(struct folio *folio);
 void folio_mark_lazyfree(struct folio *folio);
 extern void swap_setup(void);
 
-extern void lru_cache_add_inactive_or_unevictable(struct page *page,
-						struct vm_area_struct *vma);
-
 /* linux/mm/vmscan.c */
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-- 
cgit v1.2.3


From 8ba2f844f050a82624ba3ad5146aa3c116f506f7 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Thu, 28 Dec 2023 09:45:46 +0000
Subject: mm/zswap: change per-cpu mutex and buffer to per-acomp_ctx

First of all, we need to rename acomp_ctx->dstmem field to buffer, since
we are now using for purposes other than compression.

Then we change per-cpu mutex and buffer to per-acomp_ctx, since them
belong to the acomp_ctx and are necessary parts when used in the
compress/decompress contexts.

So we can remove the old per-cpu mutex and dstmem.

Link: https://lkml.kernel.org/r/20231213-zswap-dstmem-v5-5-9382162bbf05@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Chris Li <chrisl@kernel.org> (Google)
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpuhotplug.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index efc0c0b07efb..c3e06e21766a 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -124,7 +124,6 @@ enum cpuhp_state {
 	CPUHP_ARM_BL_PREPARE,
 	CPUHP_TRACE_RB_PREPARE,
 	CPUHP_MM_ZS_PREPARE,
-	CPUHP_MM_ZSWP_MEM_PREPARE,
 	CPUHP_MM_ZSWP_POOL_PREPARE,
 	CPUHP_KVM_PPC_BOOK3S_PREPARE,
 	CPUHP_ZCOMP_PREPARE,
-- 
cgit v1.2.3


From 96c7b0b42239e7b8987b2664b458dc74e825f760 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 13 Dec 2023 21:58:30 +0000
Subject: mm: return the folio from __read_swap_cache_async()

Patch series "More swap folio conversions".

These all seem like fairly straightforward conversions to me.  A lot of
compound_head() calls get removed.  And page_swap_info(), which is nice.


This patch (of 13):

Move the folio->page conversion into the callers that actually want that.
Most of the callers are happier with the folio anyway.  If the
page_allocated boolean is set, the folio allocated is of order-0, so it is
safe to pass the page directly to swap_readpage().

Link: https://lkml.kernel.org/r/20231213215842.671461-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231213215842.671461-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zswap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 08c240e16a01..e88572d4c720 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -34,7 +34,7 @@ void zswap_swapon(int type);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 void zswap_lruvec_state_init(struct lruvec *lruvec);
-void zswap_page_swapin(struct page *page);
+void zswap_folio_swapin(struct folio *folio);
 #else
 
 struct zswap_lruvec_state {};
@@ -54,7 +54,7 @@ static inline void zswap_swapon(int type) {}
 static inline void zswap_swapoff(int type) {}
 static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
 static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
-static inline void zswap_page_swapin(struct page *page) {}
+static inline void zswap_folio_swapin(struct folio *folio) {}
 #endif
 
 #endif /* _LINUX_ZSWAP_H */
-- 
cgit v1.2.3


From 3a61e6f668120ee2c7840b91891c858d575d07e2 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 13 Dec 2023 21:58:38 +0000
Subject: mm: convert swap_page_sector() to swap_folio_sector()

All callers have a folio, so pass it in.  Saves a couple of calls to
compound_head().

Link: https://lkml.kernel.org/r/20231213215842.671461-10-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3e1909087f6a..2d09e9b7ee70 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -493,7 +493,7 @@ struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
 extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
-sector_t swap_page_sector(struct page *page);
+sector_t swap_folio_sector(struct folio *folio);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
-- 
cgit v1.2.3


From 69fe7d67cb0c6eeab3d4c9a3bf950f9d12af4719 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 13 Dec 2023 21:58:40 +0000
Subject: mm: remove page_swap_info()

It's more efficient to get the swap_info_struct by calling
swp_swap_info() directly.

Link: https://lkml.kernel.org/r/20231213215842.671461-12-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2d09e9b7ee70..4db00ddad261 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -487,8 +487,7 @@ extern sector_t swapdev_block(int, pgoff_t);
 extern int __swap_count(swp_entry_t entry);
 extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
-extern struct swap_info_struct *page_swap_info(struct page *);
-extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
+struct swap_info_struct *swp_swap_info(swp_entry_t entry);
 struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
-- 
cgit v1.2.3


From f099c961f4998ad7107b1c6a7d6efb225e9a4614 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Dec 2023 20:02:32 +0000
Subject: fs: remove clean_page_buffers()

Patch series "Clean up the writeback paths".

Most of these patches verge on the trivial, converting filesystems that
just use block_write_full_page() to use mpage_writepages().  But as we saw
with Christoph's earlier patchset, there can be some "interesting"
gotchas, and I clearly haven't tested the majority of filesystems I've
touched here.

Patches 3 & 4 get rid of a lot of stack usage on architectures with larger
page sizes; 1024 bytes on 64-bit systems with 64KiB pages.  It starts to
open the door to larger folio sizes on all architectures, but it's
certainly not enough yet.

Patch 14 is kind of trivial, but it's nice to get that simplification in.


This patch (of 14):

This function has been unused since the removal of bdev_write_page().

Link: https://lkml.kernel.org/r/20231215200245.748418-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231215200245.748418-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 5f23ee599889..94f6161eb45e 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -270,7 +270,6 @@ int generic_write_end(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page *, void *);
 void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to);
-void clean_page_buffers(struct page *page);
 int cont_write_begin(struct file *, struct address_space *, loff_t,
 			unsigned, struct page **, void **,
 			get_block_t *, loff_t *);
-- 
cgit v1.2.3


From 17bf23a981be9c6629198a76940c777eb5c8c521 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Dec 2023 20:02:44 +0000
Subject: fs: convert block_write_full_page to block_write_full_folio

Convert the function to be compatible with writepage_t so that it can be
passed to write_cache_pages() by blkdev.  This removes a call to
compound_head().  We can also remove the function export as both callers
are built-in.

Link: https://lkml.kernel.org/r/20231215200245.748418-14-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 94f6161eb45e..396b2adf24bf 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -252,8 +252,8 @@ void __bh_read_batch(int nr, struct buffer_head *bhs[],
  * address_spaces.
  */
 void block_invalidate_folio(struct folio *folio, size_t offset, size_t length);
-int block_write_full_page(struct page *page, get_block_t *get_block,
-				struct writeback_control *wbc);
+int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
+		void *get_block);
 int __block_write_full_folio(struct inode *inode, struct folio *folio,
 			get_block_t *get_block, struct writeback_control *wbc,
 			bh_end_io_t *handler);
-- 
cgit v1.2.3


From 14059f66a959c760467ea2041e165f412845bcb8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Dec 2023 20:02:45 +0000
Subject: fs: remove the bh_end_io argument from __block_write_full_folio

All callers are passing end_buffer_async_write as this argument, so we can
hardcode references to it within __block_write_full_folio().  That lets us
make end_buffer_async_write() static.

Link: https://lkml.kernel.org/r/20231215200245.748418-15-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 396b2adf24bf..d78454a4dd1f 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -205,7 +205,6 @@ struct buffer_head *create_empty_buffers(struct folio *folio,
 		unsigned long blocksize, unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
-void end_buffer_async_write(struct buffer_head *bh, int uptodate);
 
 /* Things to do with buffers at mapping->private_list */
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
@@ -255,8 +254,7 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length);
 int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
 		void *get_block);
 int __block_write_full_folio(struct inode *inode, struct folio *folio,
-			get_block_t *get_block, struct writeback_control *wbc,
-			bh_end_io_t *handler);
+		get_block_t *get_block, struct writeback_control *wbc);
 int block_read_full_folio(struct folio *, get_block_t *);
 bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
-- 
cgit v1.2.3


From 280ec6ccb6422aa4a04f9ac4216ddcf055acc95d Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:45 +0100
Subject: kasan: rename kasan_slab_free_mempool to kasan_mempool_poison_object

Patch series "kasan: save mempool stack traces".

This series updates KASAN to save alloc and free stack traces for
secondary-level allocators that cache and reuse allocations internally
instead of giving them back to the underlying allocator (e.g.  mempool).

As a part of this change, introduce and document a set of KASAN hooks:

bool kasan_mempool_poison_pages(struct page *page, unsigned int order);
void kasan_mempool_unpoison_pages(struct page *page, unsigned int order);
bool kasan_mempool_poison_object(void *ptr);
void kasan_mempool_unpoison_object(void *ptr, size_t size);

and use them in the mempool code.

Besides mempool, skbuff and io_uring also cache allocations and already
use KASAN hooks to poison those.  Their code is updated to use the new
mempool hooks.

The new hooks save alloc and free stack traces (for normal kmalloc and
slab objects; stack traces for large kmalloc objects and page_alloc are
not supported by KASAN yet), improve the readability of the users' code,
and also allow the users to prevent double-free and invalid-free bugs; see
the patches for the details.


This patch (of 21):

Rename kasan_slab_free_mempool to kasan_mempool_poison_object.

kasan_slab_free_mempool is a slightly confusing name: it is unclear
whether this function poisons the object when it is freed into mempool or
does something when the object is freed from mempool to the underlying
allocator.

The new name also aligns with other mempool-related KASAN hooks added in
the following patches in this series.

Link: https://lkml.kernel.org/r/cover.1703024586.git.andreyknvl@google.com
Link: https://lkml.kernel.org/r/c5618685abb7cdbf9fb4897f565e7759f601da84.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 72cb693b075b..6310435f528b 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -172,11 +172,11 @@ static __always_inline void kasan_kfree_large(void *ptr)
 		__kasan_kfree_large(ptr, _RET_IP_);
 }
 
-void __kasan_slab_free_mempool(void *ptr, unsigned long ip);
-static __always_inline void kasan_slab_free_mempool(void *ptr)
+void __kasan_mempool_poison_object(void *ptr, unsigned long ip);
+static __always_inline void kasan_mempool_poison_object(void *ptr)
 {
 	if (kasan_enabled())
-		__kasan_slab_free_mempool(ptr, _RET_IP_);
+		__kasan_mempool_poison_object(ptr, _RET_IP_);
 }
 
 void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
@@ -256,7 +256,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init
 	return false;
 }
 static inline void kasan_kfree_large(void *ptr) {}
-static inline void kasan_slab_free_mempool(void *ptr) {}
+static inline void kasan_mempool_poison_object(void *ptr) {}
 static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags, bool init)
 {
-- 
cgit v1.2.3


From 9b94fe91099cbf05606151ef05bea9632666f5d5 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:46 +0100
Subject: kasan: move kasan_mempool_poison_object

Move kasan_mempool_poison_object after all slab-related KASAN hooks.

This is a preparatory change for the following patches in this series.

No functional changes.

Link: https://lkml.kernel.org/r/23ea215409f43c13cdf9ecc454501a264c107d67.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 6310435f528b..0d1f925c136d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -172,13 +172,6 @@ static __always_inline void kasan_kfree_large(void *ptr)
 		__kasan_kfree_large(ptr, _RET_IP_);
 }
 
-void __kasan_mempool_poison_object(void *ptr, unsigned long ip);
-static __always_inline void kasan_mempool_poison_object(void *ptr)
-{
-	if (kasan_enabled())
-		__kasan_mempool_poison_object(ptr, _RET_IP_);
-}
-
 void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
 				       void *object, gfp_t flags, bool init);
 static __always_inline void * __must_check kasan_slab_alloc(
@@ -219,6 +212,13 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 	return (void *)object;
 }
 
+void __kasan_mempool_poison_object(void *ptr, unsigned long ip);
+static __always_inline void kasan_mempool_poison_object(void *ptr)
+{
+	if (kasan_enabled())
+		__kasan_mempool_poison_object(ptr, _RET_IP_);
+}
+
 /*
  * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for
  * the hardware tag-based mode that doesn't rely on compiler instrumentation.
@@ -256,7 +256,6 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init
 	return false;
 }
 static inline void kasan_kfree_large(void *ptr) {}
-static inline void kasan_mempool_poison_object(void *ptr) {}
 static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags, bool init)
 {
@@ -276,6 +275,7 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 {
 	return (void *)object;
 }
+static inline void kasan_mempool_poison_object(void *ptr) {}
 static inline bool kasan_check_byte(const void *address)
 {
 	return true;
-- 
cgit v1.2.3


From 1bb843048d00050678c392dab87a15c8b756df6f Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:47 +0100
Subject: kasan: document kasan_mempool_poison_object

Add documentation comment for kasan_mempool_poison_object.

Link: https://lkml.kernel.org/r/af33ba8cabfa1ad731fe23a3f874bfc8d3b7fed4.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 0d1f925c136d..bbf6e2fa4ffd 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -213,6 +213,24 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 }
 
 void __kasan_mempool_poison_object(void *ptr, unsigned long ip);
+/**
+ * kasan_mempool_poison_object - Check and poison a mempool slab allocation.
+ * @ptr: Pointer to the slab allocation.
+ *
+ * This function is intended for kernel subsystems that cache slab allocations
+ * to reuse them instead of freeing them back to the slab allocator (e.g.
+ * mempool).
+ *
+ * This function poisons a slab allocation without initializing its memory and
+ * without putting it into the quarantine (for the Generic mode).
+ *
+ * This function also performs checks to detect double-free and invalid-free
+ * bugs and reports them.
+ *
+ * This function operates on all slab allocations including large kmalloc
+ * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
+ * size > KMALLOC_MAX_SIZE).
+ */
 static __always_inline void kasan_mempool_poison_object(void *ptr)
 {
 	if (kasan_enabled())
-- 
cgit v1.2.3


From 2e7c954c11af96aa1e0566a706f22152ef91d759 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:48 +0100
Subject: kasan: add return value for kasan_mempool_poison_object

Add a return value for kasan_mempool_poison_object that lets the caller
know whether the allocation is affected by a double-free or an
invalid-free bug.  The caller can use this return value to stop operating
on the object.

Also introduce a check_page_allocation helper function to improve the code
readability.

Link: https://lkml.kernel.org/r/618af65273875fb9f56954285443279b15f1fcd9.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index bbf6e2fa4ffd..33387e254caa 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -212,7 +212,7 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 	return (void *)object;
 }
 
-void __kasan_mempool_poison_object(void *ptr, unsigned long ip);
+bool __kasan_mempool_poison_object(void *ptr, unsigned long ip);
 /**
  * kasan_mempool_poison_object - Check and poison a mempool slab allocation.
  * @ptr: Pointer to the slab allocation.
@@ -225,16 +225,20 @@ void __kasan_mempool_poison_object(void *ptr, unsigned long ip);
  * without putting it into the quarantine (for the Generic mode).
  *
  * This function also performs checks to detect double-free and invalid-free
- * bugs and reports them.
+ * bugs and reports them. The caller can use the return value of this function
+ * to find out if the allocation is buggy.
  *
  * This function operates on all slab allocations including large kmalloc
  * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
  * size > KMALLOC_MAX_SIZE).
+ *
+ * Return: true if the allocation can be safely reused; false otherwise.
  */
-static __always_inline void kasan_mempool_poison_object(void *ptr)
+static __always_inline bool kasan_mempool_poison_object(void *ptr)
 {
 	if (kasan_enabled())
-		__kasan_mempool_poison_object(ptr, _RET_IP_);
+		return __kasan_mempool_poison_object(ptr, _RET_IP_);
+	return true;
 }
 
 /*
@@ -293,7 +297,10 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 {
 	return (void *)object;
 }
-static inline void kasan_mempool_poison_object(void *ptr) {}
+static inline bool kasan_mempool_poison_object(void *ptr)
+{
+	return true;
+}
 static inline bool kasan_check_byte(const void *address)
 {
 	return true;
-- 
cgit v1.2.3


From 1956832753735b1c399b86b2c66cb7c317dc9f31 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:49 +0100
Subject: kasan: introduce kasan_mempool_unpoison_object

Introduce and document a kasan_mempool_unpoison_object hook.

This hook serves as a replacement for the generic kasan_unpoison_range
that the mempool code relies on right now.  mempool will be updated to use
the new hook in one of the following patches.

For now, define the new hook to be identical to kasan_unpoison_range.  One
of the following patches will update it to add stack trace collection.

Link: https://lkml.kernel.org/r/dae25f0e18ed8fd50efe509c5b71a0592de5c18d.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 33387e254caa..c5fe303bc1c2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -228,6 +228,9 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip);
  * bugs and reports them. The caller can use the return value of this function
  * to find out if the allocation is buggy.
  *
+ * Before the poisoned allocation can be reused, it must be unpoisoned via
+ * kasan_mempool_unpoison_object().
+ *
  * This function operates on all slab allocations including large kmalloc
  * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
  * size > KMALLOC_MAX_SIZE).
@@ -241,6 +244,32 @@ static __always_inline bool kasan_mempool_poison_object(void *ptr)
 	return true;
 }
 
+void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip);
+/**
+ * kasan_mempool_unpoison_object - Unpoison a mempool slab allocation.
+ * @ptr: Pointer to the slab allocation.
+ * @size: Size to be unpoisoned.
+ *
+ * This function is intended for kernel subsystems that cache slab allocations
+ * to reuse them instead of freeing them back to the slab allocator (e.g.
+ * mempool).
+ *
+ * This function unpoisons a slab allocation that was previously poisoned via
+ * kasan_mempool_poison_object() without initializing its memory. For the
+ * tag-based modes, this function does not assign a new tag to the allocation
+ * and instead restores the original tags based on the pointer value.
+ *
+ * This function operates on all slab allocations including large kmalloc
+ * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
+ * size > KMALLOC_MAX_SIZE).
+ */
+static __always_inline void kasan_mempool_unpoison_object(void *ptr,
+							  size_t size)
+{
+	if (kasan_enabled())
+		__kasan_mempool_unpoison_object(ptr, size, _RET_IP_);
+}
+
 /*
  * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for
  * the hardware tag-based mode that doesn't rely on compiler instrumentation.
@@ -301,6 +330,8 @@ static inline bool kasan_mempool_poison_object(void *ptr)
 {
 	return true;
 }
+static inline void kasan_mempool_unpoison_object(void *ptr, size_t size) {}
+
 static inline bool kasan_check_byte(const void *address)
 {
 	return true;
-- 
cgit v1.2.3


From f129c31039283df884913142b0f3797d64d3a9d6 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:50 +0100
Subject: kasan: introduce kasan_mempool_poison_pages

Introduce and document a kasan_mempool_poison_pages hook to be used by the
mempool code instead of kasan_poison_pages.

Compated to kasan_poison_pages, the new hook:

1. For the tag-based modes, skips checking and poisoning allocations that
   were not tagged due to sampling.

2. Checks for double-free and invalid-free bugs.

In the future, kasan_poison_pages can also be updated to handle #2, but
this is out-of-scope of this series.

Link: https://lkml.kernel.org/r/88dc7340cce28249abf789f6e0c792c317df9ba5.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index c5fe303bc1c2..de2a695ad34d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -212,6 +212,29 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 	return (void *)object;
 }
 
+bool __kasan_mempool_poison_pages(struct page *page, unsigned int order,
+				  unsigned long ip);
+/**
+ * kasan_mempool_poison_pages - Check and poison a mempool page allocation.
+ * @page: Pointer to the page allocation.
+ * @order: Order of the allocation.
+ *
+ * This function is intended for kernel subsystems that cache page allocations
+ * to reuse them instead of freeing them back to page_alloc (e.g. mempool).
+ *
+ * This function is similar to kasan_mempool_poison_object() but operates on
+ * page allocations.
+ *
+ * Return: true if the allocation can be safely reused; false otherwise.
+ */
+static __always_inline bool kasan_mempool_poison_pages(struct page *page,
+						       unsigned int order)
+{
+	if (kasan_enabled())
+		return __kasan_mempool_poison_pages(page, order, _RET_IP_);
+	return true;
+}
+
 bool __kasan_mempool_poison_object(void *ptr, unsigned long ip);
 /**
  * kasan_mempool_poison_object - Check and poison a mempool slab allocation.
@@ -326,6 +349,10 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 {
 	return (void *)object;
 }
+static inline bool kasan_mempool_poison_pages(struct page *page, unsigned int order)
+{
+	return true;
+}
 static inline bool kasan_mempool_poison_object(void *ptr)
 {
 	return true;
-- 
cgit v1.2.3


From 9f41c59ae3163690868a32bd77e9e33c3bab555e Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:51 +0100
Subject: kasan: introduce kasan_mempool_unpoison_pages

Introduce and document a new kasan_mempool_unpoison_pages hook to be used
by the mempool code instead of kasan_unpoison_pages.

This hook is not functionally different from kasan_unpoison_pages, but
using it improves the mempool code readability.

Link: https://lkml.kernel.org/r/239bd9af6176f2cc59f5c25893eb36143184daff.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index de2a695ad34d..f8ebde384bd7 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -225,6 +225,9 @@ bool __kasan_mempool_poison_pages(struct page *page, unsigned int order,
  * This function is similar to kasan_mempool_poison_object() but operates on
  * page allocations.
  *
+ * Before the poisoned allocation can be reused, it must be unpoisoned via
+ * kasan_mempool_unpoison_pages().
+ *
  * Return: true if the allocation can be safely reused; false otherwise.
  */
 static __always_inline bool kasan_mempool_poison_pages(struct page *page,
@@ -235,6 +238,27 @@ static __always_inline bool kasan_mempool_poison_pages(struct page *page,
 	return true;
 }
 
+void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order,
+				    unsigned long ip);
+/**
+ * kasan_mempool_unpoison_pages - Unpoison a mempool page allocation.
+ * @page: Pointer to the page allocation.
+ * @order: Order of the allocation.
+ *
+ * This function is intended for kernel subsystems that cache page allocations
+ * to reuse them instead of freeing them back to page_alloc (e.g. mempool).
+ *
+ * This function unpoisons a page allocation that was previously poisoned by
+ * kasan_mempool_poison_pages() without zeroing the allocation's memory. For
+ * the tag-based modes, this function assigns a new tag to the allocation.
+ */
+static __always_inline void kasan_mempool_unpoison_pages(struct page *page,
+							 unsigned int order)
+{
+	if (kasan_enabled())
+		__kasan_mempool_unpoison_pages(page, order, _RET_IP_);
+}
+
 bool __kasan_mempool_poison_object(void *ptr, unsigned long ip);
 /**
  * kasan_mempool_poison_object - Check and poison a mempool slab allocation.
@@ -353,6 +377,7 @@ static inline bool kasan_mempool_poison_pages(struct page *page, unsigned int or
 {
 	return true;
 }
+static inline void kasan_mempool_unpoison_pages(struct page *page, unsigned int order) {}
 static inline bool kasan_mempool_poison_object(void *ptr)
 {
 	return true;
-- 
cgit v1.2.3


From b556a462eb8df6b6836c318d23f43409c40a7c7e Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:53 +0100
Subject: kasan: save free stack traces for slab mempools

Make kasan_mempool_poison_object save free stack traces for slab and
kmalloc mempools when the object is freed into the mempool.

Also simplify and rename ____kasan_slab_free to poison_slab_object and do
a few other reability changes.

Link: https://lkml.kernel.org/r/413a7c7c3344fb56809853339ffaabc9e4905e94.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index f8ebde384bd7..e636a00e26ba 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -268,8 +268,9 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip);
  * to reuse them instead of freeing them back to the slab allocator (e.g.
  * mempool).
  *
- * This function poisons a slab allocation without initializing its memory and
- * without putting it into the quarantine (for the Generic mode).
+ * This function poisons a slab allocation and saves a free stack trace for it
+ * without initializing the allocation's memory and without putting it into the
+ * quarantine (for the Generic mode).
  *
  * This function also performs checks to detect double-free and invalid-free
  * bugs and reports them. The caller can use the return value of this function
-- 
cgit v1.2.3


From 29d7355a9d05de9a6e38cc4d1146fb96c43853fb Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:56 +0100
Subject: kasan: save alloc stack traces for mempool

Update kasan_mempool_unpoison_object to properly poison the redzone and
save alloc strack traces for kmalloc and slab pools.

As a part of this change, split out and use a unpoison_slab_object helper
function from __kasan_slab_alloc.

[nathan@kernel.org: mark unpoison_slab_object() as static]
  Link: https://lkml.kernel.org/r/20231221180042.104694-1-andrey.konovalov@linux.dev
Link: https://lkml.kernel.org/r/05ad235da8347cfe14d496d01b2aaf074b4f607c.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index e636a00e26ba..7392c5d89b92 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -303,9 +303,10 @@ void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip);
  * mempool).
  *
  * This function unpoisons a slab allocation that was previously poisoned via
- * kasan_mempool_poison_object() without initializing its memory. For the
- * tag-based modes, this function does not assign a new tag to the allocation
- * and instead restores the original tags based on the pointer value.
+ * kasan_mempool_poison_object() and saves an alloc stack trace for it without
+ * initializing the allocation's memory. For the tag-based modes, this function
+ * does not assign a new tag to the allocation and instead restores the
+ * original tags based on the pointer value.
  *
  * This function operates on all slab allocations including large kmalloc
  * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
-- 
cgit v1.2.3


From 37dcc69ad17a008d2b720bdc39f070ef2a959430 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:59 +0100
Subject: mempool: introduce mempool_use_prealloc_only

Introduce a new mempool_alloc_preallocated API that asks the mempool to
only use the elements preallocated during the mempool's creation when
allocating and to not attempt allocating new ones from the underlying
allocator.

This API is required to test the KASAN poisoning/unpoisoning functionality
in KASAN tests, but it might be also useful on its own.

Link: https://lkml.kernel.org/r/a14d809dbdfd04cc33bcacc632fee2abd6b83c00.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>

Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempool.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 4aae6c06c5f2..7be1e32e6d42 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -51,6 +51,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
 extern void mempool_destroy(mempool_t *pool);
 extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
+extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc;
 extern void mempool_free(void *element, mempool_t *pool);
 
 /*
-- 
cgit v1.2.3


From 1ce9a0523938f87dd8505233cc3445f8e2d8dcee Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:29:03 +0100
Subject: kasan: rename and document kasan_(un)poison_object_data

Rename kasan_unpoison_object_data to kasan_unpoison_new_object and add a
documentation comment.  Do the same for kasan_poison_object_data.

The new names and the comments should suggest the users that these hooks
are intended for internal use by the slab allocator.

The following patch will remove non-slab-internal uses of these hooks.

No functional changes.

[andreyknvl@google.com: update references to renamed functions in comments]
  Link: https://lkml.kernel.org/r/20231221180637.105098-1-andrey.konovalov@linux.dev
Link: https://lkml.kernel.org/r/eab156ebbd635f9635ef67d1a4271f716994e628.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 7392c5d89b92..d49e3d4c099e 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -129,20 +129,39 @@ static __always_inline void kasan_poison_slab(struct slab *slab)
 		__kasan_poison_slab(slab);
 }
 
-void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
-static __always_inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+void __kasan_unpoison_new_object(struct kmem_cache *cache, void *object);
+/**
+ * kasan_unpoison_new_object - Temporarily unpoison a new slab object.
+ * @cache: Cache the object belong to.
+ * @object: Pointer to the object.
+ *
+ * This function is intended for the slab allocator's internal use. It
+ * temporarily unpoisons an object from a newly allocated slab without doing
+ * anything else. The object must later be repoisoned by
+ * kasan_poison_new_object().
+ */
+static __always_inline void kasan_unpoison_new_object(struct kmem_cache *cache,
 							void *object)
 {
 	if (kasan_enabled())
-		__kasan_unpoison_object_data(cache, object);
+		__kasan_unpoison_new_object(cache, object);
 }
 
-void __kasan_poison_object_data(struct kmem_cache *cache, void *object);
-static __always_inline void kasan_poison_object_data(struct kmem_cache *cache,
+void __kasan_poison_new_object(struct kmem_cache *cache, void *object);
+/**
+ * kasan_unpoison_new_object - Repoison a new slab object.
+ * @cache: Cache the object belong to.
+ * @object: Pointer to the object.
+ *
+ * This function is intended for the slab allocator's internal use. It
+ * repoisons an object that was previously unpoisoned by
+ * kasan_unpoison_new_object() without doing anything else.
+ */
+static __always_inline void kasan_poison_new_object(struct kmem_cache *cache,
 							void *object)
 {
 	if (kasan_enabled())
-		__kasan_poison_object_data(cache, object);
+		__kasan_poison_new_object(cache, object);
 }
 
 void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
@@ -342,9 +361,9 @@ static inline bool kasan_unpoison_pages(struct page *page, unsigned int order,
 	return false;
 }
 static inline void kasan_poison_slab(struct slab *slab) {}
-static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+static inline void kasan_unpoison_new_object(struct kmem_cache *cache,
 					void *object) {}
-static inline void kasan_poison_object_data(struct kmem_cache *cache,
+static inline void kasan_poison_new_object(struct kmem_cache *cache,
 					void *object) {}
 static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
 				const void *object)
-- 
cgit v1.2.3


From 91349f541e7daa6cce15e01e7ffe4fd63731ead9 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 22:19:53 +0100
Subject: lib/stackdepot: fix comment in include/linux/stackdepot.h

As stack traces can now be evicted from the stack depot, remove the
comment saying that they are never removed.

Link: https://lkml.kernel.org/r/0ebe712d91f8d302a8947d3c9e9123bc2b1b8440.1703020707.git.andreyknvl@google.com
Fixes: 108be8def46e ("lib/stackdepot: allow users to evict stack traces")
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/stackdepot.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index a6796f178913..adcbb8f23600 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -11,8 +11,6 @@
  * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free
  * stack traces often repeat, using stack depot allows to save about 100x space.
  *
- * Stack traces are never removed from the stack depot.
- *
  * Author: Alexander Potapenko <glider@google.com>
  * Copyright (C) 2016 Google, Inc.
  *
-- 
cgit v1.2.3


From 7fbb5e188248c50f737720825da1864ce42536d1 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Tue, 19 Dec 2023 21:41:23 -0800
Subject: mm: remove VM_EXEC requirement for THP eligibility

Commit e6be37b2e7bd ("mm/huge_memory.c: add missing read-only THP checking
in transparent_hugepage_enabled()") introduced the VM_EXEC requirement,
which is not strictly needed.

lld's default --rosegment option and GNU ld's -z separate-code option
(default on Linux/x86 since binutils 2.31) create a read-only PT_LOAD
segment without the PF_X flag, which should be eligible for THP.

Certain architectures support medium and large code models, where .lrodata
may be placed in a separate read-only PT_LOAD segment, which should be
eligible for THP as well.

Link: https://lkml.kernel.org/r/20231220054123.1266001-1-maskray@google.com
Signed-off-by: Fangrui Song <maskray@google.com>
Acked-by: Yang Shi <shy828301@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fa7a38a30fc6..5adb86af35fc 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -206,7 +206,6 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 	inode = vma->vm_file->f_inode;
 
 	return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) &&
-	       (vma->vm_flags & VM_EXEC) &&
 	       !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
 }
 
-- 
cgit v1.2.3


From 5ec8e8ea8b7783fab150cf86404fc38cb4db8800 Mon Sep 17 00:00:00 2001
From: Charan Teja Kalla <quic_charante@quicinc.com>
Date: Fri, 13 Oct 2023 18:34:27 +0530
Subject: mm/sparsemem: fix race in accessing memory_section->usage

The below race is observed on a PFN which falls into the device memory
region with the system memory configuration where PFN's are such that
[ZONE_NORMAL ZONE_DEVICE ZONE_NORMAL].  Since normal zone start and end
pfn contains the device memory PFN's as well, the compaction triggered
will try on the device memory PFN's too though they end up in NOP(because
pfn_to_online_page() returns NULL for ZONE_DEVICE memory sections).  When
from other core, the section mappings are being removed for the
ZONE_DEVICE region, that the PFN in question belongs to, on which
compaction is currently being operated is resulting into the kernel crash
with CONFIG_SPASEMEM_VMEMAP enabled.  The crash logs can be seen at [1].

compact_zone()			memunmap_pages
-------------			---------------
__pageblock_pfn_to_page
   ......
 (a)pfn_valid():
     valid_section()//return true
			      (b)__remove_pages()->
				  sparse_remove_section()->
				    section_deactivate():
				    [Free the array ms->usage and set
				     ms->usage = NULL]
     pfn_section_valid()
     [Access ms->usage which
     is NULL]

NOTE: From the above it can be said that the race is reduced to between
the pfn_valid()/pfn_section_valid() and the section deactivate with
SPASEMEM_VMEMAP enabled.

The commit b943f045a9af("mm/sparse: fix kernel crash with
pfn_section_valid check") tried to address the same problem by clearing
the SECTION_HAS_MEM_MAP with the expectation of valid_section() returns
false thus ms->usage is not accessed.

Fix this issue by the below steps:

a) Clear SECTION_HAS_MEM_MAP before freeing the ->usage.

b) RCU protected read side critical section will either return NULL
   when SECTION_HAS_MEM_MAP is cleared or can successfully access ->usage.

c) Free the ->usage with kfree_rcu() and set ms->usage = NULL.  No
   attempt will be made to access ->usage after this as the
   SECTION_HAS_MEM_MAP is cleared thus valid_section() return false.

Thanks to David/Pavan for their inputs on this patch.

[1] https://lore.kernel.org/linux-mm/994410bb-89aa-d987-1f50-f514903c55aa@quicinc.com/

On Snapdragon SoC, with the mentioned memory configuration of PFN's as
[ZONE_NORMAL ZONE_DEVICE ZONE_NORMAL], we are able to see bunch of
issues daily while testing on a device farm.

For this particular issue below is the log.  Though the below log is
not directly pointing to the pfn_section_valid(){ ms->usage;}, when we
loaded this dump on T32 lauterbach tool, it is pointing.

[  540.578056] Unable to handle kernel NULL pointer dereference at
virtual address 0000000000000000
[  540.578068] Mem abort info:
[  540.578070]   ESR = 0x0000000096000005
[  540.578073]   EC = 0x25: DABT (current EL), IL = 32 bits
[  540.578077]   SET = 0, FnV = 0
[  540.578080]   EA = 0, S1PTW = 0
[  540.578082]   FSC = 0x05: level 1 translation fault
[  540.578085] Data abort info:
[  540.578086]   ISV = 0, ISS = 0x00000005
[  540.578088]   CM = 0, WnR = 0
[  540.579431] pstate: 82400005 (Nzcv daif +PAN -UAO +TCO -DIT -SSBSBTYPE=--)
[  540.579436] pc : __pageblock_pfn_to_page+0x6c/0x14c
[  540.579454] lr : compact_zone+0x994/0x1058
[  540.579460] sp : ffffffc03579b510
[  540.579463] x29: ffffffc03579b510 x28: 0000000000235800 x27:000000000000000c
[  540.579470] x26: 0000000000235c00 x25: 0000000000000068 x24:ffffffc03579b640
[  540.579477] x23: 0000000000000001 x22: ffffffc03579b660 x21:0000000000000000
[  540.579483] x20: 0000000000235bff x19: ffffffdebf7e3940 x18:ffffffdebf66d140
[  540.579489] x17: 00000000739ba063 x16: 00000000739ba063 x15:00000000009f4bff
[  540.579495] x14: 0000008000000000 x13: 0000000000000000 x12:0000000000000001
[  540.579501] x11: 0000000000000000 x10: 0000000000000000 x9 :ffffff897d2cd440
[  540.579507] x8 : 0000000000000000 x7 : 0000000000000000 x6 :ffffffc03579b5b4
[  540.579512] x5 : 0000000000027f25 x4 : ffffffc03579b5b8 x3 :0000000000000001
[  540.579518] x2 : ffffffdebf7e3940 x1 : 0000000000235c00 x0 :0000000000235800
[  540.579524] Call trace:
[  540.579527]  __pageblock_pfn_to_page+0x6c/0x14c
[  540.579533]  compact_zone+0x994/0x1058
[  540.579536]  try_to_compact_pages+0x128/0x378
[  540.579540]  __alloc_pages_direct_compact+0x80/0x2b0
[  540.579544]  __alloc_pages_slowpath+0x5c0/0xe10
[  540.579547]  __alloc_pages+0x250/0x2d0
[  540.579550]  __iommu_dma_alloc_noncontiguous+0x13c/0x3fc
[  540.579561]  iommu_dma_alloc+0xa0/0x320
[  540.579565]  dma_alloc_attrs+0xd4/0x108

[quic_charante@quicinc.com: use kfree_rcu() in place of synchronize_rcu(), per David]
  Link: https://lkml.kernel.org/r/1698403778-20938-1-git-send-email-quic_charante@quicinc.com
Link: https://lkml.kernel.org/r/1697202267-23600-1-git-send-email-quic_charante@quicinc.com
Fixes: f46edbd1b151 ("mm/sparsemem: add helpers track active portions of a section at boot")
Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ec73582e7d27..2efd3be484fd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1799,6 +1799,7 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
 
 struct mem_section_usage {
+	struct rcu_head rcu;
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
 #endif
@@ -1992,7 +1993,7 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
 {
 	int idx = subsection_map_index(pfn);
 
-	return test_bit(idx, ms->usage->subsection_map);
+	return test_bit(idx, READ_ONCE(ms->usage)->subsection_map);
 }
 #else
 static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
@@ -2016,6 +2017,7 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
 static inline int pfn_valid(unsigned long pfn)
 {
 	struct mem_section *ms;
+	int ret;
 
 	/*
 	 * Ensure the upper PAGE_SHIFT bits are clear in the
@@ -2029,13 +2031,19 @@ static inline int pfn_valid(unsigned long pfn)
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
 	ms = __pfn_to_section(pfn);
-	if (!valid_section(ms))
+	rcu_read_lock();
+	if (!valid_section(ms)) {
+		rcu_read_unlock();
 		return 0;
+	}
 	/*
 	 * Traditionally early sections always returned pfn_valid() for
 	 * the entire section-sized span.
 	 */
-	return early_section(ms) || pfn_section_valid(ms, pfn);
+	ret = early_section(ms) || pfn_section_valid(ms, pfn);
+	rcu_read_unlock();
+
+	return ret;
 }
 #endif
 
-- 
cgit v1.2.3


From 5cb6674b694b84803cbee8bfccaa2bfdfeb6eae4 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 21 Dec 2023 21:04:44 +0100
Subject: mm, kasan: use KASAN_TAG_KERNEL instead of 0xff

Use the KASAN_TAG_KERNEL marco instead of open-coding 0xff in the mm code.
This macro is provided by include/linux/kasan-tags.h, which does not
include any other headers, so it's safe to include it into mm.h without
causing circular include dependencies.

Link: https://lkml.kernel.org/r/71db9087b0aebb6c4dccbc609cc0cd50621533c7.1703188911.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 1 +
 include/linux/mm.h    | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d49e3d4c099e..dbb06d789e74 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -4,6 +4,7 @@
 
 #include <linux/bug.h>
 #include <linux/kasan-enabled.h>
+#include <linux/kasan-tags.h>
 #include <linux/kernel.h>
 #include <linux/static_key.h>
 #include <linux/types.h>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b72bf25a45cf..2563ffdb51bc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1815,7 +1815,7 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 
 static inline u8 page_kasan_tag(const struct page *page)
 {
-	u8 tag = 0xff;
+	u8 tag = KASAN_TAG_KERNEL;
 
 	if (kasan_enabled()) {
 		tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
@@ -1844,7 +1844,7 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag)
 static inline void page_kasan_tag_reset(struct page *page)
 {
 	if (kasan_enabled())
-		page_kasan_tag_set(page, 0xff);
+		page_kasan_tag_set(page, KASAN_TAG_KERNEL);
 }
 
 #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
-- 
cgit v1.2.3


From 9d5fafd5d882446999366f673ab06edba453f862 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:25 +0100
Subject: mm/rmap: rename hugepage_add* to hugetlb_add*

Patch series "mm/rmap: interface overhaul", v2.

This series overhauls the rmap interface, to get rid of the "bool
compound" / RMAP_COMPOUND parameter with the goal of making the interface
less error prone, more future proof, and more natural to extend to
"batching".  Also, this converts the interface to always consume
folio+subpage, which speeds up operations on large folios.

Further, this series adds PTE-batching variants for 4 rmap functions,
whereby only folio_add_anon_rmap_ptes() is used for batching in this
series when PTE-remapping a PMD-mapped THP.  folio_remove_rmap_ptes(),
folio_try_dup_anon_rmap_ptes() and folio_dup_file_rmap_ptes() will soon
come in handy[1,2].

This series performs a lot of folio conversion along the way.  Most of the
added LOC in the diff are only due to documentation.

As we're moving to a pte/pmd interface where we clearly express the
mapping granularity we are dealing with, we first get the remainder of
hugetlb out of the way, as it is special and expected to remain special:
it treats everything as a "single logical PTE" and only currently allows
entire mappings.

Even if we'd ever support partial mappings, I strongly assume the
interface and implementation will still differ heavily: hopefull we can
avoid working on subpages/subpage mapcounts completely and only add a
"count" parameter for them to enable batching.

New (extended) hugetlb interface that operates on entire folio:
 * hugetlb_add_new_anon_rmap() -> Already existed
 * hugetlb_add_anon_rmap() -> Already existed
 * hugetlb_try_dup_anon_rmap()
 * hugetlb_try_share_anon_rmap()
 * hugetlb_add_file_rmap()
 * hugetlb_remove_rmap()

New "ordinary" interface for small folios / THP::
 * folio_add_new_anon_rmap() -> Already existed
 * folio_add_anon_rmap_[pte|ptes|pmd]()
 * folio_try_dup_anon_rmap_[pte|ptes|pmd]()
 * folio_try_share_anon_rmap_[pte|pmd]()
 * folio_add_file_rmap_[pte|ptes|pmd]()
 * folio_dup_file_rmap_[pte|ptes|pmd]()
 * folio_remove_rmap_[pte|ptes|pmd]()

folio_add_new_anon_rmap() will always map at the largest granularity
possible (currently, a single PMD to cover a PMD-sized THP).  Could be
extended if ever required.

In the future, we might want "_pud" variants and eventually "_pmds"
variants for batching.

I ran some simple microbenchmarks on an Intel(R) Xeon(R) Silver 4210R:
measuring munmap(), fork(), cow, MADV_DONTNEED on each PTE ...  and PTE
remapping PMD-mapped THPs on 1 GiB of memory.

For small folios, there is barely a change (< 1% improvement for me).

For PTE-mapped THP:
* PTE-remapping a PMD-mapped THP is more than 10% faster.
* fork() is more than 4% faster.
* MADV_DONTNEED is 2% faster
* COW when writing only a single byte on a COW-shared PTE is 1% faster
* munmap() barely changes (< 1%).

[1] https://lkml.kernel.org/r/20230810103332.3062143-1-ryan.roberts@arm.com
[2] https://lkml.kernel.org/r/20231204105440.61448-1-ryan.roberts@arm.com


This patch (of 40):

Let's just call it "hugetlb_".

Yes, it's all already inconsistent and confusing because we have a lot of
"hugepage_" functions for legacy reasons.  But "hugetlb" cannot possibly
be confused with transparent huge pages, and it matches "hugetlb.c" and
"folio_test_hugetlb()".  So let's minimize confusion in rmap code.

Link: https://lkml.kernel.org/r/20231220224504.646757-1-david@redhat.com
Link: https://lkml.kernel.org/r/20231220224504.646757-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 0ae2bb0e77f5..36096ba69bdc 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -206,9 +206,9 @@ void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
 void page_remove_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
 
-void hugepage_add_anon_rmap(struct folio *, struct vm_area_struct *,
+void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
-void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
+void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 
 static inline void __page_dup_rmap(struct page *page, bool compound)
-- 
cgit v1.2.3


From e135826b2da0cf25305086dc9ac1e91718a148e1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:26 +0100
Subject: mm/rmap: introduce and use hugetlb_remove_rmap()

hugetlb rmap handling differs quite a lot from "ordinary" rmap code.  For
example, hugetlb currently only supports entire mappings, and treats any
mapping as mapped using a single "logical PTE".  Let's move it out of the
way so we can overhaul our "ordinary" rmap.  implementation/interface.

Let's introduce and use hugetlb_remove_rmap() and remove the hugetlb code
from page_remove_rmap().  This effectively removes one check on the
small-folio path as well.

Add sanity checks that we end up with the right folios in the right
functions.

Note: all possible candidates that need care are page_remove_rmap() that
      pass compound=true.

Link: https://lkml.kernel.org/r/20231220224504.646757-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 36096ba69bdc..64ae6c4d7272 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -211,6 +211,13 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 
+static inline void hugetlb_remove_rmap(struct folio *folio)
+{
+	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+
+	atomic_dec(&folio->_entire_mapcount);
+}
+
 static inline void __page_dup_rmap(struct page *page, bool compound)
 {
 	if (compound) {
-- 
cgit v1.2.3


From 44887f39945519fa8405133b1acd098fda9c9746 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:27 +0100
Subject: mm/rmap: introduce and use hugetlb_add_file_rmap()

hugetlb rmap handling differs quite a lot from "ordinary" rmap code.  For
example, hugetlb currently only supports entire mappings, and treats any
mapping as mapped using a single "logical PTE".  Let's move it out of the
way so we can overhaul our "ordinary" rmap.  implementation/interface.

Right now we're using page_dup_file_rmap() in some cases where "ordinary"
rmap code would have used page_add_file_rmap().  So let's introduce and
use hugetlb_add_file_rmap() instead.  We won't be adding a
"hugetlb_dup_file_rmap()" functon for the fork() case, as it would be
doing the same: "dup" is just an optimization for "add".

What remains is a single page_dup_file_rmap() call in fork() code.

Add sanity checks that we end up with the right folios in the right
functions.

Link: https://lkml.kernel.org/r/20231220224504.646757-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 64ae6c4d7272..56900a16f41a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -211,6 +211,14 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 
+static inline void hugetlb_add_file_rmap(struct folio *folio)
+{
+	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
+
+	atomic_inc(&folio->_entire_mapcount);
+}
+
 static inline void hugetlb_remove_rmap(struct folio *folio)
 {
 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
-- 
cgit v1.2.3


From ebe2e35ec0f256372c158a18de459fb60070b313 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:28 +0100
Subject: mm/rmap: introduce and use hugetlb_try_dup_anon_rmap()

hugetlb rmap handling differs quite a lot from "ordinary" rmap code.  For
example, hugetlb currently only supports entire mappings, and treats any
mapping as mapped using a single "logical PTE".  Let's move it out of the
way so we can overhaul our "ordinary" rmap.  implementation/interface.

So let's introduce and use hugetlb_try_dup_anon_rmap() to make all hugetlb
handling use dedicated hugetlb_* rmap functions.

Add sanity checks that we end up with the right folios in the right
functions.

Note that is_device_private_page() does not apply to hugetlb.

Link: https://lkml.kernel.org/r/20231220224504.646757-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h   | 12 +++++++++---
 include/linux/rmap.h | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2563ffdb51bc..75bba6102825 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1964,15 +1964,21 @@ static inline bool page_maybe_dma_pinned(struct page *page)
  *
  * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
  */
-static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
-					  struct page *page)
+static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
+					  struct folio *folio)
 {
 	VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
 
 	if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
 		return false;
 
-	return page_maybe_dma_pinned(page);
+	return folio_maybe_dma_pinned(folio);
+}
+
+static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
+					  struct page *page)
+{
+	return folio_needs_cow_for_dma(vma, page_folio(page));
 }
 
 /**
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 56900a16f41a..5f26752de945 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -211,6 +211,22 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 
+/* See page_try_dup_anon_rmap() */
+static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
+		struct vm_area_struct *vma)
+{
+	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+
+	if (PageAnonExclusive(&folio->page)) {
+		if (unlikely(folio_needs_cow_for_dma(vma, folio)))
+			return -EBUSY;
+		ClearPageAnonExclusive(&folio->page);
+	}
+	atomic_inc(&folio->_entire_mapcount);
+	return 0;
+}
+
 static inline void hugetlb_add_file_rmap(struct folio *folio)
 {
 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
@@ -228,6 +244,8 @@ static inline void hugetlb_remove_rmap(struct folio *folio)
 
 static inline void __page_dup_rmap(struct page *page, bool compound)
 {
+	VM_WARN_ON(folio_test_hugetlb(page_folio(page)));
+
 	if (compound) {
 		struct folio *folio = (struct folio *)page;
 
-- 
cgit v1.2.3


From 0c2ec32bf0b2f0d7ccb98c53ee5d255d68e73595 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:29 +0100
Subject: mm/rmap: introduce and use hugetlb_try_share_anon_rmap()

hugetlb rmap handling differs quite a lot from "ordinary" rmap code.  For
example, hugetlb currently only supports entire mappings, and treats any
mapping as mapped using a single "logical PTE".  Let's move it out of the
way so we can overhaul our "ordinary" rmap.  implementation/interface.

So let's introduce and use hugetlb_try_dup_anon_rmap() to make all hugetlb
handling use dedicated hugetlb_* rmap functions.

Add sanity checks that we end up with the right folios in the right
functions.

Note that try_to_unmap_one() does not need care.  Easy to spot because
among all that nasty hugetlb special-casing in that function, we're not
using set_huge_pte_at() on the anon path -- well, and that code assumes
that we would want to swapout.

Link: https://lkml.kernel.org/r/20231220224504.646757-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 5f26752de945..d6fefa0f0410 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -227,6 +227,30 @@ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
 	return 0;
 }
 
+/* See page_try_share_anon_rmap() */
+static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
+{
+	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+	VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);
+
+	/* Paired with the memory barrier in try_grab_folio(). */
+	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+		smp_mb();
+
+	if (unlikely(folio_maybe_dma_pinned(folio)))
+		return -EBUSY;
+	ClearPageAnonExclusive(&folio->page);
+
+	/*
+	 * This is conceptually a smp_wmb() paired with the smp_rmb() in
+	 * gup_must_unshare().
+	 */
+	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+		smp_mb__after_atomic();
+	return 0;
+}
+
 static inline void hugetlb_add_file_rmap(struct folio *folio)
 {
 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
@@ -331,6 +355,7 @@ dup:
  */
 static inline int page_try_share_anon_rmap(struct page *page)
 {
+	VM_WARN_ON(folio_test_hugetlb(page_folio(page)));
 	VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
 
 	/* device private pages cannot get pinned via GUP. */
-- 
cgit v1.2.3


From 68f0320824fa59c5429cbc811e6c46e7a30ea32c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:31 +0100
Subject: mm/rmap: convert folio_add_file_rmap_range() into
 folio_add_file_rmap_[pte|ptes|pmd]()

Let's get rid of the compound parameter and instead define explicitly
which mappings we're adding.  That is more future proof, easier to read
and harder to mess up.

Use an enum to express the granularity internally.  Make the compiler
always special-case on the granularity by using __always_inline.  Replace
the "compound" check by a switch-case that will be removed by the compiler
completely.

Add plenty of sanity checks with CONFIG_DEBUG_VM.  Replace the
folio_test_pmd_mappable() check by a config check in the caller and sanity
checks.  Convert the single user of folio_add_file_rmap_range().

While at it, consistently use "int" instead of "unisgned int" in rmap code
when dealing with mapcounts and the number of pages.

This function design can later easily be extended to PUDs and to batch
PMDs.  Note that for now we don't support anything bigger than PMD-sized
folios (as we cleanly separated hugetlb handling).  Sanity checks will
catch if that ever changes.

Next up is removing page_remove_rmap() along with its "compound" parameter
and smilarly converting all other rmap functions.

Link: https://lkml.kernel.org/r/20231220224504.646757-8-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d6fefa0f0410..3d86a76b2836 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -191,6 +191,44 @@ typedef int __bitwise rmap_t;
  */
 #define RMAP_COMPOUND		((__force rmap_t)BIT(1))
 
+/*
+ * Internally, we're using an enum to specify the granularity. We make the
+ * compiler emit specialized code for each granularity.
+ */
+enum rmap_level {
+	RMAP_LEVEL_PTE = 0,
+	RMAP_LEVEL_PMD,
+};
+
+static inline void __folio_rmap_sanity_checks(struct folio *folio,
+		struct page *page, int nr_pages, enum rmap_level level)
+{
+	/* hugetlb folios are handled separately. */
+	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
+	VM_WARN_ON_FOLIO(folio_test_large(folio) &&
+			 !folio_test_large_rmappable(folio), folio);
+
+	VM_WARN_ON_ONCE(nr_pages <= 0);
+	VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
+	VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);
+
+	switch (level) {
+	case RMAP_LEVEL_PTE:
+		break;
+	case RMAP_LEVEL_PMD:
+		/*
+		 * We don't support folios larger than a single PMD yet. So
+		 * when RMAP_LEVEL_PMD is set, we assume that we are creating
+		 * a single "entire" mapping of the folio.
+		 */
+		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
+		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
+		break;
+	default:
+		VM_WARN_ON_ONCE(true);
+	}
+}
+
 /*
  * rmap interfaces called when adding or removing pte of page
  */
@@ -201,8 +239,12 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 void page_add_file_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
-void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
-		struct vm_area_struct *, bool compound);
+void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
+		struct vm_area_struct *);
+#define folio_add_file_rmap_pte(folio, page, vma) \
+	folio_add_file_rmap_ptes(folio, page, 1, vma)
+void folio_add_file_rmap_pmd(struct folio *, struct page *,
+		struct vm_area_struct *);
 void page_remove_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
 
-- 
cgit v1.2.3


From be6e57cfabe99a5d3b3869103c4ea0ed4a9692d4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:36 +0100
Subject: mm/rmap: remove page_add_file_rmap()

All users are gone, let's remove it.

Link: https://lkml.kernel.org/r/20231220224504.646757-13-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 3d86a76b2836..6a4db6933e7d 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -237,8 +237,6 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
-void page_add_file_rmap(struct page *, struct vm_area_struct *,
-		bool compound);
 void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
 		struct vm_area_struct *);
 #define folio_add_file_rmap_pte(folio, page, vma) \
-- 
cgit v1.2.3


From 8bd5130070fbf2247a97c5361427a810522ac98a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:38 +0100
Subject: mm/rmap: introduce folio_add_anon_rmap_[pte|ptes|pmd]()

Let's mimic what we did with folio_add_file_rmap_*() so we can similarly
replace page_add_anon_rmap() next.

Make the compiler always special-case on the granularity by using
__always_inline.

For the PageAnonExclusive sanity checks, when adding a PMD mapping, we're
now also checking each individual subpage covered by that PMD, instead of
only the head page.

Note that the new functions ignore the RMAP_COMPOUND flag, which we will
remove as soon as page_add_anon_rmap() is gone.

Link: https://lkml.kernel.org/r/20231220224504.646757-15-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6a4db6933e7d..b5da3d86200e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -233,6 +233,12 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio,
  * rmap interfaces called when adding or removing pte of page
  */
 void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
+void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages,
+		struct vm_area_struct *, unsigned long address, rmap_t flags);
+#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \
+	folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
+void folio_add_anon_rmap_pmd(struct folio *, struct page *,
+		struct vm_area_struct *, unsigned long address, rmap_t flags);
 void page_add_anon_rmap(struct page *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
-- 
cgit v1.2.3


From 84f0169e6c8a613012722e0d63302f9da4a72099 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:45 +0100
Subject: mm/rmap: remove page_add_anon_rmap()

All users are gone, remove it and all traces.

Link: https://lkml.kernel.org/r/20231220224504.646757-22-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b5da3d86200e..fe7b5a8b0e75 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -239,8 +239,6 @@ void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages,
 	folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
 void folio_add_anon_rmap_pmd(struct folio *, struct page *,
 		struct vm_area_struct *, unsigned long address, rmap_t flags);
-void page_add_anon_rmap(struct page *, struct vm_area_struct *,
-		unsigned long address, rmap_t flags);
 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
-- 
cgit v1.2.3


From 0cae959e3abf19ba62805f6e6a8b42b6cd9ed3e3 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:46 +0100
Subject: mm/rmap: remove RMAP_COMPOUND

No longer used, let's remove it and clarify RMAP_NONE/RMAP_EXCLUSIVE a
bit.

Link: https://lkml.kernel.org/r/20231220224504.646757-23-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fe7b5a8b0e75..bf6cb79aa7a0 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -177,20 +177,14 @@ struct anon_vma *folio_get_anon_vma(struct folio *folio);
 typedef int __bitwise rmap_t;
 
 /*
- * No special request: if the page is a subpage of a compound page, it is
- * mapped via a PTE. The mapped (sub)page is possibly shared between processes.
+ * No special request: A mapped anonymous (sub)page is possibly shared between
+ * processes.
  */
 #define RMAP_NONE		((__force rmap_t)0)
 
-/* The (sub)page is exclusive to a single process. */
+/* The anonymous (sub)page is exclusive to a single process. */
 #define RMAP_EXCLUSIVE		((__force rmap_t)BIT(0))
 
-/*
- * The compound page is not mapped via PTEs, but instead via a single PMD and
- * should be accounted accordingly.
- */
-#define RMAP_COMPOUND		((__force rmap_t)BIT(1))
-
 /*
  * Internally, we're using an enum to specify the granularity. We make the
  * compiler emit specialized code for each granularity.
-- 
cgit v1.2.3


From b06dc281aa9901076898d4d0a7bde588f11bc204 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:47 +0100
Subject: mm/rmap: introduce folio_remove_rmap_[pte|ptes|pmd]()

Let's mimic what we did with folio_add_file_rmap_*() and
folio_add_anon_rmap_*() so we can similarly replace page_remove_rmap()
next.

Make the compiler always special-case on the granularity by using
__always_inline.

We're adding folio_remove_rmap_ptes() handling right away, as we want to
use that soon for batching rmap operations when unmapping PTE-mapped large
folios.

Link: https://lkml.kernel.org/r/20231220224504.646757-24-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bf6cb79aa7a0..57e045093f04 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -243,6 +243,12 @@ void folio_add_file_rmap_pmd(struct folio *, struct page *,
 		struct vm_area_struct *);
 void page_remove_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
+void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
+		struct vm_area_struct *);
+#define folio_remove_rmap_pte(folio, page, vma) \
+	folio_remove_rmap_ptes(folio, page, 1, vma)
+void folio_remove_rmap_pmd(struct folio *, struct page *,
+		struct vm_area_struct *);
 
 void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
-- 
cgit v1.2.3


From 4d8f7418e8ba36036c8486d92d9591c368ab9b85 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:56 +0100
Subject: mm/rmap: remove page_remove_rmap()

All callers are gone, let's remove it and some leftover traces.

Link: https://lkml.kernel.org/r/20231220224504.646757-33-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 57e045093f04..fef369e37039 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -241,8 +241,6 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
 	folio_add_file_rmap_ptes(folio, page, 1, vma)
 void folio_add_file_rmap_pmd(struct folio *, struct page *,
 		struct vm_area_struct *);
-void page_remove_rmap(struct page *, struct vm_area_struct *,
-		bool compound);
 void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
 		struct vm_area_struct *);
 #define folio_remove_rmap_pte(folio, page, vma) \
@@ -389,7 +387,7 @@ dup:
  *
  * This is similar to page_try_dup_anon_rmap(), however, not used during fork()
  * to duplicate a mapping, but instead to prepare for KSM or temporarily
- * unmapping a page (swap, migration) via page_remove_rmap().
+ * unmapping a page (swap, migration) via folio_remove_rmap_*().
  *
  * Marking the page shared can only fail if the page may be pinned; device
  * private pages cannot get pinned and consequently this function cannot fail.
-- 
cgit v1.2.3


From d8ef5e311d7bfde54b60ab45026f206eff31b2d2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:57 +0100
Subject: mm/rmap: convert page_dup_file_rmap() to
 folio_dup_file_rmap_[pte|ptes|pmd]()

Let's convert page_dup_file_rmap() like the other rmap functions.  As
there is only a single caller, convert that single caller right away and
remove page_dup_file_rmap().

Add folio_dup_file_rmap_ptes() right away, we want to perform rmap baching
during fork() soon.

Link: https://lkml.kernel.org/r/20231220224504.646757-34-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 59 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fef369e37039..7607f862e795 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -308,6 +308,60 @@ static inline void hugetlb_remove_rmap(struct folio *folio)
 	atomic_dec(&folio->_entire_mapcount);
 }
 
+static __always_inline void __folio_dup_file_rmap(struct folio *folio,
+		struct page *page, int nr_pages, enum rmap_level level)
+{
+	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
+
+	switch (level) {
+	case RMAP_LEVEL_PTE:
+		do {
+			atomic_inc(&page->_mapcount);
+		} while (page++, --nr_pages > 0);
+		break;
+	case RMAP_LEVEL_PMD:
+		atomic_inc(&folio->_entire_mapcount);
+		break;
+	}
+}
+
+/**
+ * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio
+ * @folio:	The folio to duplicate the mappings of
+ * @page:	The first page to duplicate the mappings of
+ * @nr_pages:	The number of pages of which the mapping will be duplicated
+ *
+ * The page range of the folio is defined by [page, page + nr_pages)
+ *
+ * The caller needs to hold the page table lock.
+ */
+static inline void folio_dup_file_rmap_ptes(struct folio *folio,
+		struct page *page, int nr_pages)
+{
+	__folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE);
+}
+#define folio_dup_file_rmap_pte(folio, page) \
+	folio_dup_file_rmap_ptes(folio, page, 1)
+
+/**
+ * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
+ * @folio:	The folio to duplicate the mapping of
+ * @page:	The first page to duplicate the mapping of
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+static inline void folio_dup_file_rmap_pmd(struct folio *folio,
+		struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	__folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE);
+#else
+	WARN_ON_ONCE(true);
+#endif
+}
+
 static inline void __page_dup_rmap(struct page *page, bool compound)
 {
 	VM_WARN_ON(folio_test_hugetlb(page_folio(page)));
@@ -322,11 +376,6 @@ static inline void __page_dup_rmap(struct page *page, bool compound)
 	}
 }
 
-static inline void page_dup_file_rmap(struct page *page, bool compound)
-{
-	__page_dup_rmap(page, compound);
-}
-
 /**
  * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped
  *			    anonymous page
-- 
cgit v1.2.3


From 61d90309b7156d54c5d358cb5d8bf55b33d233d2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:58 +0100
Subject: mm/rmap: introduce folio_try_dup_anon_rmap_[pte|ptes|pmd]()

The last user of page_needs_cow_for_dma() and __page_dup_rmap() are gone,
remove them.

Add folio_try_dup_anon_rmap_ptes() right away, we want to perform rmap
baching during fork() soon.

Link: https://lkml.kernel.org/r/20231220224504.646757-35-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h   |   6 ---
 include/linux/rmap.h | 150 ++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 106 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 75bba6102825..896c0079f64f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1975,12 +1975,6 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
 	return folio_maybe_dma_pinned(folio);
 }
 
-static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
-					  struct page *page)
-{
-	return folio_needs_cow_for_dma(vma, page_folio(page));
-}
-
 /**
  * is_zero_page - Query if a page is a zero page
  * @page: The page to query
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 7607f862e795..850aa74b6724 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -362,68 +362,130 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio,
 #endif
 }
 
-static inline void __page_dup_rmap(struct page *page, bool compound)
+static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
+		struct page *page, int nr_pages, struct vm_area_struct *src_vma,
+		enum rmap_level level)
 {
-	VM_WARN_ON(folio_test_hugetlb(page_folio(page)));
+	bool maybe_pinned;
+	int i;
+
+	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
 
-	if (compound) {
-		struct folio *folio = (struct folio *)page;
+	/*
+	 * If this folio may have been pinned by the parent process,
+	 * don't allow to duplicate the mappings but instead require to e.g.,
+	 * copy the subpage immediately for the child so that we'll always
+	 * guarantee the pinned folio won't be randomly replaced in the
+	 * future on write faults.
+	 */
+	maybe_pinned = likely(!folio_is_device_private(folio)) &&
+		       unlikely(folio_needs_cow_for_dma(src_vma, folio));
 
-		VM_BUG_ON_PAGE(compound && !PageHead(page), page);
+	/*
+	 * No need to check+clear for already shared PTEs/PMDs of the
+	 * folio. But if any page is PageAnonExclusive, we must fallback to
+	 * copying if the folio maybe pinned.
+	 */
+	switch (level) {
+	case RMAP_LEVEL_PTE:
+		if (unlikely(maybe_pinned)) {
+			for (i = 0; i < nr_pages; i++)
+				if (PageAnonExclusive(page + i))
+					return -EBUSY;
+		}
+		do {
+			if (PageAnonExclusive(page))
+				ClearPageAnonExclusive(page);
+			atomic_inc(&page->_mapcount);
+		} while (page++, --nr_pages > 0);
+		break;
+	case RMAP_LEVEL_PMD:
+		if (PageAnonExclusive(page)) {
+			if (unlikely(maybe_pinned))
+				return -EBUSY;
+			ClearPageAnonExclusive(page);
+		}
 		atomic_inc(&folio->_entire_mapcount);
-	} else {
-		atomic_inc(&page->_mapcount);
+		break;
 	}
+	return 0;
 }
 
 /**
- * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped
- *			    anonymous page
- * @page: the page to duplicate the mapping for
- * @compound: the page is mapped as compound or as a small page
- * @vma: the source vma
+ * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range
+ *				  of a folio
+ * @folio:	The folio to duplicate the mappings of
+ * @page:	The first page to duplicate the mappings of
+ * @nr_pages:	The number of pages of which the mapping will be duplicated
+ * @src_vma:	The vm area from which the mappings are duplicated
  *
- * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq.
+ * The page range of the folio is defined by [page, page + nr_pages)
  *
- * Duplicating the mapping can only fail if the page may be pinned; device
- * private pages cannot get pinned and consequently this function cannot fail.
+ * The caller needs to hold the page table lock and the
+ * vma->vma_mm->write_protect_seq.
+ *
+ * Duplicating the mappings can only fail if the folio may be pinned; device
+ * private folios cannot get pinned and consequently this function cannot fail
+ * for them.
+ *
+ * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in
+ * the parent and the child. They must *not* be writable after this call
+ * succeeded.
+ *
+ * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
+ */
+static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
+		struct page *page, int nr_pages, struct vm_area_struct *src_vma)
+{
+	return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma,
+					 RMAP_LEVEL_PTE);
+}
+#define folio_try_dup_anon_rmap_pte(folio, page, vma) \
+	folio_try_dup_anon_rmap_ptes(folio, page, 1, vma)
+
+/**
+ * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
+ *				 of a folio
+ * @folio:	The folio to duplicate the mapping of
+ * @page:	The first page to duplicate the mapping of
+ * @src_vma:	The vm area from which the mapping is duplicated
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
  *
- * If duplicating the mapping succeeds, the page has to be mapped R/O into
- * the parent and the child. It must *not* get mapped writable after this call.
+ * The caller needs to hold the page table lock and the
+ * vma->vma_mm->write_protect_seq.
+ *
+ * Duplicating the mapping can only fail if the folio may be pinned; device
+ * private folios cannot get pinned and consequently this function cannot fail
+ * for them.
+ *
+ * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in
+ * the parent and the child. They must *not* be writable after this call
+ * succeeded.
  *
  * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
  */
+static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
+		struct page *page, struct vm_area_struct *src_vma)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma,
+					 RMAP_LEVEL_PMD);
+#else
+	WARN_ON_ONCE(true);
+	return -EBUSY;
+#endif
+}
+
 static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
 					 struct vm_area_struct *vma)
 {
-	VM_BUG_ON_PAGE(!PageAnon(page), page);
-
-	/*
-	 * No need to check+clear for already shared pages, including KSM
-	 * pages.
-	 */
-	if (!PageAnonExclusive(page))
-		goto dup;
-
-	/*
-	 * If this page may have been pinned by the parent process,
-	 * don't allow to duplicate the mapping but instead require to e.g.,
-	 * copy the page immediately for the child so that we'll always
-	 * guarantee the pinned page won't be randomly replaced in the
-	 * future on write faults.
-	 */
-	if (likely(!is_device_private_page(page)) &&
-	    unlikely(page_needs_cow_for_dma(vma, page)))
-		return -EBUSY;
+	struct folio *folio = page_folio(page);
 
-	ClearPageAnonExclusive(page);
-	/*
-	 * It's okay to share the anon page between both processes, mapping
-	 * the page R/O into both processes.
-	 */
-dup:
-	__page_dup_rmap(page, compound);
-	return 0;
+	if (likely(!compound))
+		return folio_try_dup_anon_rmap_pte(folio, page, vma);
+	return folio_try_dup_anon_rmap_pmd(folio, page, vma);
 }
 
 /**
-- 
cgit v1.2.3


From a13d096471ec0ac5c6fc90fbcd57e8430024046a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:45:01 +0100
Subject: mm/rmap: remove page_try_dup_anon_rmap()

All users are gone, remove page_try_dup_anon_rmap() and any remaining
traces.

Link: https://lkml.kernel.org/r/20231220224504.646757-38-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 850aa74b6724..0ad2ea2734e4 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -253,7 +253,7 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 
-/* See page_try_dup_anon_rmap() */
+/* See folio_try_dup_anon_rmap_*() */
 static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
 		struct vm_area_struct *vma)
 {
@@ -478,16 +478,6 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
 #endif
 }
 
-static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
-					 struct vm_area_struct *vma)
-{
-	struct folio *folio = page_folio(page);
-
-	if (likely(!compound))
-		return folio_try_dup_anon_rmap_pte(folio, page, vma);
-	return folio_try_dup_anon_rmap_pmd(folio, page, vma);
-}
-
 /**
  * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly
  *			      shared to prepare for KSM or temporary unmapping
@@ -496,8 +486,8 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
  * The caller needs to hold the PT lock and has to have the page table entry
  * cleared/invalidated.
  *
- * This is similar to page_try_dup_anon_rmap(), however, not used during fork()
- * to duplicate a mapping, but instead to prepare for KSM or temporarily
+ * This is similar to folio_try_dup_anon_rmap_*(), however, not used during
+ * fork() to duplicate a mapping, but instead to prepare for KSM or temporarily
  * unmapping a page (swap, migration) via folio_remove_rmap_*().
  *
  * Marking the page shared can only fail if the page may be pinned; device
-- 
cgit v1.2.3


From e3b4b1374f87c71e9309efc6149f113cdd17af72 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:45:02 +0100
Subject: mm: convert page_try_share_anon_rmap() to
 folio_try_share_anon_rmap_[pte|pmd]()

Let's convert it like we converted all the other rmap functions.  Don't
introduce folio_try_share_anon_rmap_ptes() for now, as we don't have a
user that wants rmap batching in sight.  Pretty easy to add later.

All users are easy to convert -- only ksm.c doesn't use folios yet but
that is left for future work -- so let's just do it in a single shot.

While at it, turn the BUG_ON into a WARN_ON_ONCE.

Note that page_try_share_anon_rmap() so far didn't care about pte/pmd
mappings (no compound parameter).  We're changing that so we can perform
better sanity checks and make the code actually more readable/consistent.
For example, __folio_rmap_sanity_checks() will make sure that a PMD range
actually falls completely into the folio.

Link: https://lkml.kernel.org/r/20231220224504.646757-39-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 96 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 71 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 0ad2ea2734e4..fd6fe16fa358 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -269,7 +269,7 @@ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
 	return 0;
 }
 
-/* See page_try_share_anon_rmap() */
+/* See folio_try_share_anon_rmap_*() */
 static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
 {
 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
@@ -478,31 +478,15 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
 #endif
 }
 
-/**
- * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly
- *			      shared to prepare for KSM or temporary unmapping
- * @page: the exclusive anonymous page to try marking possibly shared
- *
- * The caller needs to hold the PT lock and has to have the page table entry
- * cleared/invalidated.
- *
- * This is similar to folio_try_dup_anon_rmap_*(), however, not used during
- * fork() to duplicate a mapping, but instead to prepare for KSM or temporarily
- * unmapping a page (swap, migration) via folio_remove_rmap_*().
- *
- * Marking the page shared can only fail if the page may be pinned; device
- * private pages cannot get pinned and consequently this function cannot fail.
- *
- * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY
- * otherwise.
- */
-static inline int page_try_share_anon_rmap(struct page *page)
+static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
+		struct page *page, int nr_pages, enum rmap_level level)
 {
-	VM_WARN_ON(folio_test_hugetlb(page_folio(page)));
-	VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
+	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+	VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
+	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
 
-	/* device private pages cannot get pinned via GUP. */
-	if (unlikely(is_device_private_page(page))) {
+	/* device private folios cannot get pinned via GUP. */
+	if (unlikely(folio_is_device_private(folio))) {
 		ClearPageAnonExclusive(page);
 		return 0;
 	}
@@ -553,7 +537,7 @@ static inline int page_try_share_anon_rmap(struct page *page)
 	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
 		smp_mb();
 
-	if (unlikely(page_maybe_dma_pinned(page)))
+	if (unlikely(folio_maybe_dma_pinned(folio)))
 		return -EBUSY;
 	ClearPageAnonExclusive(page);
 
@@ -566,6 +550,68 @@ static inline int page_try_share_anon_rmap(struct page *page)
 	return 0;
 }
 
+/**
+ * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page
+ *				   mapped by a PTE possibly shared to prepare
+ *				   for KSM or temporary unmapping
+ * @folio:	The folio to share a mapping of
+ * @page:	The mapped exclusive page
+ *
+ * The caller needs to hold the page table lock and has to have the page table
+ * entries cleared/invalidated.
+ *
+ * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during
+ * fork() to duplicate mappings, but instead to prepare for KSM or temporarily
+ * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte().
+ *
+ * Marking the mapped page shared can only fail if the folio maybe pinned;
+ * device private folios cannot get pinned and consequently this function cannot
+ * fail.
+ *
+ * Returns 0 if marking the mapped page possibly shared succeeded. Returns
+ * -EBUSY otherwise.
+ */
+static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
+		struct page *page)
+{
+	return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE);
+}
+
+/**
+ * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page
+ *				   range mapped by a PMD possibly shared to
+ *				   prepare for temporary unmapping
+ * @folio:	The folio to share the mapping of
+ * @page:	The first page to share the mapping of
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
+ *
+ * The caller needs to hold the page table lock and has to have the page table
+ * entries cleared/invalidated.
+ *
+ * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during
+ * fork() to duplicate a mapping, but instead to prepare for temporarily
+ * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd().
+ *
+ * Marking the mapped pages shared can only fail if the folio maybe pinned;
+ * device private folios cannot get pinned and consequently this function cannot
+ * fail.
+ *
+ * Returns 0 if marking the mapped pages possibly shared succeeded. Returns
+ * -EBUSY otherwise.
+ */
+static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
+		struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
+					   RMAP_LEVEL_PMD);
+#else
+	WARN_ON_ONCE(true);
+	return -EBUSY;
+#endif
+}
+
 /*
  * Called from mm/vmscan.c to handle paging out
  */
-- 
cgit v1.2.3


From 90ca22513ed5d7cf546c7c8d35a03ec2a2f5c87e Mon Sep 17 00:00:00 2001
From: Mathis Marion <mathis.marion@silabs.com>
Date: Tue, 19 Dec 2023 14:11:54 +0100
Subject: lib: crc_ccitt_false() is identical to crc_itu_t()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

crc_ccitt_false() was introduced in commit 0d85adb5fbd33 ("lib/crc-ccitt:
Add CCITT-FALSE CRC16 variant"), but it is redundant with crc_itu_t().
Since the latter is more used, it is the one being kept.

Link: https://lkml.kernel.org/r/20231219131154.748577-1-Mathis.Marion@silabs.com
Signed-off-by: Mathis Marion <mathis.marion@silabs.com>
Cc: Andrey Smirnov <andrew.smirnov@gmail.com>
Cc: Andrey Vostrikov <andrey.vostrikov@cogentembedded.com>
Cc: Jérôme Pouiller <jerome.pouiller@silabs.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crc-ccitt.h                     | 7 -------
 include/linux/surface_aggregator/serial_hub.h | 4 ++--
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crc-ccitt.h b/include/linux/crc-ccitt.h
index 72c92c396bb8..cd4f420231ba 100644
--- a/include/linux/crc-ccitt.h
+++ b/include/linux/crc-ccitt.h
@@ -5,19 +5,12 @@
 #include <linux/types.h>
 
 extern u16 const crc_ccitt_table[256];
-extern u16 const crc_ccitt_false_table[256];
 
 extern u16 crc_ccitt(u16 crc, const u8 *buffer, size_t len);
-extern u16 crc_ccitt_false(u16 crc, const u8 *buffer, size_t len);
 
 static inline u16 crc_ccitt_byte(u16 crc, const u8 c)
 {
 	return (crc >> 8) ^ crc_ccitt_table[(crc ^ c) & 0xff];
 }
 
-static inline u16 crc_ccitt_false_byte(u16 crc, const u8 c)
-{
-    return (crc << 8) ^ crc_ccitt_false_table[(crc >> 8) ^ c];
-}
-
 #endif /* _LINUX_CRC_CCITT_H */
diff --git a/include/linux/surface_aggregator/serial_hub.h b/include/linux/surface_aggregator/serial_hub.h
index 5c4ae1a26183..d8dbef6b7fc2 100644
--- a/include/linux/surface_aggregator/serial_hub.h
+++ b/include/linux/surface_aggregator/serial_hub.h
@@ -12,7 +12,7 @@
 #ifndef _LINUX_SURFACE_AGGREGATOR_SERIAL_HUB_H
 #define _LINUX_SURFACE_AGGREGATOR_SERIAL_HUB_H
 
-#include <linux/crc-ccitt.h>
+#include <linux/crc-itu-t.h>
 #include <linux/kref.h>
 #include <linux/ktime.h>
 #include <linux/list.h>
@@ -188,7 +188,7 @@ static_assert(sizeof(struct ssh_command) == 8);
  */
 static inline u16 ssh_crc(const u8 *buf, size_t len)
 {
-	return crc_ccitt_false(0xffff, buf, len);
+	return crc_itu_t(0xffff, buf, len);
 }
 
 /*
-- 
cgit v1.2.3


From 29166371ef6780429e4cb84f1827fafbdd4005ab Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Wed, 27 Dec 2023 07:46:25 +0800
Subject: kdump: remove redundant DEFAULT_CRASH_KERNEL_LOW_SIZE

Remove duplicate definitions, no functional changes.

Link: https://lkml.kernel.org/r/MW4PR84MB3145459ADC7EB38BBB36955B8198A@MW4PR84MB3145.NAMPRD84.PROD.OUTLOOK.COM
Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Reported-by: Huacai Chen <chenhuacai@loongson.cn>
Acked-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 5126a4fecb44..9eaeaafe0cad 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -87,12 +87,6 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
 
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
-#define DEFAULT_CRASH_KERNEL_LOW_SIZE  (128UL << 20)
-#endif
-#endif
-
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base,
 		unsigned long long *low_size, bool *high);
-- 
cgit v1.2.3


From 501a06fe8e4c185bbda371b8cedbdf1b23a633d8 Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Thu, 7 Dec 2023 11:24:06 -0800
Subject: zswap: memcontrol: implement zswap writeback disabling

During our experiment with zswap, we sometimes observe swap IOs due to
occasional zswap store failures and writebacks-to-swap.  These swapping
IOs prevent many users who cannot tolerate swapping from adopting zswap to
save memory and improve performance where possible.

This patch adds the option to disable this behavior entirely: do not
writeback to backing swapping device when a zswap store attempt fail, and
do not write pages in the zswap pool back to the backing swap device (both
when the pool is full, and when the new zswap shrinker is called).

This new behavior can be opted-in/out on a per-cgroup basis via a new
cgroup file.  By default, writebacks to swap device is enabled, which is
the previous behavior.  Initially, writeback is enabled for the root
cgroup, and a newly created cgroup will inherit the current setting of its
parent.

Note that this is subtly different from setting memory.swap.max to 0, as
it still allows for pages to be stored in the zswap pool (which itself
consumes swap space in its current form).

This patch should be applied on top of the zswap shrinker series:

https://lore.kernel.org/linux-mm/20231130194023.4102148-1-nphamcs@gmail.com/

as it also disables the zswap shrinker, a major source of zswap
writebacks.

For the most part, this feature is motivated by internal parties who
have already established their opinions regarding swapping - the
workloads that are highly sensitive to IO, and especially those who are
using servers with really slow disk performance (for instance, massive
but slow HDDs).  For these folks, it's impossible to convince them to
even entertain zswap if swapping also comes as a packaged deal.
Writeback disabling is quite a useful feature in these situations - on
a mixed workloads deployment, they can disable writeback for the more
IO-sensitive workloads, and enable writeback for other background
workloads.

For instance, on a server with HDD, I allocate memories and populate
them with random values (so that zswap store will always fail), and
specify memory.high low enough to trigger reclaim.  The time it takes
to allocate the memories and just read through it a couple of times
(doing silly things like computing the values' average etc.):

zswap.writeback disabled:
real 0m30.537s
user 0m23.687s
sys 0m6.637s
0 pages swapped in
0 pages swapped out

zswap.writeback enabled:
real 0m45.061s
user 0m24.310s
sys 0m8.892s
712686 pages swapped in
461093 pages swapped out

(the last two lines are from vmstat -s).

[nphamcs@gmail.com: add a comment about recurring zswap store failures leading to reclaim inefficiency]
  Link: https://lkml.kernel.org/r/20231221005725.3446672-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231207192406.3809579-1-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: David Heidelberg <david@ixit.cz>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 12 ++++++++++++
 include/linux/zswap.h      |  7 +++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 43b77363ab8e..5de775e6cdd9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -219,6 +219,12 @@ struct mem_cgroup {
 
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
 	unsigned long zswap_max;
+
+	/*
+	 * Prevent pages from this memcg from being written back from zswap to
+	 * swap, and from being swapped out on zswap store failures.
+	 */
+	bool zswap_writeback;
 #endif
 
 	unsigned long soft_limit;
@@ -1941,6 +1947,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg);
 void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size);
 void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size);
+bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg);
 #else
 static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
 {
@@ -1954,6 +1961,11 @@ static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg,
 					     size_t size)
 {
 }
+static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
+{
+	/* if zswap is disabled, do not block pages going to the swapping device */
+	return true;
+}
 #endif
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index e88572d4c720..0b709f5bc65f 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -35,6 +35,7 @@ void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 void zswap_lruvec_state_init(struct lruvec *lruvec);
 void zswap_folio_swapin(struct folio *folio);
+bool is_zswap_enabled(void);
 #else
 
 struct zswap_lruvec_state {};
@@ -55,6 +56,12 @@ static inline void zswap_swapoff(int type) {}
 static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
 static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
 static inline void zswap_folio_swapin(struct folio *folio) {}
+
+static inline bool is_zswap_enabled(void)
+{
+	return false;
+}
+
 #endif
 
 #endif /* _LINUX_ZSWAP_H */
-- 
cgit v1.2.3


From 7c223098212957a1ecd8768e8e747ae2cf88e880 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Fri, 29 Dec 2023 20:53:49 +0000
Subject: locking/osq_lock: Move the definition of optimistic_spin_node into
 osq_lock.c

struct optimistic_spin_node is private to the implementation.
Move it into the C file to ensure nothing is accessing it.

Signed-off-by: David Laight <david.laight@aculab.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/osq_lock.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index 5581dbd3bd34..ea8fb31379e3 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -6,11 +6,6 @@
  * An MCS like lock especially tailored for optimistic spinning for sleeping
  * lock implementations (mutex, rwsem, etc).
  */
-struct optimistic_spin_node {
-	struct optimistic_spin_node *next, *prev;
-	int locked; /* 1 if lock acquired */
-	int cpu; /* encoded CPU # + 1 value */
-};
 
 struct optimistic_spin_queue {
 	/*
-- 
cgit v1.2.3


From cff9c565e65f3622e8dc1dcc21c1520a083dff35 Mon Sep 17 00:00:00 2001
From: Luiz Angelo Daros de Luca <luizluca@gmail.com>
Date: Wed, 20 Dec 2023 01:52:29 -0300
Subject: net: mdio: get/put device node during (un)registration

The __of_mdiobus_register() function was storing the device node in
dev.of_node without increasing its reference count. It implicitly relied
on the caller to maintain the allocated node until the mdiobus was
unregistered.

Now, __of_mdiobus_register() will acquire the node before assigning it,
and of_mdiobus_unregister_callback() will be called at the end of
mdio_unregister().

Drivers can now release the node immediately after MDIO registration.
Some of them are already doing that even before this patch.

Signed-off-by: Luiz Angelo Daros de Luca <luizluca@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index e9e85d347587..ede891776d8b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -434,6 +434,9 @@ struct mii_bus {
 
 	/** @shared: shared state across different PHYs */
 	struct phy_package_shared *shared[PHY_MAX_ADDR];
+
+	/** @__unregister_callback: called at the last step of unregistration */
+	void (*__unregister_callback)(struct mii_bus *bus);
 };
 #define to_mii_bus(d) container_of(d, struct mii_bus, dev)
 
-- 
cgit v1.2.3


From 02018c544ef113e980a2349eba89003d6f399d22 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 21 Dec 2023 19:00:34 +0100
Subject: net: phy: Introduce ethernet link topology representation

Link topologies containing multiple network PHYs attached to the same
net_device can be found when using a PHY as a media converter for use
with an SFP connector, on which an SFP transceiver containing a PHY can
be used.

With the current model, the transceiver's PHY can't be used for
operations such as cable testing, timestamping, macsec offload, etc.

The reason being that most of the logic for these configuration, coming
from either ethtool netlink or ioctls tend to use netdev->phydev, which
in multi-phy systems will reference the PHY closest to the MAC.

Introduce a numbering scheme allowing to enumerate PHY devices that
belong to any netdev, which can in turn allow userspace to take more
precise decisions with regard to each PHY's configuration.

The numbering is maintained per-netdev, in a phy_device_list.
The numbering works similarly to a netdevice's ifindex, with
identifiers that are only recycled once INT_MAX has been reached.

This prevents races that could occur between PHY listing and SFP
transceiver removal/insertion.

The identifiers are assigned at phy_attach time, as the numbering
depends on the netdevice the phy is attached to.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h              |  4 +-
 include/linux/phy.h                    |  4 ++
 include/linux/phy_link_topology.h      | 67 ++++++++++++++++++++++++++++++++++
 include/linux/phy_link_topology_core.h | 19 ++++++++++
 4 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/phy_link_topology.h
 create mode 100644 include/linux/phy_link_topology_core.h

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 75c7725e5e4f..5baa5517f533 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -40,7 +40,6 @@
 #include <net/dcbnl.h>
 #endif
 #include <net/netprio_cgroup.h>
-
 #include <linux/netdev_features.h>
 #include <linux/neighbour.h>
 #include <uapi/linux/netdevice.h>
@@ -52,6 +51,7 @@
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
 #include <net/dropreason-core.h>
+#include <linux/phy_link_topology_core.h>
 
 struct netpoll_info;
 struct device;
@@ -2047,6 +2047,7 @@ enum netdev_stat_type {
  *	@fcoe_ddp_xid:	Max exchange id for FCoE LRO by ddp
  *
  *	@priomap:	XXX: need comments on this one
+ *	@link_topo:	Physical link topology tracking attached PHYs
  *	@phydev:	Physical device may attach itself
  *			for hardware timestamping
  *	@sfp_bus:	attached &struct sfp_bus structure.
@@ -2441,6 +2442,7 @@ struct net_device {
 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 	struct netprio_map __rcu *priomap;
 #endif
+	struct phy_link_topology	link_topo;
 	struct phy_device	*phydev;
 	struct sfp_bus		*sfp_bus;
 	struct lock_class_key	*qdisc_tx_busylock;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index ede891776d8b..ea9416797b89 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -547,6 +547,9 @@ struct macsec_ops;
  * @drv: Pointer to the driver for this PHY instance
  * @devlink: Create a link between phy dev and mac dev, if the external phy
  *           used by current mac interface is managed by another mac interface.
+ * @phyindex: Unique id across the phy's parent tree of phys to address the PHY
+ *	      from userspace, similar to ifindex. A zero index means the PHY
+ *	      wasn't assigned an id yet.
  * @phy_id: UID for this device found during discovery
  * @c45_ids: 802.3-c45 Device Identifiers if is_c45.
  * @is_c45:  Set to true if this PHY uses clause 45 addressing.
@@ -646,6 +649,7 @@ struct phy_device {
 
 	struct device_link *devlink;
 
+	u32 phyindex;
 	u32 phy_id;
 
 	struct phy_c45_device_ids c45_ids;
diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h
new file mode 100644
index 000000000000..91902263ec0e
--- /dev/null
+++ b/include/linux/phy_link_topology.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PHY device list allow maintaining a list of PHY devices that are
+ * part of a netdevice's link topology. PHYs can for example be chained,
+ * as is the case when using a PHY that exposes an SFP module, on which an
+ * SFP transceiver that embeds a PHY is connected.
+ *
+ * This list can then be used by userspace to leverage individual PHY
+ * capabilities.
+ */
+#ifndef __PHY_LINK_TOPOLOGY_H
+#define __PHY_LINK_TOPOLOGY_H
+
+#include <linux/ethtool.h>
+#include <linux/phy_link_topology_core.h>
+
+struct xarray;
+struct phy_device;
+struct net_device;
+struct sfp_bus;
+
+struct phy_device_node {
+	enum phy_upstream upstream_type;
+
+	union {
+		struct net_device	*netdev;
+		struct phy_device	*phydev;
+	} upstream;
+
+	struct sfp_bus *parent_sfp_bus;
+
+	struct phy_device *phy;
+};
+
+static inline struct phy_device *
+phy_link_topo_get_phy(struct phy_link_topology *topo, u32 phyindex)
+{
+	struct phy_device_node *pdn = xa_load(&topo->phys, phyindex);
+
+	if (pdn)
+		return pdn->phy;
+
+	return NULL;
+}
+
+#if IS_ENABLED(CONFIG_PHYLIB)
+int phy_link_topo_add_phy(struct phy_link_topology *topo,
+			  struct phy_device *phy,
+			  enum phy_upstream upt, void *upstream);
+
+void phy_link_topo_del_phy(struct phy_link_topology *lt, struct phy_device *phy);
+
+#else
+static inline int phy_link_topo_add_phy(struct phy_link_topology *topo,
+					struct phy_device *phy,
+					enum phy_upstream upt, void *upstream)
+{
+	return 0;
+}
+
+static inline void phy_link_topo_del_phy(struct phy_link_topology *topo,
+					 struct phy_device *phy)
+{
+}
+#endif
+
+#endif /* __PHY_LINK_TOPOLOGY_H */
diff --git a/include/linux/phy_link_topology_core.h b/include/linux/phy_link_topology_core.h
new file mode 100644
index 000000000000..78c75f909489
--- /dev/null
+++ b/include/linux/phy_link_topology_core.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PHY_LINK_TOPOLOGY_CORE_H
+#define __PHY_LINK_TOPOLOGY_CORE_H
+
+struct xarray;
+
+struct phy_link_topology {
+	struct xarray phys;
+
+	u32 next_phy_index;
+};
+
+static inline void phy_link_topo_init(struct phy_link_topology *topo)
+{
+	xa_init_flags(&topo->phys, XA_FLAGS_ALLOC1);
+	topo->next_phy_index = 1;
+}
+
+#endif /* __PHY_LINK_TOPOLOGY_CORE_H */
-- 
cgit v1.2.3


From 9c5625f559ad6fe9f6f733c11475bf470e637d34 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 21 Dec 2023 19:00:35 +0100
Subject: net: sfp: pass the phy_device when disconnecting an sfp module's PHY

Pass the phy_device as a parameter to the sfp upstream .disconnect_phy
operation. This is preparatory work to help track phy devices across
a net_device's link.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sfp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 9346cd44814d..0573e53b0c11 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -544,7 +544,7 @@ struct sfp_upstream_ops {
 	void (*link_down)(void *priv);
 	void (*link_up)(void *priv);
 	int (*connect_phy)(void *priv, struct phy_device *);
-	void (*disconnect_phy)(void *priv);
+	void (*disconnect_phy)(void *priv, struct phy_device *);
 };
 
 #if IS_ENABLED(CONFIG_SFP)
-- 
cgit v1.2.3


From 034fcc210349b873ece7356905be5c6ca11eef2a Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 21 Dec 2023 19:00:36 +0100
Subject: net: phy: add helpers to handle sfp phy connect/disconnect

There are a few PHY drivers that can handle SFP modules through their
sfp_upstream_ops. Introduce Phylib helpers to keep track of connected
SFP PHYs in a netdevice's namespace, by adding the SFP PHY to the
upstream PHY's netdev's namespace.

By doing so, these SFP PHYs can be enumerated and exposed to users,
which will be able to use their capabilities.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index ea9416797b89..ac22b8e28a85 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1729,6 +1729,8 @@ int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
 int __phy_resume(struct phy_device *phydev);
 int phy_loopback(struct phy_device *phydev, bool enable);
+int phy_sfp_connect_phy(void *upstream, struct phy_device *phy);
+void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy);
 void phy_sfp_attach(void *upstream, struct sfp_bus *bus);
 void phy_sfp_detach(void *upstream, struct sfp_bus *bus);
 int phy_sfp_probe(struct phy_device *phydev,
-- 
cgit v1.2.3


From dedd702a35793ab462fce4c737eeba0badf9718e Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 21 Dec 2023 19:00:37 +0100
Subject: net: sfp: Add helper to return the SFP bus name

Knowing the bus name is helpful when we want to expose the link topology
to userspace, add a helper to return the SFP bus name.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sfp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 0573e53b0c11..55c0ab17c9e2 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -570,6 +570,7 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode);
 int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 			 const struct sfp_upstream_ops *ops);
 void sfp_bus_del_upstream(struct sfp_bus *bus);
+const char *sfp_get_name(struct sfp_bus *bus);
 #else
 static inline int sfp_parse_port(struct sfp_bus *bus,
 				 const struct sfp_eeprom_id *id,
@@ -648,6 +649,11 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 static inline void sfp_bus_del_upstream(struct sfp_bus *bus)
 {
 }
+
+static inline const char *sfp_get_name(struct sfp_bus *bus)
+{
+	return NULL;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 5a0e241003b80247de59727c945bc94c848f893d Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@denx.de>
Date: Wed, 29 Nov 2023 09:43:28 -0300
Subject: thermal/core: Prepare for introduction of thermal reboot

Add some helper functions to make it easier introducing the support
for thermal reboot.

No functional change.

Signed-off-by: Fabio Estevam <festevam@denx.de>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20231129124330.519423-2-festevam@gmail.com
---
 include/linux/reboot.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index c4cc3b89ced1..4586c663884e 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -177,7 +177,12 @@ void ctrl_alt_del(void);
 
 extern void orderly_poweroff(bool force);
 extern void orderly_reboot(void);
-void hw_protection_shutdown(const char *reason, int ms_until_forced);
+void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown);
+
+static inline void hw_protection_shutdown(const char *reason, int ms_until_forced)
+{
+	__hw_protection_shutdown(reason, ms_until_forced, true);
+}
 
 /*
  * Emergency restart, callable from an interrupt handler.
-- 
cgit v1.2.3


From 79fa723ba84c2b1b3124c72df8a3b07b851a5477 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@denx.de>
Date: Wed, 29 Nov 2023 09:43:29 -0300
Subject: reboot: Introduce thermal_zone_device_critical_reboot()

Introduce thermal_zone_device_critical_reboot() to trigger an
emergency reboot.

It is a counterpart of thermal_zone_device_critical() with the
difference that it will force a reboot instead of shutdown.

The motivation for doing this is to allow the thermal subystem
to trigger a reboot when the temperature reaches the critical
temperature.

Signed-off-by: Fabio Estevam <festevam@denx.de>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20231129124330.519423-3-festevam@gmail.com
---
 include/linux/reboot.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 4586c663884e..abcdde4df697 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -179,6 +179,11 @@ extern void orderly_poweroff(bool force);
 extern void orderly_reboot(void);
 void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown);
 
+static inline void hw_protection_reboot(const char *reason, int ms_until_forced)
+{
+	__hw_protection_shutdown(reason, ms_until_forced, false);
+}
+
 static inline void hw_protection_shutdown(const char *reason, int ms_until_forced)
 {
 	__hw_protection_shutdown(reason, ms_until_forced, true);
-- 
cgit v1.2.3


From 37d158d0b05144f696323ae5bbfe1e137f7c06d3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 20 Dec 2023 08:38:46 +0100
Subject: HID: make hid_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the hid_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Jiri Kosina <jikos@kernel.org>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: linux-input@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 include/linux/hid.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index bf43f3ff6664..7c26db874ff0 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -912,7 +912,7 @@ extern bool hid_ignore(struct hid_device *);
 extern int hid_add_device(struct hid_device *);
 extern void hid_destroy_device(struct hid_device *);
 
-extern struct bus_type hid_bus_type;
+extern const struct bus_type hid_bus_type;
 
 extern int __must_check __hid_register_driver(struct hid_driver *,
 		struct module *, const char *mod_name);
-- 
cgit v1.2.3


From 9b0a3839e8d29663cd9ee2c43d38b06c3b91619e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 20 Dec 2023 08:38:48 +0100
Subject: HID: bpf: make bus_type const in struct hid_bpf_ops

The struct bus_type pointer in hid_bpf_ops just passes the pointer to
the driver core, and the driver core can handle, and expects, a constant
pointer, so also make the pointer constant in hid_bpf_ops.

Part of the process of moving all usages of struct bus_type to be
constant to move them all to read-only memory.

Cc: Jiri Kosina <jikos@kernel.org>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: linux-input@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 include/linux/hid_bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h
index e9afb61e6ee0..840cd254172d 100644
--- a/include/linux/hid_bpf.h
+++ b/include/linux/hid_bpf.h
@@ -115,7 +115,7 @@ struct hid_bpf_ops {
 				  size_t len, enum hid_report_type rtype,
 				  enum hid_class_request reqtype);
 	struct module *owner;
-	struct bus_type *bus_type;
+	const struct bus_type *bus_type;
 };
 
 extern struct hid_bpf_ops *hid_bpf_ops;
-- 
cgit v1.2.3


From 87d8f1ee1d4070571095859d310f6951440f288c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 23 Dec 2023 11:43:19 -0800
Subject: platform/x86: wmi: linux/wmi.h: fix Excess kernel-doc description
 warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the "private:" comment to prevent the kernel-doc warning:

include/linux/wmi.h:27: warning: Excess struct member 'setable' description in 'wmi_device'

Either a struct member is documented (via kernel-doc) or it's private,
but not both.

Fixes: b4cc979588ee ("platform/x86: wmi: Add kernel doc comments")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Armin Wolf <W_Armin@gmx.de>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: platform-driver-x86@vger.kernel.org
Reviewed-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20231223194321.23084-1-rdunlap@infradead.org
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/wmi.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 50f7f1e4fd4f..686291b87852 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -21,8 +21,6 @@
  */
 struct wmi_device {
 	struct device dev;
-
-	/* private: used by the WMI driver core */
 	bool setable;
 };
 
-- 
cgit v1.2.3


From 993498e537af9260e697219ce41b41b22b6199cc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 Dec 2023 14:07:47 +0000
Subject: net-device: move gso_partial_features to net_device_read_tx

dev->gso_partial_features is read from tx fast path for GSO packets.

Move it to appropriate section to avoid a cache line miss.

Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Coco Li <lixiaoyan@google.com>
Cc: David Ahern <dsahern@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5baa5517f533..d59db9adcc96 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2115,6 +2115,7 @@ struct net_device {
 	const struct net_device_ops *netdev_ops;
 	const struct header_ops *header_ops;
 	struct netdev_queue	*_tx;
+	netdev_features_t	gso_partial_features;
 	unsigned int		real_num_tx_queues;
 	unsigned int		gso_max_size;
 	unsigned int		gso_ipv4_max_size;
@@ -2211,7 +2212,6 @@ struct net_device {
 	netdev_features_t	vlan_features;
 	netdev_features_t	hw_enc_features;
 	netdev_features_t	mpls_features;
-	netdev_features_t	gso_partial_features;
 
 	unsigned int		min_mtu;
 	unsigned int		max_mtu;
-- 
cgit v1.2.3


From 059d37b718d38d26087121c754691df77acfc66b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:06:13 -0800
Subject: net: phy: linux/phy.h: fix Excess kernel-doc description warning

Remove the @phy_timer: line to prevent the kernel-doc warning:

include/linux/phy.h:768: warning: Excess struct member 'phy_timer' description in 'phy_device'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Heiner Kallweit <hkallweit1@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: netdev@vger.kernel.org
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3cc52826f18e..bd285950972c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -568,7 +568,6 @@ struct macsec_ops;
  *      - Bits [31:24] are reserved for defining generic
  *        PHY driver behavior.
  * @irq: IRQ number of the PHY's interrupt (-1 if none)
- * @phy_timer: The timer for handling the state machine
  * @phylink: Pointer to phylink instance for this PHY
  * @sfp_bus_attached: Flag indicating whether the SFP bus has been attached
  * @sfp_bus: SFP bus attached to this PHY's fiber port
-- 
cgit v1.2.3


From e4df56ad0bf3506c5189abb9be83f3bea05a4c4f Mon Sep 17 00:00:00 2001
From: Lin Gui <lin.gui@mediatek.com>
Date: Tue, 19 Dec 2023 07:05:32 +0800
Subject: mmc: core: Add wp_grp_size sysfs node

The eMMC card can be set into write-protected mode to prevent data from
being accidentally modified or deleted. Wp_grp_size (Write Protect Group
Size) refers to an attribute of the eMMC card, used to manage write
protection and is the CSD register [36:32] of the eMMC device. Wp_grp_size
(Write Protect Group Size) indicates how many eMMC blocks are contained in
each write protection group on the eMMC card.

To allow userspace easy access of the CSD register bits, let's add sysfs
node "wp_grp_size".

Signed-off-by: Lin Gui <lin.gui@mediatek.com>
Signed-off-by: Bo Ye <bo.ye@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20231218230532.82427-1-bo.ye@mediatek.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/card.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 47eeb122524c..f34407cc2788 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -32,6 +32,7 @@ struct mmc_csd {
 	unsigned int		r2w_factor;
 	unsigned int		max_dtr;
 	unsigned int		erase_size;		/* In sectors */
+	unsigned int		wp_grp_size;
 	unsigned int		read_blkbits;
 	unsigned int		write_blkbits;
 	unsigned int		capacity;
@@ -303,6 +304,7 @@ struct mmc_card {
 	unsigned int		eg_boundary;	/* don't cross erase-group boundaries */
 	unsigned int		erase_arg;	/* erase / trim / discard */
  	u8			erased_byte;	/* value of erased bytes */
+	unsigned int		wp_grp_size;	/* write group size in sectors */
 
 	u32			raw_cid[4];	/* raw card CID */
 	u32			raw_csd[4];	/* raw card CSD */
-- 
cgit v1.2.3


From 3e64db35bc37edbe9e37aaa987df92cde12ddb6c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 2 Jan 2024 14:23:34 -0800
Subject: Revert "net: mdio: get/put device node during (un)registration"

This reverts commit cff9c565e65f3622e8dc1dcc21c1520a083dff35.

Revert based on feedback from Russell.

Link: https://lore.kernel.org/all/ZZPtUIRerqTI2%2Fyh@shell.armlinux.org.uk/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index ac22b8e28a85..6cb9d843aee9 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -434,9 +434,6 @@ struct mii_bus {
 
 	/** @shared: shared state across different PHYs */
 	struct phy_package_shared *shared[PHY_MAX_ADDR];
-
-	/** @__unregister_callback: called at the last step of unregistration */
-	void (*__unregister_callback)(struct mii_bus *bus);
 };
 #define to_mii_bus(d) container_of(d, struct mii_bus, dev)
 
-- 
cgit v1.2.3


From db02e176f597a14eb696141ffa008c2429453a15 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 6 Dec 2023 16:42:31 -0600
Subject: PCI/AER: Use explicit register sizes for struct members

aer_irq() reads the AER Root Error Status and Error Source Identification
(PCI_ERR_ROOT_STATUS and PCI_ERR_ROOT_ERR_SRC) registers directly into
struct aer_err_source.  Both registers are 32 bits, so declare the members
explicitly as "u32" instead of "unsigned int".

Similarly, aer_get_device_error_info() reads the AER Header Log
(PCI_ERR_HEADER_LOG) registers, which are also 32 bits, into struct
aer_header_log_regs.  Declare those members as "u32" as well.

No functional changes intended.

Link: https://lore.kernel.org/r/20231206224231.732765-4-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/aer.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/aer.h b/include/linux/aer.h
index f6ea2f57d808..ae0fae70d4bd 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -19,10 +19,10 @@
 struct pci_dev;
 
 struct aer_header_log_regs {
-	unsigned int dw0;
-	unsigned int dw1;
-	unsigned int dw2;
-	unsigned int dw3;
+	u32 dw0;
+	u32 dw1;
+	u32 dw2;
+	u32 dw3;
 };
 
 struct aer_capability_regs {
-- 
cgit v1.2.3


From 7d4b5d7a37bdd63a5a3371b988744b060d5bb86f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 27 Dec 2023 21:38:23 +0100
Subject: async: Introduce async_schedule_dev_nocall()

In preparation for subsequent changes, introduce a specialized variant
of async_schedule_dev() that will not invoke the argument function
synchronously when it cannot be scheduled for asynchronous execution.

The new function, async_schedule_dev_nocall(), will be used for fixing
possible deadlocks in the system-wide power management core code.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com> for the series.
Tested-by: Youngmin Nam <youngmin.nam@samsung.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/async.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/async.h b/include/linux/async.h
index cce4ad31e8fc..33c9ff4afb49 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -90,6 +90,8 @@ async_schedule_dev(async_func_t func, struct device *dev)
 	return async_schedule_node(func, dev, dev_to_node(dev));
 }
 
+bool async_schedule_dev_nocall(async_func_t func, struct device *dev);
+
 /**
  * async_schedule_dev_domain - A device specific version of async_schedule_domain
  * @func: function to execute asynchronously
-- 
cgit v1.2.3


From 3b82024c5ba93e7a0db2d0b9635ca6b28338efd7 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 28 Dec 2023 13:04:41 +0530
Subject: OPP: Move dev_pm_opp_icc_bw to internal opp.h

It isn't used by any driver or API, privatize it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm_opp.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 74768c47d790..76dcb7f37bcd 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -45,18 +45,6 @@ struct dev_pm_opp_supply {
 	unsigned long u_watt;
 };
 
-/**
- * struct dev_pm_opp_icc_bw - Interconnect bandwidth values
- * @avg:	Average bandwidth corresponding to this OPP (in icc units)
- * @peak:	Peak bandwidth corresponding to this OPP (in icc units)
- *
- * This structure stores the bandwidth values for a single interconnect path.
- */
-struct dev_pm_opp_icc_bw {
-	u32 avg;
-	u32 peak;
-};
-
 typedef int (*config_regulators_t)(struct device *dev,
 			struct dev_pm_opp *old_opp, struct dev_pm_opp *new_opp,
 			struct regulator **regulators, unsigned int count);
-- 
cgit v1.2.3


From a15f2d48c6f84ae0dd2000288592c79d5d1acd0e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 16:47:41 +0100
Subject: nubus: Make nubus_bus_type static and constant

Now that the driver core can properly handle constant struct bus_type,
move the nubus_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

It's also never used outside of drivers/nubus/bus.c so make it static
and don't export it as no one is using it.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Finn Thain <fthain@linux-m68k.org>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/2023121940-enlarged-editor-c9a8@gregkh
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 include/linux/nubus.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nubus.h b/include/linux/nubus.h
index bdcd85e622d8..4d103ac8f5c7 100644
--- a/include/linux/nubus.h
+++ b/include/linux/nubus.h
@@ -89,8 +89,6 @@ struct nubus_driver {
 	void (*remove)(struct nubus_board *board);
 };
 
-extern struct bus_type nubus_bus_type;
-
 /* Generic NuBus interface functions, modelled after the PCI interface */
 #ifdef CONFIG_PROC_FS
 extern bool nubus_populate_procfs;
-- 
cgit v1.2.3


From 2ad28ce9b98f8b22feaecc0966c706a8ef59cbf0 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Mon, 1 Jan 2024 12:35:27 +0200
Subject: nvme: remove unused definition

There is no users for NVMF_AUTH_HASH_LEN macro.

Reviewed-by: Israel Rukshin <israelr@nvidia.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 44325c068b6a..462c21e0e417 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -20,7 +20,6 @@
 #define NVMF_TRSVCID_SIZE	32
 #define NVMF_TRADDR_SIZE	256
 #define NVMF_TSAS_SIZE		256
-#define NVMF_AUTH_HASH_LEN	64
 
 #define NVME_DISC_SUBSYS_NAME	"nqn.2014-08.org.nvmexpress.discovery"
 
-- 
cgit v1.2.3


From 7865dfb1eb941ddd25802a9e13b6ff5f3f4dc02f Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 21 Dec 2023 15:23:24 -0800
Subject: bpf: sockmap, added comments describing update proto rules

Add a comment describing that the psock update proto callbback can be
called multiple times and this must be safe.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/r/20231221232327.43678-3-john.fastabend@gmail.com
---
 include/linux/skmsg.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index c953b8c0d2f4..888a4b217829 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -100,6 +100,11 @@ struct sk_psock {
 	void (*saved_close)(struct sock *sk, long timeout);
 	void (*saved_write_space)(struct sock *sk);
 	void (*saved_data_ready)(struct sock *sk);
+	/* psock_update_sk_prot may be called with restore=false many times
+	 * so the handler must be safe for this case. It will be called
+	 * exactly once with restore=true when the psock is being destroyed
+	 * and psock refcnt is zero, but before an RCU grace period.
+	 */
 	int  (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock,
 				     bool restore);
 	struct proto			*sk_proto;
-- 
cgit v1.2.3


From d3d344a1ca69d8fb2413e29e6400f3ad58a05c06 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 2 Jan 2024 16:22:20 +0000
Subject: net-device: move xdp_prog to net_device_read_rx

xdp_prog is used in receive path, both from XDP enabled drivers
and from netif_elide_gro().

This patch also removes two 4-bytes holes.

Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Coco Li <lixiaoyan@google.com>
Cc: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240102162220.750823-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d59db9adcc96..e265aa1f2169 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2150,6 +2150,7 @@ struct net_device {
 
 	/* RX read-mostly hotpath */
 	__cacheline_group_begin(net_device_read_rx);
+	struct bpf_prog __rcu	*xdp_prog;
 	struct list_head	ptype_specific;
 	int			ifindex;
 	unsigned int		real_num_rx_queues;
@@ -2325,7 +2326,6 @@ struct net_device {
 	const unsigned char	*dev_addr;
 
 	unsigned int		num_rx_queues;
-	struct bpf_prog __rcu	*xdp_prog;
 #define GRO_LEGACY_MAX_SIZE	65536u
 /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
  * and shinfo->gso_segs is a 16bit field.
-- 
cgit v1.2.3


From 9fc8e802048ad150e8032c4f3dbf40112160cfe9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 21 Dec 2023 19:17:39 -0800
Subject: bpf: Add objcg to bpf_mem_alloc

The objcg is a bpf_mem_alloc level property since all bpf_mem_cache's
are with the same objcg. This patch made such a property explicit.
The next patch will use this property to save and restore objcg
for percpu unit allocator.

Acked-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231222031739.1288590-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index bb1223b21308..acef8c808599 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -11,6 +11,7 @@ struct bpf_mem_caches;
 struct bpf_mem_alloc {
 	struct bpf_mem_caches __percpu *caches;
 	struct bpf_mem_cache __percpu *cache;
+	struct obj_cgroup *objcg;
 	bool percpu;
 	struct work_struct work;
 };
-- 
cgit v1.2.3


From c39aa3b289e9c10d0d246cd919b06809f13b72b8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 21 Dec 2023 19:17:45 -0800
Subject: bpf: Allow per unit prefill for non-fix-size percpu memory allocator

Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation")
added support for non-fix-size percpu memory allocation.
Such allocation will allocate percpu memory for all buckets on all
cpus and the memory consumption is in the order to quadratic.
For example, let us say, 4 cpus, unit size 16 bytes, so each
cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes.
Then let us say, 8 cpus with the same unit size, each cpu
has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes.
So if the number of cpus doubles, the number of memory consumption
will be 4 times. So for a system with large number of cpus, the
memory consumption goes up quickly with quadratic order.
For example, for 4KB percpu allocation, 128 cpus. The total memory
consumption will 4KB * 128 * 128 = 64MB. Things will become
worse if the number of cpus is bigger (e.g., 512, 1024, etc.)

In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is
done in boot time, so for system with large number of cpus, the initial
percpu memory consumption is very visible. For example, for 128 cpu
system, the total percpu memory allocation will be at least
(16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096)
  * 128 * 128 = ~138MB.
which is pretty big. It will be even bigger for larger number of cpus.

Note that the current prefill also allocates 4 entries if the unit size
is less than 256. So on top of 138MB memory consumption, this will
add more consumption with
3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB.
Next patch will try to reduce this memory consumption.

Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory
at init stage") moved the non-fix-size percpu memory allocation
to bpf verificaiton stage. Once a particular bpf_percpu_obj_new()
is called by bpf program, the memory allocator will try to fill in
the cache with all sizes, causing the same amount of percpu memory
consumption as in the boot stage.

To reduce the initial percpu memory consumption for non-fix-size
percpu memory allocation, instead of filling the cache with all
supported allocation sizes, this patch intends to fill the cache
only for the requested size. As typically users will not use large
percpu data structure, this can save memory significantly.
For example, the allocation size is 64 bytes with 128 cpus.
Then total percpu memory amount will be 64 * 128 * 128 = 1MB,
much less than previous 138MB.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index acef8c808599..aaf004d94322 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -22,8 +22,15 @@ struct bpf_mem_alloc {
  * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects.
  * Alloc and free are done with bpf_mem_{alloc,free}() and the size of
  * the returned object is given by the size argument of bpf_mem_alloc().
+ * If percpu equals true, error will be returned in order to avoid
+ * large memory consumption and the below bpf_mem_alloc_percpu_unit_init()
+ * should be used to do on-demand per-cpu allocation for each size.
  */
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
+/* Initialize a non-fix-size percpu memory allocator */
+int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg);
+/* The percpu allocation with a specific unit size. */
+int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 
 /* kmalloc/kfree equivalent: */
-- 
cgit v1.2.3


From 5e5401d6612ef599ad45785b941eebda7effc90f Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 4 Jan 2024 09:47:36 +0000
Subject: net: phylink: move phylink_pcs_neg_mode() into phylink.c

Move phylink_pcs_neg_mode() from the header file into the .c file since
nothing should be using it.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 66 -------------------------------------------------
 1 file changed, 66 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 875439ab45de..d589f89c612c 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -98,72 +98,6 @@ static inline bool phylink_autoneg_inband(unsigned int mode)
 	return mode == MLO_AN_INBAND;
 }
 
-/**
- * phylink_pcs_neg_mode() - helper to determine PCS inband mode
- * @mode: one of %MLO_AN_FIXED, %MLO_AN_PHY, %MLO_AN_INBAND.
- * @interface: interface mode to be used
- * @advertising: adertisement ethtool link mode mask
- *
- * Determines the negotiation mode to be used by the PCS, and returns
- * one of:
- *
- * - %PHYLINK_PCS_NEG_NONE: interface mode does not support inband
- * - %PHYLINK_PCS_NEG_OUTBAND: an out of band mode (e.g. reading the PHY)
- *   will be used.
- * - %PHYLINK_PCS_NEG_INBAND_DISABLED: inband mode selected but autoneg
- *   disabled
- * - %PHYLINK_PCS_NEG_INBAND_ENABLED: inband mode selected and autoneg enabled
- *
- * Note: this is for cases where the PCS itself is involved in negotiation
- * (e.g. Clause 37, SGMII and similar) not Clause 73.
- */
-static inline unsigned int phylink_pcs_neg_mode(unsigned int mode,
-						phy_interface_t interface,
-						const unsigned long *advertising)
-{
-	unsigned int neg_mode;
-
-	switch (interface) {
-	case PHY_INTERFACE_MODE_SGMII:
-	case PHY_INTERFACE_MODE_QSGMII:
-	case PHY_INTERFACE_MODE_QUSGMII:
-	case PHY_INTERFACE_MODE_USXGMII:
-		/* These protocols are designed for use with a PHY which
-		 * communicates its negotiation result back to the MAC via
-		 * inband communication. Note: there exist PHYs that run
-		 * with SGMII but do not send the inband data.
-		 */
-		if (!phylink_autoneg_inband(mode))
-			neg_mode = PHYLINK_PCS_NEG_OUTBAND;
-		else
-			neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED;
-		break;
-
-	case PHY_INTERFACE_MODE_1000BASEX:
-	case PHY_INTERFACE_MODE_2500BASEX:
-		/* 1000base-X is designed for use media-side for Fibre
-		 * connections, and thus the Autoneg bit needs to be
-		 * taken into account. We also do this for 2500base-X
-		 * as well, but drivers may not support this, so may
-		 * need to override this.
-		 */
-		if (!phylink_autoneg_inband(mode))
-			neg_mode = PHYLINK_PCS_NEG_OUTBAND;
-		else if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
-					   advertising))
-			neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED;
-		else
-			neg_mode = PHYLINK_PCS_NEG_INBAND_DISABLED;
-		break;
-
-	default:
-		neg_mode = PHYLINK_PCS_NEG_NONE;
-		break;
-	}
-
-	return neg_mode;
-}
-
 /**
  * struct phylink_link_state - link state structure
  * @advertising: ethtool bitmask containing advertised link modes
-- 
cgit v1.2.3


From 040a82be54c09a72162a3db2f5cd2ba289c0f224 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 13 Oct 2023 12:26:31 +0100
Subject: netfs: Rearrange netfs_io_subrequest to put request pointer first

Rearrange the netfs_io_subrequest struct to put the netfs_io_request
pointer (rreq) first.  This then allows netfs_io_subrequest to be put in a
union with a pointer to a wrapper around netfs_io_request.  This will be
useful in the future for cifs and maybe ceph.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Steve French <sfrench@samba.org>
cc: Shyam Prasad N <nspmangalore@gmail.com>
cc: Rohith Surabattula <rohiths.msft@gmail.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cifs@vger.kernel.org
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 852956aa3c4b..d3bac60fcd6f 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -204,8 +204,8 @@ struct netfs_cache_resources {
  * the pages it points to can be relied on to exist for the duration.
  */
 struct netfs_io_subrequest {
-	struct work_struct	work;
 	struct netfs_io_request *rreq;		/* Supervising I/O request */
+	struct work_struct	work;
 	struct list_head	rreq_link;	/* Link in rreq->subrequests */
 	struct iov_iter		io_iter;	/* Iterator for this subrequest */
 	loff_t			start;		/* Where to start the I/O */
-- 
cgit v1.2.3


From f36be9ce8146faabdbbf74ee0499edb2039c53a5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 14:13:10 +0100
Subject: EDAC: constantify the struct bus_type usage

In many places in the edac code, struct bus_type pointers are passed
around and then eventually sent to the driver core, which can handle a
constant pointer.  So constantify all of the edac usage of these as well
because the data in them is never modified by the edac code either.

Cc: Borislav Petkov <bp@alien8.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: James Morse <james.morse@arm.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Robert Richter <rric@kernel.org>
Cc:  <linux-edac@vger.kernel.org>
Link: https://lore.kernel.org/r/2023121909-tribute-punctuate-4b22@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/edac.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/edac.h b/include/linux/edac.h
index fa4bda2a70f6..ccaf2ae0801d 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -30,7 +30,7 @@ struct device;
 
 extern int edac_op_state;
 
-struct bus_type *edac_get_sysfs_subsys(void);
+const struct bus_type *edac_get_sysfs_subsys(void);
 
 static inline void opstate_init(void)
 {
@@ -492,7 +492,7 @@ struct edac_raw_error_desc {
  */
 struct mem_ctl_info {
 	struct device			dev;
-	struct bus_type			*bus;
+	const struct bus_type		*bus;
 
 	struct list_head link;	/* for global list of mem_ctl_info structs */
 
-- 
cgit v1.2.3


From db2292b01b799e926abfdbd6fafa1f27f0d0e457 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 15:07:23 +0100
Subject: PM: clk: make pm_clk_add_notifier() take a const pointer

The driver core wants to work with const struct bus_type, so there's no
reason that pm_clk_add_notifier() should not also do the same thing,
considering that it just passes the pointer off to the driver core which
is expecting a const *.

Cc: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/2023121922-triumph-exploit-f545@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/pm_clock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h
index ada3a0ab10bf..68669ce18720 100644
--- a/include/linux/pm_clock.h
+++ b/include/linux/pm_clock.h
@@ -91,10 +91,10 @@ static inline int devm_pm_clk_create(struct device *dev)
 #endif
 
 #ifdef CONFIG_HAVE_CLK
-extern void pm_clk_add_notifier(struct bus_type *bus,
+extern void pm_clk_add_notifier(const struct bus_type *bus,
 					struct pm_clk_notifier_block *clknb);
 #else
-static inline void pm_clk_add_notifier(struct bus_type *bus,
+static inline void pm_clk_add_notifier(const struct bus_type *bus,
 					struct pm_clk_notifier_block *clknb)
 {
 }
-- 
cgit v1.2.3


From e76933a9bfa9b7f28a387f2e13cb3e689adc200d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 15:06:19 +0100
Subject: maple: make maple_bus_type static and const

There is no need to export maple_bus_type as no one uses it outside of
maple.c, so make it static, AND make it const as it can be read-only as
no one modifies it.

Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: <linux-sh@vger.kernel.org>
Link: https://lore.kernel.org/r/2023121918-rejoicing-frostlike-d976@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/maple.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/maple.h b/include/linux/maple.h
index 9b140272ee16..9aae44efcfd4 100644
--- a/include/linux/maple.h
+++ b/include/linux/maple.h
@@ -5,7 +5,6 @@
 #include <mach/maple.h>
 
 struct device;
-extern struct bus_type maple_bus_type;
 
 /* Maple Bus command and response codes */
 enum maple_code {
-- 
cgit v1.2.3


From 86438841e48f6361f0a6a04805b7d7813738761f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 14:41:42 +0100
Subject: dma-debug: make dma_debug_add_bus take a const pointer

The driver core now can handle a const struct bus_type pointer, and the
dma_debug_add_bus() call just passes on the pointer give to it to the
driver core, so make this pointer const as well to allow everyone to use
read-only struct bus_type pointers going forward.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc:  <iommu@lists.linux.dev>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/2023121941-dejected-nugget-681e@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/dma-map-ops.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index f2fc203fb8a1..e401f824a007 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -443,10 +443,10 @@ static inline void arch_teardown_dma_ops(struct device *dev)
 #endif /* CONFIG_ARCH_HAS_TEARDOWN_DMA_OPS */
 
 #ifdef CONFIG_DMA_API_DEBUG
-void dma_debug_add_bus(struct bus_type *bus);
+void dma_debug_add_bus(const struct bus_type *bus);
 void debug_dma_dump_mappings(struct device *dev);
 #else
-static inline void dma_debug_add_bus(struct bus_type *bus)
+static inline void dma_debug_add_bus(const struct bus_type *bus)
 {
 }
 static inline void debug_dma_dump_mappings(struct device *dev)
-- 
cgit v1.2.3


From b5b0774d53bb81bddbf8c609b3f183d4af6e91da Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Date: Wed, 20 Dec 2023 12:30:41 +0530
Subject: HID: amd_sfh: Add a new interface for exporting HPD data

AMDSFH has information about the User presence information via the Human
Presence Detection (HPD) sensor which is part of the AMD sensor fusion hub.

Add a new interface to export this information, where other drivers like
PMF can use this information to enhance user experiences.

Link: https://lore.kernel.org/all/ad064333-48a4-4cfa-9428-69e8a7c44667@redhat.com/
Co-developed-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Signed-off-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 include/linux/amd-pmf-io.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 include/linux/amd-pmf-io.h

(limited to 'include/linux')

diff --git a/include/linux/amd-pmf-io.h b/include/linux/amd-pmf-io.h
new file mode 100644
index 000000000000..5b6d29d36922
--- /dev/null
+++ b/include/linux/amd-pmf-io.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD Platform Management Framework Interface
+ *
+ * Copyright (c) 2023, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Authors: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
+ *          Basavaraj Natikar <Basavaraj.Natikar@amd.com>
+ */
+
+#ifndef AMD_PMF_IO_H
+#define AMD_PMF_IO_H
+
+#include <linux/types.h>
+
+/**
+ * enum sfh_message_type - Query the SFH message type
+ * @MT_HPD: Message ID to know the Human presence info from MP2 FW
+ */
+enum sfh_message_type {
+	MT_HPD,
+};
+
+/**
+ * enum sfh_hpd_info - Query the Human presence information
+ * @SFH_NOT_DETECTED: Check the HPD connection information from MP2 FW
+ * @SFH_USER_PRESENT: Check if the user is present from HPD sensor
+ * @SFH_USER_AWAY: Check if the user is away from HPD sensor
+ */
+enum sfh_hpd_info {
+	SFH_NOT_DETECTED,
+	SFH_USER_PRESENT,
+	SFH_USER_AWAY,
+};
+
+/**
+ * struct amd_sfh_info - get HPD sensor info from MP2 FW
+ * @user_present: Populates the user presence information
+ */
+struct amd_sfh_info {
+	u8 user_present;
+};
+
+int amd_get_sfh_info(struct amd_sfh_info *sfh_info, enum sfh_message_type op);
+#endif
-- 
cgit v1.2.3


From 584f35a3647d42980af495fc0bc5c51eb174aa35 Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Date: Wed, 20 Dec 2023 12:30:42 +0530
Subject: HID: amd_sfh: Add a new interface for exporting ALS data

AMDSFH has information about the Ambient light via the Ambient
Light Sensor (ALS) which is part of the AMD sensor fusion hub.

Add a new interface to export this information, where other drivers like
PMF can use this information to enhance user experiences.

Link: https://lore.kernel.org/all/ad064333-48a4-4cfa-9428-69e8a7c44667@redhat.com/
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Co-developed-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Signed-off-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 include/linux/amd-pmf-io.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/amd-pmf-io.h b/include/linux/amd-pmf-io.h
index 5b6d29d36922..b4f818205216 100644
--- a/include/linux/amd-pmf-io.h
+++ b/include/linux/amd-pmf-io.h
@@ -17,9 +17,11 @@
 /**
  * enum sfh_message_type - Query the SFH message type
  * @MT_HPD: Message ID to know the Human presence info from MP2 FW
+ * @MT_ALS: Message ID to know the Ambient light info from MP2 FW
  */
 enum sfh_message_type {
 	MT_HPD,
+	MT_ALS,
 };
 
 /**
@@ -36,9 +38,11 @@ enum sfh_hpd_info {
 
 /**
  * struct amd_sfh_info - get HPD sensor info from MP2 FW
+ * @ambient_light: Populates the ambient light information
  * @user_present: Populates the user presence information
  */
 struct amd_sfh_info {
+	u32 ambient_light;
 	u8 user_present;
 };
 
-- 
cgit v1.2.3


From 398aa9a7e77cf23c2a6f882ddd3dcd96f21771dc Mon Sep 17 00:00:00 2001
From: Manan Aurora <maurora@google.com>
Date: Tue, 31 Oct 2023 03:46:41 +0000
Subject: usb: dwc3: Support EBC feature of DWC_usb31

Support configuration and use of bulk endpoints in the so-called EBC
mode described in the DBC_usb31 databook (appendix E)

Added a bit fifo_mode to usb_ep to indicate to the UDC driver that a
specific endpoint is to operate in the EBC (or equivalent) mode when
enabled

Added macros for bits 15 and 14 of DEPCFG parameter 1 to indicate EBC
mode and write back behaviour. These bits will be set to 1 when
configuring an EBC endpoint as described in the programming guide

Signed-off-by: Manan Aurora <maurora@google.com>
Link: https://lore.kernel.org/r/20231031034641.660606-1-maurora@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/gadget.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 6532beb587b1..a771ccc038ac 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -236,6 +236,7 @@ struct usb_ep {
 	unsigned		max_streams:16;
 	unsigned		mult:2;
 	unsigned		maxburst:5;
+	unsigned		fifo_mode:1;
 	u8			address;
 	const struct usb_endpoint_descriptor	*desc;
 	const struct usb_ss_ep_comp_descriptor	*comp_desc;
-- 
cgit v1.2.3


From 49a78b05d5ca1e23fd737747a8757b8bdc319b30 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Thu, 4 Jan 2024 11:28:22 +0800
Subject: USB: core: Use device_driver directly in struct usb_driver and
 usb_device_driver

There is usbdrv_wrap in struct usb_driver and usb_device_driver, it
contains device_driver and for_devices. for_devices is used to
distinguish between device drivers and interface drivers.

Like the is_usb_device(), it tests the type of the device. We can test
that if the probe of device_driver is equal to usb_probe_device in
is_usb_device_driver(), and then the struct usbdrv_wrap is no longer
needed.

Clean up struct usbdrv_wrap, use device_driver directly in struct
usb_driver and usb_device_driver. This makes the code cleaner.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/20240104032822.1896596-1-yajun.deng@linux.dev
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 07556341ba2b..9e52179872a5 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1143,16 +1143,6 @@ extern ssize_t usb_store_new_id(struct usb_dynids *dynids,
 
 extern ssize_t usb_show_dynids(struct usb_dynids *dynids, char *buf);
 
-/**
- * struct usbdrv_wrap - wrapper for driver-model structure
- * @driver: The driver-model core driver structure.
- * @for_devices: Non-zero for device drivers, 0 for interface drivers.
- */
-struct usbdrv_wrap {
-	struct device_driver driver;
-	int for_devices;
-};
-
 /**
  * struct usb_driver - identifies USB interface driver to usbcore
  * @name: The driver name should be unique among USB drivers,
@@ -1193,7 +1183,7 @@ struct usbdrv_wrap {
  *	is bound to the driver.
  * @dynids: used internally to hold the list of dynamically added device
  *	ids for this driver.
- * @drvwrap: Driver-model core structure wrapper.
+ * @driver: The driver-model core driver structure.
  * @no_dynamic_id: if set to 1, the USB core will not allow dynamic ids to be
  *	added to this driver by preventing the sysfs file from being created.
  * @supports_autosuspend: if set to 0, the USB core will not allow autosuspend
@@ -1241,13 +1231,13 @@ struct usb_driver {
 	const struct attribute_group **dev_groups;
 
 	struct usb_dynids dynids;
-	struct usbdrv_wrap drvwrap;
+	struct device_driver driver;
 	unsigned int no_dynamic_id:1;
 	unsigned int supports_autosuspend:1;
 	unsigned int disable_hub_initiated_lpm:1;
 	unsigned int soft_unbind:1;
 };
-#define	to_usb_driver(d) container_of(d, struct usb_driver, drvwrap.driver)
+#define	to_usb_driver(d) container_of(d, struct usb_driver, driver)
 
 /**
  * struct usb_device_driver - identifies USB device driver to usbcore
@@ -1268,7 +1258,7 @@ struct usb_driver {
  *	on to call the normal usb_choose_configuration().
  * @dev_groups: Attributes attached to the device that will be created once it
  *	is bound to the driver.
- * @drvwrap: Driver-model core structure wrapper.
+ * @driver: The driver-model core driver structure.
  * @id_table: used with @match() to select better matching driver at
  * 	probe() time.
  * @supports_autosuspend: if set to 0, the USB core will not allow autosuspend
@@ -1277,7 +1267,7 @@ struct usb_driver {
  *	resume and suspend functions will be called in addition to the driver's
  *	own, so this part of the setup does not need to be replicated.
  *
- * USB drivers must provide all the fields listed above except drvwrap,
+ * USB drivers must provide all the fields listed above except driver,
  * match, and id_table.
  */
 struct usb_device_driver {
@@ -1293,13 +1283,13 @@ struct usb_device_driver {
 	int (*choose_configuration) (struct usb_device *udev);
 
 	const struct attribute_group **dev_groups;
-	struct usbdrv_wrap drvwrap;
+	struct device_driver driver;
 	const struct usb_device_id *id_table;
 	unsigned int supports_autosuspend:1;
 	unsigned int generic_subclass:1;
 };
 #define	to_usb_device_driver(d) container_of(d, struct usb_device_driver, \
-		drvwrap.driver)
+		driver)
 
 /**
  * struct usb_class_driver - identifies a USB driver that wants to use the USB major number
-- 
cgit v1.2.3


From cb86a3383aa7b9bb891daca691e596f6bfe52d82 Mon Sep 17 00:00:00 2001
From: Vamshi Gajjela <vamshigajjela@google.com>
Date: Thu, 9 Nov 2023 12:04:16 +0530
Subject: serial: core: Update uart_poll_timeout() function to return unsigned
 long

The function uart_fifo_timeout() returns an unsigned long value, which
is the number of jiffies. Therefore, change the variable timeout in the
function uart_poll_timeout() from int to unsigned long.
Change the return type of the function uart_poll_timeout() from int to
unsigned long to be consistent with the type of timeout values.

Signed-off-by: Vamshi Gajjela <vamshigajjela@google.com>
Link: https://lore.kernel.org/r/20231109063417.3971005-2-vamshigajjela@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 89f7b6c63598..536b2581d3e2 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -852,9 +852,9 @@ static inline unsigned long uart_fifo_timeout(struct uart_port *port)
 }
 
 /* Base timer interval for polling */
-static inline int uart_poll_timeout(struct uart_port *port)
+static inline unsigned long uart_poll_timeout(struct uart_port *port)
 {
-	int timeout = uart_fifo_timeout(port);
+	unsigned long timeout = uart_fifo_timeout(port);
 
 	return timeout > 6 ? (timeout / 2 - 2) : 1;
 }
-- 
cgit v1.2.3


From 5f1e77b2285b47c216b44e513071549cf006a309 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Thu, 30 Nov 2023 13:30:15 -0500
Subject: SUNRPC: Remove unused function rpc_clnt_xprt_switch_put()

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/clnt.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index e9d4377d03c6..5e9d1469c6fa 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -252,7 +252,6 @@ void		rpc_clnt_probe_trunked_xprts(struct rpc_clnt *,
 
 const char *rpc_proc_name(const struct rpc_task *task);
 
-void rpc_clnt_xprt_switch_put(struct rpc_clnt *);
 void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *);
 void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *);
 bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
-- 
cgit v1.2.3


From 12fc0a963128b54b82e98b9909f463e784b90b07 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Dec 2023 20:47:07 +0000
Subject: nfs: Remove writepage

NFS already has writepages and migrate_folio, so it does not need to
implement writepage.  The writepage operation is deprecated as it leads
to worse performance under high memory pressure due to folios being
written out in LRU order rather than sequentially within a file.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 279262057a92..f5ce7b101146 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -595,7 +595,6 @@ extern void nfs_complete_unlink(struct dentry *dentry, struct inode *);
  * linux/fs/nfs/write.c
  */
 extern int  nfs_congestion_kb;
-extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct folio *folio);
 extern int  nfs_update_folio(struct file *file, struct folio *folio,
-- 
cgit v1.2.3


From 401df0d4f4098ecc9c5278da2f50756d62e5b37d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Tue, 19 Dec 2023 13:01:03 +0100
Subject: nvmem: layouts: refactor .add_cells() callback arguments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simply pass whole "struct nvmem_layout" instead of single variables.
There is nothing in "struct nvmem_layout" that we have to hide from
layout drivers. They also access it during .probe() and .remove().

Thanks to this change:

1. API gets more consistent
   All layouts drivers callbacks get the same argument

2. Layouts get correct device
   Before this change NVMEM core code was passing NVMEM device instead
   of layout device. That resulted in:
   * Confusing prints
   * Calling devm_*() helpers on wrong device
   * Helpers like of_device_get_match_data() dereferencing NULLs

3. It gets possible to get match data
   First of all nvmem_layout_get_match_data() requires passing "struct
   nvmem_layout" which .add_cells() callback didn't have before this. It
   doesn't matter much as it's rather useless now anyway (and will be
   dropped).
   What's more important however is that of_device_get_match_data() can
   be used now thanks to owning a proper device pointer.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20231219120104.3422-1-zajec5@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-provider.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 6fe65b35ea97..81a67642ac55 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -173,7 +173,7 @@ struct nvmem_cell_table {
 struct nvmem_layout {
 	struct device dev;
 	struct nvmem_device *nvmem;
-	int (*add_cells)(struct device *dev, struct nvmem_device *nvmem);
+	int (*add_cells)(struct nvmem_layout *layout);
 };
 
 struct nvmem_layout_driver {
-- 
cgit v1.2.3


From 43f60e3fb62edc7bd8891de8779fb422f4ae23ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Tue, 19 Dec 2023 13:01:04 +0100
Subject: nvmem: drop nvmem_layout_get_match_data()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thanks for layouts refactoring we now have "struct device" associated
with layout. Also its OF pointer points directly to the "nvmem-layout"
DT node.

All it takes to get match data is a generic of_device_get_match_data().

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20231219120104.3422-2-zajec5@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-provider.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 81a67642ac55..f0ba0e03218f 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -205,9 +205,6 @@ void nvmem_layout_driver_unregister(struct nvmem_layout_driver *drv);
 	module_driver(__nvmem_layout_driver, nvmem_layout_driver_register, \
 		      nvmem_layout_driver_unregister)
 
-const void *nvmem_layout_get_match_data(struct nvmem_device *nvmem,
-					struct nvmem_layout *layout);
-
 #else
 
 static inline struct nvmem_device *nvmem_register(const struct nvmem_config *c)
@@ -238,13 +235,6 @@ static inline int nvmem_layout_register(struct nvmem_layout *layout)
 
 static inline void nvmem_layout_unregister(struct nvmem_layout *layout) {}
 
-static inline const void *
-nvmem_layout_get_match_data(struct nvmem_device *nvmem,
-			    struct nvmem_layout *layout)
-{
-	return NULL;
-}
-
 #endif /* CONFIG_NVMEM */
 
 #if IS_ENABLED(CONFIG_NVMEM) && IS_ENABLED(CONFIG_OF)
-- 
cgit v1.2.3


From 33cf42e68efc8ff529a7eee08a4f0ba8c8d0a207 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Thu, 21 Dec 2023 18:34:17 +0100
Subject: nvmem: core: add nvmem_dev_size() helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is required by layouts that need to read whole NVMEM content. It's
especially useful for NVMEM devices without hardcoded layout (like
U-Boot environment data block).

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20231221173421.13737-2-zajec5@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-consumer.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index 2d306fa13b1a..34c0e58dfa26 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -81,6 +81,7 @@ int nvmem_device_cell_write(struct nvmem_device *nvmem,
 			    struct nvmem_cell_info *info, void *buf);
 
 const char *nvmem_dev_name(struct nvmem_device *nvmem);
+size_t nvmem_dev_size(struct nvmem_device *nvmem);
 
 void nvmem_add_cell_lookups(struct nvmem_cell_lookup *entries,
 			    size_t nentries);
-- 
cgit v1.2.3


From aeda33ab8160c7a2e24ba4f44492ad1e974ddc7d Mon Sep 17 00:00:00 2001
From: Abhijit Gangurde <abhijit.gangurde@amd.com>
Date: Fri, 22 Dec 2023 12:16:26 +0530
Subject: cdx: create sysfs bin files for cdx resources

Resource binary file contains the content of the memory regions.
These resources<x> devices can be used to mmap the MMIO regions in
the user-space.

Co-developed-by: Puneet Gupta <puneet.gupta@amd.com>
Signed-off-by: Puneet Gupta <puneet.gupta@amd.com>
Signed-off-by: Abhijit Gangurde <abhijit.gangurde@amd.com>
Link: https://lore.kernel.org/r/20231222064627.2828960-1-abhijit.gangurde@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cdx/cdx_bus.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index 94ad2c9017c9..fab9e62c7e7d 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -135,6 +135,7 @@ struct cdx_device {
 	u8 bus_num;
 	u8 dev_num;
 	struct resource res[MAX_CDX_DEV_RESOURCES];
+	struct bin_attribute *res_attr[MAX_CDX_DEV_RESOURCES];
 	u8 res_count;
 	u64 dma_mask;
 	u16 flags;
@@ -147,6 +148,15 @@ struct cdx_device {
 #define to_cdx_device(_dev) \
 	container_of(_dev, struct cdx_device, dev)
 
+#define cdx_resource_start(dev, num)	((dev)->res[(num)].start)
+#define cdx_resource_end(dev, num)	((dev)->res[(num)].end)
+#define cdx_resource_flags(dev, num)	((dev)->res[(num)].flags)
+#define cdx_resource_len(dev, num) \
+	((cdx_resource_start((dev), (num)) == 0 &&	\
+	  cdx_resource_end((dev), (num)) ==		\
+	  cdx_resource_start((dev), (num))) ? 0 :	\
+	 (cdx_resource_end((dev), (num)) -		\
+	  cdx_resource_start((dev), (num)) + 1))
 /**
  * struct cdx_driver - CDX device driver
  * @driver: Generic device driver
-- 
cgit v1.2.3


From cf60af04edfe51fca488246c9959904adb2750fa Mon Sep 17 00:00:00 2001
From: Abhijit Gangurde <abhijit.gangurde@amd.com>
Date: Fri, 22 Dec 2023 12:16:27 +0530
Subject: cdx: Create resource debugfs file for cdx device

resource debugfs file contains host addresses of CDX device resources.
Each line of the resource file describe type of resource, a region
with start-end and flag fields.

Signed-off-by: Abhijit Gangurde <abhijit.gangurde@amd.com>
Link: https://lore.kernel.org/r/20231222064627.2828960-2-abhijit.gangurde@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cdx/cdx_bus.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index fab9e62c7e7d..6355a36a3f81 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -113,6 +113,7 @@ struct cdx_controller {
  * @dev_num: Device number for this device
  * @res: array of MMIO region entries
  * @res_attr: resource binary attribute
+ * @debugfs_dir: debugfs directory for this device
  * @res_count: number of valid MMIO regions
  * @dma_mask: Default DMA mask
  * @flags: CDX device flags
@@ -136,6 +137,7 @@ struct cdx_device {
 	u8 dev_num;
 	struct resource res[MAX_CDX_DEV_RESOURCES];
 	struct bin_attribute *res_attr[MAX_CDX_DEV_RESOURCES];
+	struct dentry *debugfs_dir;
 	u8 res_count;
 	u64 dma_mask;
 	u16 flags;
-- 
cgit v1.2.3


From a87e55bfa25c195b3aaa25369175905ba9527fff Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 15:26:38 +0100
Subject: moxtet: remove unused moxtet_type declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For some reason, moxtet_type was defined in moxtet.h, but never actually
used.  Looks like a left-over from the original commit that was
exporting the moxtet bus type, but that wasn't needed, and it was a
different variable name, so no one noticed this one dangling around.

Cc: Marek Behún <kabel@kernel.org>
Link: https://lore.kernel.org/r/2023121937-pants-heroics-17c1@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/moxtet.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moxtet.h b/include/linux/moxtet.h
index 79184948fab4..ac577699edfd 100644
--- a/include/linux/moxtet.h
+++ b/include/linux/moxtet.h
@@ -35,8 +35,6 @@ enum turris_mox_module_id {
 
 #define MOXTET_NIRQS	16
 
-extern struct bus_type moxtet_type;
-
 struct moxtet {
 	struct device			*dev;
 	struct mutex			lock;
-- 
cgit v1.2.3


From e1be24b2e1190a7662462e8e398189ac795339cd Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 18:18:58 +0100
Subject: platform/surface: aggregator: make ssam_bus_type constant and static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the driver core can properly handle constant struct bus_type,
move the ssam_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

It's also never used outside of
drivers/platform/surface/aggregator/bus.c so make it static and don't
export it as no one is using it.

Cc: Maximilian Luz <luzmaximilian@gmail.com>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc:  <platform-driver-x86@vger.kernel.org>
Reviewed-by: Maximilian Luz <luzmaximilian@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/2023121957-tapered-upswing-8326@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/surface_aggregator/device.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index 42b249b4c24b..8cd8c38cf3f3 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -193,7 +193,6 @@ struct ssam_device_driver {
 
 #ifdef CONFIG_SURFACE_AGGREGATOR_BUS
 
-extern struct bus_type ssam_bus_type;
 extern const struct device_type ssam_device_type;
 
 /**
-- 
cgit v1.2.3


From 0c4b2255b7afbcc80f4efcc8f67425162f49c263 Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Mon, 18 Dec 2023 21:50:24 -0800
Subject: firmware: xilinx: Export function to use in other module

Export zynqmp_pm_get_family_info() to access and find family information
in other module.

Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20231219055025.27570-2-jay.buddhabhatti@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 1478f691cc10..06f4e6eaf13e 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -524,6 +524,7 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...);
 #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE)
 int zynqmp_pm_get_api_version(u32 *version);
 int zynqmp_pm_get_chipid(u32 *idcode, u32 *version);
+int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily);
 int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out);
 int zynqmp_pm_clock_enable(u32 clock_id);
 int zynqmp_pm_clock_disable(u32 clock_id);
@@ -602,6 +603,11 @@ static inline int zynqmp_pm_get_chipid(u32 *idcode, u32 *version)
 	return -ENODEV;
 }
 
+static inline int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily)
+{
+	return -ENODEV;
+}
+
 static inline int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata,
 				       u32 *out)
 {
-- 
cgit v1.2.3


From 97d62760e441af9ed393e127a46172f9534b5808 Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Mon, 18 Dec 2023 21:50:25 -0800
Subject: drivers: soc: xilinx: add check for platform

Some error event IDs for Versal and Versal NET are different.
Both the platforms should access their respective error event
IDs so use sub_family_code to check for platform and check
error IDs for respective platforms. The family code is passed
via platform data to avoid platform detection again.
Platform data is setup when even driver is registered.

Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20231219055025.27570-3-jay.buddhabhatti@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 06f4e6eaf13e..9a7e52739251 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -95,10 +95,18 @@
 /*
  * Node IDs for the Error Events.
  */
-#define EVENT_ERROR_PMC_ERR1	(0x28100000U)
-#define EVENT_ERROR_PMC_ERR2	(0x28104000U)
-#define EVENT_ERROR_PSM_ERR1	(0x28108000U)
-#define EVENT_ERROR_PSM_ERR2	(0x2810C000U)
+#define VERSAL_EVENT_ERROR_PMC_ERR1	(0x28100000U)
+#define VERSAL_EVENT_ERROR_PMC_ERR2	(0x28104000U)
+#define VERSAL_EVENT_ERROR_PSM_ERR1	(0x28108000U)
+#define VERSAL_EVENT_ERROR_PSM_ERR2	(0x2810C000U)
+
+#define VERSAL_NET_EVENT_ERROR_PMC_ERR1	(0x28100000U)
+#define VERSAL_NET_EVENT_ERROR_PMC_ERR2	(0x28104000U)
+#define VERSAL_NET_EVENT_ERROR_PMC_ERR3	(0x28108000U)
+#define VERSAL_NET_EVENT_ERROR_PSM_ERR1	(0x2810C000U)
+#define VERSAL_NET_EVENT_ERROR_PSM_ERR2	(0x28110000U)
+#define VERSAL_NET_EVENT_ERROR_PSM_ERR3	(0x28114000U)
+#define VERSAL_NET_EVENT_ERROR_PSM_ERR4	(0x28118000U)
 
 /* ZynqMP SD tap delay tuning */
 #define SD_ITAPDLY	0xFF180314
-- 
cgit v1.2.3


From 98e20e5e13d2811898921f999288be7151a11954 Mon Sep 17 00:00:00 2001
From: Quentin Deslandes <qde@naccy.de>
Date: Tue, 26 Dec 2023 14:07:42 +0100
Subject: bpfilter: remove bpfilter

bpfilter was supposed to convert iptables filtering rules into
BPF programs on the fly, from the kernel, through a usermode
helper. The base code for the UMH was introduced in 2018, and
couple of attempts (2, 3) tried to introduce the BPF program
generate features but were abandoned.

bpfilter now sits in a kernel tree unused and unusable, occasionally
causing confusion amongst Linux users (4, 5).

As bpfilter is now developed in a dedicated repository on GitHub (6),
it was suggested a couple of times this year (LSFMM/BPF 2023,
LPC 2023) to remove the deprecated kernel part of the project. This
is the purpose of this patch.

[1]: https://lore.kernel.org/lkml/20180522022230.2492505-1-ast@kernel.org/
[2]: https://lore.kernel.org/bpf/20210829183608.2297877-1-me@ubique.spb.ru/#t
[3]: https://lore.kernel.org/lkml/20221224000402.476079-1-qde@naccy.de/
[4]: https://dxuuu.xyz/bpfilter.html
[5]: https://github.com/linuxkit/linuxkit/pull/3904
[6]: https://github.com/facebook/bpfilter

Signed-off-by: Quentin Deslandes <qde@naccy.de>
Link: https://lore.kernel.org/r/20231226130745.465988-1-qde@naccy.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpfilter.h | 24 ------------------------
 1 file changed, 24 deletions(-)
 delete mode 100644 include/linux/bpfilter.h

(limited to 'include/linux')

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
deleted file mode 100644
index 736ded4905e0..000000000000
--- a/include/linux/bpfilter.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_BPFILTER_H
-#define _LINUX_BPFILTER_H
-
-#include <uapi/linux/bpfilter.h>
-#include <linux/usermode_driver.h>
-#include <linux/sockptr.h>
-
-struct sock;
-int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval,
-			    unsigned int optlen);
-int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
-			    int __user *optlen);
-
-struct bpfilter_umh_ops {
-	struct umd_info info;
-	/* since ip_getsockopt() can run in parallel, serialize access to umh */
-	struct mutex lock;
-	int (*sockopt)(struct sock *sk, int optname, sockptr_t optval,
-		       unsigned int optlen, bool is_set);
-	int (*start)(void);
-};
-extern struct bpfilter_umh_ops bpfilter_ops;
-#endif
-- 
cgit v1.2.3


From 57331a59ac0d680f606403eb24edd3c35aecba31 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 4 Jan 2024 09:58:46 -0500
Subject: NFSv4.1: Use the nfs_client's rpc timeouts for backchannel

For backchannel requests that lookup the appropriate nfs_client, use the
state-management rpc_clnt's rpc_timeout parameters for the backchannel's
response.  When the nfs_client cannot be found, fall back to using the
xprt's default timeout parameters.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/bc_xprt.h |  3 ++-
 include/linux/sunrpc/sched.h   | 14 +++++++++++++-
 include/linux/sunrpc/svc.h     |  2 ++
 include/linux/sunrpc/xprt.h    | 11 -----------
 4 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index db30a159f9d5..f22bf915dcf6 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -20,7 +20,8 @@
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
 struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid);
 void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied);
-void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task);
+void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task,
+		const struct rpc_timeout *to);
 void xprt_free_bc_request(struct rpc_rqst *req);
 int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
 void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 8ada7dc802d3..2d61987b3545 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -37,6 +37,17 @@ struct rpc_wait {
 	struct list_head	timer_list;	/* Timer list */
 };
 
+/*
+ * This describes a timeout strategy
+ */
+struct rpc_timeout {
+	unsigned long		to_initval,		/* initial timeout */
+				to_maxval,		/* max timeout */
+				to_increment;		/* if !exponential */
+	unsigned int		to_retries;		/* max # of retries */
+	unsigned char		to_exponential;
+};
+
 /*
  * This is the RPC task struct
  */
@@ -205,7 +216,8 @@ struct rpc_wait_queue {
  */
 struct rpc_task *rpc_new_task(const struct rpc_task_setup *);
 struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
-struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req);
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
+		struct rpc_timeout *timeout);
 void		rpc_put_task(struct rpc_task *);
 void		rpc_put_task_async(struct rpc_task *);
 bool		rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index b10f987509cc..3331a1c2b47e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -250,6 +250,8 @@ struct svc_rqst {
 	struct net		*rq_bc_net;	/* pointer to backchannel's
 						 * net namespace
 						 */
+	unsigned long	bc_to_initval;
+	unsigned int	bc_to_retries;
 	void **			rq_lease_breaker; /* The v4 client breaking a lease */
 	unsigned int		rq_status_counter; /* RPC processing counter */
 };
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index f85d3a0daca2..464f6a9492ab 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -30,17 +30,6 @@
 #define RPC_MAXCWND(xprt)	((xprt)->max_reqs << RPC_CWNDSHIFT)
 #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
 
-/*
- * This describes a timeout strategy
- */
-struct rpc_timeout {
-	unsigned long		to_initval,		/* initial timeout */
-				to_maxval,		/* max timeout */
-				to_increment;		/* if !exponential */
-	unsigned int		to_retries;		/* max # of retries */
-	unsigned char		to_exponential;
-};
-
 enum rpc_display_format_t {
 	RPC_DISPLAY_ADDR = 0,
 	RPC_DISPLAY_PORT,
-- 
cgit v1.2.3


From fe1eb24bd5ade085914248c527044e942f75e06a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 4 Jan 2024 16:04:35 -0800
Subject: Revert "Introduce PHY listing and link_topology tracking"

This reverts commit 32bb4515e34469975abc936deb0a116c4a445817.
This reverts commit d078d480639a4f3b5fc2d56247afa38e0956483a.
This reverts commit fcc4b105caa4b844bf043375bf799c20a9c99db1.
This reverts commit 345237dbc1bdbb274c9fb9ec38976261ff4a40b8.
This reverts commit 7db69ec9cfb8b4ab50420262631fb2d1908b25bf.
This reverts commit 95132a018f00f5dad38bdcfd4180d1af955d46f6.
This reverts commit 63d5eaf35ac36cad00cfb3809d794ef0078c822b.
This reverts commit c29451aefcb42359905d18678de38e52eccb3bb5.
This reverts commit 2ab0edb505faa9ac90dee1732571390f074e8113.
This reverts commit dedd702a35793ab462fce4c737eeba0badf9718e.
This reverts commit 034fcc210349b873ece7356905be5c6ca11eef2a.
This reverts commit 9c5625f559ad6fe9f6f733c11475bf470e637d34.
This reverts commit 02018c544ef113e980a2349eba89003d6f399d22.

Looks like we need more time for reviews, and incremental
changes will be hard to make sense of. So revert.

Link: https://lore.kernel.org/all/ZZP6FV5sXEf+xd58@shell.armlinux.org.uk/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h              |  4 +-
 include/linux/phy.h                    |  6 ---
 include/linux/phy_link_topology.h      | 67 ----------------------------------
 include/linux/phy_link_topology_core.h | 19 ----------
 include/linux/sfp.h                    |  8 +---
 5 files changed, 2 insertions(+), 102 deletions(-)
 delete mode 100644 include/linux/phy_link_topology.h
 delete mode 100644 include/linux/phy_link_topology_core.h

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e265aa1f2169..118c40258d07 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -40,6 +40,7 @@
 #include <net/dcbnl.h>
 #endif
 #include <net/netprio_cgroup.h>
+
 #include <linux/netdev_features.h>
 #include <linux/neighbour.h>
 #include <uapi/linux/netdevice.h>
@@ -51,7 +52,6 @@
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
 #include <net/dropreason-core.h>
-#include <linux/phy_link_topology_core.h>
 
 struct netpoll_info;
 struct device;
@@ -2047,7 +2047,6 @@ enum netdev_stat_type {
  *	@fcoe_ddp_xid:	Max exchange id for FCoE LRO by ddp
  *
  *	@priomap:	XXX: need comments on this one
- *	@link_topo:	Physical link topology tracking attached PHYs
  *	@phydev:	Physical device may attach itself
  *			for hardware timestamping
  *	@sfp_bus:	attached &struct sfp_bus structure.
@@ -2442,7 +2441,6 @@ struct net_device {
 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 	struct netprio_map __rcu *priomap;
 #endif
-	struct phy_link_topology	link_topo;
 	struct phy_device	*phydev;
 	struct sfp_bus		*sfp_bus;
 	struct lock_class_key	*qdisc_tx_busylock;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 6cb9d843aee9..e9e85d347587 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -544,9 +544,6 @@ struct macsec_ops;
  * @drv: Pointer to the driver for this PHY instance
  * @devlink: Create a link between phy dev and mac dev, if the external phy
  *           used by current mac interface is managed by another mac interface.
- * @phyindex: Unique id across the phy's parent tree of phys to address the PHY
- *	      from userspace, similar to ifindex. A zero index means the PHY
- *	      wasn't assigned an id yet.
  * @phy_id: UID for this device found during discovery
  * @c45_ids: 802.3-c45 Device Identifiers if is_c45.
  * @is_c45:  Set to true if this PHY uses clause 45 addressing.
@@ -646,7 +643,6 @@ struct phy_device {
 
 	struct device_link *devlink;
 
-	u32 phyindex;
 	u32 phy_id;
 
 	struct phy_c45_device_ids c45_ids;
@@ -1726,8 +1722,6 @@ int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
 int __phy_resume(struct phy_device *phydev);
 int phy_loopback(struct phy_device *phydev, bool enable);
-int phy_sfp_connect_phy(void *upstream, struct phy_device *phy);
-void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy);
 void phy_sfp_attach(void *upstream, struct sfp_bus *bus);
 void phy_sfp_detach(void *upstream, struct sfp_bus *bus);
 int phy_sfp_probe(struct phy_device *phydev,
diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h
deleted file mode 100644
index 91902263ec0e..000000000000
--- a/include/linux/phy_link_topology.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PHY device list allow maintaining a list of PHY devices that are
- * part of a netdevice's link topology. PHYs can for example be chained,
- * as is the case when using a PHY that exposes an SFP module, on which an
- * SFP transceiver that embeds a PHY is connected.
- *
- * This list can then be used by userspace to leverage individual PHY
- * capabilities.
- */
-#ifndef __PHY_LINK_TOPOLOGY_H
-#define __PHY_LINK_TOPOLOGY_H
-
-#include <linux/ethtool.h>
-#include <linux/phy_link_topology_core.h>
-
-struct xarray;
-struct phy_device;
-struct net_device;
-struct sfp_bus;
-
-struct phy_device_node {
-	enum phy_upstream upstream_type;
-
-	union {
-		struct net_device	*netdev;
-		struct phy_device	*phydev;
-	} upstream;
-
-	struct sfp_bus *parent_sfp_bus;
-
-	struct phy_device *phy;
-};
-
-static inline struct phy_device *
-phy_link_topo_get_phy(struct phy_link_topology *topo, u32 phyindex)
-{
-	struct phy_device_node *pdn = xa_load(&topo->phys, phyindex);
-
-	if (pdn)
-		return pdn->phy;
-
-	return NULL;
-}
-
-#if IS_ENABLED(CONFIG_PHYLIB)
-int phy_link_topo_add_phy(struct phy_link_topology *topo,
-			  struct phy_device *phy,
-			  enum phy_upstream upt, void *upstream);
-
-void phy_link_topo_del_phy(struct phy_link_topology *lt, struct phy_device *phy);
-
-#else
-static inline int phy_link_topo_add_phy(struct phy_link_topology *topo,
-					struct phy_device *phy,
-					enum phy_upstream upt, void *upstream)
-{
-	return 0;
-}
-
-static inline void phy_link_topo_del_phy(struct phy_link_topology *topo,
-					 struct phy_device *phy)
-{
-}
-#endif
-
-#endif /* __PHY_LINK_TOPOLOGY_H */
diff --git a/include/linux/phy_link_topology_core.h b/include/linux/phy_link_topology_core.h
deleted file mode 100644
index 78c75f909489..000000000000
--- a/include/linux/phy_link_topology_core.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __PHY_LINK_TOPOLOGY_CORE_H
-#define __PHY_LINK_TOPOLOGY_CORE_H
-
-struct xarray;
-
-struct phy_link_topology {
-	struct xarray phys;
-
-	u32 next_phy_index;
-};
-
-static inline void phy_link_topo_init(struct phy_link_topology *topo)
-{
-	xa_init_flags(&topo->phys, XA_FLAGS_ALLOC1);
-	topo->next_phy_index = 1;
-}
-
-#endif /* __PHY_LINK_TOPOLOGY_CORE_H */
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 55c0ab17c9e2..9346cd44814d 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -544,7 +544,7 @@ struct sfp_upstream_ops {
 	void (*link_down)(void *priv);
 	void (*link_up)(void *priv);
 	int (*connect_phy)(void *priv, struct phy_device *);
-	void (*disconnect_phy)(void *priv, struct phy_device *);
+	void (*disconnect_phy)(void *priv);
 };
 
 #if IS_ENABLED(CONFIG_SFP)
@@ -570,7 +570,6 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode);
 int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 			 const struct sfp_upstream_ops *ops);
 void sfp_bus_del_upstream(struct sfp_bus *bus);
-const char *sfp_get_name(struct sfp_bus *bus);
 #else
 static inline int sfp_parse_port(struct sfp_bus *bus,
 				 const struct sfp_eeprom_id *id,
@@ -649,11 +648,6 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 static inline void sfp_bus_del_upstream(struct sfp_bus *bus)
 {
 }
-
-static inline const char *sfp_get_name(struct sfp_bus *bus)
-{
-	return NULL;
-}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 19bfcdf9498aa968ea293417fbbc39e523527ca8 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Wed, 3 Jan 2024 20:05:44 +0100
Subject: bpf: Relax tracing prog recursive attach rules

Currently, it's not allowed to attach an fentry/fexit prog to another
one fentry/fexit. At the same time it's not uncommon to see a tracing
program with lots of logic in use, and the attachment limitation
prevents usage of fentry/fexit for performance analysis (e.g. with
"bpftool prog profile" command) in this case. An example could be
falcosecurity libs project that uses tp_btf tracing programs.

Following the corresponding discussion [1], the reason for that is to
avoid tracing progs call cycles without introducing more complex
solutions. But currently it seems impossible to load and attach tracing
programs in a way that will form such a cycle. The limitation is coming
from the fact that attach_prog_fd is specified at the prog load (thus
making it impossible to attach to a program loaded after it in this
way), as well as tracing progs not implementing link_detach.

Replace "no same type" requirement with verification that no more than
one level of attachment nesting is allowed. In this way only one
fentry/fexit program could be attached to another fentry/fexit to cover
profiling use case, and still no cycle could be formed. To implement,
add a new field into bpf_prog_aux to track nested attachment for tracing
programs.

[1]: https://lore.kernel.org/bpf/20191108064039.2041889-16-ast@kernel.org/

Acked-by: Jiri Olsa <olsajiri@gmail.com>
Acked-by: Song Liu <song@kernel.org>
Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Link: https://lore.kernel.org/r/20240103190559.14750-2-9erthalion6@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7671530d6e4e..e30100597d0a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1449,6 +1449,7 @@ struct bpf_prog_aux {
 	bool dev_bound; /* Program is bound to the netdev. */
 	bool offload_requested; /* Program is bound and offloaded to the netdev. */
 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
+	bool attach_tracing_prog; /* true if tracing another tracing program */
 	bool func_proto_unreliable;
 	bool sleepable;
 	bool tail_call_reachable;
-- 
cgit v1.2.3


From 990b6b5b13b7993b7f44740c0add3119d407ccbf Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 13 Dec 2023 09:32:20 +0800
Subject: jbd2: add errseq to detect client fs's bdev writeback error

Add errseq in journal, so that JBD2 can detect whether metadata is
successfully written to fs bdev. This patch adds detection in recovery
process to replace original solution(using local variable wb_err).

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20231213013224.2100050-2-chengzhihao1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index beb30719ee16..cea1aa70ae36 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -998,6 +998,13 @@ struct journal_s
 	 */
 	struct block_device	*j_fs_dev;
 
+	/**
+	 * @j_fs_dev_wb_err:
+	 *
+	 * Records the errseq of the client fs's backing block device.
+	 */
+	errseq_t		j_fs_dev_wb_err;
+
 	/**
 	 * @j_total_len: Total maximum capacity of the journal region on disk.
 	 */
@@ -1698,6 +1705,25 @@ static inline void jbd2_journal_abort_handle(handle_t *handle)
 	handle->h_aborted = 1;
 }
 
+static inline void jbd2_init_fs_dev_write_error(journal_t *journal)
+{
+	struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping;
+
+	/*
+	 * Save the original wb_err value of client fs's bdev mapping which
+	 * could be used to detect the client fs's metadata async write error.
+	 */
+	errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err);
+}
+
+static inline int jbd2_check_fs_dev_write_error(journal_t *journal)
+{
+	struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping;
+
+	return errseq_check(&mapping->wb_err,
+			    READ_ONCE(journal->j_fs_dev_wb_err));
+}
+
 #endif /* __KERNEL__   */
 
 /* Comparison functions for transaction IDs: perform comparisons using
-- 
cgit v1.2.3


From 8a4fd33d879fb303b207f06ea6340d73f698c4ed Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 13 Dec 2023 09:32:22 +0800
Subject: jbd2: remove unused 'JBD2_CHECKPOINT_IO_ERROR' and 'j_atomic_flags'

Since 'JBD2_CHECKPOINT_IO_ERROR' and j_atomic_flags' are not useful
anymore after fs dev's errseq is imported into jbd2, just remove them.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20231213013224.2100050-4-chengzhihao1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index cea1aa70ae36..971f3e826e15 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -755,11 +755,6 @@ struct journal_s
 	 */
 	unsigned long		j_flags;
 
-	/**
-	 * @j_atomic_flags: Atomic journaling state flags.
-	 */
-	unsigned long		j_atomic_flags;
-
 	/**
 	 * @j_errno:
 	 *
@@ -1406,12 +1401,6 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
 #define JBD2_JOURNAL_FLUSH_VALID	(JBD2_JOURNAL_FLUSH_DISCARD | \
 					JBD2_JOURNAL_FLUSH_ZEROOUT)
 
-/*
- * Journal atomic flag definitions
- */
-#define JBD2_CHECKPOINT_IO_ERROR	0x001	/* Detect io error while writing
-						 * buffer back to disk */
-
 /*
  * Function declarations for the journaling transaction and buffer
  * management
-- 
cgit v1.2.3


From c2e64baac4f36f7e0365218c7523bb9ba4639250 Mon Sep 17 00:00:00 2001
From: Thierry Reding <thierry.reding@gmail.com>
Date: Thu, 21 Dec 2023 11:08:02 +0100
Subject: pwm: Add pwm_apply_state() compatibility stub
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to make the transition to the new pwm_apply_might_sleep() a bit
smoother, add a compatibility stub. This will prevent new calls to the
old function introduced via other subsystems from breaking builds. Once
the next merge window has closed we can take another stab at removing
the stub.

Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 5dd665d8c909..2a5e1154652e 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -537,6 +537,13 @@ static inline void pwm_apply_args(struct pwm_device *pwm)
 	pwm_apply_might_sleep(pwm, &state);
 }
 
+/* only for backwards-compatibility, new code should not use this */
+static inline int pwm_apply_state(struct pwm_device *pwm,
+				  const struct pwm_state *state)
+{
+	return pwm_apply_might_sleep(pwm, state);
+}
+
 struct pwm_lookup {
 	struct list_head list;
 	const char *provider;
-- 
cgit v1.2.3


From d73f444d06fb8a42a5c0623453f3ea1fe9880229 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:06:20 -0800
Subject: pwm: linux/pwm.h: fix Excess kernel-doc description warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the @pwm: line to prevent the kernel-doc warning:

include/linux/pwm.h:87: warning: Excess struct member 'pwm' description in 'pwm_device'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Cc:  <linux-pwm@vger.kernel.org>
Fixes: f3e25e68ceb2 ("pwm: Drop unused member "pwm" from struct pwm_device")
Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 2a5e1154652e..fcc2c4496f73 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -69,7 +69,6 @@ struct pwm_state {
  * @label: name of the PWM device
  * @flags: flags associated with the PWM device
  * @hwpwm: per-chip relative index of the PWM device
- * @pwm: global index of the PWM device
  * @chip: PWM chip providing this PWM device
  * @args: PWM arguments
  * @state: last applied state
-- 
cgit v1.2.3


From 92a714d727ec9e7ccfcc7432d348aba730145914 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 4 Jan 2024 15:52:11 +0000
Subject: netfs: Fix interaction between write-streaming and cachefiles culling

An issue can occur between write-streaming (storing dirty data in partial
non-uptodate pages) and a cachefiles object being culled to make space.
The problem occurs because the cache object is only marked in use while
there are files open using it.  Once it has been released, it can be culled
and the cookie marked disabled.

At this point, a streaming write is permitted to occur (if the cache is
active, we require pages to be prefetched and cached), but the cache can
become active again before this gets flushed out - and then two effects can
occur:

 (1) The cache may be asked to write out a region that's less than its DIO
     block size (assumed by cachefiles to be PAGE_SIZE) - and this causes
     one of two debugging statements to be emitted.

 (2) netfs_how_to_modify() gets confused because it sees a page that isn't
     allowed to be non-uptodate being uptodate and tries to prefetch it -
     leading to a warning that PG_fscache is set twice.

Fix this by the following means:

 (1) Add a netfs_inode flag to disallow write-streaming to an inode and set
     it if we ever do local caching of that inode.  It remains set for the
     lifetime of that inode - even if the cookie becomes disabled.

 (2) If the no-write-streaming flag is set, then make netfs_how_to_modify()
     always want to prefetch instead.

 (3) If netfs_how_to_modify() decides it wants to prefetch a folio, but
     that folio has write-streamed data in it, then it requires the folio
     be flushed first.

 (4) Export a counter of the number of times we wanted to prefetch a
     non-uptodate page, but found it had write-streamed data in it.

 (5) Export a counter of the number of times we cancelled a write to the
     cache because it didn't DIO align and remove the debug statements.

Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-erofs@lists.ozlabs.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/fscache-cache.h | 3 +++
 include/linux/netfs.h         | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index a174cedf4d90..bdf7f3eddf0a 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -189,17 +189,20 @@ extern atomic_t fscache_n_write;
 extern atomic_t fscache_n_no_write_space;
 extern atomic_t fscache_n_no_create_space;
 extern atomic_t fscache_n_culled;
+extern atomic_t fscache_n_dio_misfit;
 #define fscache_count_read() atomic_inc(&fscache_n_read)
 #define fscache_count_write() atomic_inc(&fscache_n_write)
 #define fscache_count_no_write_space() atomic_inc(&fscache_n_no_write_space)
 #define fscache_count_no_create_space() atomic_inc(&fscache_n_no_create_space)
 #define fscache_count_culled() atomic_inc(&fscache_n_culled)
+#define fscache_count_dio_misfit() atomic_inc(&fscache_n_dio_misfit)
 #else
 #define fscache_count_read() do {} while(0)
 #define fscache_count_write() do {} while(0)
 #define fscache_count_no_write_space() do {} while(0)
 #define fscache_count_no_create_space() do {} while(0)
 #define fscache_count_culled() do {} while(0)
+#define fscache_count_dio_misfit() do {} while(0)
 #endif
 
 #endif /* _LINUX_FSCACHE_CACHE_H */
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index d3bac60fcd6f..100cbb261269 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -142,6 +142,7 @@ struct netfs_inode {
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 #define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
 #define NETFS_ICTX_WRITETHROUGH	2		/* Write-through caching */
+#define NETFS_ICTX_NO_WRITE_STREAMING	3	/* Don't engage in write-streaming */
 };
 
 /*
-- 
cgit v1.2.3


From 8a6286c1804e2c7144aef3154a0357c4b496e10b Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 3 Jan 2024 14:28:36 +0100
Subject: dpll: expose fractional frequency offset value to user

Add a new netlink attribute to expose fractional frequency offset value
for a pin. Add an op to get the value from the driver.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Acked-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Link: https://lore.kernel.org/r/20240103132838.1501801-2-jiri@resnulli.us
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dpll.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index b1a5f9ca8ee5..9cf896ea1d41 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -77,6 +77,9 @@ struct dpll_pin_ops {
 				const struct dpll_device *dpll, void *dpll_priv,
 				const s32 phase_adjust,
 				struct netlink_ext_ack *extack);
+	int (*ffo_get)(const struct dpll_pin *pin, void *pin_priv,
+		       const struct dpll_device *dpll, void *dpll_priv,
+		       s64 *ffo, struct netlink_ext_ack *extack);
 };
 
 struct dpll_pin_frequency {
-- 
cgit v1.2.3


From 9c5938694cd0e9e00bdfb7e60900673263daf4d5 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 5 Jan 2024 16:57:29 +0100
Subject: mm/rmap: silence VM_WARN_ON_FOLIO() in __folio_rmap_sanity_checks()

Unfortunately, vm_insert_page() and friends and up passing
driver-allocated folios into folio_add_file_rmap_pte() using
insert_page_into_pte_locked().

While these driver-allocated folios can be compound pages (large folios),
they are not proper "rmappable" folios.

In these VM_MIXEDMAP VMAs, there isn't really the concept of a reverse
mapping, so long-term, we should clean that up and not call into rmap
code.

For the time being, document how we can end up in rmap code with large
folios that are not marked rmappable.

Link: https://lkml.kernel.org/r/793c5cee-d5fc-4eb1-86a2-39e05686233d@redhat.com
Fixes: 68f0320824fa ("mm/rmap: convert folio_add_file_rmap_range() into folio_add_file_rmap_[pte|ptes|pmd]()")
Reported-by: syzbot+50ef73537bbc393a25bb@syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/000000000000014174060e09316e@google.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fd6fe16fa358..b7944a833668 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -199,8 +199,15 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio,
 {
 	/* hugetlb folios are handled separately. */
 	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
-	VM_WARN_ON_FOLIO(folio_test_large(folio) &&
-			 !folio_test_large_rmappable(folio), folio);
+
+	/*
+	 * TODO: we get driver-allocated folios that have nothing to do with
+	 * the rmap using vm_insert_page(); therefore, we cannot assume that
+	 * folio_test_large_rmappable() holds for large folios. We should
+	 * handle any desired mapcount+stats accounting for these folios in
+	 * VM_MIXEDMAP VMAs separately, and then sanity-check here that
+	 * we really only get rmappable folios.
+	 */
 
 	VM_WARN_ON_ONCE(nr_pages <= 0);
 	VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
-- 
cgit v1.2.3


From 71ce1ab54a505736786d9c5921e6c2718c7ec535 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 27 Dec 2023 14:12:01 +0000
Subject: mm/mglru: add CONFIG_ARCH_HAS_HW_PTE_YOUNG

Patch series "mm/mglru: Kconfig cleanup", v4.

This series is the result of the following discussion:
https://lore.kernel.org/47066176-bd93-55dd-c2fa-002299d9e034@linux.ibm.com/

It mainly avoids building the code that walks page tables on CPUs that
use it, i.e., those don't support hardware accessed bit. Specifically,
it introduces a new Kconfig to guard some of functions added by
commit bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
on CPUs like POWER9, on which the series was tested.


This patch (of 5):

Some architectures are able to set the accessed bit in PTEs when PTEs
are used as part of linear address translations.

Add CONFIG_ARCH_HAS_HW_PTE_YOUNG for such architectures to be able to
override arch_has_hw_pte_young().

Link: https://lkml.kernel.org/r/20231227141205.2200125-1-kinseyho@google.com
Link: https://lkml.kernel.org/r/20231227141205.2200125-2-kinseyho@google.com
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Co-developed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Tested-by: Donet Tom <donettom@linux.vnet.ibm.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index af7639c3b0a3..9ecc20fa6269 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -375,7 +375,7 @@ static inline bool arch_has_hw_nonleaf_pmd_young(void)
  */
 static inline bool arch_has_hw_pte_young(void)
 {
-	return false;
+	return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG);
 }
 #endif
 
-- 
cgit v1.2.3


From 61dd3f246b3adaabff3241c586f2210ac91b05a4 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 27 Dec 2023 14:12:02 +0000
Subject: mm/mglru: add CONFIG_LRU_GEN_WALKS_MMU

Add CONFIG_LRU_GEN_WALKS_MMU such that if disabled, the code that
walks page tables to promote pages into the youngest generation will
not be built.

Also improves code readability by adding two helper functions
get_mm_state() and get_next_mm().

Link: https://lkml.kernel.org/r/20231227141205.2200125-3-kinseyho@google.com
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Co-developed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Tested-by: Donet Tom <donettom@linux.vnet.ibm.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  2 +-
 include/linux/mm_types.h   | 12 ++++++++----
 include/linux/mmzone.h     |  2 ++
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5de775e6cdd9..20ff87f8e001 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -330,7 +330,7 @@ struct mem_cgroup {
 	struct deferred_split deferred_split_queue;
 #endif
 
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
 	/* per-memcg mm_struct list */
 	struct lru_gen_mm_list mm_list;
 #endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a66534c78c4d..552fa2d11c57 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -958,7 +958,7 @@ struct mm_struct {
 		 */
 		unsigned long ksm_zero_pages;
 #endif /* CONFIG_KSM */
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
 		struct {
 			/* this mm_struct is on lru_gen_mm_list */
 			struct list_head list;
@@ -973,7 +973,7 @@ struct mm_struct {
 			struct mem_cgroup *memcg;
 #endif
 		} lru_gen;
-#endif /* CONFIG_LRU_GEN */
+#endif /* CONFIG_LRU_GEN_WALKS_MMU */
 	} __randomize_layout;
 
 	/*
@@ -1011,6 +1011,10 @@ struct lru_gen_mm_list {
 	spinlock_t lock;
 };
 
+#endif /* CONFIG_LRU_GEN */
+
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
+
 void lru_gen_add_mm(struct mm_struct *mm);
 void lru_gen_del_mm(struct mm_struct *mm);
 #ifdef CONFIG_MEMCG
@@ -1036,7 +1040,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm)
 	WRITE_ONCE(mm->lru_gen.bitmap, -1);
 }
 
-#else /* !CONFIG_LRU_GEN */
+#else /* !CONFIG_LRU_GEN_WALKS_MMU */
 
 static inline void lru_gen_add_mm(struct mm_struct *mm)
 {
@@ -1060,7 +1064,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm)
 {
 }
 
-#endif /* CONFIG_LRU_GEN */
+#endif /* CONFIG_LRU_GEN_WALKS_MMU */
 
 struct vma_iterator {
 	struct ma_state mas;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2efd3be484fd..bc3f63ec4291 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -640,9 +640,11 @@ struct lruvec {
 #ifdef CONFIG_LRU_GEN
 	/* evictable pages divided into generations */
 	struct lru_gen_folio		lrugen;
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
 	/* to concurrently iterate lru_gen_mm_list */
 	struct lru_gen_mm_state		mm_state;
 #endif
+#endif /* CONFIG_LRU_GEN */
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
 #endif
-- 
cgit v1.2.3


From 745b13e647cd119e70d16b57698e12b7c86ca264 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 27 Dec 2023 14:12:03 +0000
Subject: mm/mglru: remove CONFIG_MEMCG

Remove CONFIG_MEMCG in a refactoring to improve code readability at
the cost of a few bytes in struct lru_gen_folio per node when
CONFIG_MEMCG=n.

Link: https://lkml.kernel.org/r/20231227141205.2200125-4-kinseyho@google.com
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Co-developed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Tested-by: Donet Tom <donettom@linux.vnet.ibm.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h |  4 ----
 include/linux/mmzone.h   | 26 ++------------------------
 2 files changed, 2 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 552fa2d11c57..55b7121809ff 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1017,9 +1017,7 @@ struct lru_gen_mm_list {
 
 void lru_gen_add_mm(struct mm_struct *mm);
 void lru_gen_del_mm(struct mm_struct *mm);
-#ifdef CONFIG_MEMCG
 void lru_gen_migrate_mm(struct mm_struct *mm);
-#endif
 
 static inline void lru_gen_init_mm(struct mm_struct *mm)
 {
@@ -1050,11 +1048,9 @@ static inline void lru_gen_del_mm(struct mm_struct *mm)
 {
 }
 
-#ifdef CONFIG_MEMCG
 static inline void lru_gen_migrate_mm(struct mm_struct *mm)
 {
 }
-#endif
 
 static inline void lru_gen_init_mm(struct mm_struct *mm)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bc3f63ec4291..28665e1b8475 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -440,14 +440,12 @@ struct lru_gen_folio {
 	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 	/* whether the multi-gen LRU is enabled */
 	bool enabled;
-#ifdef CONFIG_MEMCG
 	/* the memcg generation this lru_gen_folio belongs to */
 	u8 gen;
 	/* the list segment this lru_gen_folio belongs to */
 	u8 seg;
 	/* per-node lru_gen_folio list for global reclaim */
 	struct hlist_nulls_node list;
-#endif
 };
 
 enum {
@@ -493,11 +491,6 @@ struct lru_gen_mm_walk {
 	bool force_scan;
 };
 
-void lru_gen_init_lruvec(struct lruvec *lruvec);
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
-
-#ifdef CONFIG_MEMCG
-
 /*
  * For each node, memcgs are divided into two generations: the old and the
  * young. For each generation, memcgs are randomly sharded into multiple bins
@@ -555,6 +548,8 @@ struct lru_gen_memcg {
 };
 
 void lru_gen_init_pgdat(struct pglist_data *pgdat);
+void lru_gen_init_lruvec(struct lruvec *lruvec);
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
 
 void lru_gen_init_memcg(struct mem_cgroup *memcg);
 void lru_gen_exit_memcg(struct mem_cgroup *memcg);
@@ -563,19 +558,6 @@ void lru_gen_offline_memcg(struct mem_cgroup *memcg);
 void lru_gen_release_memcg(struct mem_cgroup *memcg);
 void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);
 
-#else /* !CONFIG_MEMCG */
-
-#define MEMCG_NR_GENS	1
-
-struct lru_gen_memcg {
-};
-
-static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
-{
-}
-
-#endif /* CONFIG_MEMCG */
-
 #else /* !CONFIG_LRU_GEN */
 
 static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
@@ -590,8 +572,6 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 {
 }
 
-#ifdef CONFIG_MEMCG
-
 static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
 {
 }
@@ -616,8 +596,6 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
 {
 }
 
-#endif /* CONFIG_MEMCG */
-
 #endif /* CONFIG_LRU_GEN */
 
 struct lruvec {
-- 
cgit v1.2.3


From 533c67e6358406727145efae32882c4dc355d6c5 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 27 Dec 2023 14:12:04 +0000
Subject: mm/mglru: add dummy pmd_dirty()

Add dummy pmd_dirty() for architectures that don't provide it.
This is similar to commit 6617da8fb565 ("mm: add dummy pmd_young()
for architectures not having it").

Link: https://lkml.kernel.org/r/20231227141205.2200125-5-kinseyho@google.com
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202312210606.1Etqz3M4-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202312210042.xQEiqlEh-lkp@intel.com/
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Suggested-by: Yu Zhao <yuzhao@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Donet Tom <donettom@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 9ecc20fa6269..466cf477551a 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -184,6 +184,13 @@ static inline int pmd_young(pmd_t pmd)
 }
 #endif
 
+#ifndef pmd_dirty
+static inline int pmd_dirty(pmd_t pmd)
+{
+	return 0;
+}
+#endif
+
 /*
  * A facility to provide lazy MMU batching.  This allows PTE updates and
  * page invalidations to be delayed until a call to leave lazy MMU mode
-- 
cgit v1.2.3


From e435ca87882167dda78776ce4bd6eb2094eb864b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Dec 2023 08:57:43 +0000
Subject: mm: remove inc/dec lruvec page state functions

Patch series "Remove some lruvec page accounting functions", v2.

Some functions are now unused; remove them.  Make
__mod_lruvec_page_state() unused and then remove it.


This patch (of 6):

All callers of these have been converted to their folio equivalents.

Link: https://lkml.kernel.org/r/20231228085748.1083901-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231228085748.1083901-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vmstat.h | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index fed855bae6d8..147ae73e0ee7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -597,18 +597,6 @@ static inline void mod_lruvec_page_state(struct page *page,
 
 #endif /* CONFIG_MEMCG */
 
-static inline void __inc_lruvec_page_state(struct page *page,
-					   enum node_stat_item idx)
-{
-	__mod_lruvec_page_state(page, idx, 1);
-}
-
-static inline void __dec_lruvec_page_state(struct page *page,
-					   enum node_stat_item idx)
-{
-	__mod_lruvec_page_state(page, idx, -1);
-}
-
 static inline void __lruvec_stat_mod_folio(struct folio *folio,
 					   enum node_stat_item idx, int val)
 {
@@ -627,18 +615,6 @@ static inline void __lruvec_stat_sub_folio(struct folio *folio,
 	__lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
 }
 
-static inline void inc_lruvec_page_state(struct page *page,
-					 enum node_stat_item idx)
-{
-	mod_lruvec_page_state(page, idx, 1);
-}
-
-static inline void dec_lruvec_page_state(struct page *page,
-					 enum node_stat_item idx)
-{
-	mod_lruvec_page_state(page, idx, -1);
-}
-
 static inline void lruvec_stat_mod_folio(struct folio *folio,
 					 enum node_stat_item idx, int val)
 {
-- 
cgit v1.2.3


From c701123bd68bf1cc3bc167b4f597cb1f4995c39c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Dec 2023 08:57:48 +0000
Subject: mm/memcontrol: remove __mod_lruvec_page_state()

There are no more callers of __mod_lruvec_page_state(), so convert the
implementation to __lruvec_stat_mod_folio(), removing two calls to
compound_head() (one explicit, one hidden inside page_memcg()).

Link: https://lkml.kernel.org/r/20231228085748.1083901-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vmstat.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 147ae73e0ee7..343906a98d6e 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -556,19 +556,25 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
 	local_irq_restore(flags);
 }
 
-void __mod_lruvec_page_state(struct page *page,
+void __lruvec_stat_mod_folio(struct folio *folio,
 			     enum node_stat_item idx, int val);
 
-static inline void mod_lruvec_page_state(struct page *page,
+static inline void lruvec_stat_mod_folio(struct folio *folio,
 					 enum node_stat_item idx, int val)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__mod_lruvec_page_state(page, idx, val);
+	__lruvec_stat_mod_folio(folio, idx, val);
 	local_irq_restore(flags);
 }
 
+static inline void mod_lruvec_page_state(struct page *page,
+					 enum node_stat_item idx, int val)
+{
+	lruvec_stat_mod_folio(page_folio(page), idx, val);
+}
+
 #else
 
 static inline void __mod_lruvec_state(struct lruvec *lruvec,
@@ -583,10 +589,16 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
 	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 }
 
-static inline void __mod_lruvec_page_state(struct page *page,
-					   enum node_stat_item idx, int val)
+static inline void __lruvec_stat_mod_folio(struct folio *folio,
+					 enum node_stat_item idx, int val)
 {
-	__mod_node_page_state(page_pgdat(page), idx, val);
+	__mod_node_page_state(folio_pgdat(folio), idx, val);
+}
+
+static inline void lruvec_stat_mod_folio(struct folio *folio,
+					 enum node_stat_item idx, int val)
+{
+	mod_node_page_state(folio_pgdat(folio), idx, val);
 }
 
 static inline void mod_lruvec_page_state(struct page *page,
@@ -597,12 +609,6 @@ static inline void mod_lruvec_page_state(struct page *page,
 
 #endif /* CONFIG_MEMCG */
 
-static inline void __lruvec_stat_mod_folio(struct folio *folio,
-					   enum node_stat_item idx, int val)
-{
-	__mod_lruvec_page_state(&folio->page, idx, val);
-}
-
 static inline void __lruvec_stat_add_folio(struct folio *folio,
 					   enum node_stat_item idx)
 {
@@ -615,12 +621,6 @@ static inline void __lruvec_stat_sub_folio(struct folio *folio,
 	__lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
 }
 
-static inline void lruvec_stat_mod_folio(struct folio *folio,
-					 enum node_stat_item idx, int val)
-{
-	mod_lruvec_page_state(&folio->page, idx, val);
-}
-
 static inline void lruvec_stat_add_folio(struct folio *folio,
 					 enum node_stat_item idx)
 {
-- 
cgit v1.2.3


From b805ab3c6935d14654ccc28f16ffce7a13c2c528 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Fri, 29 Dec 2023 10:26:51 +0800
Subject: mm/vmstat: move pgdemote_* out of CONFIG_NUMA_BALANCING

Demotion can work well without CONFIG_NUMA_BALANCING.  But the commit
23e9f0138963 ("mm/vmstat: move pgdemote_* to per-node stats") wrongly hid
it behind CONFIG_NUMA_BALANCING.

Fix it by moving them out of CONFIG_NUMA_BALANCING.

Link: https://lkml.kernel.org/r/20231229022651.3229174-1-lizhijian@fujitsu.com
Fixes: 23e9f0138963 ("mm/vmstat: move pgdemote_* to per-node stats")
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 28665e1b8475..c18c53353b50 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -207,11 +207,11 @@ enum node_stat_item {
 #ifdef CONFIG_NUMA_BALANCING
 	PGPROMOTE_SUCCESS,	/* promote successfully */
 	PGPROMOTE_CANDIDATE,	/* candidate pages to promote */
+#endif
 	/* PGDEMOTE_*: pages demoted */
 	PGDEMOTE_KSWAPD,
 	PGDEMOTE_DIRECT,
 	PGDEMOTE_KHUGEPAGED,
-#endif
 	NR_VM_NODE_STAT_ITEMS
 };
 
-- 
cgit v1.2.3


From 26a1a86dd093a10d0653429bf013dae6e95dccbf Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 20 Dec 2023 16:17:29 -0800
Subject: cxl/events: Promote CXL event structures to a core header

UEFI code can process CXL events through CPER records.  Those records
use almost the same format as the CXL events.

Lift the CXL event structures to a core header to be shared in later
patches.

[jic123: drop "CXL rev 3.0" mention]

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-2-1bb8a4ca2c7a@intel.com
[djbw: add F: entry to maintainers for include/linux/cxl-event.h]
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/cxl-event.h | 95 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 include/linux/cxl-event.h

(limited to 'include/linux')

diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
new file mode 100644
index 000000000000..0fc068123f8e
--- /dev/null
+++ b/include/linux/cxl-event.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Intel Corporation. */
+#ifndef _LINUX_CXL_EVENT_H
+#define _LINUX_CXL_EVENT_H
+
+/*
+ * Common Event Record Format
+ * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
+ */
+struct cxl_event_record_hdr {
+	uuid_t id;
+	u8 length;
+	u8 flags[3];
+	__le16 handle;
+	__le16 related_handle;
+	__le64 timestamp;
+	u8 maint_op_class;
+	u8 reserved[15];
+} __packed;
+
+#define CXL_EVENT_RECORD_DATA_LENGTH 0x50
+struct cxl_event_record_raw {
+	struct cxl_event_record_hdr hdr;
+	u8 data[CXL_EVENT_RECORD_DATA_LENGTH];
+} __packed;
+
+/*
+ * General Media Event Record
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ */
+#define CXL_EVENT_GEN_MED_COMP_ID_SIZE	0x10
+struct cxl_event_gen_media {
+	struct cxl_event_record_hdr hdr;
+	__le64 phys_addr;
+	u8 descriptor;
+	u8 type;
+	u8 transaction_type;
+	u8 validity_flags[2];
+	u8 channel;
+	u8 rank;
+	u8 device[3];
+	u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE];
+	u8 reserved[46];
+} __packed;
+
+/*
+ * DRAM Event Record - DER
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 3-44
+ */
+#define CXL_EVENT_DER_CORRECTION_MASK_SIZE	0x20
+struct cxl_event_dram {
+	struct cxl_event_record_hdr hdr;
+	__le64 phys_addr;
+	u8 descriptor;
+	u8 type;
+	u8 transaction_type;
+	u8 validity_flags[2];
+	u8 channel;
+	u8 rank;
+	u8 nibble_mask[3];
+	u8 bank_group;
+	u8 bank;
+	u8 row[3];
+	u8 column[2];
+	u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE];
+	u8 reserved[0x17];
+} __packed;
+
+/*
+ * Get Health Info Record
+ * CXL rev 3.0 section 8.2.9.8.3.1; Table 8-100
+ */
+struct cxl_get_health_info {
+	u8 health_status;
+	u8 media_status;
+	u8 add_status;
+	u8 life_used;
+	u8 device_temp[2];
+	u8 dirty_shutdown_cnt[4];
+	u8 cor_vol_err_cnt[4];
+	u8 cor_per_err_cnt[4];
+} __packed;
+
+/*
+ * Memory Module Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
+ */
+struct cxl_event_mem_module {
+	struct cxl_event_record_hdr hdr;
+	u8 event_type;
+	struct cxl_get_health_info info;
+	u8 reserved[0x3d];
+} __packed;
+
+#endif /* _LINUX_CXL_EVENT_H */
-- 
cgit v1.2.3


From 477bd4beb93bf9ace9bda71f1437b191befa9cf4 Mon Sep 17 00:00:00 2001
From: Swee Leong Ching <leong.ching.swee@intel.com>
Date: Fri, 5 Jan 2024 15:09:23 +0800
Subject: net: stmmac: Make MSI interrupt routine generic

There is no support for per DMA channel interrupt for non-MSI platform,
where the MAC's per channel interrupt hooks up to interrupt controller(GIC)
through shared peripheral interrupt(SPI) to handle interrupt from TX/RX
transmit channel.

This patch generalize the existing MSI ISR to also support non-MSI
platform.

Signed-off-by: Teoh Ji Sheng <ji.sheng.teoh@intel.com>
Signed-off-by: Swee Leong Ching <leong.ching.swee@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/stmmac.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index dee5ad6e48c5..b950e6f9761d 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -98,7 +98,7 @@ struct stmmac_dma_cfg {
 	int mixed_burst;
 	bool aal;
 	bool eame;
-	bool multi_msi_en;
+	bool multi_irq_en;
 	bool dche;
 };
 
@@ -215,7 +215,7 @@ struct dwmac4_addrs {
 #define STMMAC_FLAG_TSO_EN			BIT(4)
 #define STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP	BIT(5)
 #define STMMAC_FLAG_VLAN_FAIL_Q_EN		BIT(6)
-#define STMMAC_FLAG_MULTI_MSI_EN		BIT(7)
+#define STMMAC_FLAG_MULTI_IRQ_EN		BIT(7)
 #define STMMAC_FLAG_EXT_SNAPSHOT_EN		BIT(8)
 #define STMMAC_FLAG_INT_SNAPSHOT_EN		BIT(9)
 #define STMMAC_FLAG_RX_CLK_RUNS_IN_LPI		BIT(10)
-- 
cgit v1.2.3


From deb704281f076097b0347116a82edeba96697db1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 17 Nov 2023 17:14:27 -0500
Subject: SUNRPC: Add a server-side API for retrieving an RPC's pseudoflavor

NFSD will use this new API to determine whether nfsd_splice_read is
safe to use. This avoids the need to add a dependency to NFSD for
CONFIG_SUNRPC_GSS.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svcauth.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index 6f90203edbf8..61c455f1e1f5 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -131,8 +131,11 @@ enum svc_auth_status {
  *   This call releases a domain.
  *
  * set_client()
- *   Givens a pending request (struct svc_rqst), finds and assigns
+ *   Given a pending request (struct svc_rqst), finds and assigns
  *   an appropriate 'auth_domain' as the client.
+ *
+ * pseudoflavor()
+ *   Returns RPC_AUTH pseudoflavor in use by @rqstp.
  */
 struct auth_ops {
 	char *	name;
@@ -143,11 +146,13 @@ struct auth_ops {
 	int			(*release)(struct svc_rqst *rqstp);
 	void			(*domain_release)(struct auth_domain *dom);
 	enum svc_auth_status	(*set_client)(struct svc_rqst *rqstp);
+	rpc_authflavor_t	(*pseudoflavor)(struct svc_rqst *rqstp);
 };
 
 struct svc_xprt;
 
 extern enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp);
+extern rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp);
 extern int	svc_authorise(struct svc_rqst *rqstp);
 extern enum svc_auth_status svc_set_client(struct svc_rqst *rqstp);
 extern int	svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops);
-- 
cgit v1.2.3


From 3587b5c75376fd0b6ca8c4a8de54954e410f4e0e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 17 Nov 2023 17:14:46 -0500
Subject: SUNRPC: Remove RQ_SPLICE_OK

This flag is no longer used.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index b10f987509cc..544fcfe07479 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -260,8 +260,6 @@ enum {
 	RQ_LOCAL,		/* local request */
 	RQ_USEDEFERRAL,		/* use deferral */
 	RQ_DROPME,		/* drop current reply */
-	RQ_SPLICE_OK,		/* turned off in gss privacy to prevent
-				 * encrypting page cache pages */
 	RQ_VICTIM,		/* Have agreed to shut down */
 	RQ_DATA,		/* request has data */
 };
-- 
cgit v1.2.3


From 52e89100754b2e888cb63bf2d19e65d809497cd6 Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong@kylinos.cn>
Date: Sat, 2 Dec 2023 21:07:25 +0000
Subject: NFSv4, NFSD: move enum nfs_cb_opnum4 to include/linux/nfs4.h

Callback operations enum is defined in client and server, move it to
common header file.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Anna Schumaker <Anna.Schumaker@netapp.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/nfs4.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index c11c4db34639..ef8d2d618d5b 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -869,4 +869,26 @@ enum {
 	RCA4_TYPE_MASK_OTHER_LAYOUT_MAX	= 15,
 };
 
+enum nfs_cb_opnum4 {
+	OP_CB_GETATTR = 3,
+	OP_CB_RECALL  = 4,
+
+	/* Callback operations new to NFSv4.1 */
+	OP_CB_LAYOUTRECALL  = 5,
+	OP_CB_NOTIFY        = 6,
+	OP_CB_PUSH_DELEG    = 7,
+	OP_CB_RECALL_ANY    = 8,
+	OP_CB_RECALLABLE_OBJ_AVAIL = 9,
+	OP_CB_RECALL_SLOT   = 10,
+	OP_CB_SEQUENCE      = 11,
+	OP_CB_WANTS_CANCELLED = 12,
+	OP_CB_NOTIFY_LOCK   = 13,
+	OP_CB_NOTIFY_DEVICEID = 14,
+
+	/* Callback operations new to NFSv4.2 */
+	OP_CB_OFFLOAD = 15,
+
+	OP_CB_ILLEGAL = 10044,
+};
+
 #endif
-- 
cgit v1.2.3


From b541dd554bc0442f7ff8c6cab6c5460c044913c8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 21 Nov 2023 11:40:13 -0500
Subject: svcrdma: Eliminate allocation of recv_ctxt objects in backchannel

The svc_rdma_recv_ctxt free list uses a lockless list to avoid the
need for a spin lock in the fast path. llist_del_first(), which is
used by svc_rdma_recv_ctxt_get(), requires serialization, however,
when there are multiple list producers that are unserialized.

I mistakenly thought there was only one caller of
svc_rdma_recv_ctxt_get() (svc_rdma_refresh_recvs()), thus explicit
serialization would not be necessary. But there is another caller:
svc_rdma_bc_sendto(), and these two are not serialized against each
other. I haven't seen ill effects that I could directly ascribe to
a lack of serialization. It's just an observation based on code
audit.

When DMA-mapping before sending a Reply, the passed-in struct
svc_rdma_recv_ctxt is used only for its write and reply PCLs. These
are currently always empty in the backchannel case. So, instead of
passing a full svc_rdma_recv_ctxt object to
svc_rdma_map_reply_msg(), let's pass in just the Write and Reply
PCLs.

This change makes it unnecessary for the backchannel to acquire a
dummy svc_rdma_recv_ctxt object when sending an RPC Call. The need
for svc_rdma_recv_ctxt free list serialization is now completely
avoided.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index a5ee0af2a310..4ac32895a058 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -200,7 +200,8 @@ extern int svc_rdma_send(struct svcxprt_rdma *rdma,
 			 struct svc_rdma_send_ctxt *ctxt);
 extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 				  struct svc_rdma_send_ctxt *sctxt,
-				  const struct svc_rdma_recv_ctxt *rctxt,
+				  const struct svc_rdma_pcl *write_pcl,
+				  const struct svc_rdma_pcl *reply_pcl,
 				  const struct xdr_buf *xdr);
 extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 				    struct svc_rdma_send_ctxt *sctxt,
-- 
cgit v1.2.3


From 9c7e1a06588ee6962afe0dfe5a398e1d23212005 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 21 Nov 2023 11:40:26 -0500
Subject: svcrdma: Add a utility workqueue to svcrdma

To handle work in the background, set up an UNBOUND workqueue for
svcrdma. Subsequent patches will make use of it.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 4ac32895a058..e18c94e816b3 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -65,6 +65,7 @@ extern unsigned int svcrdma_ord;
 extern unsigned int svcrdma_max_requests;
 extern unsigned int svcrdma_max_bc_requests;
 extern unsigned int svcrdma_max_req_size;
+extern struct workqueue_struct *svcrdma_wq;
 
 extern struct percpu_counter svcrdma_stat_read;
 extern struct percpu_counter svcrdma_stat_recv;
-- 
cgit v1.2.3


From ae225fe27b931de89b6b1e1bbe6de4de23000850 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 21 Nov 2023 11:40:33 -0500
Subject: svcrdma: Add an async version of svc_rdma_send_ctxt_put()

DMA unmapping can take quite some time, so it should not be handled
in a single-threaded completion handler. Defer releasing send_ctxts
to the recently-added workqueue.

With this patch, DMA unmapping can be handled in parallel, and it
does not cause head-of-queue blocking of Send completions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index e18c94e816b3..ab250017b99f 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -152,7 +152,9 @@ struct svc_rdma_recv_ctxt {
 struct svc_rdma_send_ctxt {
 	struct llist_node	sc_node;
 	struct rpc_rdma_cid	sc_cid;
+	struct work_struct	sc_work;
 
+	struct svcxprt_rdma	*sc_rdma;
 	struct ib_send_wr	sc_send_wr;
 	struct ib_cqe		sc_cqe;
 	struct xdr_buf		sc_hdrbuf;
-- 
cgit v1.2.3


From 6a04a4349330c5476adf465159a7f49411091bbe Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2023 09:57:09 -0500
Subject: svcrdma: Move struct svc_rdma_chunk_ctxt to svc_rdma.h

Prepare for nestling these into the send and recv ctxts so they
no longer have to be allocated dynamically.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ab250017b99f..50c4f18a9b7f 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -127,6 +127,21 @@ enum {
 
 #define RPCSVC_MAXPAYLOAD_RDMA	RPCSVC_MAXPAYLOAD
 
+/*
+ * A chunk context tracks all I/O for moving one Read or Write
+ * chunk. This is a set of rdma_rw's that handle data movement
+ * for all segments of one chunk.
+ */
+struct svc_rdma_chunk_ctxt {
+	struct rpc_rdma_cid	cc_cid;
+	struct ib_cqe		cc_cqe;
+	struct list_head	cc_rwctxts;
+	ktime_t			cc_posttime;
+	int			cc_sqecount;
+	enum ib_wc_status	cc_status;
+	struct completion	cc_done;
+};
+
 struct svc_rdma_recv_ctxt {
 	struct llist_node	rc_node;
 	struct list_head	rc_list;
-- 
cgit v1.2.3


From b1818412d06fc03605d02dbdd4a7c53dc9e2d5ba Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2023 09:57:16 -0500
Subject: svcrdma: Start moving fields out of struct svc_rdma_read_info

Since the request's svc_rdma_recv_ctxt will stay around for the
duration of the RDMA Read operation, the contents of struct
svc_rdma_read_info can reside in the request's svc_rdma_recv_ctxt
rather than being allocated separately. This will eventually save a
call to kmalloc() in a hot path.

Start this clean-up by moving the Read chunk's svc_rdma_chunk_ctxt.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 50c4f18a9b7f..6c7501ae4e29 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -156,6 +156,10 @@ struct svc_rdma_recv_ctxt {
 	u32			rc_inv_rkey;
 	__be32			rc_msgtype;
 
+	/* State for pulling a Read chunk */
+	unsigned int		rc_readbytes;
+	struct svc_rdma_chunk_ctxt	rc_cc;
+
 	struct svc_rdma_pcl	rc_call_pcl;
 
 	struct svc_rdma_pcl	rc_read_pcl;
-- 
cgit v1.2.3


From 8e122582680c6f8acd686a5a2af9c0e46fe90f2d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2023 09:57:22 -0500
Subject: svcrdma: Move svc_rdma_read_info::ri_pageno to struct
 svc_rdma_recv_ctxt

Further clean up: move the page index field into svc_rdma_recv_ctxt.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 6c7501ae4e29..0ea66f73bec2 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -157,6 +157,7 @@ struct svc_rdma_recv_ctxt {
 	__be32			rc_msgtype;
 
 	/* State for pulling a Read chunk */
+	unsigned int		rc_curpage;
 	unsigned int		rc_readbytes;
 	struct svc_rdma_chunk_ctxt	rc_cc;
 
-- 
cgit v1.2.3


From 919f6e790ab6cca772fa60c6006162c0a7ebbfc5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2023 09:57:28 -0500
Subject: svcrdma: Move read_info::ri_pageoff into struct svc_rdma_recv_ctxt

Further clean up: move the starting byte offset field into
svc_rdma_recv_ctxt.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 0ea66f73bec2..44a14eaf8c40 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -157,6 +157,7 @@ struct svc_rdma_recv_ctxt {
 	__be32			rc_msgtype;
 
 	/* State for pulling a Read chunk */
+	unsigned int		rc_pageoff;
 	unsigned int		rc_curpage;
 	unsigned int		rc_readbytes;
 	struct svc_rdma_chunk_ctxt	rc_cc;
-- 
cgit v1.2.3


From fc20f19b4df4a46d1003d15d84148a117e8bdf5d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2023 09:57:35 -0500
Subject: svcrdma: Update synopsis of svc_rdma_build_read_segment()

Since the RDMA Read I/O state is now contained in the recv_ctxt,
svc_rdma_build_read_segment() can use the recv_ctxt to derive that
information rather than the other way around. This removes one usage
of the ri_readctxt field, enabling its removal in a subsequent
patch.

At the same time, the use of ri_rqst can similarly be replaced with
a passed-in function parameter.

Start with build_read_segment() because it is a common utility
function at the bottom of the Read chunk path.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 44a14eaf8c40..f03f9909fb97 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -116,6 +116,13 @@ struct svcxprt_rdma {
 /* sc_flags */
 #define RDMAXPRT_CONN_PENDING	3
 
+static inline struct svcxprt_rdma *svc_rdma_rqst_rdma(struct svc_rqst *rqstp)
+{
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+
+	return container_of(xprt, struct svcxprt_rdma, sc_xprt);
+}
+
 /*
  * Default connection parameters
  */
-- 
cgit v1.2.3


From 018f34051bc9f4908336b3fe9e52931bb8410ced Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2023 09:58:33 -0500
Subject: svcrdma: Move the svc_rdma_cc_init() call

Now that the chunk_ctxt for Reads is no longer dynamically allocated
it can be initialized once for the life of the object that contains
it (struct svc_rdma_recv_ctxt).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f03f9909fb97..051fefde8d51 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -211,6 +211,8 @@ extern int svc_rdma_recvfrom(struct svc_rqst *);
 
 /* svc_rdma_rw.c */
 extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
+extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+			     struct svc_rdma_chunk_ctxt *cc);
 extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 				     const struct svc_rdma_chunk *chunk,
 				     const struct xdr_buf *xdr);
-- 
cgit v1.2.3


From 28ee0ec8948ac235327a1f5472fc032b308284a3 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Dec 2023 10:24:08 -0500
Subject: svcrdma: De-duplicate completion ID initialization helpers

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 051fefde8d51..46f2ce9f810b 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -134,6 +134,30 @@ enum {
 
 #define RPCSVC_MAXPAYLOAD_RDMA	RPCSVC_MAXPAYLOAD
 
+/**
+ * svc_rdma_send_cid_init - Initialize a Receive Queue completion ID
+ * @rdma: controlling transport
+ * @cid: completion ID to initialize
+ */
+static inline void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
+					  struct rpc_rdma_cid *cid)
+{
+	cid->ci_queue_id = rdma->sc_rq_cq->res.id;
+	cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
+/**
+ * svc_rdma_send_cid_init - Initialize a Send Queue completion ID
+ * @rdma: controlling transport
+ * @cid: completion ID to initialize
+ */
+static inline void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma,
+					  struct rpc_rdma_cid *cid)
+{
+	cid->ci_queue_id = rdma->sc_sq_cq->res.id;
+	cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
 /*
  * A chunk context tracks all I/O for moving one Read or Write
  * chunk. This is a set of rdma_rw's that handle data movement
-- 
cgit v1.2.3


From 4d9d69db898d05bd063548eee65d16a020676fec Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Dec 2023 17:31:48 -0500
Subject: svcrdma: Add back svc_rdma_recv_ctxt::rc_pages

Having an nfsd thread waiting for an RDMA Read completion is
problematic if the Read responder (the client) stops responding. We
need to go back to handling RDMA Reads by allowing the nfsd thread
to return to the svc scheduler, then waking a second thread finish
the RPC message once the Read completion fires.

To start with, restore the rc_pages field so that RDMA Read pages
can be managed across calls to svc_rdma_recvfrom().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 46f2ce9f810b..0f2d7f68ef5d 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -183,7 +183,6 @@ struct svc_rdma_recv_ctxt {
 	void			*rc_recv_buf;
 	struct xdr_stream	rc_stream;
 	u32			rc_byte_len;
-	unsigned int		rc_page_count;
 	u32			rc_inv_rkey;
 	__be32			rc_msgtype;
 
@@ -199,6 +198,9 @@ struct svc_rdma_recv_ctxt {
 	struct svc_rdma_chunk	*rc_cur_result_payload;
 	struct svc_rdma_pcl	rc_write_pcl;
 	struct svc_rdma_pcl	rc_reply_pcl;
+
+	unsigned int		rc_page_count;
+	struct page		*rc_pages[RPCSVC_MAXPAGES];
 };
 
 struct svc_rdma_send_ctxt {
-- 
cgit v1.2.3


From a937693a82fd2211c5e52b638959d1486a77d16a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Dec 2023 17:31:54 -0500
Subject: svcrdma: Add back svcxprt_rdma::sc_read_complete_q

Having an nfsd thread waiting for an RDMA Read completion is
problematic if the Read responder (ie, the client) stops responding.
We need to go back to handling RDMA Reads by allowing the nfsd
thread to return to the svc scheduler, then waking a second thread
finish the RPC message once the Read completion fires.

As a next step, add a list_head upon which completed Reads are queued.
A subsequent patch will make use of this queue.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 0f2d7f68ef5d..c98d29e51b9c 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -98,6 +98,7 @@ struct svcxprt_rdma {
 	u32		     sc_pending_recvs;
 	u32		     sc_recv_batch;
 	struct list_head     sc_rq_dto_q;
+	struct list_head     sc_read_complete_q;
 	spinlock_t	     sc_rq_dto_lock;
 	struct ib_qp         *sc_qp;
 	struct ib_cq         *sc_rq_cq;
-- 
cgit v1.2.3


From d3dba534100d4e9eb7a5204be97cd6f9ada2066e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Dec 2023 17:32:07 -0500
Subject: svcrdma: Implement multi-stage Read completion again

Having an nfsd thread waiting for an RDMA Read completion is
problematic if the Read responder (ie, the client) stops responding.
We need to go back to handling RDMA Reads by getting the svc scheduler
to call svc_rdma_recvfrom() a second time to finish building an RPC
message after a Read completion.

This is the final patch, and makes several changes that have to
happen concurrently:

1. svc_rdma_process_read_list no longer waits for a completion, but
   simply builds and posts the Read WRs.

2. svc_rdma_read_done() now queues a completed Read on
   sc_read_complete_q for later processing rather than calling
   complete().

3. The completed RPC message is no longer built in the
   svc_rdma_process_read_list() path. Finishing the message is now
   done in svc_rdma_recvfrom() when it notices work on the
   sc_read_complete_q. The "finish building this RPC message" code
   is removed from the svc_rdma_process_read_list() path.

This arrangement avoids the need for an nfsd thread to wait for an
RDMA Read non-interruptibly without a timeout. It's basically the
same code structure that Tom Tucker used for Read chunks along with
some clean-up and modernization.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index c98d29e51b9c..e7595ae62fe2 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -170,8 +170,6 @@ struct svc_rdma_chunk_ctxt {
 	struct list_head	cc_rwctxts;
 	ktime_t			cc_posttime;
 	int			cc_sqecount;
-	enum ib_wc_status	cc_status;
-	struct completion	cc_done;
 };
 
 struct svc_rdma_recv_ctxt {
@@ -191,6 +189,7 @@ struct svc_rdma_recv_ctxt {
 	unsigned int		rc_pageoff;
 	unsigned int		rc_curpage;
 	unsigned int		rc_readbytes;
+	struct xdr_buf		rc_saved_arg;
 	struct svc_rdma_chunk_ctxt	rc_cc;
 
 	struct svc_rdma_pcl	rc_call_pcl;
@@ -240,6 +239,9 @@ extern int svc_rdma_recvfrom(struct svc_rqst *);
 extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
 extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
 			     struct svc_rdma_chunk_ctxt *cc);
+extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
+				struct svc_rdma_chunk_ctxt *cc,
+				enum dma_data_direction dir);
 extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 				     const struct svc_rdma_chunk *chunk,
 				     const struct xdr_buf *xdr);
-- 
cgit v1.2.3


From 7b207ccd983350a5dedd132b57c666186dd02a7c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 15 Dec 2023 11:56:32 +1100
Subject: svc: don't hold reference for poolstats, only mutex.

A future patch will remove refcounting on svc_serv as it is of little
use.
It is currently used to keep the svc around while the pool_stats file is
open.
Change this to get the pointer, protected by the mutex, only in
seq_start, and the release the mutex in seq_stop.
This means that if the nfsd server is stopped and restarted while the
pool_stats file it open, then some pool stats info could be from the
first instance and some from the second.  This might appear odd, but is
unlikely to be a problem in practice.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 544fcfe07479..3bea2840272d 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -97,6 +97,12 @@ struct svc_serv {
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 };
 
+/* This is used by pool_stats to find and lock an svc */
+struct svc_info {
+	struct svc_serv		*serv;
+	struct mutex		*mutex;
+};
+
 /**
  * svc_get() - increment reference count on a SUNRPC serv
  * @serv:  the svc_serv to have count incremented
@@ -431,7 +437,7 @@ void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 				     int (*threadfn)(void *data));
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
-int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
+int		   svc_pool_stats_open(struct svc_info *si, struct file *file);
 void		   svc_process(struct svc_rqst *rqstp);
 void		   svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp);
 int		   svc_register(const struct svc_serv *, struct net *, const int,
-- 
cgit v1.2.3


From 1e3577a4521ef33199eea05ce7b9099825848c49 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 15 Dec 2023 11:56:34 +1100
Subject: SUNRPC: discard sv_refcnt, and svc_get/svc_put

sv_refcnt is no longer useful.
lockd and nfs-cb only ever have the svc active when there are a non-zero
number of threads, so sv_refcnt mirrors sv_nrthreads.

nfsd also keeps the svc active between when a socket is added and when
the first thread is started, but we don't really need a refcount for
that.  We can simply not destroy the svc while there are any permanent
sockets attached.

So remove sv_refcnt and the get/put functions.
Instead of a final call to svc_put(), call svc_destroy() instead.
This is changed to also store NULL in the passed-in pointer to make it
easier to avoid use-after-free situations.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 27 +--------------------------
 1 file changed, 1 insertion(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 3bea2840272d..8d7888234e9e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -69,7 +69,6 @@ struct svc_serv {
 	struct svc_program *	sv_program;	/* RPC program */
 	struct svc_stat *	sv_stats;	/* RPC statistics */
 	spinlock_t		sv_lock;
-	struct kref		sv_refcnt;
 	unsigned int		sv_nrthreads;	/* # of server threads */
 	unsigned int		sv_maxconn;	/* max connections allowed or
 						 * '0' causing max to be based
@@ -103,31 +102,7 @@ struct svc_info {
 	struct mutex		*mutex;
 };
 
-/**
- * svc_get() - increment reference count on a SUNRPC serv
- * @serv:  the svc_serv to have count incremented
- *
- * Returns: the svc_serv that was passed in.
- */
-static inline struct svc_serv *svc_get(struct svc_serv *serv)
-{
-	kref_get(&serv->sv_refcnt);
-	return serv;
-}
-
-void svc_destroy(struct kref *);
-
-/**
- * svc_put - decrement reference count on a SUNRPC serv
- * @serv:  the svc_serv to have count decremented
- *
- * When the reference count reaches zero, svc_destroy()
- * is called to clean up and free the serv.
- */
-static inline void svc_put(struct svc_serv *serv)
-{
-	kref_put(&serv->sv_refcnt, svc_destroy);
-}
+void svc_destroy(struct svc_serv **svcp);
 
 /*
  * Maximum payload size supported by a kernel RPC server.
-- 
cgit v1.2.3


From 317bacf960a4879af22d12175f47d284930b3273 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Fri, 1 Dec 2023 17:25:27 -0500
Subject: i3c: master: add enable(disable) hot join in sys entry

Add hotjoin entry in sys file system allow user enable/disable hotjoin
feature.

Add (*enable(disable)_hotjoin)() to i3c_master_controller_ops.
Add api i3c_master_enable(disable)_hotjoin();

Signed-off-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20231201222532.2431484-2-Frank.Li@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 24c1863b86e2..3b5bd8e3257c 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -460,6 +460,8 @@ struct i3c_master_controller_ops {
 	int (*disable_ibi)(struct i3c_dev_desc *dev);
 	void (*recycle_ibi_slot)(struct i3c_dev_desc *dev,
 				 struct i3c_ibi_slot *slot);
+	int (*enable_hotjoin)(struct i3c_master_controller *master);
+	int (*disable_hotjoin)(struct i3c_master_controller *master);
 };
 
 /**
@@ -495,6 +497,7 @@ struct i3c_master_controller {
 	const struct i3c_master_controller_ops *ops;
 	unsigned int secondary : 1;
 	unsigned int init_done : 1;
+	unsigned int hotjoin: 1;
 	struct {
 		struct list_head i3c;
 		struct list_head i2c;
@@ -551,6 +554,8 @@ int i3c_master_register(struct i3c_master_controller *master,
 			const struct i3c_master_controller_ops *ops,
 			bool secondary);
 void i3c_master_unregister(struct i3c_master_controller *master);
+int i3c_master_enable_hotjoin(struct i3c_master_controller *master);
+int i3c_master_disable_hotjoin(struct i3c_master_controller *master);
 
 /**
  * i3c_dev_get_master_data() - get master private data attached to an I3C
-- 
cgit v1.2.3


From e5e3df06ac98d15cfb10bb5c12356709365e91b2 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Fri, 1 Dec 2023 17:25:29 -0500
Subject: i3c: add actual_len in i3c_priv_xfer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In MIPI I3C Specification:

"Ninth Bit of SDR Target Returned (Read) Data as End-of-Data: In I2C, the
ninth Data bit from Target to Controller is an ACK by the Controller. By
contrast, in I3C this bit allows the Target to end a Read, and allows the
Controller to Abort a Read. In SDR terms, the ninth bit of Read data is
referred to as the T-Bit (for ‘Transition’)"

I3C allow devices early terminate data transfer. So need "actual_len" field
to indicate how much get by i3c_priv_xfer.

Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20231201222532.2431484-4-Frank.Li@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/device.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h
index 84ed77c04940..e119f11948ef 100644
--- a/include/linux/i3c/device.h
+++ b/include/linux/i3c/device.h
@@ -54,6 +54,7 @@ enum i3c_hdr_mode {
  * struct i3c_priv_xfer - I3C SDR private transfer
  * @rnw: encodes the transfer direction. true for a read, false for a write
  * @len: transfer length in bytes of the transfer
+ * @actual_len: actual length in bytes are transferred by the controller
  * @data: input/output buffer
  * @data.in: input buffer. Must point to a DMA-able buffer
  * @data.out: output buffer. Must point to a DMA-able buffer
@@ -62,6 +63,7 @@ enum i3c_hdr_mode {
 struct i3c_priv_xfer {
 	u8 rnw;
 	u16 len;
+	u16 actual_len;
 	union {
 		void *in;
 		const void *out;
-- 
cgit v1.2.3


From 18e5794879905a788e06fb2bc40b6f5b58eae5c2 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:05:42 -0800
Subject: i3c: master: fix Excess kernel-doc description warning

Remove the @boardinfo: line to prevent the kernel-doc warning:

include/linux/i3c/master.h:98: warning: Excess struct member 'boardinfo' description in 'i2c_dev_desc'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc:  <linux-i3c@lists.infradead.org>
Link: https://lore.kernel.org/r/20231223050542.13930-1-rdunlap@infradead.org
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 3b5bd8e3257c..1ecd73b17ff5 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -76,7 +76,6 @@ struct i2c_dev_boardinfo {
 /**
  * struct i2c_dev_desc - I2C device descriptor
  * @common: common part of the I2C device descriptor
- * @boardinfo: pointer to the boardinfo attached to this I2C device
  * @dev: I2C device object registered to the I2C framework
  * @addr: I2C device address
  * @lvr: LVR (Legacy Virtual Register) needed by the I3C core to know about
-- 
cgit v1.2.3


From 10416a3578ba5f76d0b161d2d36a1d8a4c46a69d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 15:59:32 +0100
Subject: firewire: make fw_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the fw_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Cc: linux1394-devel@lists.sourceforge.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/2023121931-skydiver-dodgy-d1bd@gregkh
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 include/linux/firewire.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index bd3fc75d4f14..dd9f2d765e68 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -75,7 +75,7 @@ void fw_csr_iterator_init(struct fw_csr_iterator *ci, const u32 *p);
 int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value);
 int fw_csr_string(const u32 *directory, int key, char *buf, size_t size);
 
-extern struct bus_type fw_bus_type;
+extern const struct bus_type fw_bus_type;
 
 struct fw_card_driver;
 struct fw_node;
-- 
cgit v1.2.3


From e9ee910218ffd420454b52a052d6f1087354905b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sun, 7 Jan 2024 17:11:38 -0800
Subject: Revert "net: stmmac: Enable Per DMA Channel interrupt"

Revert "net: stmmac: Use interrupt mode INTM=1 for per channel irq"
This reverts commit 36af9f25ddfd311da82628f194c794786467cb12.
Revert "net: stmmac: Add support for TX/RX channel interrupt"
This reverts commit 9072e03d32088137a435ddf3aa95fd6e038d69d8.
Revert "net: stmmac: Make MSI interrupt routine generic"
This reverts commit 477bd4beb93bf9ace9bda71f1437b191befa9cf4.
Revert "dt-bindings: net: snps,dwmac: per channel irq"
This reverts commit 67d47c8ada0f8795bfcdb85cc8f2ad3ce556674b.

Device tree bindings need to be reviewed.

Link: https://lore.kernel.org/all/2df9fe3e-7971-4aa2-89a9-0e085b3b00d7@linaro.org/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/stmmac.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index b950e6f9761d..dee5ad6e48c5 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -98,7 +98,7 @@ struct stmmac_dma_cfg {
 	int mixed_burst;
 	bool aal;
 	bool eame;
-	bool multi_irq_en;
+	bool multi_msi_en;
 	bool dche;
 };
 
@@ -215,7 +215,7 @@ struct dwmac4_addrs {
 #define STMMAC_FLAG_TSO_EN			BIT(4)
 #define STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP	BIT(5)
 #define STMMAC_FLAG_VLAN_FAIL_Q_EN		BIT(6)
-#define STMMAC_FLAG_MULTI_IRQ_EN		BIT(7)
+#define STMMAC_FLAG_MULTI_MSI_EN		BIT(7)
 #define STMMAC_FLAG_EXT_SNAPSHOT_EN		BIT(8)
 #define STMMAC_FLAG_INT_SNAPSHOT_EN		BIT(9)
 #define STMMAC_FLAG_RX_CLK_RUNS_IN_LPI		BIT(10)
-- 
cgit v1.2.3


From 3fbf61207c66ff7ac9b60ab76d4bfd239f97e973 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sun, 7 Jan 2024 17:14:51 -0800
Subject: Revert "mlx5 updates 2023-12-20"

Revert "net/mlx5: Implement management PF Ethernet profile"
This reverts commit 22c4640698a1d47606b5a4264a584e8046641784.
Revert "net/mlx5: Enable SD feature"
This reverts commit c88c49ac9c18fb7c3fa431126de1d8f8f555e912.
Revert "net/mlx5e: Block TLS device offload on combined SD netdev"
This reverts commit 83a59ce0057b7753d7fbece194b89622c663b2a6.
Revert "net/mlx5e: Support per-mdev queue counter"
This reverts commit d72baceb92539a178d2610b0e9ceb75706a75b55.
Revert "net/mlx5e: Support cross-vhca RSS"
This reverts commit c73a3ab8fa6e93a783bd563938d7cf00d62d5d34.
Revert "net/mlx5e: Let channels be SD-aware"
This reverts commit e4f9686bdee7b4dd89e0ed63cd03606e4bda4ced.
Revert "net/mlx5e: Create EN core HW resources for all secondary devices"
This reverts commit c4fb94aa822d6c9d05fc3c5aee35c7e339061dc1.
Revert "net/mlx5e: Create single netdev per SD group"
This reverts commit e2578b4f983cfcd47837bbe3bcdbf5920e50b2ad.
Revert "net/mlx5: SD, Add informative prints in kernel log"
This reverts commit c82d360325112ccc512fc11a3b68cdcdf04a1478.
Revert "net/mlx5: SD, Implement steering for primary and secondaries"
This reverts commit 605fcce33b2d1beb0139b6e5913fa0b2062116b2.
Revert "net/mlx5: SD, Implement devcom communication and primary election"
This reverts commit a45af9a96740873db9a4b5bb493ce2ad81ccb4d5.
Revert "net/mlx5: SD, Implement basic query and instantiation"
This reverts commit 63b9ce944c0e26c44c42cdd5095c2e9851c1a8ff.
Revert "net/mlx5: SD, Introduce SD lib"
This reverts commit 4a04a31f49320d078b8078e1da4b0e2faca5dfa3.
Revert "net/mlx5: Fix query of sd_group field"
This reverts commit e04984a37398b3f4f5a79c993b94c6b1224184cc.
Revert "net/mlx5e: Use the correct lag ports number when creating TISes"
This reverts commit a7e7b40c4bc115dbf2a2bb453d7bbb2e0ea99703.

There are some unanswered questions on the list, and we don't
have any docs. Given the lack of replies so far and the fact
that v6.8 merge window has started - let's revert this and
revisit for v6.9.

Link: https://lore.kernel.org/all/20231221005721.186607-1-saeed@kernel.org/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/driver.h   | 10 ----------
 include/linux/mlx5/mlx5_ifc.h | 24 ++++++------------------
 include/linux/mlx5/vport.h    |  1 -
 3 files changed, 6 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2bba88c67f58..7ee5b79ff3d6 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -681,7 +681,6 @@ struct mlx5e_resources {
 		struct mlx5_sq_bfreg       bfreg;
 #define MLX5_MAX_NUM_TC 8
 		u32                        tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC];
-		bool			   tisn_valid;
 	} hw_objs;
 	struct net_device *uplink_netdev;
 	struct mutex uplink_netdev_lock;
@@ -822,7 +821,6 @@ struct mlx5_core_dev {
 	struct blocking_notifier_head macsec_nh;
 #endif
 	u64 num_ipsec_offloads;
-	struct mlx5_sd          *sd;
 };
 
 struct mlx5_db {
@@ -1224,14 +1222,6 @@ static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev)
 	return dev->caps.embedded_cpu;
 }
 
-static inline bool mlx5_core_is_mgmt_pf(const struct mlx5_core_dev *dev)
-{
-	if (!MLX5_CAP_GEN_2(dev, local_mng_port_valid))
-		return false;
-
-	return MLX5_CAP_GEN_2(dev, local_mng_port);
-}
-
 static inline bool
 mlx5_core_is_ecpf_esw_manager(const struct mlx5_core_dev *dev)
 {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 586569209254..fee20fc010c2 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1954,10 +1954,8 @@ enum {
 struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_0[0x80];
 
-	u8	   migratable[0x1];
-	u8	   reserved_at_81[0x19];
-	u8	   local_mng_port[0x1];
-	u8	   reserved_at_9b[0x5];
+	u8         migratable[0x1];
+	u8         reserved_at_81[0x1f];
 
 	u8	   max_reformat_insert_size[0x8];
 	u8	   max_reformat_insert_offset[0x8];
@@ -1975,13 +1973,7 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 
 	u8	   allowed_object_for_other_vhca_access[0x40];
 
-	u8	   reserved_at_140[0x20];
-
-	u8	   reserved_at_160[0xa];
-	u8	   local_mng_port_valid[0x1];
-	u8	   reserved_at_16b[0x15];
-
-	u8	   reserved_at_180[0x20];
+	u8	   reserved_at_140[0x60];
 
 	u8	   flow_table_type_2_type[0x8];
 	u8	   reserved_at_1a8[0x3];
@@ -4038,13 +4030,8 @@ struct mlx5_ifc_nic_vport_context_bits {
 	u8	   affiliation_criteria[0x4];
 	u8	   affiliated_vhca_id[0x10];
 
-	u8	   reserved_at_60[0xa0];
+	u8	   reserved_at_60[0xd0];
 
-	u8	   reserved_at_100[0x1];
-	u8         sd_group[0x3];
-	u8	   reserved_at_104[0x1c];
-
-	u8	   reserved_at_120[0x10];
 	u8         mtu[0x10];
 
 	u8         system_image_guid[0x40];
@@ -10129,7 +10116,8 @@ struct mlx5_ifc_mpir_reg_bits {
 	u8         reserved_at_20[0x20];
 
 	u8         local_port[0x8];
-	u8         reserved_at_28[0x18];
+	u8         reserved_at_28[0x15];
+	u8         sd_group[0x3];
 
 	u8         reserved_at_60[0x20];
 };
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index c36cc6d82926..fbb9bf447889 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -72,7 +72,6 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu);
 int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu);
 int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
 					   u64 *system_image_guid);
-int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group);
 int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
 int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
 				    u16 vport, u64 node_guid);
-- 
cgit v1.2.3


From 4e33b071bb8e8415fb9847249ffcf300fa7d8cac Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 Dec 2023 07:51:41 +0000
Subject: block: remove disk_clear_zoned

disk_clear_zoned is unused now that the last warts of the host-aware
model support in sd are gone.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Link: https://lore.kernel.org/r/20231228075141.362560-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 94701a63ad8a..e1e705aef51e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -318,7 +318,6 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
 			       void *data);
 
 void disk_set_zoned(struct gendisk *disk);
-void disk_clear_zoned(struct gendisk *disk);
 
 #define BLK_ALL_ZONES  ((unsigned int)-1)
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
-- 
cgit v1.2.3


From 3b7cb745473aec7255d66e3854abaa9c3f46f952 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 8 Jan 2024 11:50:16 -0700
Subject: block: move __get_task_ioprio() into header file

We call this once per IO, which can be millions of times per second.
Since nobody really uses io priorities, or at least it isn't very
common, this is all wasted time and can amount to as much as 3% of
the total kernel time.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ioprio.h | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 7578d4f6a969..d6a9b5b7ed16 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -47,7 +47,30 @@ static inline int task_nice_ioclass(struct task_struct *task)
 }
 
 #ifdef CONFIG_BLOCK
-int __get_task_ioprio(struct task_struct *p);
+/*
+ * If the task has set an I/O priority, use that. Otherwise, return
+ * the default I/O priority.
+ *
+ * Expected to be called for current task or with task_lock() held to keep
+ * io_context stable.
+ */
+static inline int __get_task_ioprio(struct task_struct *p)
+{
+	struct io_context *ioc = p->io_context;
+	int prio;
+
+	if (p != current)
+		lockdep_assert_held(&p->alloc_lock);
+	if (ioc)
+		prio = ioc->ioprio;
+	else
+		prio = IOPRIO_DEFAULT;
+
+	if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
+		prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
+					 task_nice_ioprio(p));
+	return prio;
+}
 #else
 static inline int __get_task_ioprio(struct task_struct *p)
 {
-- 
cgit v1.2.3


From 53889bcaf536b3abedeaf104019877cee37dd08b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 8 Jan 2024 11:51:57 -0700
Subject: block: make __get_task_ioprio() easier to read

We don't need to do any gymnastics if we don't have an io_context
assigned at all, so just return early with our default priority.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ioprio.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index d6a9b5b7ed16..db1249cd9692 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -59,13 +59,13 @@ static inline int __get_task_ioprio(struct task_struct *p)
 	struct io_context *ioc = p->io_context;
 	int prio;
 
+	if (!ioc)
+		return IOPRIO_DEFAULT;
+
 	if (p != current)
 		lockdep_assert_held(&p->alloc_lock);
-	if (ioc)
-		prio = ioc->ioprio;
-	else
-		prio = IOPRIO_DEFAULT;
 
+	prio = ioc->ioprio;
 	if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
 		prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
 					 task_nice_ioprio(p));
-- 
cgit v1.2.3


From fd37721803c6e73619108f76ad2e12a9aa5fafaf Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 28 Dec 2023 17:47:03 +0300
Subject: mm, treewide: introduce NR_PAGE_ORDERS

NR_PAGE_ORDERS defines the number of page orders supported by the page
allocator, ranging from 0 to MAX_ORDER, MAX_ORDER + 1 in total.

NR_PAGE_ORDERS assists in defining arrays of page orders and allows for
more natural iteration over them.

[kirill.shutemov@linux.intel.com: fixup for kerneldoc warning]
  Link: https://lkml.kernel.org/r/20240101111512.7empzyifq7kxtzk3@box
Link: https://lkml.kernel.org/r/20231228144704.14033-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c18c53353b50..1ea7636dfb76 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -35,6 +35,8 @@
 
 #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
 
+#define NR_PAGE_ORDERS (MAX_ORDER + 1)
+
 /*
  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
  * costly to service.  That is between allocation orders which should
@@ -96,7 +98,7 @@ static inline bool migratetype_is_mergeable(int mt)
 }
 
 #define for_each_migratetype_order(order, type) \
-	for (order = 0; order <= MAX_ORDER; order++) \
+	for (order = 0; order < NR_PAGE_ORDERS; order++) \
 		for (type = 0; type < MIGRATE_TYPES; type++)
 
 extern int page_group_by_mobility_disabled;
@@ -933,7 +935,7 @@ struct zone {
 	CACHELINE_PADDING(_pad1_);
 
 	/* free areas of different sizes */
-	struct free_area	free_area[MAX_ORDER + 1];
+	struct free_area	free_area[NR_PAGE_ORDERS];
 
 #ifdef CONFIG_UNACCEPTED_MEMORY
 	/* Pages to be accepted. All pages on the list are MAX_ORDER */
-- 
cgit v1.2.3


From 5e0a760b44417f7cadd79de2204d6247109558a0 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 28 Dec 2023 17:47:04 +0300
Subject: mm, treewide: rename MAX_ORDER to MAX_PAGE_ORDER

commit 23baf831a32c ("mm, treewide: redefine MAX_ORDER sanely") has
changed the definition of MAX_ORDER to be inclusive.  This has caused
issues with code that was not yet upstream and depended on the previous
definition.

To draw attention to the altered meaning of the define, rename MAX_ORDER
to MAX_PAGE_ORDER.

Link: https://lkml.kernel.org/r/20231228144704.14033-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h         |  2 +-
 include/linux/mmzone.h          | 14 +++++++-------
 include/linux/pageblock-flags.h |  4 ++--
 include/linux/slab.h            |  4 ++--
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 236ec7b63c54..c1ee640d87b1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -829,7 +829,7 @@ static inline unsigned huge_page_shift(struct hstate *h)
 
 static inline bool hstate_is_gigantic(struct hstate *h)
 {
-	return huge_page_order(h) > MAX_ORDER;
+	return huge_page_order(h) > MAX_PAGE_ORDER;
 }
 
 static inline unsigned int pages_per_huge_page(const struct hstate *h)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1ea7636dfb76..4ed33b127821 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -27,15 +27,15 @@
 
 /* Free memory management - zoned buddy allocator.  */
 #ifndef CONFIG_ARCH_FORCE_MAX_ORDER
-#define MAX_ORDER 10
+#define MAX_PAGE_ORDER 10
 #else
-#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
+#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
 #endif
-#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
+#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER)
 
 #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
 
-#define NR_PAGE_ORDERS (MAX_ORDER + 1)
+#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1)
 
 /*
  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
@@ -938,7 +938,7 @@ struct zone {
 	struct free_area	free_area[NR_PAGE_ORDERS];
 
 #ifdef CONFIG_UNACCEPTED_MEMORY
-	/* Pages to be accepted. All pages on the list are MAX_ORDER */
+	/* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */
 	struct list_head	unaccepted_pages;
 #endif
 
@@ -1748,8 +1748,8 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
 #define SECTION_BLOCKFLAGS_BITS \
 	((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
 
-#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
-#error Allocator MAX_ORDER exceeds SECTION_SIZE
+#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
+#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
 #endif
 
 static inline unsigned long pfn_to_section_nr(unsigned long pfn)
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index e83c4c095041..3f2409b968ec 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -41,14 +41,14 @@ extern unsigned int pageblock_order;
  * Huge pages are a constant size, but don't exceed the maximum allocation
  * granularity.
  */
-#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER)
+#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
 #else /* CONFIG_HUGETLB_PAGE */
 
 /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
-#define pageblock_order		MAX_ORDER
+#define pageblock_order		MAX_PAGE_ORDER
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d6d6ffeeb9a2..d63823e518c0 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -308,7 +308,7 @@ static inline unsigned int arch_slab_minalign(void)
  * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
  */
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
+#define KMALLOC_SHIFT_MAX	(MAX_PAGE_ORDER + PAGE_SHIFT)
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	5
 #endif
@@ -316,7 +316,7 @@ static inline unsigned int arch_slab_minalign(void)
 
 #ifdef CONFIG_SLUB
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
+#define KMALLOC_SHIFT_MAX	(MAX_PAGE_ORDER + PAGE_SHIFT)
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	3
 #endif
-- 
cgit v1.2.3


From 2fb7e4dd35c52933b18ff127bf92d703c8e2e897 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 5 Jan 2024 13:51:21 +0100
Subject: PNP: make pnp_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the pnp_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pnp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index 267fb8a4fb6e..ddbe7c3ca4ce 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -435,7 +435,7 @@ struct pnp_protocol {
 #define protocol_for_each_dev(protocol, dev)	\
 	list_for_each_entry(dev, &(protocol)->devices, protocol_list)
 
-extern struct bus_type pnp_bus_type;
+extern const struct bus_type pnp_bus_type;
 
 #if defined(CONFIG_PNP)
 
-- 
cgit v1.2.3


From 6eade110754c085cee9e46f4d87d2c3ea4e59e8c Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 20 Dec 2023 16:17:32 -0800
Subject: cxl/events: Separate UUID from event structures

The UEFI CXL CPER structure does not include the UUID.  Now that the
UUID is passed separately to the trace event there is no need to have
the UUID in those structures.

Move UUID from the event record header to the raw structures.  Adjust
cxl-test to Create dummy structures for creating test records.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-5-1bb8a4ca2c7a@intel.com
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/cxl-event.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 0fc068123f8e..3d9b5954d0c1 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -8,7 +8,6 @@
  * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
  */
 struct cxl_event_record_hdr {
-	uuid_t id;
 	u8 length;
 	u8 flags[3];
 	__le16 handle;
@@ -18,8 +17,13 @@ struct cxl_event_record_hdr {
 	u8 reserved[15];
 } __packed;
 
+/*
+ * Common Event Record Format
+ * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
+ */
 #define CXL_EVENT_RECORD_DATA_LENGTH 0x50
 struct cxl_event_record_raw {
+	uuid_t id;
 	struct cxl_event_record_hdr hdr;
 	u8 data[CXL_EVENT_RECORD_DATA_LENGTH];
 } __packed;
-- 
cgit v1.2.3


From f9c683386f5bc0364615138ce2b14be50848dbcf Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 20 Dec 2023 16:17:33 -0800
Subject: cxl/events: Create a CXL event union

The CXL CPER and event log records share everything but a UUID/GUID in
their structures.

Define a cxl_event union without the UUID/GUID to be shared between the
CPER and event log record formats.  Adjust the code to use this union.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-6-1bb8a4ca2c7a@intel.com
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/cxl-event.h | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 3d9b5954d0c1..4d6c05f535f8 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -17,13 +17,8 @@ struct cxl_event_record_hdr {
 	u8 reserved[15];
 } __packed;
 
-/*
- * Common Event Record Format
- * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
- */
 #define CXL_EVENT_RECORD_DATA_LENGTH 0x50
-struct cxl_event_record_raw {
-	uuid_t id;
+struct cxl_event_generic {
 	struct cxl_event_record_hdr hdr;
 	u8 data[CXL_EVENT_RECORD_DATA_LENGTH];
 } __packed;
@@ -96,4 +91,20 @@ struct cxl_event_mem_module {
 	u8 reserved[0x3d];
 } __packed;
 
+union cxl_event {
+	struct cxl_event_generic generic;
+	struct cxl_event_gen_media gen_media;
+	struct cxl_event_dram dram;
+	struct cxl_event_mem_module mem_module;
+} __packed;
+
+/*
+ * Common Event Record Format; in event logs
+ * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
+ */
+struct cxl_event_record_raw {
+	uuid_t id;
+	union cxl_event event;
+} __packed;
+
 #endif /* _LINUX_CXL_EVENT_H */
-- 
cgit v1.2.3


From 671a794c33c6e048ca5cedd5ad6af44d52d5d7e5 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 20 Dec 2023 16:17:34 -0800
Subject: acpi/ghes: Process CXL Component Events

BIOS can configure memory devices as firmware first.  This will send CXL
events to the firmware instead of the OS.  The firmware can then send
these events to the OS via UEFI.

UEFI v2.10 section N.2.14 defines a Common Platform Error Record (CPER)
format for CXL Component Events.  The format is mostly the same as the
CXL Common Event Record Format.  The difference is the use of a GUID in
the Section Type rather than a UUID as part of the event itself.

Add GHES support to detect CXL CPER records and call a registered
callback with the event.

A notifier chain was considered for the callback but the complexity did
not justify the use case as only the CXL subsystem requires this event.
Enforce that only one callback can be registered at any time.

Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-7-1bb8a4ca2c7a@intel.com
[djbw: fixup checkpatch errors]
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/cxl-event.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 4d6c05f535f8..95841750a383 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -107,4 +107,54 @@ struct cxl_event_record_raw {
 	union cxl_event event;
 } __packed;
 
+enum cxl_event_type {
+	CXL_CPER_EVENT_GEN_MEDIA,
+	CXL_CPER_EVENT_DRAM,
+	CXL_CPER_EVENT_MEM_MODULE,
+};
+
+#define CPER_CXL_DEVICE_ID_VALID		BIT(0)
+#define CPER_CXL_DEVICE_SN_VALID		BIT(1)
+#define CPER_CXL_COMP_EVENT_LOG_VALID		BIT(2)
+struct cxl_cper_event_rec {
+	struct {
+		u32 length;
+		u64 validation_bits;
+		struct cper_cxl_event_devid {
+			u16 vendor_id;
+			u16 device_id;
+			u8 func_num;
+			u8 device_num;
+			u8 bus_num;
+			u16 segment_num;
+			u16 slot_num; /* bits 2:0 reserved */
+			u8 reserved;
+		} __packed device_id;
+		struct cper_cxl_event_sn {
+			u32 lower_dw;
+			u32 upper_dw;
+		} __packed dev_serial_num;
+	} __packed hdr;
+
+	union cxl_event event;
+} __packed;
+
+typedef void (*cxl_cper_callback)(enum cxl_event_type type,
+				  struct cxl_cper_event_rec *rec);
+
+#ifdef CONFIG_ACPI_APEI_GHES
+int cxl_cper_register_callback(cxl_cper_callback callback);
+int cxl_cper_unregister_callback(cxl_cper_callback callback);
+#else
+static inline int cxl_cper_register_callback(cxl_cper_callback callback)
+{
+	return 0;
+}
+
+static inline int cxl_cper_unregister_callback(cxl_cper_callback callback)
+{
+	return 0;
+}
+#endif
+
 #endif /* _LINUX_CXL_EVENT_H */
-- 
cgit v1.2.3


From ced085ef369af7a2b6da962ec2fbd01339f60693 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 20 Dec 2023 16:17:35 -0800
Subject: PCI: Introduce cleanup helpers for device reference counts and locks

The "goto error" pattern is notorious for introducing subtle resource
leaks. Use the new cleanup.h helpers for PCI device reference counts and
locks.

Similar to the new put_device() and device_lock() cleanup helpers,
__free(put_device) and guard(device), define the same for PCI devices,
__free(pci_dev_put) and guard(pci_dev).  These helpers eliminate the
need for "goto free;" and "goto unlock;" patterns. For example, A
'struct pci_dev *' instance declared as:

    struct pci_dev *pdev __free(pci_dev_put) = NULL;

...will automatically call pci_dev_put() if @pdev is non-NULL when @pdev
goes out of scope (automatic variable scope). If a function wants to
invoke pci_dev_put() on error, but return @pdev on success, it can do:

    return no_free_ptr(pdev);

...or:

    return_ptr(pdev);

For potential cleanup opportunity there are 587 open-coded calls to
pci_dev_put() in the kernel with 65 instances within 10 lines of a goto
statement with the CXL driver threatening to add another one.

The guard() helper holds the associated lock for the remainder of the
current scope in which it was invoked. So, for example:

    func(...)
    {
        if (...) {
            ...
            guard(pci_dev); /* pci_dev_lock() invoked here */
            ...
        } /* <- implied pci_dev_unlock() triggered here */
    }

There are 15 invocations of pci_dev_unlock() in the kernel with 5
instances within 10 lines of a goto statement. Again, the CXL driver is
threatening to add another.

Introduce these helpers to preclude the addition of new more error prone
goto put; / goto unlock; sequences. For now, these helpers are used in
drivers/cxl/pci.c to allow ACPI error reports to be fed back into the
CXL driver associated with the PCI device identified in the report.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-8-1bb8a4ca2c7a@intel.com
[djbw: rewrite changelog]
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index dea043bc1e38..0d23d2e0eb1a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1170,6 +1170,7 @@ int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge);
 u8 pci_common_swizzle(struct pci_dev *dev, u8 *pinp);
 struct pci_dev *pci_dev_get(struct pci_dev *dev);
 void pci_dev_put(struct pci_dev *dev);
+DEFINE_FREE(pci_dev_put, struct pci_dev *, if (_T) pci_dev_put(_T))
 void pci_remove_bus(struct pci_bus *b);
 void pci_stop_and_remove_bus_device(struct pci_dev *dev);
 void pci_stop_and_remove_bus_device_locked(struct pci_dev *dev);
@@ -1874,6 +1875,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev);
 void pci_dev_lock(struct pci_dev *dev);
 int pci_dev_trylock(struct pci_dev *dev);
 void pci_dev_unlock(struct pci_dev *dev);
+DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T))
 
 /*
  * PCI domain support.  Sometimes called PCI segment (eg by ACPI),
-- 
cgit v1.2.3


From dc97f6344f205b0dfa144e1b3e16d6dc05383d57 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 20 Dec 2023 16:17:36 -0800
Subject: cxl/pci: Register for and process CPER events

If the firmware has configured CXL event support to be firmware first
the OS can process those events through CPER records.  The CXL layer has
unique DPA to HPA knowledge and standard event trace parsing in place.

CPER records contain Bus, Device, Function information which can be used
to identify the PCI device which is sending the event.

Change the PCI driver registration to include registration of a CXL
CPER callback to process events through the trace subsystem.

Use new scoped based management to simplify the handling of the PCI
device object.

Tested-by: Smita-Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Smita-Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-9-1bb8a4ca2c7a@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
[djbw: use new pci_dev guard, flip init order]
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/cxl-event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 95841750a383..91125eca4c8a 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -108,6 +108,7 @@ struct cxl_event_record_raw {
 } __packed;
 
 enum cxl_event_type {
+	CXL_CPER_EVENT_GENERIC,
 	CXL_CPER_EVENT_GEN_MEDIA,
 	CXL_CPER_EVENT_DRAM,
 	CXL_CPER_EVENT_MEM_MODULE,
-- 
cgit v1.2.3


From ef067191f73cce3ee192e991ce486d95524655d5 Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Mon, 25 Dec 2023 17:11:56 +0200
Subject: vdpa/mlx5: Expose resumable vq capability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Necessary for checking if resumable vqs are supported by the hardware.
Actual support will be added in a downstream patch.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Message-Id: <20231225151203.152687-2-dtatulea@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/mlx5/mlx5_ifc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6f3631425f38..9eaceaf6bcb0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1236,7 +1236,8 @@ struct mlx5_ifc_virtio_emulation_cap_bits {
 
 	u8	   reserved_at_c0[0x13];
 	u8         desc_group_mkey_supported[0x1];
-	u8         reserved_at_d4[0xc];
+	u8         freeze_to_rdy_supported[0x1];
+	u8         reserved_at_d5[0xb];
 
 	u8         reserved_at_e0[0x20];
 
-- 
cgit v1.2.3


From 9b23417825df470e4c9e98e7ed4b2c37465bfa1e Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Mon, 25 Dec 2023 17:11:59 +0200
Subject: vdpa/mlx5: Mark vq addrs for modification in hw vq

Addresses get set by .set_vq_address. hw vq addresses will be updated on
next modify_virtqueue.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Message-Id: <20231225151203.152687-5-dtatulea@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/mlx5/mlx5_ifc_vdpa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h
index b86d51a855f6..9594ac405740 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -145,6 +145,7 @@ enum {
 	MLX5_VIRTQ_MODIFY_MASK_STATE                    = (u64)1 << 0,
 	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS      = (u64)1 << 3,
 	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4,
+	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS           = (u64)1 << 6,
 	MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY          = (u64)1 << 14,
 };
 
-- 
cgit v1.2.3


From 60c43b3f6b4eb5a3d672952a0d65991f414ea258 Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Mon, 25 Dec 2023 17:12:00 +0200
Subject: vdpa/mlx5: Mark vq state for modification in hw vq

.set_vq_state will set the indices and mark the fields to be modified in
the hw vq.

Advertise that the device supports changing the vq state when the device
is in DRIVER_OK state and suspended.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20231225151203.152687-6-dtatulea@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/mlx5/mlx5_ifc_vdpa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h
index 9594ac405740..32e712106e68 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -146,6 +146,8 @@ enum {
 	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS      = (u64)1 << 3,
 	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4,
 	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS           = (u64)1 << 6,
+	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX       = (u64)1 << 7,
+	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX        = (u64)1 << 8,
 	MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY          = (u64)1 << 14,
 };
 
-- 
cgit v1.2.3


From f756dd3e2a4c704c0ab5ecb143ab71f1249af497 Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Mon, 25 Dec 2023 17:12:01 +0200
Subject: vdpa/mlx5: Use vq suspend/resume during .set_map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of tearing down and setting up vq resources, use vq
suspend/resume during .set_map to speed things up a bit.

The vq mr is updated with the new mapping while the vqs are suspended.

If the device doesn't support resumable vqs, do the old teardown and
setup dance.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Message-Id: <20231225151203.152687-7-dtatulea@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/mlx5/mlx5_ifc_vdpa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h
index 32e712106e68..40371c916cf9 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -148,6 +148,7 @@ enum {
 	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS           = (u64)1 << 6,
 	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX       = (u64)1 << 7,
 	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX        = (u64)1 << 8,
+	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY            = (u64)1 << 11,
 	MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY          = (u64)1 << 14,
 };
 
-- 
cgit v1.2.3


From 3e999770ac1c7c31a70685dd5b88e89473509e9c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 9 Jan 2024 17:59:22 +0100
Subject: PM: sleep: Restore asynchronous device resume optimization

Before commit 7839d0078e0d ("PM: sleep: Fix possible deadlocks in core
system-wide PM code"), the resume of devices that were allowed to resume
asynchronously was scheduled before starting the resume of the other
devices, so the former did not have to wait for the latter unless
functional dependencies were present.

Commit 7839d0078e0d removed that optimization in order to address a
correctness issue, but it can be restored with the help of a new device
power management flag, so do that now.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
---
 include/linux/pm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 92a4f69de0e8..a2f3e53a8196 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -681,6 +681,7 @@ struct dev_pm_info {
 	bool			wakeup_path:1;
 	bool			syscore:1;
 	bool			no_pm_callbacks:1;	/* Owned by the PM core */
+	bool			async_in_progress:1;	/* Owned by the PM core */
 	unsigned int		must_resume:1;	/* Owned by the PM core */
 	unsigned int		may_skip_resume:1;	/* Set by subsystems */
 #else
-- 
cgit v1.2.3


From f35b88b66fbb5c90298ce3aa483b8a2cf1f39ad0 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 10 Jan 2024 20:10:08 -0800
Subject: iommu: Add cache_invalidate_user op

The updates of the PTEs in the nested page table will be propagated to the
hardware caches.

Add a new domain op cache_invalidate_user() for the userspace to flush the
hardware caches for a nested domain through iommufd. No wrapper for it, as
it's only supposed to be used by iommufd. Then, pass in invalidation
requests in form of a user data array containing a number of invalidation
data entries.

Link: https://lore.kernel.org/r/20240111041015.47920-2-yi.l.liu@intel.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6291aa7b079b..93c0d12dd047 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -284,6 +284,23 @@ struct iommu_user_data {
 	size_t len;
 };
 
+/**
+ * struct iommu_user_data_array - iommu driver specific user space data array
+ * @type: The data type of all the entries in the user buffer array
+ * @uptr: Pointer to the user buffer array
+ * @entry_len: The fixed-width length of an entry in the array, in bytes
+ * @entry_num: The number of total entries in the array
+ *
+ * The user buffer includes an array of requests with format defined in
+ * include/uapi/linux/iommufd.h
+ */
+struct iommu_user_data_array {
+	unsigned int type;
+	void __user *uptr;
+	size_t entry_len;
+	u32 entry_num;
+};
+
 /**
  * __iommu_copy_struct_from_user - Copy iommu driver specific user space data
  * @dst_data: Pointer to an iommu driver specific user data that is defined in
@@ -440,6 +457,13 @@ struct iommu_ops {
  * @iotlb_sync_map: Sync mappings created recently using @map to the hardware
  * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
  *            queue
+ * @cache_invalidate_user: Flush hardware cache for user space IO page table.
+ *                         The @domain must be IOMMU_DOMAIN_NESTED. The @array
+ *                         passes in the cache invalidation requests, in form
+ *                         of a driver data structure. The driver must update
+ *                         array->entry_num to report the number of handled
+ *                         invalidation requests. The driver data structure
+ *                         must be defined in include/uapi/linux/iommufd.h
  * @iova_to_phys: translate iova to physical address
  * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE,
  *                           including no-snoop TLPs on PCIe or other platform
@@ -465,6 +489,8 @@ struct iommu_domain_ops {
 			      size_t size);
 	void (*iotlb_sync)(struct iommu_domain *domain,
 			   struct iommu_iotlb_gather *iotlb_gather);
+	int (*cache_invalidate_user)(struct iommu_domain *domain,
+				     struct iommu_user_data_array *array);
 
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
 				    dma_addr_t iova);
-- 
cgit v1.2.3


From 77785117f9c73fd71a440a5ac86dd80752967adc Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 10 Jan 2024 20:10:10 -0800
Subject: iommu: Add iommu_copy_struct_from_user_array helper

Wrap up the data pointer/num sanity and __iommu_copy_struct_from_user()
call for iommu drivers to copy driver specific data at a specific location
in the struct iommu_user_data_array.

And expect it to be used in cache_invalidate_user ops for example.

Link: https://lore.kernel.org/r/20240111041015.47920-4-yi.l.liu@intel.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Co-developed-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 93c0d12dd047..9dbadf74a3a1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -341,6 +341,57 @@ static inline int __iommu_copy_struct_from_user(
 				      sizeof(*kdst),                      \
 				      offsetofend(typeof(*kdst), min_last))
 
+/**
+ * __iommu_copy_struct_from_user_array - Copy iommu driver specific user space
+ *                                       data from an iommu_user_data_array
+ * @dst_data: Pointer to an iommu driver specific user data that is defined in
+ *            include/uapi/linux/iommufd.h
+ * @src_array: Pointer to a struct iommu_user_data_array for a user space array
+ * @data_type: The data type of the @dst_data. Must match with @src_array.type
+ * @index: Index to the location in the array to copy user data from
+ * @data_len: Length of current user data structure, i.e. sizeof(struct _dst)
+ * @min_len: Initial length of user data structure for backward compatibility.
+ *           This should be offsetofend using the last member in the user data
+ *           struct that was initially added to include/uapi/linux/iommufd.h
+ */
+static inline int __iommu_copy_struct_from_user_array(
+	void *dst_data, const struct iommu_user_data_array *src_array,
+	unsigned int data_type, unsigned int index, size_t data_len,
+	size_t min_len)
+{
+	struct iommu_user_data src_data;
+
+	if (WARN_ON(!src_array || index >= src_array->entry_num))
+		return -EINVAL;
+	if (!src_array->entry_num)
+		return -EINVAL;
+	src_data.uptr = src_array->uptr + src_array->entry_len * index;
+	src_data.len = src_array->entry_len;
+	src_data.type = src_array->type;
+
+	return __iommu_copy_struct_from_user(dst_data, &src_data, data_type,
+					     data_len, min_len);
+}
+
+/**
+ * iommu_copy_struct_from_user_array - Copy iommu driver specific user space
+ *                                     data from an iommu_user_data_array
+ * @kdst: Pointer to an iommu driver specific user data that is defined in
+ *        include/uapi/linux/iommufd.h
+ * @user_array: Pointer to a struct iommu_user_data_array for a user space
+ *              array
+ * @data_type: The data type of the @kdst. Must match with @user_array->type
+ * @index: Index to the location in the array to copy user data from
+ * @min_last: The last member of the data structure @kdst points in the
+ *            initial version.
+ * Return 0 for success, otherwise -error.
+ */
+#define iommu_copy_struct_from_user_array(kdst, user_array, data_type, index, \
+					  min_last)                           \
+	__iommu_copy_struct_from_user_array(                                  \
+		kdst, user_array, data_type, index, sizeof(*(kdst)),          \
+		offsetofend(typeof(*(kdst)), min_last))
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability
-- 
cgit v1.2.3


From dee56ccb468a832074397fdbf22bbd9bf6d710aa Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 12 Dec 2023 15:18:02 +0100
Subject: fbdev: amba-clcd: Delete the old CLCD driver

We have managed to ascertain that all users of the old FBDEV
code that are out of tree are now gone.

The new DRM driver can be found in drivers/gpu/drm/pl111/.

The remaining out of tree user was the ARM FVP emulation
platform, running Android. Thanks to changes in Android
versions 13 and 14, Android can now use the DRM driver when
being emulated under FVP. Some final patches are being put
in place to make it fully featured.

This is essentially a revert of the partial revert in
commit 112c35237c72 ("Partially revert "video: fbdev: amba-clcd: Retire elder CLCD driver"")

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/linux/amba/clcd-regs.h |  87 -------------
 include/linux/amba/clcd.h      | 290 -----------------------------------------
 2 files changed, 377 deletions(-)
 delete mode 100644 include/linux/amba/clcd-regs.h
 delete mode 100644 include/linux/amba/clcd.h

(limited to 'include/linux')

diff --git a/include/linux/amba/clcd-regs.h b/include/linux/amba/clcd-regs.h
deleted file mode 100644
index 421b0fa90d6a..000000000000
--- a/include/linux/amba/clcd-regs.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * David A Rusling
- *
- * Copyright (C) 2001 ARM Limited
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of this archive
- * for more details.
- */
-
-#ifndef AMBA_CLCD_REGS_H
-#define AMBA_CLCD_REGS_H
-
-/*
- * CLCD Controller Internal Register addresses
- */
-#define CLCD_TIM0		0x00000000
-#define CLCD_TIM1 		0x00000004
-#define CLCD_TIM2 		0x00000008
-#define CLCD_TIM3 		0x0000000c
-#define CLCD_UBAS 		0x00000010
-#define CLCD_LBAS 		0x00000014
-
-#define CLCD_PL110_IENB		0x00000018
-#define CLCD_PL110_CNTL		0x0000001c
-#define CLCD_PL110_STAT		0x00000020
-#define CLCD_PL110_INTR 	0x00000024
-#define CLCD_PL110_UCUR		0x00000028
-#define CLCD_PL110_LCUR		0x0000002C
-
-#define CLCD_PL111_CNTL		0x00000018
-#define CLCD_PL111_IENB		0x0000001c
-#define CLCD_PL111_RIS		0x00000020
-#define CLCD_PL111_MIS		0x00000024
-#define CLCD_PL111_ICR		0x00000028
-#define CLCD_PL111_UCUR		0x0000002c
-#define CLCD_PL111_LCUR		0x00000030
-
-#define CLCD_PALL 		0x00000200
-#define CLCD_PALETTE		0x00000200
-
-#define TIM2_PCD_LO_MASK	GENMASK(4, 0)
-#define TIM2_PCD_LO_BITS	5
-#define TIM2_CLKSEL		(1 << 5)
-#define TIM2_ACB_MASK		GENMASK(10, 6)
-#define TIM2_IVS		(1 << 11)
-#define TIM2_IHS		(1 << 12)
-#define TIM2_IPC		(1 << 13)
-#define TIM2_IOE		(1 << 14)
-#define TIM2_BCD		(1 << 26)
-#define TIM2_PCD_HI_MASK	GENMASK(31, 27)
-#define TIM2_PCD_HI_BITS	5
-#define TIM2_PCD_HI_SHIFT	27
-
-#define CNTL_LCDEN		(1 << 0)
-#define CNTL_LCDBPP1		(0 << 1)
-#define CNTL_LCDBPP2		(1 << 1)
-#define CNTL_LCDBPP4		(2 << 1)
-#define CNTL_LCDBPP8		(3 << 1)
-#define CNTL_LCDBPP16		(4 << 1)
-#define CNTL_LCDBPP16_565	(6 << 1)
-#define CNTL_LCDBPP16_444	(7 << 1)
-#define CNTL_LCDBPP24		(5 << 1)
-#define CNTL_LCDBW		(1 << 4)
-#define CNTL_LCDTFT		(1 << 5)
-#define CNTL_LCDMONO8		(1 << 6)
-#define CNTL_LCDDUAL		(1 << 7)
-#define CNTL_BGR		(1 << 8)
-#define CNTL_BEBO		(1 << 9)
-#define CNTL_BEPO		(1 << 10)
-#define CNTL_LCDPWR		(1 << 11)
-#define CNTL_LCDVCOMP(x)	((x) << 12)
-#define CNTL_LDMAFIFOTIME	(1 << 15)
-#define CNTL_WATERMARK		(1 << 16)
-
-/* ST Microelectronics variant bits */
-#define CNTL_ST_1XBPP_444	0x0
-#define CNTL_ST_1XBPP_5551	(1 << 17)
-#define CNTL_ST_1XBPP_565	(1 << 18)
-#define CNTL_ST_CDWID_12	0x0
-#define CNTL_ST_CDWID_16	(1 << 19)
-#define CNTL_ST_CDWID_18	(1 << 20)
-#define CNTL_ST_CDWID_24	((1 << 19)|(1 << 20))
-#define CNTL_ST_CEAEN		(1 << 21)
-#define CNTL_ST_LCDBPP24_PACKED	(6 << 1)
-
-#endif /* AMBA_CLCD_REGS_H */
diff --git a/include/linux/amba/clcd.h b/include/linux/amba/clcd.h
deleted file mode 100644
index b6e0cbeaf533..000000000000
--- a/include/linux/amba/clcd.h
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * linux/include/asm-arm/hardware/amba_clcd.h -- Integrator LCD panel.
- *
- * David A Rusling
- *
- * Copyright (C) 2001 ARM Limited
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of this archive
- * for more details.
- */
-#include <linux/fb.h>
-#include <linux/amba/clcd-regs.h>
-
-enum {
-	/* individual formats */
-	CLCD_CAP_RGB444		= (1 << 0),
-	CLCD_CAP_RGB5551	= (1 << 1),
-	CLCD_CAP_RGB565		= (1 << 2),
-	CLCD_CAP_RGB888		= (1 << 3),
-	CLCD_CAP_BGR444		= (1 << 4),
-	CLCD_CAP_BGR5551	= (1 << 5),
-	CLCD_CAP_BGR565		= (1 << 6),
-	CLCD_CAP_BGR888		= (1 << 7),
-
-	/* connection layouts */
-	CLCD_CAP_444		= CLCD_CAP_RGB444 | CLCD_CAP_BGR444,
-	CLCD_CAP_5551		= CLCD_CAP_RGB5551 | CLCD_CAP_BGR5551,
-	CLCD_CAP_565		= CLCD_CAP_RGB565 | CLCD_CAP_BGR565,
-	CLCD_CAP_888		= CLCD_CAP_RGB888 | CLCD_CAP_BGR888,
-
-	/* red/blue ordering */
-	CLCD_CAP_RGB		= CLCD_CAP_RGB444 | CLCD_CAP_RGB5551 |
-				  CLCD_CAP_RGB565 | CLCD_CAP_RGB888,
-	CLCD_CAP_BGR		= CLCD_CAP_BGR444 | CLCD_CAP_BGR5551 |
-				  CLCD_CAP_BGR565 | CLCD_CAP_BGR888,
-
-	CLCD_CAP_ALL		= CLCD_CAP_BGR | CLCD_CAP_RGB,
-};
-
-struct backlight_device;
-
-struct clcd_panel {
-	struct fb_videomode	mode;
-	signed short		width;	/* width in mm */
-	signed short		height;	/* height in mm */
-	u32			tim2;
-	u32			tim3;
-	u32			cntl;
-	u32			caps;
-	unsigned int		bpp:8,
-				fixedtimings:1,
-				grayscale:1;
-	unsigned int		connector;
-	struct backlight_device	*backlight;
-	/*
-	 * If the B/R lines are switched between the CLCD
-	 * and the panel we need to know this and not try to
-	 * compensate with the BGR bit in the control register.
-	 */
-	bool			bgr_connection;
-};
-
-struct clcd_regs {
-	u32			tim0;
-	u32			tim1;
-	u32			tim2;
-	u32			tim3;
-	u32			cntl;
-	unsigned long		pixclock;
-};
-
-struct clcd_fb;
-
-/*
- * the board-type specific routines
- */
-struct clcd_board {
-	const char *name;
-
-	/*
-	 * Optional.  Hardware capability flags.
-	 */
-	u32	caps;
-
-	/*
-	 * Optional.  Check whether the var structure is acceptable
-	 * for this display.
-	 */
-	int	(*check)(struct clcd_fb *fb, struct fb_var_screeninfo *var);
-
-	/*
-	 * Compulsory.  Decode fb->fb.var into regs->*.  In the case of
-	 * fixed timing, set regs->* to the register values required.
-	 */
-	void	(*decode)(struct clcd_fb *fb, struct clcd_regs *regs);
-
-	/*
-	 * Optional.  Disable any extra display hardware.
-	 */
-	void	(*disable)(struct clcd_fb *);
-
-	/*
-	 * Optional.  Enable any extra display hardware.
-	 */
-	void	(*enable)(struct clcd_fb *);
-
-	/*
-	 * Setup platform specific parts of CLCD driver
-	 */
-	int	(*setup)(struct clcd_fb *);
-
-	/*
-	 * mmap the framebuffer memory
-	 */
-	int	(*mmap)(struct clcd_fb *, struct vm_area_struct *);
-
-	/*
-	 * Remove platform specific parts of CLCD driver
-	 */
-	void	(*remove)(struct clcd_fb *);
-};
-
-struct amba_device;
-struct clk;
-
-/* this data structure describes each frame buffer device we find */
-struct clcd_fb {
-	struct fb_info		fb;
-	struct amba_device	*dev;
-	struct clk		*clk;
-	struct clcd_panel	*panel;
-	struct clcd_board	*board;
-	void			*board_data;
-	void __iomem		*regs;
-	u16			off_ienb;
-	u16			off_cntl;
-	u32			clcd_cntl;
-	u32			cmap[16];
-	bool			clk_enabled;
-};
-
-static inline void clcdfb_decode(struct clcd_fb *fb, struct clcd_regs *regs)
-{
-	struct fb_var_screeninfo *var = &fb->fb.var;
-	u32 val, cpl;
-
-	/*
-	 * Program the CLCD controller registers and start the CLCD
-	 */
-	val = ((var->xres / 16) - 1) << 2;
-	val |= (var->hsync_len - 1) << 8;
-	val |= (var->right_margin - 1) << 16;
-	val |= (var->left_margin - 1) << 24;
-	regs->tim0 = val;
-
-	val = var->yres;
-	if (fb->panel->cntl & CNTL_LCDDUAL)
-		val /= 2;
-	val -= 1;
-	val |= (var->vsync_len - 1) << 10;
-	val |= var->lower_margin << 16;
-	val |= var->upper_margin << 24;
-	regs->tim1 = val;
-
-	val = fb->panel->tim2;
-	val |= var->sync & FB_SYNC_HOR_HIGH_ACT  ? 0 : TIM2_IHS;
-	val |= var->sync & FB_SYNC_VERT_HIGH_ACT ? 0 : TIM2_IVS;
-
-	cpl = var->xres_virtual;
-	if (fb->panel->cntl & CNTL_LCDTFT)	  /* TFT */
-		/* / 1 */;
-	else if (!var->grayscale)		  /* STN color */
-		cpl = cpl * 8 / 3;
-	else if (fb->panel->cntl & CNTL_LCDMONO8) /* STN monochrome, 8bit */
-		cpl /= 8;
-	else					  /* STN monochrome, 4bit */
-		cpl /= 4;
-
-	regs->tim2 = val | ((cpl - 1) << 16);
-
-	regs->tim3 = fb->panel->tim3;
-
-	val = fb->panel->cntl;
-	if (var->grayscale)
-		val |= CNTL_LCDBW;
-
-	if (fb->panel->caps && fb->board->caps && var->bits_per_pixel >= 16) {
-		/*
-		 * if board and panel supply capabilities, we can support
-		 * changing BGR/RGB depending on supplied parameters. Here
-		 * we switch to what the framebuffer is providing if need
-		 * be, so if the framebuffer is BGR but the display connection
-		 * is RGB (first case) we switch it around. Vice versa mutatis
-		 * mutandis if the framebuffer is RGB but the display connection
-		 * is BGR, we flip it around.
-		 */
-		if (var->red.offset == 0)
-			val &= ~CNTL_BGR;
-		else
-			val |= CNTL_BGR;
-		if (fb->panel->bgr_connection)
-			val ^= CNTL_BGR;
-	}
-
-	switch (var->bits_per_pixel) {
-	case 1:
-		val |= CNTL_LCDBPP1;
-		break;
-	case 2:
-		val |= CNTL_LCDBPP2;
-		break;
-	case 4:
-		val |= CNTL_LCDBPP4;
-		break;
-	case 8:
-		val |= CNTL_LCDBPP8;
-		break;
-	case 16:
-		/*
-		 * PL110 cannot choose between 5551 and 565 modes in its
-		 * control register.  It is possible to use 565 with
-		 * custom external wiring.
-		 */
-		if (amba_part(fb->dev) == 0x110 ||
-		    var->green.length == 5)
-			val |= CNTL_LCDBPP16;
-		else if (var->green.length == 6)
-			val |= CNTL_LCDBPP16_565;
-		else
-			val |= CNTL_LCDBPP16_444;
-		break;
-	case 32:
-		val |= CNTL_LCDBPP24;
-		break;
-	}
-
-	regs->cntl = val;
-	regs->pixclock = var->pixclock;
-}
-
-static inline int clcdfb_check(struct clcd_fb *fb, struct fb_var_screeninfo *var)
-{
-	var->xres_virtual = var->xres = (var->xres + 15) & ~15;
-	var->yres_virtual = var->yres = (var->yres + 1) & ~1;
-
-#define CHECK(e,l,h) (var->e < l || var->e > h)
-	if (CHECK(right_margin, (5+1), 256) ||	/* back porch */
-	    CHECK(left_margin, (5+1), 256) ||	/* front porch */
-	    CHECK(hsync_len, (5+1), 256) ||
-	    var->xres > 4096 ||
-	    var->lower_margin > 255 ||		/* back porch */
-	    var->upper_margin > 255 ||		/* front porch */
-	    var->vsync_len > 32 ||
-	    var->yres > 1024)
-		return -EINVAL;
-#undef CHECK
-
-	/* single panel mode: PCD = max(PCD, 1) */
-	/* dual panel mode: PCD = max(PCD, 5) */
-
-	/*
-	 * You can't change the grayscale setting, and
-	 * we can only do non-interlaced video.
-	 */
-	if (var->grayscale != fb->fb.var.grayscale ||
-	    (var->vmode & FB_VMODE_MASK) != FB_VMODE_NONINTERLACED)
-		return -EINVAL;
-
-#define CHECK(e) (var->e != fb->fb.var.e)
-	if (fb->panel->fixedtimings &&
-	    (CHECK(xres)		||
-	     CHECK(yres)		||
-	     CHECK(bits_per_pixel)	||
-	     CHECK(pixclock)		||
-	     CHECK(left_margin)		||
-	     CHECK(right_margin)	||
-	     CHECK(upper_margin)	||
-	     CHECK(lower_margin)	||
-	     CHECK(hsync_len)		||
-	     CHECK(vsync_len)		||
-	     CHECK(sync)))
-		return -EINVAL;
-#undef CHECK
-
-	var->nonstd = 0;
-	var->accel_flags = 0;
-
-	return 0;
-}
-- 
cgit v1.2.3


From 7ea26f9460c6c76b1d6e36f39fce34b16cb88300 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 9 Jan 2024 20:22:45 +0200
Subject: fsnotify: compile out fsnotify permission hooks if
 !FANOTIFY_ACCESS_PERMISSIONS

The depency of FANOTIFY_ACCESS_PERMISSIONS on SECURITY made sure that
the fsnotify permission hooks were never called when SECURITY was
disabled.

Moving the fsnotify permission hook out of the secutiy hook broke that
optimisation.

Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>
Closes: https://lore.kernel.org/linux-fsdevel/53682ece-f0e7-48de-9a1c-879ee34b0449@kernel.dk/
Fixes: d9e5d31084b0 ("fsnotify: optionally pass access range in file permission hooks")
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20240109182245.38884-1-amir73il@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fsnotify.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 11e6434b8e71..8300a5286988 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -100,6 +100,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
 	return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
 }
 
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 /*
  * fsnotify_file_area_perm - permission hook before access to file range
  */
@@ -145,6 +146,24 @@ static inline int fsnotify_open_perm(struct file *file)
 	return fsnotify_file(file, FS_OPEN_PERM);
 }
 
+#else
+static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
+					  const loff_t *ppos, size_t count)
+{
+	return 0;
+}
+
+static inline int fsnotify_file_perm(struct file *file, int perm_mask)
+{
+	return 0;
+}
+
+static inline int fsnotify_open_perm(struct file *file)
+{
+	return 0;
+}
+#endif
+
 /*
  * fsnotify_link_count - inode's link count changed
  */
-- 
cgit v1.2.3


From 755113d7678681a137c330f7997ceb680adb644e Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 9 Jan 2024 10:41:11 +0100
Subject: thermal/debugfs: Add thermal cooling device debugfs information

The thermal framework does not have any debug information except a
sysfs stat which is a bit controversial. This one allocates big chunks
of memory for every cooling devices with a high number of states and
could represent on some systems in production several megabytes of
memory for just a portion of it. As the sysfs is limited to a page
size, the output is not exploitable with large data array and gets
truncated.

The patch provides the same information than sysfs except the
transitions are dynamically allocated, thus they won't show more
events than the ones which actually occurred. There is no longer a
size limitation and it opens the field for more debugging information
where the debugfs is designed for, not sysfs.

The thermal debugfs directory structure tries to stay consistent with
the sysfs one but in a very simplified way:

thermal/
 -- cooling_devices
    |-- 0
    |   |-- clear
    |   |-- time_in_state_ms
    |   |-- total_trans
    |   `-- trans_table
    |-- 1
    |   |-- clear
    |   |-- time_in_state_ms
    |   |-- total_trans
    |   `-- trans_table
    |-- 2
    |   |-- clear
    |   |-- time_in_state_ms
    |   |-- total_trans
    |   `-- trans_table
    |-- 3
    |   |-- clear
    |   |-- time_in_state_ms
    |   |-- total_trans
    |   `-- trans_table
    `-- 4
        |-- clear
        |-- time_in_state_ms
        |-- total_trans
        `-- trans_table

The content of the files in the cooling devices directory is the same
as the sysfs one except for the trans_table which has the following
format:

Transition	Hits
1->0      	246
0->1      	246
2->1      	632
1->2      	632
3->2      	98
2->3      	98

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
[ rjw: White space fixups, rebase ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 9d0427da32af..7defea8fa223 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -32,6 +32,7 @@
 struct thermal_zone_device;
 struct thermal_cooling_device;
 struct thermal_instance;
+struct thermal_debugfs;
 struct thermal_attr;
 
 enum thermal_trend {
@@ -113,6 +114,9 @@ struct thermal_cooling_device {
 	struct mutex lock; /* protect thermal_instances list */
 	struct list_head thermal_instances;
 	struct list_head node;
+#ifdef CONFIG_THERMAL_DEBUGFS
+	struct thermal_debugfs *debugfs;
+#endif
 };
 
 /**
@@ -189,6 +193,9 @@ struct thermal_zone_device {
 	struct list_head node;
 	struct delayed_work poll_queue;
 	enum thermal_notify_event notify_event;
+#ifdef CONFIG_THERMAL_DEBUGFS
+	struct thermal_debugfs *debugfs;
+#endif
 	bool suspended;
 };
 
-- 
cgit v1.2.3


From 57a427c81c322c5f0cdfe7c46cdee553d18b1ec6 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 5 Jan 2024 14:45:11 +0100
Subject: thermal: core: Use kstrdup_const() during cooling device registration

Some *thermal_cooling_device_register() calls pass a string literal as
the 'type' parameter, so kstrdup_const() can be used instead of
kstrdup() to avoid a memory allocation in such cases.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 7defea8fa223..3227335fb447 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -103,7 +103,7 @@ struct thermal_cooling_device_ops {
 
 struct thermal_cooling_device {
 	int id;
-	char *type;
+	const char *type;
 	unsigned long max_state;
 	struct device device;
 	struct device_node *np;
-- 
cgit v1.2.3


From ba5afb9a84df2e6b26a1b6389b98849cd16ea757 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 12 Jan 2024 09:09:14 +0100
Subject: fs: rework listmount() implementation

Linus pointed out that there's error handling and naming issues in the
that we should rewrite:

* Perform the access checks for the buffer before actually doing any
  work instead of doing it during the iteration.
* Rename the arguments to listmount() and do_listmount() to clarify what
  the arguments are used for.
* Get rid of the pointless ctr variable and overflow checking.
* Get rid of the pointless speculation check.

Link: https://lore.kernel.org/r/CAHk-=wjh6Cypo8WC-McXgSzCaou3UXccxB+7PVeSuGR8AjCphg@mail.gmail.com
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/syscalls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 5c0dbef55792..cdba4d0c6d4a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -414,7 +414,7 @@ asmlinkage long sys_statmount(const struct mnt_id_req __user *req,
 			      struct statmount __user *buf, size_t bufsize,
 			      unsigned int flags);
 asmlinkage long sys_listmount(const struct mnt_id_req __user *req,
-			      u64 __user *buf, size_t bufsize,
+			      u64 __user *mnt_ids, size_t nr_mnt_ids,
 			      unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
-- 
cgit v1.2.3


From 9181d6f8a2bb32d158de66a84164fac05e3ddd18 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 12 Jan 2024 12:28:16 +0000
Subject: net: add more sanity check in virtio_net_hdr_to_skb()

syzbot/KMSAN reports access to uninitialized data from gso_features_check() [1]

The repro use af_packet, injecting a gso packet and hdrlen == 0.

We could fix the issue making gso_features_check() more careful
while dealing with NETIF_F_TSO_MANGLEID in fast path.

Or we can make sure virtio_net_hdr_to_skb() pulls minimal network and
transport headers as intended.

Note that for GSO packets coming from untrusted sources, SKB_GSO_DODGY
bit forces a proper header validation (and pull) before the packet can
hit any device ndo_start_xmit(), thus we do not need a precise disection
at virtio_net_hdr_to_skb() stage.

[1]
BUG: KMSAN: uninit-value in skb_gso_segment include/net/gso.h:83 [inline]
BUG: KMSAN: uninit-value in validate_xmit_skb+0x10f2/0x1930 net/core/dev.c:3629
 skb_gso_segment include/net/gso.h:83 [inline]
 validate_xmit_skb+0x10f2/0x1930 net/core/dev.c:3629
 __dev_queue_xmit+0x1eac/0x5130 net/core/dev.c:4341
 dev_queue_xmit include/linux/netdevice.h:3134 [inline]
 packet_xmit+0x9c/0x6b0 net/packet/af_packet.c:276
 packet_snd net/packet/af_packet.c:3087 [inline]
 packet_sendmsg+0x8b1d/0x9f30 net/packet/af_packet.c:3119
 sock_sendmsg_nosec net/socket.c:730 [inline]
 __sock_sendmsg net/socket.c:745 [inline]
 ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584
 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638
 __sys_sendmsg net/socket.c:2667 [inline]
 __do_sys_sendmsg net/socket.c:2676 [inline]
 __se_sys_sendmsg net/socket.c:2674 [inline]
 __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x63/0x6b

Uninit was created at:
 slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768
 slab_alloc_node mm/slub.c:3478 [inline]
 kmem_cache_alloc_node+0x5e9/0xb10 mm/slub.c:3523
 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:560
 __alloc_skb+0x318/0x740 net/core/skbuff.c:651
 alloc_skb include/linux/skbuff.h:1286 [inline]
 alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6334
 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2780
 packet_alloc_skb net/packet/af_packet.c:2936 [inline]
 packet_snd net/packet/af_packet.c:3030 [inline]
 packet_sendmsg+0x70e8/0x9f30 net/packet/af_packet.c:3119
 sock_sendmsg_nosec net/socket.c:730 [inline]
 __sock_sendmsg net/socket.c:745 [inline]
 ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584
 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638
 __sys_sendmsg net/socket.c:2667 [inline]
 __do_sys_sendmsg net/socket.c:2676 [inline]
 __se_sys_sendmsg net/socket.c:2674 [inline]
 __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x63/0x6b

CPU: 0 PID: 5025 Comm: syz-executor279 Not tainted 6.7.0-rc7-syzkaller-00003-gfbafc3e621c3 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023

Reported-by: syzbot+7f4d0ea3df4d4fa9a65f@syzkaller.appspotmail.com
Link: https://lore.kernel.org/netdev/0000000000005abd7b060eb160cd@google.com/
Fixes: 9274124f023b ("net: stricter validation of untrusted gso packets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 27cc1d464321..4dfa9b69ca8d 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -3,6 +3,8 @@
 #define _LINUX_VIRTIO_NET_H
 
 #include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <linux/udp.h>
 #include <uapi/linux/tcp.h>
 #include <uapi/linux/virtio_net.h>
@@ -49,6 +51,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 					const struct virtio_net_hdr *hdr,
 					bool little_endian)
 {
+	unsigned int nh_min_len = sizeof(struct iphdr);
 	unsigned int gso_type = 0;
 	unsigned int thlen = 0;
 	unsigned int p_off = 0;
@@ -65,6 +68,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 			gso_type = SKB_GSO_TCPV6;
 			ip_proto = IPPROTO_TCP;
 			thlen = sizeof(struct tcphdr);
+			nh_min_len = sizeof(struct ipv6hdr);
 			break;
 		case VIRTIO_NET_HDR_GSO_UDP:
 			gso_type = SKB_GSO_UDP;
@@ -100,7 +104,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 		if (!skb_partial_csum_set(skb, start, off))
 			return -EINVAL;
 
-		p_off = skb_transport_offset(skb) + thlen;
+		nh_min_len = max_t(u32, nh_min_len, skb_transport_offset(skb));
+		p_off = nh_min_len + thlen;
 		if (!pskb_may_pull(skb, p_off))
 			return -EINVAL;
 	} else {
@@ -140,7 +145,7 @@ retry:
 
 			skb_set_transport_header(skb, keys.control.thoff);
 		} else if (gso_type) {
-			p_off = thlen;
+			p_off = nh_min_len + thlen;
 			if (!pskb_may_pull(skb, p_off))
 				return -EINVAL;
 		}
-- 
cgit v1.2.3


From 34d946b723b53488ab39d8ac540ddf9db255317a Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 9 Jan 2024 00:25:48 -0500
Subject: i3c: master: fix kernel-doc check warning

Fix warning found by
	'scripts/kernel-doc -v -none include/linux/i3c/master.h'

include/linux/i3c/master.h:457: warning: Function parameter or member 'enable_hotjoin' not described in 'i3c_master_controller_ops'
include/linux/i3c/master.h:457: warning: Function parameter or member 'disable_hotjoin' not described in 'i3c_master_controller_ops'
include/linux/i3c/master.h:499: warning: Function parameter or member 'hotjoin' not described in 'i3c_master_controller'

Signed-off-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20240109052548.2128133-1-Frank.Li@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 1ecd73b17ff5..0ca27dd86956 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -433,6 +433,8 @@ struct i3c_bus {
  *		      for a future IBI
  *		      This method is mandatory only if ->request_ibi is not
  *		      NULL.
+ * @enable_hotjoin: enable hot join event detect.
+ * @disable_hotjoin: disable hot join event detect.
  */
 struct i3c_master_controller_ops {
 	int (*bus_init)(struct i3c_master_controller *master);
@@ -474,6 +476,7 @@ struct i3c_master_controller_ops {
  * @ops: master operations. See &struct i3c_master_controller_ops
  * @secondary: true if the master is a secondary master
  * @init_done: true when the bus initialization is done
+ * @hotjoin: true if the master support hotjoin
  * @boardinfo.i3c: list of I3C  boardinfo objects
  * @boardinfo.i2c: list of I2C boardinfo objects
  * @boardinfo: board-level information attached to devices connected on the bus
-- 
cgit v1.2.3


From aaefabc4a5f7ae48682c4d2d5d10faaf95c08eb9 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Tue, 7 Nov 2023 10:44:41 +0800
Subject: ceph: try to allocate a smaller extent map for sparse read

In fscrypt case and for a smaller read length we can predict the
max count of the extent map. And for small read length use cases
this could save some memories.

[ idryomov: squash into a single patch to avoid build break, drop
  redundant variable in ceph_alloc_sparse_ext_map() ]

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index b8610e9d2471..fa018d5864e7 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -572,9 +572,12 @@ int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt);
  */
 #define CEPH_SPARSE_EXT_ARRAY_INITIAL  16
 
-static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op)
+static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt)
 {
-	return __ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL);
+	if (!cnt)
+		cnt = CEPH_SPARSE_EXT_ARRAY_INITIAL;
+
+	return __ceph_alloc_sparse_ext_map(op, cnt);
 }
 
 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
-- 
cgit v1.2.3


From 832b371097eb928d077c827b8f117bf5b99d35c0 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Mon, 15 Jan 2024 16:05:26 +0100
Subject: gpiolib: Fix scope-based gpio_device refcounting

Commit 9e4555d1e54a ("gpiolib: add support for scope-based management to
gpio_device") sought to add scope-based gpio_device refcounting, but
erroneously forgot a negation of IS_ERR_OR_NULL().

As a result, gpio_device_put() is not called if the gpio_device pointer
is valid (meaning the ref is leaked), but only called if the pointer is
NULL or an ERR_PTR().

While at it drop a superfluous trailing semicolon.

Fixes: 9e4555d1e54a ("gpiolib: add support for scope-based management to gpio_device")
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index e846bd4e7559..9a5c6c76e653 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -635,7 +635,7 @@ struct gpio_device *gpio_device_get(struct gpio_device *gdev);
 void gpio_device_put(struct gpio_device *gdev);
 
 DEFINE_FREE(gpio_device_put, struct gpio_device *,
-	    if (IS_ERR_OR_NULL(_T)) gpio_device_put(_T));
+	    if (!IS_ERR_OR_NULL(_T)) gpio_device_put(_T))
 
 struct device *gpio_device_to_device(struct gpio_device *gdev);
 
-- 
cgit v1.2.3


From 2db6b72c989763e30fab83b186e9263fece26bc6 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 6 Dec 2023 12:17:51 -0600
Subject: PCI: Fix kernel-doc issues

Fix kernel-doc issues reported by
"find include -name \*pci\* | xargs scripts/kernel-doc -none":

  include/linux/pci.h:731: warning: Function parameter or member 'pdev' not described in 'pci_is_vga'
  include/linux/pci-epc.h:154: warning: Function parameter or member 'list_lock' not described in 'pci_epc'
  include/linux/pci-epf.h:83: warning: expecting prototype for struct pci_epf_event_ops. Prototype was for struct pci_epc_event_ops instead

Link: https://lore.kernel.org/r/20240111162850.2177655-1-helgaas@kernel.org
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Sui Jingfeng <suijingfeng@loongson.cn>
---
 include/linux/pci-epc.h | 2 +-
 include/linux/pci-epf.h | 2 +-
 include/linux/pci.h     | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 5cb694031072..bfe41b03b70c 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -122,7 +122,7 @@ struct pci_epc_mem {
  * struct pci_epc - represents the PCI EPC device
  * @dev: PCI EPC device
  * @pci_epf: list of endpoint functions present in this EPC device
- * list_lock: Mutex for protecting pci_epf list
+ * @list_lock: Mutex for protecting pci_epf list
  * @ops: function pointers for performing endpoint operations
  * @windows: array of address space of the endpoint controller
  * @mem: first window of the endpoint controller, which corresponds to
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 3f44b6aec477..92d0b71d33d7 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -68,7 +68,7 @@ struct pci_epf_ops {
 };
 
 /**
- * struct pci_epf_event_ops - Callbacks for capturing the EPC events
+ * struct pci_epc_event_ops - Callbacks for capturing the EPC events
  * @core_init: Callback for the EPC initialization complete event
  * @link_up: Callback for the EPC link up event
  * @link_down: Callback for the EPC link down event
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1a89dc66f89a..eb45087d7e00 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -715,6 +715,7 @@ static inline bool pci_is_bridge(struct pci_dev *dev)
 
 /**
  * pci_is_vga - check if the PCI device is a VGA device
+ * @pdev: PCI device
  *
  * The PCI Code and ID Assignment spec, r1.15, secs 1.4 and 1.1, define
  * VGA Base Class and Sub-Classes:
-- 
cgit v1.2.3


From 5f4c01f1e3c7b0c8d1e5dd6f080531de7aa5e47b Mon Sep 17 00:00:00 2001
From: Leonardo Bras <leobras@redhat.com>
Date: Mon, 15 Jan 2024 17:19:34 -0300
Subject: spinlock: Fix failing build for PREEMPT_RT

Since 1d71b30e1f85 ("sched.h: Move (spin|rwlock)_needbreak() to
spinlock.h") build fails for PREEMPT_RT, since there is no definition
available of either spin_needbreak() and rwlock_needbreak().

Since it was moved on the mentioned commit, it was placed inside a
!PREEMPT_RT part of the code, making it out of reach for an RT kernel.

Fix this by moving code it a few lines down so it can be reached by an
RT build, where it can also make use of the *_is_contended() definition
added by the spinlock_rt.h.

Fixes: d1d71b30e1f85 ("sched.h: Move (spin|rwlock)_needbreak() to
spinlock.h")
Signed-off-by: Leonardo Bras <leobras@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Acked-by: Waiman Long <longman@redhat.com>
---
 include/linux/spinlock.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 0c71f06454d9..b5c59fdad160 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -449,6 +449,12 @@ static __always_inline int spin_is_contended(spinlock_t *lock)
 	return raw_spin_is_contended(&lock->rlock);
 }
 
+#define assert_spin_locked(lock)	assert_raw_spin_locked(&(lock)->rlock)
+
+#else  /* !CONFIG_PREEMPT_RT */
+# include <linux/spinlock_rt.h>
+#endif /* CONFIG_PREEMPT_RT */
+
 /*
  * Does a critical section need to be broken due to another
  * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
@@ -480,12 +486,6 @@ static inline int rwlock_needbreak(rwlock_t *lock)
 #endif
 }
 
-#define assert_spin_locked(lock)	assert_raw_spin_locked(&(lock)->rlock)
-
-#else  /* !CONFIG_PREEMPT_RT */
-# include <linux/spinlock_rt.h>
-#endif /* CONFIG_PREEMPT_RT */
-
 /*
  * Pull the atomic_t declaration:
  * (asm-mips/atomic.h needs above definitions)
-- 
cgit v1.2.3


From 7bed6f3d08b7af27b7015da8dc3acf2b9c1f21d7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 16 Jan 2024 21:29:59 +0000
Subject: block: Fix iterating over an empty bio with bio_for_each_folio_all

If the bio contains no data, bio_first_folio() calls page_folio() on a
NULL pointer and oopses.  Move the test that we've reached the end of
the bio from bio_next_folio() to bio_first_folio().

Reported-by: syzbot+8b23309d5788a79d3eea@syzkaller.appspotmail.com
Reported-by: syzbot+004c1e0fced2b4bc3dcc@syzkaller.appspotmail.com
Fixes: 640d1930bef4 ("block: Add bio_for_each_folio_all()")
Cc: stable@vger.kernel.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://lore.kernel.org/r/20240116212959.3413014-1-willy@infradead.org
[axboe: add unlikely() to error case]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index ec4db73e5f4e..875d792bffff 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -286,6 +286,11 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio,
 {
 	struct bio_vec *bvec = bio_first_bvec_all(bio) + i;
 
+	if (unlikely(i >= bio->bi_vcnt)) {
+		fi->folio = NULL;
+		return;
+	}
+
 	fi->folio = page_folio(bvec->bv_page);
 	fi->offset = bvec->bv_offset +
 			PAGE_SIZE * (bvec->bv_page - &fi->folio->page);
@@ -303,10 +308,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio)
 		fi->offset = 0;
 		fi->length = min(folio_size(fi->folio), fi->_seg_count);
 		fi->_next = folio_next(fi->folio);
-	} else if (fi->_i + 1 < bio->bi_vcnt) {
-		bio_first_folio(fi, bio, fi->_i + 1);
 	} else {
-		fi->folio = NULL;
+		bio_first_folio(fi, bio, fi->_i + 1);
 	}
 }
 
-- 
cgit v1.2.3


From a54e72197037d2c9bfcd70dddaac8c8ccb5b41ba Mon Sep 17 00:00:00 2001
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Date: Thu, 11 Jan 2024 23:06:39 +0800
Subject: netfilter: propagate net to nf_bridge_get_physindev

This is a preparation patch for replacing physindev with physinif on
nf_bridge_info structure. We will use dev_get_by_index_rcu to resolve
device, when needed, and it requires net to be available.

Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index f980edfdd278..e927b9a15a55 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -56,7 +56,7 @@ static inline int nf_bridge_get_physoutif(const struct sk_buff *skb)
 }
 
 static inline struct net_device *
-nf_bridge_get_physindev(const struct sk_buff *skb)
+nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net)
 {
 	const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 
-- 
cgit v1.2.3


From 9874808878d9eed407e3977fd11fee49de1e1d86 Mon Sep 17 00:00:00 2001
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Date: Thu, 11 Jan 2024 23:06:40 +0800
Subject: netfilter: bridge: replace physindev with physinif in nf_bridge_info

An skb can be added to a neigh->arp_queue while waiting for an arp
reply. Where original skb's skb->dev can be different to neigh's
neigh->dev. For instance in case of bridging dnated skb from one veth to
another, the skb would be added to a neigh->arp_queue of the bridge.

As skb->dev can be reset back to nf_bridge->physindev and used, and as
there is no explicit mechanism that prevents this physindev from been
freed under us (for instance neigh_flush_dev doesn't cleanup skbs from
different device's neigh queue) we can crash on e.g. this stack:

arp_process
  neigh_update
    skb = __skb_dequeue(&neigh->arp_queue)
      neigh_resolve_output(..., skb)
        ...
          br_nf_dev_xmit
            br_nf_pre_routing_finish_bridge_slow
              skb->dev = nf_bridge->physindev
              br_handle_frame_finish

Let's use plain ifindex instead of net_device link. To peek into the
original net_device we will use dev_get_by_index_rcu(). Thus either we
get device and are safe to use it or we don't get it and drop skb.

Fixes: c4e70a87d975 ("netfilter: bridge: rename br_netfilter.c to br_netfilter_hooks.c")
Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge.h | 4 ++--
 include/linux/skbuff.h           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index e927b9a15a55..743475ca7e9d 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -42,7 +42,7 @@ static inline int nf_bridge_get_physinif(const struct sk_buff *skb)
 	if (!nf_bridge)
 		return 0;
 
-	return nf_bridge->physindev ? nf_bridge->physindev->ifindex : 0;
+	return nf_bridge->physinif;
 }
 
 static inline int nf_bridge_get_physoutif(const struct sk_buff *skb)
@@ -60,7 +60,7 @@ nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net)
 {
 	const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 
-	return nf_bridge ? nf_bridge->physindev : NULL;
+	return nf_bridge ? dev_get_by_index_rcu(net, nf_bridge->physinif) : NULL;
 }
 
 static inline struct net_device *
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a5ae952454c8..2dde34c29203 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -295,7 +295,7 @@ struct nf_bridge_info {
 	u8			bridged_dnat:1;
 	u8			sabotage_in_done:1;
 	__u16			frag_max_size;
-	struct net_device	*physindev;
+	int			physinif;
 
 	/* always valid & non-NULL from FORWARD on, for physdev match */
 	struct net_device	*physoutdev;
-- 
cgit v1.2.3


From 49e60333d743ae32db3bdde2f93bc818482dd741 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 17 Jan 2024 12:36:09 -0800
Subject: blk-mq: Remove the hctx 'run' debugfs attribute

Nobody uses the debugfs hctx 'run' attribute. Hence remove this
attribute and also the code that updates the corresponding member
variable.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Cc: Gabriel Ryan <gabe@cs.columbia.edu>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20240117203609.4122520-1-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a676e116085f..7a8150a5f051 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -391,9 +391,6 @@ struct blk_mq_hw_ctx {
 	 */
 	struct blk_mq_tags	*sched_tags;
 
-	/** @run: Number of dispatched requests. */
-	unsigned long		run;
-
 	/** @numa_node: NUMA node the storage adapter has been connected to. */
 	unsigned int		numa_node;
 	/** @queue_num: Index of this hardware queue. */
-- 
cgit v1.2.3


From 66967a32d3b16ed447e76fed4d946bab52e43d86 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 17 Jan 2024 19:31:40 -0800
Subject: bpf: extract bpf_ctx_convert_map logic and make it more reusable

Refactor btf_get_prog_ctx_type() a bit to allow reuse of
bpf_ctx_convert_map logic in more than one places. Simplify interface by
returning btf_type instead of btf_member (field reference in BTF).

To do the above we need to touch and start untangling
btf_translate_to_vmlinux() implementation. We do the bare minimum to
not regress anything for btf_translate_to_vmlinux(), but its
implementation is very questionable for what it claims to be doing.
Mapping kfunc argument types to kernel corresponding types conceptually
is quite different from recognizing program context types. Fixing this
is out of scope for this change though.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240118033143.3384355-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 59d404e22814..cf5c6ff48981 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -512,7 +512,7 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id);
 int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
 				struct module *owner);
 struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id);
-const struct btf_member *
+const struct btf_type *
 btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 		      const struct btf_type *t, enum bpf_prog_type prog_type,
 		      int arg);
-- 
cgit v1.2.3


From 7a8e9cdf9405819105ae7405cd91e482bf574b01 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Tue, 16 Jan 2024 08:09:25 -0600
Subject: seq_buf: Make DECLARE_SEQ_BUF() usable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using the address operator on the array doesn't work:

./include/linux/seq_buf.h:27:27: error: initialization of ‘char *’
  from incompatible pointer type ‘char (*)[128]’
  [-Werror=incompatible-pointer-types]
   27 |                 .buffer = &__ ## NAME ## _buffer,       \
      |                           ^

Apart from fixing that, we can improve DECLARE_SEQ_BUF() by using a
compound literal to define the buffer array without attaching a name
to it. This makes the macro a single statement, allowing constructs
such as:

  static DECLARE_SEQ_BUF(my_seq_buf, MYSB_SIZE);

to work as intended.

Link: https://lkml.kernel.org/r/20240116-declare-seq-buf-fix-v1-1-915db4692f32@linux.ibm.com

Cc: stable@vger.kernel.org
Acked-by: Kees Cook <keescook@chromium.org>
Fixes: dcc4e5728eea ("seq_buf: Introduce DECLARE_SEQ_BUF and seq_buf_str()")
Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 5fb1f12c33f9..c44f4b47b945 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -22,9 +22,8 @@ struct seq_buf {
 };
 
 #define DECLARE_SEQ_BUF(NAME, SIZE)			\
-	char __ ## NAME ## _buffer[SIZE] = "";		\
 	struct seq_buf NAME = {				\
-		.buffer = &__ ## NAME ## _buffer,	\
+		.buffer = (char[SIZE]) { 0 },		\
 		.size = SIZE,				\
 	}
 
-- 
cgit v1.2.3


From b60db383e2ba64a18e49b6bef3be1ab18aa159f1 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 23 Nov 2023 10:40:40 +0100
Subject: include/linux/i2c.h: remove I2C_CLASS_DDC support

After removal of the legacy EEPROM driver and I2C_CLASS_DDC support in
olpc_dcon there's no i2c client driver left supporting I2C_CLASS_DDC.
Class-based device auto-detection is a legacy mechanism and shouldn't
be used in new code. So we can remove this class completely now.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/i2c.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 0dae9db27538..d029aade338f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -850,7 +850,6 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap)
 
 /* i2c adapter classes (bitmask) */
 #define I2C_CLASS_HWMON		(1<<0)	/* lm_sensors, ... */
-#define I2C_CLASS_DDC		(1<<3)	/* DDC bus on graphics adapters */
 #define I2C_CLASS_SPD		(1<<7)	/* Memory modules */
 /* Warn users that the adapter doesn't support classes anymore */
 #define I2C_CLASS_DEPRECATED	(1<<8)
-- 
cgit v1.2.3


From 73febd775bdbdb98c81255ff85773ac410ded5c4 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Sun, 12 Nov 2023 17:54:41 -0500
Subject: i2c: create debugfs entry per adapter

Two drivers already implement custom debugfs handling for their
i2c_adapter and more will come. So, let the core create a debugfs
directory per adapter and pass that to drivers for their debugfs files.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/i2c.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index d029aade338f..e01fb1097868 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -746,6 +746,8 @@ struct i2c_adapter {
 
 	struct irq_domain *host_notify_domain;
 	struct regulator *bus_regulator;
+
+	struct dentry *debugfs;
 };
 #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
 
-- 
cgit v1.2.3


From 94959c0e796e41128483588d133b9a7003b409f9 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 16:22:43 +0100
Subject: i2c: make i2c_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the i2c_bus_type variable to be a constant structure as well, placing
it into read-only memory which can not be modified at runtime.

Note, the sound/soc/rockchip/rk3399_gru_sound.c also needed tweaking as
it decided to save off a pointer to a bus type for internal stuff, and
it was using the i2c_bus_type as well.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/i2c.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e01fb1097868..652ecb7abeda 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -23,7 +23,7 @@
 #include <linux/swab.h>		/* for swab16 */
 #include <uapi/linux/i2c.h>
 
-extern struct bus_type i2c_bus_type;
+extern const struct bus_type i2c_bus_type;
 extern struct device_type i2c_adapter_type;
 extern struct device_type i2c_client_type;
 
-- 
cgit v1.2.3


From a8355235dbd571b32c750ee756dd6dac216d18f2 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 8 Nov 2023 07:38:07 +0100
Subject: i2c: mux: reg: Remove class-based device auto-detection support

Legacy class-based device auto-detection shouldn't be used in new code.
Therefore remove support in i2c-mux-reg as long as we don't have a
user of this feature yet.

Link: https://lore.kernel.org/linux-i2c/a22978a4-88e4-46f4-b71c-032b22321599@gmail.com/
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/platform_data/i2c-mux-reg.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/i2c-mux-reg.h b/include/linux/platform_data/i2c-mux-reg.h
index 2543c2a1c9ae..e2e895768311 100644
--- a/include/linux/platform_data/i2c-mux-reg.h
+++ b/include/linux/platform_data/i2c-mux-reg.h
@@ -17,7 +17,6 @@
  * @n_values: Number of multiplexer channels
  * @little_endian: Indicating if the register is in little endian
  * @write_only: Reading the register is not allowed by hardware
- * @classes: Optional I2C auto-detection classes
  * @idle: Value to write to mux when idle
  * @idle_in_use: indicate if idle value is in use
  * @reg: Virtual address of the register to switch channel
@@ -30,7 +29,6 @@ struct i2c_mux_reg_platform_data {
 	int n_values;
 	bool little_endian;
 	bool write_only;
-	const unsigned int *classes;
 	u32 idle;
 	bool idle_in_use;
 	void __iomem *reg;
-- 
cgit v1.2.3


From ef175b29a242fea98f467f008237484b03c94834 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Fri, 15 Jan 2021 15:24:59 -0600
Subject: of: Stop circularly including of_device.h and of_platform.h

The DT of_device.h and of_platform.h headers date back to the separate
of_platform_bus_type before it was merged into the regular platform bus.
As part of that merge prepping Arm DT support 13 years ago, they
"temporarily" include each other. The headers also include
platform_device.h and of.h. The result was lots of drivers relied on
these implicit includes.

Now the entire tree has been fixed over the last couple of cycles to
explicitly include the necessary headers instead of relying on
of_device.h and/or of_platform.h implicit includes, so the implicit and
circular includes can finally be removed.

Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_device.h   | 5 +----
 include/linux/of_platform.h | 4 ++--
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index a72661e47faa..9042bca5bb84 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -2,10 +2,7 @@
 #ifndef _LINUX_OF_DEVICE_H
 #define _LINUX_OF_DEVICE_H
 
-#include <linux/platform_device.h>
-#include <linux/of_platform.h> /* temporary until merge */
-
-#include <linux/of.h>
+#include <linux/device/driver.h>
 
 struct device;
 struct of_device_id;
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index fadfea575485..a2ff1ad48f7f 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -7,11 +7,11 @@
  */
 
 #include <linux/mod_devicetable.h>
-#include <linux/of_device.h>
-#include <linux/platform_device.h>
 
 struct device;
+struct device_node;
 struct of_device_id;
+struct platform_device;
 
 /**
  * struct of_dev_auxdata - lookup table entry for device names & platform_data
-- 
cgit v1.2.3


From d26270061ae66b915138af7cd73ca6f8b85e6b44 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 18 Jan 2024 12:31:55 -0800
Subject: string: Remove strlcpy()

With all the users of strlcpy() removed[1] from the kernel, remove the
API, self-tests, and other references. Leave mentions in Documentation
(about its deprecation), and in checkpatch.pl (to help migrate host-only
tools/ usage). Long live strscpy().

Link: https://github.com/KSPP/linux/issues/89 [1]
Cc: Azeem Shaikh <azeemshaikh38@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Joe Perches <joe@perches.com>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: linux-hardening@vger.kernel.org
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 51 ------------------------------------------
 include/linux/string.h         |  3 ---
 2 files changed, 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 79ef6ac4c021..89a6888f2f9e 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -214,51 +214,6 @@ __kernel_size_t __fortify_strlen(const char * const POS p)
 	return ret;
 }
 
-/* Defined after fortified strlen() to reuse it. */
-extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
-/**
- * strlcpy - Copy a string into another string buffer
- *
- * @p: pointer to destination of copy
- * @q: pointer to NUL-terminated source string to copy
- * @size: maximum number of bytes to write at @p
- *
- * If strlen(@q) >= @size, the copy of @q will be truncated at
- * @size - 1 bytes. @p will always be NUL-terminated.
- *
- * Do not use this function. While FORTIFY_SOURCE tries to avoid
- * over-reads when calculating strlen(@q), it is still possible.
- * Prefer strscpy(), though note its different return values for
- * detecting truncation.
- *
- * Returns total number of bytes written to @p, including terminating NUL.
- *
- */
-__FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, size_t size)
-{
-	const size_t p_size = __member_size(p);
-	const size_t q_size = __member_size(q);
-	size_t q_len;	/* Full count of source string length. */
-	size_t len;	/* Count of characters going into destination. */
-
-	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
-		return __real_strlcpy(p, q, size);
-	q_len = strlen(q);
-	len = (q_len >= size) ? size - 1 : q_len;
-	if (__builtin_constant_p(size) && __builtin_constant_p(q_len) && size) {
-		/* Write size is always larger than destination. */
-		if (len >= p_size)
-			__write_overflow();
-	}
-	if (size) {
-		if (len >= p_size)
-			fortify_panic(__func__);
-		__underlying_memcpy(p, q, len);
-		p[len] = '\0';
-	}
-	return q_len;
-}
-
 /* Defined after fortified strnlen() to reuse it. */
 extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy);
 /**
@@ -272,12 +227,6 @@ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy);
  * @p buffer. The behavior is undefined if the string buffers overlap. The
  * destination @p buffer is always NUL terminated, unless it's zero-sized.
  *
- * Preferred to strlcpy() since the API doesn't require reading memory
- * from the source @q string beyond the specified @size bytes, and since
- * the return value is easier to error-check than strlcpy()'s.
- * In addition, the implementation is robust to the string changing out
- * from underneath it, unlike the current strlcpy() implementation.
- *
  * Preferred to strncpy() since it always returns a valid string, and
  * doesn't unnecessarily force the tail of the destination buffer to be
  * zero padded. If padding is desired please use strscpy_pad().
diff --git a/include/linux/string.h b/include/linux/string.h
index ce137830a0b9..ab148d8dbfc1 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -66,9 +66,6 @@ extern char * strcpy(char *,const char *);
 #ifndef __HAVE_ARCH_STRNCPY
 extern char * strncpy(char *,const char *, __kernel_size_t);
 #endif
-#ifndef __HAVE_ARCH_STRLCPY
-size_t strlcpy(char *, const char *, size_t);
-#endif
 #ifndef __HAVE_ARCH_STRSCPY
 ssize_t strscpy(char *, const char *, size_t);
 #endif
-- 
cgit v1.2.3


From a54d51fb2dfb846aedf3751af501e9688db447f5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 18 Jan 2024 20:17:49 +0000
Subject: udp: fix busy polling

Generic sk_busy_loop_end() only looks at sk->sk_receive_queue
for presence of packets.

Problem is that for UDP sockets after blamed commit, some packets
could be present in another queue: udp_sk(sk)->reader_queue

In some cases, a busy poller could spin until timeout expiration,
even if some packets are available in udp_sk(sk)->reader_queue.

v3: - make sk_busy_loop_end() nicer (Willem)

v2: - add a READ_ONCE(sk->sk_family) in sk_is_inet() to avoid KCSAN splats.
    - add a sk_is_inet() check in sk_is_udp() (Willem feedback)
    - add a sk_is_inet() check in sk_is_tcp().

Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skmsg.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 888a4b217829..e65ec3fd2799 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -505,12 +505,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 	return !!psock->saved_data_ready;
 }
 
-static inline bool sk_is_udp(const struct sock *sk)
-{
-	return sk->sk_type == SOCK_DGRAM &&
-	       sk->sk_protocol == IPPROTO_UDP;
-}
-
 #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
 
 #define BPF_F_STRPARSER	(1UL << 1)
-- 
cgit v1.2.3


From 73ae7e1c7644a8c33ba526302a10267cdbc249f8 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <cassel@kernel.org>
Date: Thu, 11 Jan 2024 17:57:44 +0100
Subject: ata: libata-sata: improve sysfs description for ATA_LPM_UNKNOWN

Currently, both ATA_LPM_UNKNOWN (0) and ATA_LPM_MAX_POWER (1) displays
as "max_performance" in sysfs.

This is quite misleading as they are not the same.

For ATA_LPM_UNKNOWN, ata_eh_set_lpm() will not be called at all,
leaving the configuration in unknown state.
For ATA_LPM_MAX_POWER, ata_eh_set_lpm() is called, and setting the
policy to ATA_LPM_MAX_POWER.

This also matches the description of the SATA_MOBILE_LPM_POLICY Kconfig:
0 => Keep firmware settings
1 => Maximum performance

Thus, update the sysfs description for ATA_LPM_UNKNOWN to match reality.

While at it, update libata.h to mention that the ascii descriptions
are in libata-sata.c and not in libata-scsi.c.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 1dbb14daccfa..26d68115afb8 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -471,7 +471,7 @@ enum ata_completion_errors {
 
 /*
  * Link power management policy: If you alter this, you also need to
- * alter libata-scsi.c (for the ascii descriptions)
+ * alter libata-sata.c (for the ascii descriptions)
  */
 enum ata_lpm_policy {
 	ATA_LPM_UNKNOWN,
-- 
cgit v1.2.3


From 41353fbad4f551e82c2792f7e82ac225c79cc710 Mon Sep 17 00:00:00 2001
From: Guixin Liu <kanie@linux.alibaba.com>
Date: Thu, 18 Jan 2024 20:51:45 +0800
Subject: nvmet: unify aer type enum

The host and target use two definition of aer type, unify
them into a single one.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 462c21e0e417..68eff8c86ce3 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -816,12 +816,6 @@ struct nvme_reservation_status_ext {
 	struct nvme_registered_ctrl_ext regctl_eds[];
 };
 
-enum nvme_async_event_type {
-	NVME_AER_TYPE_ERROR	= 0,
-	NVME_AER_TYPE_SMART	= 1,
-	NVME_AER_TYPE_NOTICE	= 2,
-};
-
 /* I/O commands */
 
 enum nvme_opcode {
-- 
cgit v1.2.3


From 25461ce8b3d28528f2c55f5e737e99d2906eda83 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@nvidia.com>
Date: Fri, 15 Dec 2023 19:31:14 -0800
Subject: net/mlx5e: Use the correct lag ports number when creating TISes

The cited commit moved the code of mlx5e_create_tises() and changed the
loop to create TISes over MLX5_MAX_PORTS constant value, instead of
getting the correct lag ports supported by the device, which can cause
FW errors on devices with less than MLX5_MAX_PORTS ports.

Change that back to mlx5e_get_num_lag_ports(mdev).

Also IPoIB interfaces create there own TISes, they don't use the eth
TISes, pass a flag to indicate that.

This fixes the following errors that might appear in kernel log:
mlx5_cmd_out_err:808:(pid 650): CREATE_TIS(0x912) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x595b5d), err(-22)
mlx5e_create_mdev_resources:174:(pid 650): alloc tises failed, -22

Fixes: b25bd37c859f ("net/mlx5: Move TISes from priv to mdev HW resources")
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 8c55ff351e5f..41f03b352401 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -681,6 +681,7 @@ struct mlx5e_resources {
 		struct mlx5_sq_bfreg       bfreg;
 #define MLX5_MAX_NUM_TC 8
 		u32                        tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC];
+		bool			   tisn_valid;
 	} hw_objs;
 	struct net_device *uplink_netdev;
 	struct mutex uplink_netdev_lock;
-- 
cgit v1.2.3


From cfbc3608a8c69b48bf238bd68f768192f0238e0d Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Tue, 19 Dec 2023 14:46:20 +0200
Subject: net/mlx5: Fix query of sd_group field

The sd_group field moved in the HW spec from the MPIR register
to the vport context.
Align the query accordingly.

Fixes: f5e956329960 ("net/mlx5: Expose Management PCIe Index Register (MPIR)")
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 10 +++++++---
 include/linux/mlx5/vport.h    |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index bf5320b28b8b..37230253f9f1 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4036,8 +4036,13 @@ struct mlx5_ifc_nic_vport_context_bits {
 	u8	   affiliation_criteria[0x4];
 	u8	   affiliated_vhca_id[0x10];
 
-	u8	   reserved_at_60[0xd0];
+	u8	   reserved_at_60[0xa0];
 
+	u8	   reserved_at_100[0x1];
+	u8         sd_group[0x3];
+	u8	   reserved_at_104[0x1c];
+
+	u8	   reserved_at_120[0x10];
 	u8         mtu[0x10];
 
 	u8         system_image_guid[0x40];
@@ -10122,8 +10127,7 @@ struct mlx5_ifc_mpir_reg_bits {
 	u8         reserved_at_20[0x20];
 
 	u8         local_port[0x8];
-	u8         reserved_at_28[0x15];
-	u8         sd_group[0x3];
+	u8         reserved_at_28[0x18];
 
 	u8         reserved_at_60[0x20];
 };
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index fbb9bf447889..c36cc6d82926 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -72,6 +72,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu);
 int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu);
 int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
 					   u64 *system_image_guid);
+int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group);
 int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
 int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
 				    u16 vport, u64 node_guid);
-- 
cgit v1.2.3


From ec7cc38ef9f83553102e84c82536971a81630739 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Sat, 30 Dec 2023 22:40:37 +0200
Subject: net/mlx5: Bridge, fix multicast packets sent to uplink

To enable multicast packets which are offloaded in bridge multicast
offload mode to be sent also to uplink, FTE bit uplink_hairpin_en should
be set. Add this bit to FTE for the bridge multicast offload rules.

Fixes: 18c2916cee12 ("net/mlx5: Bridge, snoop igmp/mld packets")
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/fs.h       | 1 +
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 6f7725238abc..3fb428ce7d1c 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -132,6 +132,7 @@ struct mlx5_flow_handle;
 
 enum {
 	FLOW_CONTEXT_HAS_TAG = BIT(0),
+	FLOW_CONTEXT_UPLINK_HAIRPIN_EN = BIT(1),
 };
 
 struct mlx5_flow_context {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 37230253f9f1..c726f90ab752 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -3576,7 +3576,7 @@ struct mlx5_ifc_flow_context_bits {
 	u8         action[0x10];
 
 	u8         extended_destination[0x1];
-	u8         reserved_at_81[0x1];
+	u8         uplink_hairpin_en[0x1];
 	u8         flow_source[0x2];
 	u8         encrypt_decrypt_type[0x4];
 	u8         destination_list_size[0x18];
-- 
cgit v1.2.3


From 56062d60f117dccfb5281869e0ab61e090baf864 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <rpalethorpe@suse.com>
Date: Wed, 10 Jan 2024 15:01:22 +0200
Subject: x86/entry/ia32: Ensure s32 is sign extended to s64

Presently ia32 registers stored in ptregs are unconditionally cast to
unsigned int by the ia32 stub. They are then cast to long when passed to
__se_sys*, but will not be sign extended.

This takes the sign of the syscall argument into account in the ia32
stub. It still casts to unsigned int to avoid implementation specific
behavior. However then casts to int or unsigned int as necessary. So that
the following cast to long sign extends the value.

This fixes the io_pgetevents02 LTP test when compiled with -m32. Presently
the systemcall io_pgetevents_time64() unexpectedly accepts -1 for the
maximum number of events.

It doesn't appear other systemcalls with signed arguments are effected
because they all have compat variants defined and wired up.

Fixes: ebeb8c82ffaf ("syscalls/x86: Use 'struct pt_regs' based syscall calling for IA32_EMULATION and x32")
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Richard Palethorpe <rpalethorpe@suse.com>
Signed-off-by: Nikolay Borisov <nik.borisov@suse.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20240110130122.3836513-1-nik.borisov@suse.com
Link: https://lore.kernel.org/ltp/20210921130127.24131-1-rpalethorpe@suse.com/
---
 include/linux/syscalls.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index cdba4d0c6d4a..77eb9b0e7685 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -128,6 +128,7 @@ struct mnt_id_req;
 #define __TYPE_IS_LL(t) (__TYPE_AS(t, 0LL) || __TYPE_AS(t, 0ULL))
 #define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a
 #define __SC_CAST(t, a)	(__force t) a
+#define __SC_TYPE(t, a)	t
 #define __SC_ARGS(t, a)	a
 #define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long))
 
-- 
cgit v1.2.3


From 2f8c7c3715f2c6fb51a4ecc0905c04dd78a3da29 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 24 Jan 2024 13:24:24 +0000
Subject: spi: Raise limit on number of chip selects

As reported by Guenter the limit we've got on the number of chip selects is
set too low for some systems, raise the limit. We should really remove the
hard coded limit but this is needed as a fix so let's do the simple thing
and raise the limit for now.

Fixes: 4d8ff6b0991d ("spi: Add multi-cs memories support in SPI core")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Suggested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://msgid.link/r/20240124-spi-multi-cs-max-v2-1-df6fc5ab1abc@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 471fe2ff9066..600fbd5daf68 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -21,7 +21,7 @@
 #include <uapi/linux/spi/spi.h>
 
 /* Max no. of CS supported per spi device */
-#define SPI_CS_CNT_MAX 4
+#define SPI_CS_CNT_MAX 16
 
 struct dma_chan;
 struct software_node;
-- 
cgit v1.2.3


From 90383cc07895183c75a0db2460301c2ffd912359 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 24 Jan 2024 11:15:33 -0800
Subject: exec: Distinguish in_execve from in_exec

Just to help distinguish the fs->in_exec flag from the current->in_execve
flag, add comments in check_unsafe_exec() and copy_fs() for more
context. Also note that in_execve is only used by TOMOYO now.

Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cdb8ea53c365..ffe8f618ab86 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -920,7 +920,7 @@ struct task_struct {
 	unsigned			sched_rt_mutex:1;
 #endif
 
-	/* Bit to tell LSMs we're in execve(): */
+	/* Bit to tell TOMOYO we're in execve(): */
 	unsigned			in_execve:1;
 	unsigned			in_iowait:1;
 #ifndef TIF_RESTORE_SIGMASK
-- 
cgit v1.2.3


From c4608d1bf7c6536d1a3d233eb21e50678681564e Mon Sep 17 00:00:00 2001
From: Yang Shi <yang@os.amperecomputing.com>
Date: Wed, 20 Dec 2023 22:59:43 -0800
Subject: mm: mmap: map MAP_STACK to VM_NOHUGEPAGE

commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP
boundaries") incured regression for stress-ng pthread benchmark [1].  It
is because THP get allocated to pthread's stack area much more possible
than before.  Pthread's stack area is allocated by mmap without
VM_GROWSDOWN or VM_GROWSUP flag, so kernel can't tell whether it is a
stack area or not.

The MAP_STACK flag is used to mark the stack area, but it is a no-op on
Linux.  Mapping MAP_STACK to VM_NOHUGEPAGE to prevent from allocating THP
for such stack area.

With this change the stack area looks like:

fffd18e10000-fffd19610000 rw-p 00000000 00:00 0
Size:               8192 kB
KernelPageSize:        4 kB
MMUPageSize:           4 kB
Rss:                  12 kB
Pss:                  12 kB
Pss_Dirty:            12 kB
Shared_Clean:          0 kB
Shared_Dirty:          0 kB
Private_Clean:         0 kB
Private_Dirty:        12 kB
Referenced:           12 kB
Anonymous:            12 kB
KSM:                   0 kB
LazyFree:              0 kB
AnonHugePages:         0 kB
ShmemPmdMapped:        0 kB
FilePmdMapped:         0 kB
Shared_Hugetlb:        0 kB
Private_Hugetlb:       0 kB
Swap:                  0 kB
SwapPss:               0 kB
Locked:                0 kB
THPeligible:           0
VmFlags: rd wr mr mw me ac nh

The "nh" flag is set.

[1] https://lore.kernel.org/linux-mm/202312192310.56367035-oliver.sang@intel.com/

Link: https://lkml.kernel.org/r/20231221065943.2803551-2-shy828301@gmail.com
Fixes: efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries")
Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Tested-by: Oliver Sang <oliver.sang@intel.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christopher Lameter <cl@linux.com>
Cc: Huang, Ying <ying.huang@intel.com>
Cc: <stable@vger.kerenl.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mman.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 40d94411d492..dc7048824be8 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -156,6 +156,7 @@ calc_vm_flag_bits(unsigned long flags)
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
 	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      ) |
+	       _calc_vm_trans(flags, MAP_STACK,	     VM_NOHUGEPAGE) |
 	       arch_calc_vm_flag_bits(flags);
 }
 
-- 
cgit v1.2.3


From f6564fce256a3944aa1bc76cb3c40e792d97c1eb Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 18 Jan 2024 11:59:14 +0100
Subject: mm, kmsan: fix infinite recursion due to RCU critical section

Alexander Potapenko writes in [1]: "For every memory access in the code
instrumented by KMSAN we call kmsan_get_metadata() to obtain the metadata
for the memory being accessed.  For virtual memory the metadata pointers
are stored in the corresponding `struct page`, therefore we need to call
virt_to_page() to get them.

According to the comment in arch/x86/include/asm/page.h,
virt_to_page(kaddr) returns a valid pointer iff virt_addr_valid(kaddr) is
true, so KMSAN needs to call virt_addr_valid() as well.

To avoid recursion, kmsan_get_metadata() must not call instrumented code,
therefore ./arch/x86/include/asm/kmsan.h forks parts of
arch/x86/mm/physaddr.c to check whether a virtual address is valid or not.

But the introduction of rcu_read_lock() to pfn_valid() added instrumented
RCU API calls to virt_to_page_or_null(), which is called by
kmsan_get_metadata(), so there is an infinite recursion now.  I do not
think it is correct to stop that recursion by doing
kmsan_enter_runtime()/kmsan_exit_runtime() in kmsan_get_metadata(): that
would prevent instrumented functions called from within the runtime from
tracking the shadow values, which might introduce false positives."

Fix the issue by switching pfn_valid() to the _sched() variant of
rcu_read_lock/unlock(), which does not require calling into RCU.  Given
the critical section in pfn_valid() is very small, this is a reasonable
trade-off (with preemptible RCU).

KMSAN further needs to be careful to suppress calls into the scheduler,
which would be another source of recursion.  This can be done by wrapping
the call to pfn_valid() into preempt_disable/enable_no_resched().  The
downside is that this sacrifices breaking scheduling guarantees; however,
a kernel compiled with KMSAN has already given up any performance
guarantees due to being heavily instrumented.

Note, KMSAN code already disables tracing via Makefile, and since mmzone.h
is included, it is not necessary to use the notrace variant, which is
generally preferred in all other cases.

Link: https://lkml.kernel.org/r/20240115184430.2710652-1-glider@google.com [1]
Link: https://lkml.kernel.org/r/20240118110022.2538350-1-elver@google.com
Fixes: 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing memory_section->usage")
Signed-off-by: Marco Elver <elver@google.com>
Reported-by: Alexander Potapenko <glider@google.com>
Reported-by: syzbot+93a9e8a3dea8d6085e12@syzkaller.appspotmail.com
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Alexander Potapenko <glider@google.com>
Cc: Charan Teja Kalla <quic_charante@quicinc.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4ed33b127821..a497f189d988 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -2013,9 +2013,9 @@ static inline int pfn_valid(unsigned long pfn)
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
 	ms = __pfn_to_section(pfn);
-	rcu_read_lock();
+	rcu_read_lock_sched();
 	if (!valid_section(ms)) {
-		rcu_read_unlock();
+		rcu_read_unlock_sched();
 		return 0;
 	}
 	/*
@@ -2023,7 +2023,7 @@ static inline int pfn_valid(unsigned long pfn)
 	 * the entire section-sized span.
 	 */
 	ret = early_section(ms) || pfn_section_valid(ms, pfn);
-	rcu_read_unlock();
+	rcu_read_unlock_sched();
 
 	return ret;
 }
-- 
cgit v1.2.3


From a22fe1d6dec7e98535b97249fdc95c2be79120bb Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 23 Jan 2024 12:28:41 -0500
Subject: dmaengine: fix is_slave_direction() return false when DMA_DEV_TO_DEV

is_slave_direction() should return true when direction is DMA_DEV_TO_DEV.

Fixes: 49920bc66984 ("dmaengine: add new enum dma_transfer_direction")
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20240123172842.3764529-1-Frank.Li@nxp.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 3df70d6131c8..752dbde4cec1 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -953,7 +953,8 @@ static inline int dmaengine_slave_config(struct dma_chan *chan,
 
 static inline bool is_slave_direction(enum dma_transfer_direction direction)
 {
-	return (direction == DMA_MEM_TO_DEV) || (direction == DMA_DEV_TO_MEM);
+	return (direction == DMA_MEM_TO_DEV) || (direction == DMA_DEV_TO_MEM) ||
+	       (direction == DMA_DEV_TO_DEV);
 }
 
 static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_single(
-- 
cgit v1.2.3


From 5a287d3d2b9de2b3e747132c615599907ba5c3c1 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Fri, 26 Jan 2024 19:45:31 +0100
Subject: lsm: fix default return value of the socket_getpeersec_*() hooks

For these hooks the true "neutral" value is -EOPNOTSUPP, which is
currently what is returned when no LSM provides this hook and what LSMs
return when there is no security context set on the socket. Correct the
value in <linux/lsm_hooks.h> and adjust the dispatch functions in
security/security.c to avoid issues when the BPF LSM is enabled.

Cc: stable@vger.kernel.org
Fixes: 98e828a0650f ("security: Refactor declaration of LSM hooks")
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
[PM: subject line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 185924c56378..76458b6d53da 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -315,9 +315,9 @@ LSM_HOOK(int, 0, socket_getsockopt, struct socket *sock, int level, int optname)
 LSM_HOOK(int, 0, socket_setsockopt, struct socket *sock, int level, int optname)
 LSM_HOOK(int, 0, socket_shutdown, struct socket *sock, int how)
 LSM_HOOK(int, 0, socket_sock_rcv_skb, struct sock *sk, struct sk_buff *skb)
-LSM_HOOK(int, 0, socket_getpeersec_stream, struct socket *sock,
+LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_stream, struct socket *sock,
 	 sockptr_t optval, sockptr_t optlen, unsigned int len)
-LSM_HOOK(int, 0, socket_getpeersec_dgram, struct socket *sock,
+LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_dgram, struct socket *sock,
 	 struct sk_buff *skb, u32 *secid)
 LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority)
 LSM_HOOK(void, LSM_RET_VOID, sk_free_security, struct sock *sk)
-- 
cgit v1.2.3


From 764ad6b02777d77dca3659ca490f0898aa593670 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Wed, 24 Jan 2024 12:26:59 +0100
Subject: HID: bpf: use __bpf_kfunc instead of noinline

Follow the docs at Documentation/bpf/kfuncs.rst:
- declare the function with `__bpf_kfunc`
- disables missing prototype warnings, which allows to remove them from
  include/linux/hid-bpf.h

Removing the prototypes is not an issue because we currently have to
redeclare them when writing the BPF program. They will eventually be
generated by bpftool directly AFAIU.

Link: https://lore.kernel.org/r/20240124-b4-hid-bpf-fixes-v2-3-052520b1e5e6@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hid_bpf.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h
index 840cd254172d..7118ac28d468 100644
--- a/include/linux/hid_bpf.h
+++ b/include/linux/hid_bpf.h
@@ -77,17 +77,6 @@ enum hid_bpf_attach_flags {
 int hid_bpf_device_event(struct hid_bpf_ctx *ctx);
 int hid_bpf_rdesc_fixup(struct hid_bpf_ctx *ctx);
 
-/* Following functions are kfunc that we export to BPF programs */
-/* available everywhere in HID-BPF */
-__u8 *hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const size_t __sz);
-
-/* only available in syscall */
-int hid_bpf_attach_prog(unsigned int hid_id, int prog_fd, __u32 flags);
-int hid_bpf_hw_request(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz,
-		       enum hid_report_type rtype, enum hid_class_request reqtype);
-struct hid_bpf_ctx *hid_bpf_allocate_context(unsigned int hid_id);
-void hid_bpf_release_context(struct hid_bpf_ctx *ctx);
-
 /*
  * Below is HID internal
  */
-- 
cgit v1.2.3


From 1e560864159d002b453da42bd2c13a1805515a20 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Tue, 30 Jan 2024 11:02:43 +0100
Subject: PCI/ASPM: Fix deadlock when enabling ASPM

A last minute revert in 6.7-final introduced a potential deadlock when
enabling ASPM during probe of Qualcomm PCIe controllers as reported by
lockdep:

  ============================================
  WARNING: possible recursive locking detected
  6.7.0 #40 Not tainted
  --------------------------------------------
  kworker/u16:5/90 is trying to acquire lock:
  ffffacfa78ced000 (pci_bus_sem){++++}-{3:3}, at: pcie_aspm_pm_state_change+0x58/0xdc

              but task is already holding lock:
  ffffacfa78ced000 (pci_bus_sem){++++}-{3:3}, at: pci_walk_bus+0x34/0xbc

              other info that might help us debug this:
   Possible unsafe locking scenario:

         CPU0
         ----
    lock(pci_bus_sem);
    lock(pci_bus_sem);

               *** DEADLOCK ***

  Call trace:
   print_deadlock_bug+0x25c/0x348
   __lock_acquire+0x10a4/0x2064
   lock_acquire+0x1e8/0x318
   down_read+0x60/0x184
   pcie_aspm_pm_state_change+0x58/0xdc
   pci_set_full_power_state+0xa8/0x114
   pci_set_power_state+0xc4/0x120
   qcom_pcie_enable_aspm+0x1c/0x3c [pcie_qcom]
   pci_walk_bus+0x64/0xbc
   qcom_pcie_host_post_init_2_7_0+0x28/0x34 [pcie_qcom]

The deadlock can easily be reproduced on machines like the Lenovo ThinkPad
X13s by adding a delay to increase the race window during asynchronous
probe where another thread can take a write lock.

Add a new pci_set_power_state_locked() and associated helper functions that
can be called with the PCI bus semaphore held to avoid taking the read lock
twice.

Link: https://lore.kernel.org/r/ZZu0qx2cmn7IwTyQ@hovoldconsulting.com
Link: https://lore.kernel.org/r/20240130100243.11011-1-johan+linaro@kernel.org
Fixes: f93e71aea6c6 ("Revert "PCI/ASPM: Remove pcie_aspm_pm_state_change()"")
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: <stable@vger.kernel.org>	# 6.7
---
 include/linux/pci.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index add9368e6314..7ab0d13672da 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1422,6 +1422,7 @@ int pci_load_and_free_saved_state(struct pci_dev *dev,
 				  struct pci_saved_state **state);
 int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state);
 int pci_set_power_state(struct pci_dev *dev, pci_power_t state);
+int pci_set_power_state_locked(struct pci_dev *dev, pci_power_t state);
 pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
 bool pci_pme_capable(struct pci_dev *dev, pci_power_t state);
 void pci_pme_active(struct pci_dev *dev, bool enable);
@@ -1625,6 +1626,8 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
 
 void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
 		  void *userdata);
+void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
+			 void *userdata);
 int pci_cfg_space_size(struct pci_dev *dev);
 unsigned char pci_bus_max_busnr(struct pci_bus *bus);
 void pci_setup_bridge(struct pci_bus *bus);
@@ -2025,6 +2028,8 @@ static inline int pci_save_state(struct pci_dev *dev) { return 0; }
 static inline void pci_restore_state(struct pci_dev *dev) { }
 static inline int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 { return 0; }
+static inline int pci_set_power_state_locked(struct pci_dev *dev, pci_power_t state)
+{ return 0; }
 static inline int pci_wake_from_d3(struct pci_dev *dev, bool enable)
 { return 0; }
 static inline pci_power_t pci_choose_state(struct pci_dev *dev,
-- 
cgit v1.2.3


From 97f7cf1cd80eeed3b7c808b7c12463295c751001 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@netfilter.org>
Date: Mon, 29 Jan 2024 10:57:01 +0100
Subject: netfilter: ipset: fix performance regression in swap operation

The patch "netfilter: ipset: fix race condition between swap/destroy
and kernel side add/del/test", commit 28628fa9 fixes a race condition.
But the synchronize_rcu() added to the swap function unnecessarily slows
it down: it can safely be moved to destroy and use call_rcu() instead.

Eric Dumazet pointed out that simply calling the destroy functions as
rcu callback does not work: sets with timeout use garbage collectors
which need cancelling at destroy which can wait. Therefore the destroy
functions are split into two: cancelling garbage collectors safely at
executing the command received by netlink and moving the remaining
part only into the rcu callback.

Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
Reported-by: Ale Crismani <ale.crismani@automattic.com>
Reported-by: David Wang <00107082@163.com>
Tested-by: David Wang <00107082@163.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index e8c350a3ade1..e9f4f845d760 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -186,6 +186,8 @@ struct ip_set_type_variant {
 	/* Return true if "b" set is the same as "a"
 	 * according to the create set parameters */
 	bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
+	/* Cancel ongoing garbage collectors before destroying the set*/
+	void (*cancel_gc)(struct ip_set *set);
 	/* Region-locking is used */
 	bool region_lock;
 };
@@ -242,6 +244,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);
 
 /* A generic IP set */
 struct ip_set {
+	/* For call_cru in destroy */
+	struct rcu_head rcu;
 	/* The name of the set */
 	char name[IPSET_MAXNAMELEN];
 	/* Lock protecting the set data */
-- 
cgit v1.2.3


From f9e9115d0c014dec3278d68823eaff159f98f4d6 Mon Sep 17 00:00:00 2001
From: Caleb Sander <csander@purestorage.com>
Date: Wed, 31 Jan 2024 09:43:13 -0700
Subject: nvme: take const cmd pointer in read-only helpers

nvme_is_fabrics() and nvme_is_write() only read struct nvme_command,
so take it by const pointer. This allows callers to pass a const pointer
and communicates that these functions don't modify the command.

Signed-off-by: Caleb Sander <csander@purestorage.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 68eff8c86ce3..bc605ec4a3fd 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1812,7 +1812,7 @@ struct nvme_command {
 	};
 };
 
-static inline bool nvme_is_fabrics(struct nvme_command *cmd)
+static inline bool nvme_is_fabrics(const struct nvme_command *cmd)
 {
 	return cmd->common.opcode == nvme_fabrics_command;
 }
@@ -1831,7 +1831,7 @@ struct nvme_error_slot {
 	__u8		resv2[24];
 };
 
-static inline bool nvme_is_write(struct nvme_command *cmd)
+static inline bool nvme_is_write(const struct nvme_command *cmd)
 {
 	/*
 	 * What a mess...
-- 
cgit v1.2.3


From 54ce1927eb787f7bbb7ee664841c8f5932703f39 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 31 Jan 2024 15:55:38 -0800
Subject: cxl/cper: Fix errant CPER prints for CXL events

Jonathan reports that CXL CPER events dump an extra generic error
message.

	{1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1
	{1}[Hardware Error]: event severity: recoverable
	{1}[Hardware Error]:  Error 0, type: recoverable
	{1}[Hardware Error]:   section type: unknown, fbcd0a77-c260-417f-85a9-088b1621eba6
	{1}[Hardware Error]:   section length: 0x90
	{1}[Hardware Error]:   00000000: 00000090 00000007 00000000 0d938086 ................
	{1}[Hardware Error]:   00000010: 00100000 00000000 00040000 00000000 ................
	...

CXL events were rerouted though the CXL subsystem for additional
processing.  However, when that work was done it was missed that
cper_estatus_print_section() continued with a generic error message
which is confusing.

Teach CPER print code to ignore printing details of some section types.
Assign the CXL event GUIDs to this set to prevent confusing unknown
prints.

Reported-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/cper.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cper.h b/include/linux/cper.h
index c1a7dc325121..265b0f8fc0b3 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -90,6 +90,29 @@ enum {
 	GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E,	\
 		  0x72, 0x2D, 0xEB, 0x41)
 
+/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */
+/*
+ * General Media Event Record
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ */
+#define CPER_SEC_CXL_GEN_MEDIA_GUID					\
+	GUID_INIT(0xfbcd0a77, 0xc260, 0x417f,				\
+		  0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6)
+/*
+ * DRAM Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
+ */
+#define CPER_SEC_CXL_DRAM_GUID						\
+	GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab,				\
+		  0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24)
+/*
+ * Memory Module Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
+ */
+#define CPER_SEC_CXL_MEM_MODULE_GUID					\
+	GUID_INIT(0xfe927475, 0xdd59, 0x4339,				\
+		  0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74)
+
 /*
  * Flags bits definitions for flags in struct cper_record_header
  * If set, the error has been recovered
-- 
cgit v1.2.3


From dad6a09f3148257ac1773cd90934d721d68ab595 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 29 Jan 2024 15:56:36 -0800
Subject: hrtimer: Report offline hrtimer enqueue

The hrtimers migration on CPU-down hotplug process has been moved
earlier, before the CPU actually goes to die. This leaves a small window
of opportunity to queue an hrtimer in a blind spot, leaving it ignored.

For example a practical case has been reported with RCU waking up a
SCHED_FIFO task right before the CPUHP_AP_IDLE_DEAD stage, queuing that
way a sched/rt timer to the local offline CPU.

Make sure such situations never go unnoticed and warn when that happens.

Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Reported-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20240129235646.3171983-4-boqun.feng@gmail.com
---
 include/linux/hrtimer.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 87e3bedf8eb0..641c4567cfa7 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -157,6 +157,7 @@ enum  hrtimer_base_type {
  * @max_hang_time:	Maximum time spent in hrtimer_interrupt
  * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are
  *			 expired
+ * @online:		CPU is online from an hrtimers point of view
  * @timer_waiters:	A hrtimer_cancel() invocation waits for the timer
  *			callback to finish.
  * @expires_next:	absolute time of the next event, is required for remote
@@ -179,7 +180,8 @@ struct hrtimer_cpu_base {
 	unsigned int			hres_active		: 1,
 					in_hrtirq		: 1,
 					hang_detected		: 1,
-					softirq_activated       : 1;
+					softirq_activated       : 1,
+					online			: 1;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	unsigned int			nr_events;
 	unsigned short			nr_retries;
-- 
cgit v1.2.3


From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: blk-wbt: Fix detection of dirty-throttled tasks

The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).

Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.

CC: stable@vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/backing-dev-defs.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
 	struct delayed_work dwork;	/* work item used for writeback */
 	struct delayed_work bw_dwork;	/* work item used for bandwidth estimate */
 
-	unsigned long dirty_sleep;	/* last wait */
-
 	struct list_head bdi_node;	/* anchored at bdi->wb_list */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
 	 * any dirty wbs, which is depended upon by bdi_has_dirty().
 	 */
 	atomic_long_t tot_write_bandwidth;
+	/*
+	 * Jiffies when last process was dirty throttled on this bdi. Used by
+	 * blk-wbt.
+	 */
+	unsigned long last_bdp_sleep;
 
 	struct bdi_writeback wb;  /* the root writeback info for this bdi */
 	struct list_head wb_list; /* list of all wbs */
-- 
cgit v1.2.3


From cd7d469c25704d414d71bf3644f163fb74e7996b Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Fri, 13 Oct 2023 13:55:44 +0800
Subject: libceph: fail sparse-read if the data length doesn't match

Once this happens that means there have bugs.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index fa018d5864e7..f66f6aac74f6 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -45,6 +45,7 @@ enum ceph_sparse_read_state {
 	CEPH_SPARSE_READ_HDR	= 0,
 	CEPH_SPARSE_READ_EXTENTS,
 	CEPH_SPARSE_READ_DATA_LEN,
+	CEPH_SPARSE_READ_DATA_PRE,
 	CEPH_SPARSE_READ_DATA,
 };
 
@@ -64,7 +65,7 @@ struct ceph_sparse_read {
 	u64				sr_req_len;  /* orig request length */
 	u64				sr_pos;      /* current pos in buffer */
 	int				sr_index;    /* current extent index */
-	__le32				sr_datalen;  /* length of actual data */
+	u32				sr_datalen;  /* length of actual data */
 	u32				sr_count;    /* extent count in reply */
 	int				sr_ext_len;  /* length of extent array */
 	struct ceph_sparse_extent	*sr_extent;  /* extent array */
-- 
cgit v1.2.3


From 8e46a2d068c92a905d01cbb018b00d66991585ab Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Thu, 14 Dec 2023 16:01:03 +0800
Subject: libceph: just wait for more data to be available on the socket

A short read may occur while reading the message footer from the
socket.  Later, when the socket is ready for another read, the
messenger invokes all read_partial_*() handlers, including
read_partial_sparse_msg_data().  The expectation is that
read_partial_sparse_msg_data() would bail, allowing the messenger to
invoke read_partial() for the footer and pick up where it left off.

However read_partial_sparse_msg_data() violates that and ends up
calling into the state machine in the OSD client.  The sparse-read
state machine assumes that it's a new op and interprets some piece of
the footer as the sparse-read header and returns bogus extents/data
length, etc.

To determine whether read_partial_sparse_msg_data() should bail, let's
reuse cursor->total_resid.  Because once it reaches to zero that means
all the extents and data have been successfully received in last read,
else it could break out when partially reading any of the extents and
data.  And then osd_sparse_read() could continue where it left off.

[ idryomov: changelog ]

Link: https://tracker.ceph.com/issues/63586
Fixes: d396f89db39a ("libceph: add sparse read support to msgr1")
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 2eaaabbe98cb..1717cc57cdac 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -283,7 +283,7 @@ struct ceph_msg {
 	struct kref kref;
 	bool more_to_follow;
 	bool needs_out_seq;
-	bool sparse_read;
+	u64 sparse_read_total;
 	int front_alloc_len;
 
 	struct ceph_msgpool *pool;
-- 
cgit v1.2.3


From 4356e9f841f7fbb945521cef3577ba394c65f3fc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 9 Feb 2024 12:39:31 -0800
Subject: work around gcc bugs with 'asm goto' with outputs

We've had issues with gcc and 'asm goto' before, and we created a
'asm_volatile_goto()' macro for that in the past: see commits
3f0116c3238a ("compiler/gcc4: Add quirk for 'asm goto' miscompilation
bug") and a9f180345f53 ("compiler/gcc4: Make quirk for
asm_volatile_goto() unconditional").

Then, much later, we ended up removing the workaround in commit
43c249ea0b1e ("compiler-gcc.h: remove ancient workaround for gcc PR
58670") because we no longer supported building the kernel with the
affected gcc versions, but we left the macro uses around.

Now, Sean Christopherson reports a new version of a very similar
problem, which is fixed by re-applying that ancient workaround.  But the
problem in question is limited to only the 'asm goto with outputs'
cases, so instead of re-introducing the old workaround as-is, let's
rename and limit the workaround to just that much less common case.

It looks like there are at least two separate issues that all hit in
this area:

 (a) some versions of gcc don't mark the asm goto as 'volatile' when it
     has outputs:

        https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619
        https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420

     which is easy to work around by just adding the 'volatile' by hand.

 (b) Internal compiler errors:

        https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422

     which are worked around by adding the extra empty 'asm' as a
     barrier, as in the original workaround.

but the problem Sean sees may be a third thing since it involves bad
code generation (not an ICE) even with the manually added 'volatile'.

but the same old workaround works for this case, even if this feels a
bit like voodoo programming and may only be hiding the issue.

Reported-and-tested-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/all/20240208220604.140859-1-seanjc@google.com/
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Uros Bizjak <ubizjak@gmail.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Cc: Andrew Pinski <quic_apinski@quicinc.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h   | 19 +++++++++++++++++++
 include/linux/compiler_types.h |  4 ++--
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index aebb65bf95a7..c1a963be7d28 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -64,6 +64,25 @@
 		__builtin_unreachable();	\
 	} while (0)
 
+/*
+ * GCC 'asm goto' with outputs miscompiles certain code sequences:
+ *
+ *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420
+ *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422
+ *
+ * Work it around via the same compiler barrier quirk that we used
+ * to use for the old 'asm goto' workaround.
+ *
+ * Also, always mark such 'asm goto' statements as volatile: all
+ * asm goto statements are supposed to be volatile as per the
+ * documentation, but some versions of gcc didn't actually do
+ * that for asms with outputs:
+ *
+ *    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619
+ */
+#define asm_goto_output(x...) \
+	do { asm volatile goto(x); asm (""); } while (0)
+
 #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
 #define __HAVE_BUILTIN_BSWAP32__
 #define __HAVE_BUILTIN_BSWAP64__
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 6f1ca49306d2..663d8791c871 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -362,8 +362,8 @@ struct ftrace_likely_data {
 #define __member_size(p)	__builtin_object_size(p, 1)
 #endif
 
-#ifndef asm_volatile_goto
-#define asm_volatile_goto(x...) asm goto(x)
+#ifndef asm_goto_output
+#define asm_goto_output(x...) asm goto(x)
 #endif
 
 #ifdef CONFIG_CC_HAS_ASM_INLINE
-- 
cgit v1.2.3