From 51333bfbf18f730a78bbb85895f9c9e4c994d883 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 18 Jan 2025 22:10:17 +0100
Subject: usb: musb: Constify struct musb_fifo_cfg

'struct musb_fifo_cfg' are not modified in these drivers.

Constifying these structures moves some data to a read-only section, so
increase overall security.

On a x86_64, with allmodconfig, as an example:
Before:
======
   text	   data	    bss	    dec	    hex	filename
  64381	   5537	    202	  70120	  111e8	drivers/usb/musb/musb_core.o

After:
=====
   text	   data	    bss	    dec	    hex	filename
  64957	   4929	    202	  70088	  111c8	drivers/usb/musb/musb_core.o

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/26e6f3e25dc8e785d0034dd7e59918e455563e60.1737234596.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/musb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/usb/musb.h b/include/linux/usb/musb.h
index 3963e55e88a3..fbdef950f06c 100644
--- a/include/linux/usb/musb.h
+++ b/include/linux/usb/musb.h
@@ -61,7 +61,7 @@ struct musb_hdrc_eps_bits {
 };
 
 struct musb_hdrc_config {
-	struct musb_fifo_cfg	*fifo_cfg;	/* board fifo configuration */
+	const struct musb_fifo_cfg	*fifo_cfg;	/* board fifo configuration */
 	unsigned		fifo_cfg_size;	/* size of the fifo configuration */
 
 	/* MUSB configuration-specific details */
-- 
cgit v1.2.3


From 470cb490d1b75cf25f3139dcf0226967bcc6e217 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Mon, 13 Jan 2025 15:43:18 -0600
Subject: iio: adc: ad7173: move fwnode_irq_get_byname() call site

Move the call to fwnode_irq_get_byname() from the driver-specific
ad7173_fw_parse_device_config() to the shared ad_sd_init() function.

The main reason for this is that we want struct ad_sigma_delta_info to
be static const data that describes the actual ADC chip, not the
application-specific configuration or any runtime state.

Previously, this struct was being used to pass the IRQ number to the
shared ad_sd_init() function. Now, this is replaced by a boolean flag
that is set at compile time and the ad_sd_init() function handles
looking up the IRQ number instead. This also has the added benefit that
if any other drivers need to make use of this in the future, they just
have to set the flag and the shared code will take care of the rest
rather than duplicating the code in each driver.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://patch.msgid.link/20250113-iio-adc-ad7313-fix-non-const-info-struct-v4-1-b63be3ecac4a@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/adc/ad_sigma_delta.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h
index 417073c52380..f242b285081b 100644
--- a/include/linux/iio/adc/ad_sigma_delta.h
+++ b/include/linux/iio/adc/ad_sigma_delta.h
@@ -46,6 +46,7 @@ struct iio_dev;
  *		modify or drop the sample data, it, may be NULL.
  * @has_registers: true if the device has writable and readable registers, false
  *		if there is just one read-only sample data shift register.
+ * @has_named_irqs: Set to true if there is more than one IRQ line.
  * @addr_shift: Shift of the register address in the communications register.
  * @read_mask: Mask for the communications register having the read bit set.
  * @status_ch_mask: Mask for the channel number stored in status register.
@@ -53,7 +54,6 @@ struct iio_dev;
  *   be used.
  * @irq_flags: flags for the interrupt used by the triggered buffer
  * @num_slots: Number of sequencer slots
- * @irq_line: IRQ for reading conversions. If 0, spi->irq will be used
  * @num_resetclks: Number of SPI clk cycles with MOSI=1 to reset the chip.
  */
 struct ad_sigma_delta_info {
@@ -64,13 +64,13 @@ struct ad_sigma_delta_info {
 	int (*disable_one)(struct ad_sigma_delta *, unsigned int chan);
 	int (*postprocess_sample)(struct ad_sigma_delta *, unsigned int raw_sample);
 	bool has_registers;
+	bool has_named_irqs;
 	unsigned int addr_shift;
 	unsigned int read_mask;
 	unsigned int status_ch_mask;
 	unsigned int data_reg;
 	unsigned long irq_flags;
 	unsigned int num_slots;
-	int irq_line;
 	unsigned int num_resetclks;
 };
 
-- 
cgit v1.2.3


From 42e128c9d5c5b6632c3b182afb7fab0957b6c337 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Wed, 22 Jan 2025 01:25:59 +0000
Subject: tty/ldsem: Remove unused ldsem_down_write_trylock

ldsem_down_write_trylock() was added in 2013 by
commit 4898e640caf0 ("tty: Add timed, writer-prioritized rw semaphore")
but hasn't been used.

Remove it.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20250122012559.441006-1-linux@treblig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_ldisc.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index af01e89074b2..c5cccc3fc1e8 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -39,7 +39,6 @@ do {								\
 int ldsem_down_read(struct ld_semaphore *sem, long timeout);
 int ldsem_down_read_trylock(struct ld_semaphore *sem);
 int ldsem_down_write(struct ld_semaphore *sem, long timeout);
-int ldsem_down_write_trylock(struct ld_semaphore *sem);
 void ldsem_up_read(struct ld_semaphore *sem);
 void ldsem_up_write(struct ld_semaphore *sem);
 
-- 
cgit v1.2.3


From a1cd339599a8cff197805c9c71c9cab83cec59c2 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Fri, 10 Jan 2025 18:22:05 -0600
Subject: counter: add direction change event

Add COUNTER_EVENT_DIRECTION_CHANGE to be used by drivers to emit events
when a counter detects a change in direction.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://lore.kernel.org/r/20250110-counter-ti-eqep-add-direction-support-v2-2-c6b6f96d2db9@baylibre.com
Signed-off-by: William Breathitt Gray <wbg@kernel.org>
---
 include/uapi/linux/counter.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/counter.h b/include/uapi/linux/counter.h
index 008a691c254b..350b45d616bb 100644
--- a/include/uapi/linux/counter.h
+++ b/include/uapi/linux/counter.h
@@ -65,6 +65,8 @@ enum counter_event_type {
 	COUNTER_EVENT_CHANGE_OF_STATE,
 	/* Count value captured */
 	COUNTER_EVENT_CAPTURE,
+	/* Direction change detected */
+	COUNTER_EVENT_DIRECTION_CHANGE,
 };
 
 /**
-- 
cgit v1.2.3


From 958c3a6706863f9f77b21c90d2428473441cd8a1 Mon Sep 17 00:00:00 2001
From: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Date: Thu, 23 Jan 2025 08:44:17 +0000
Subject: efi/cper, cxl: Make definitions and structures global

In preparation to add tracepoint support, move protocol error UUID
definition to a common location, Also, make struct CXL RAS capability,
cxl_cper_sec_prot_err and CPER validation flags global for use across
different modules.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Fan Ni <fan.ni@samsung.com>
Reviewed-by: Gregory Price <gourry@gourry.net>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://patch.msgid.link/20250123084421.127697-3-Smita.KoralahalliChannabasappa@amd.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/cxl/event.h  | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cper.h |  4 +++
 2 files changed, 84 insertions(+)

(limited to 'include')

diff --git a/include/cxl/event.h b/include/cxl/event.h
index 04edd44bd26f..7a6c14c05215 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -164,6 +164,86 @@ struct cxl_cper_work_data {
 	struct cxl_cper_event_rec rec;
 };
 
+#define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
+#define PROT_ERR_VALID_AGENT_ADDRESS		BIT_ULL(1)
+#define PROT_ERR_VALID_DEVICE_ID		BIT_ULL(2)
+#define PROT_ERR_VALID_SERIAL_NUMBER		BIT_ULL(3)
+#define PROT_ERR_VALID_CAPABILITY		BIT_ULL(4)
+#define PROT_ERR_VALID_DVSEC			BIT_ULL(5)
+#define PROT_ERR_VALID_ERROR_LOG		BIT_ULL(6)
+
+/*
+ * The layout of the enumeration and the values matches CXL Agent Type
+ * field in the UEFI 2.10 Section N.2.13,
+ */
+enum {
+	RCD,	/* Restricted CXL Device */
+	RCH_DP,	/* Restricted CXL Host Downstream Port */
+	DEVICE,	/* CXL Device */
+	LD,	/* CXL Logical Device */
+	FMLD,	/* CXL Fabric Manager managed Logical Device */
+	RP,	/* CXL Root Port */
+	DSP,	/* CXL Downstream Switch Port */
+	USP,	/* CXL Upstream Switch Port */
+};
+
+#pragma pack(1)
+
+/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
+struct cxl_cper_sec_prot_err {
+	u64 valid_bits;
+	u8 agent_type;
+	u8 reserved[7];
+
+	/*
+	 * Except for RCH Downstream Port, all the remaining CXL Agent
+	 * types are uniquely identified by the PCIe compatible SBDF number.
+	 */
+	union {
+		u64 rcrb_base_addr;
+		struct {
+			u8 function;
+			u8 device;
+			u8 bus;
+			u16 segment;
+			u8 reserved_1[3];
+		};
+	} agent_addr;
+
+	struct {
+		u16 vendor_id;
+		u16 device_id;
+		u16 subsystem_vendor_id;
+		u16 subsystem_id;
+		u8 class_code[2];
+		u16 slot;
+		u8 reserved_1[4];
+	} device_id;
+
+	struct {
+		u32 lower_dw;
+		u32 upper_dw;
+	} dev_serial_num;
+
+	u8 capability[60];
+	u16 dvsec_len;
+	u16 err_len;
+	u8 reserved_2[4];
+};
+
+#pragma pack()
+
+/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
+struct cxl_ras_capability_regs {
+	u32 uncor_status;
+	u32 uncor_mask;
+	u32 uncor_severity;
+	u32 cor_status;
+	u32 cor_mask;
+	u32 cap_control;
+	u32 header_log[16];
+};
+
 #ifdef CONFIG_ACPI_APEI_GHES
 int cxl_cper_register_work(struct work_struct *work);
 int cxl_cper_unregister_work(struct work_struct *work);
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 265b0f8fc0b3..5c6d4d5b9975 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -89,6 +89,10 @@ enum {
 #define CPER_NOTIFY_DMAR						\
 	GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E,	\
 		  0x72, 0x2D, 0xEB, 0x41)
+/* CXL Protocol Error Section */
+#define CPER_SEC_CXL_PROT_ERR						\
+	GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78,	\
+		  0x4B, 0x77, 0x10, 0x48)
 
 /* CXL Event record UUIDs are formatted as GUIDs and reported in section type */
 /*
-- 
cgit v1.2.3


From 61eac5f7f6439e8fe99b5fb29406acb0fd7b83c6 Mon Sep 17 00:00:00 2001
From: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Date: Thu, 23 Jan 2025 08:44:18 +0000
Subject: efi/cper, cxl: Remove cper_cxl.h

Move the declaration of cxl_cper_print_prot_err() to include/linux/cper.h
to avoid maintaining a separate header file just for this function
declaration. Remove drivers/firmware/efi/cper_cxl.h as its contents have
been reorganized.

No functional changes.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Fan Ni <fan.ni@samsung.com>
Reviewed-by: Gregory Price <gourry@gourry.net>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://patch.msgid.link/20250123084421.127697-4-Smita.KoralahalliChannabasappa@amd.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/linux/cper.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/cper.h b/include/linux/cper.h
index 5c6d4d5b9975..0ed60a91eca9 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -605,4 +605,8 @@ void cper_estatus_print(const char *pfx,
 int cper_estatus_check_header(const struct acpi_hest_generic_status *estatus);
 int cper_estatus_check(const struct acpi_hest_generic_status *estatus);
 
+struct cxl_cper_sec_prot_err;
+void cxl_cper_print_prot_err(const char *pfx,
+			     const struct cxl_cper_sec_prot_err *prot_err);
+
 #endif
-- 
cgit v1.2.3


From 315c2f0b53ba2645062627443a12cea73f3dad9c Mon Sep 17 00:00:00 2001
From: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Date: Thu, 23 Jan 2025 08:44:19 +0000
Subject: acpi/ghes, cper: Recognize and cache CXL Protocol errors

Add support in GHES to detect and process CXL CPER Protocol errors, as
defined in UEFI v2.10, section N.2.13.

Define struct cxl_cper_prot_err_work_data to cache CXL protocol error
information, including RAS capabilities and severity, for further
handling.

These cached CXL CPER records will later be processed by workqueues
within the CXL subsystem.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Gregory Price <gourry@gourry.net>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://patch.msgid.link/20250123084421.127697-5-Smita.KoralahalliChannabasappa@amd.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/cxl/event.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/cxl/event.h b/include/cxl/event.h
index 7a6c14c05215..8381a07052d0 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -244,6 +244,12 @@ struct cxl_ras_capability_regs {
 	u32 header_log[16];
 };
 
+struct cxl_cper_prot_err_work_data {
+	struct cxl_cper_sec_prot_err prot_err;
+	struct cxl_ras_capability_regs ras_cap;
+	int severity;
+};
+
 #ifdef CONFIG_ACPI_APEI_GHES
 int cxl_cper_register_work(struct work_struct *work);
 int cxl_cper_unregister_work(struct work_struct *work);
-- 
cgit v1.2.3


From c8006fbd0f4fd15fa990786ff9a097b45eef616c Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Mon, 27 Jan 2025 21:58:59 +0000
Subject: bus: mhi: host: Remove unused functions

mhi_device_get() and mhi_queue_dma() haven't been used since 2020's
commit 189ff97cca53 ("bus: mhi: core: Add support for data transfer")
added them.

Remove them.

Note that mhi_queue_dma_sync() is used and has been left.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/20250127215859.261105-1-linux@treblig.org
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 059dc94d20bb..dd372b0123a6 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -720,12 +720,6 @@ enum mhi_state mhi_get_mhi_state(struct mhi_controller *mhi_cntrl);
  */
 void mhi_soc_reset(struct mhi_controller *mhi_cntrl);
 
-/**
- * mhi_device_get - Disable device low power mode
- * @mhi_dev: Device associated with the channel
- */
-void mhi_device_get(struct mhi_device *mhi_dev);
-
 /**
  * mhi_device_get_sync - Disable device low power mode. Synchronously
  *                       take the controller out of suspended state
@@ -776,18 +770,6 @@ int mhi_prepare_for_transfer_autoqueue(struct mhi_device *mhi_dev);
  */
 void mhi_unprepare_from_transfer(struct mhi_device *mhi_dev);
 
-/**
- * mhi_queue_dma - Send or receive DMA mapped buffers from client device
- *                 over MHI channel
- * @mhi_dev: Device associated with the channels
- * @dir: DMA direction for the channel
- * @mhi_buf: Buffer for holding the DMA mapped data
- * @len: Buffer length
- * @mflags: MHI transfer flags used for the transfer
- */
-int mhi_queue_dma(struct mhi_device *mhi_dev, enum dma_data_direction dir,
-		  struct mhi_buf *mhi_buf, size_t len, enum mhi_flags mflags);
-
 /**
  * mhi_queue_buf - Send or receive raw buffers from client device over MHI
  *                 channel
-- 
cgit v1.2.3


From dbd2e08ff09fd6bd51215b44474899cc1b7b7a16 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Mon, 27 Jan 2025 20:30:22 +0100
Subject: iio: gts-helper: export iio_gts_get_total_gain()

Export this function in preparation for the fix in veml6030.c, where the
total gain can be used to ease the calculation of the processed value of
the IIO_LIGHT channel compared to acquiring the scale in NANO.

Suggested-by: Matti Vaittinen <mazziesaccount@gmail.com>
Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Reviewed-by: Matti Vaittinen <mazziesaccount@gmail.com>
Link: https://patch.msgid.link/20250127-veml6030-scale-v3-1-4f32ba03df94@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio-gts-helper.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/iio/iio-gts-helper.h b/include/linux/iio/iio-gts-helper.h
index e5de7a124bad..66f830ab9b49 100644
--- a/include/linux/iio/iio-gts-helper.h
+++ b/include/linux/iio/iio-gts-helper.h
@@ -208,5 +208,6 @@ int iio_gts_all_avail_scales(struct iio_gts *gts, const int **vals, int *type,
 			     int *length);
 int iio_gts_avail_scales_for_time(struct iio_gts *gts, int time,
 				  const int **vals, int *type, int *length);
+int iio_gts_get_total_gain(struct iio_gts *gts, int gain, int time);
 
 #endif
-- 
cgit v1.2.3


From 34934d79965518548d33c0513a9572ae936d8c29 Mon Sep 17 00:00:00 2001
From: Guillaume Ranquet <granquet@baylibre.com>
Date: Mon, 27 Jan 2025 14:59:32 +0100
Subject: iio: introduce the FAULT event type

Add a new event type to describe an hardware failure.

Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Signed-off-by: Guillaume Ranquet <granquet@baylibre.com>
Reviewed-by: David Lechner <dlechner@baylibre.com>
Link: https://patch.msgid.link/20250127-ad4111_openwire-v5-1-ef2db05c384f@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/uapi/linux/iio/types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index 12886d4465e4..3eb0821af7a4 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -119,6 +119,7 @@ enum iio_event_type {
 	IIO_EV_TYPE_CHANGE,
 	IIO_EV_TYPE_MAG_REFERENCED,
 	IIO_EV_TYPE_GESTURE,
+	IIO_EV_TYPE_FAULT,
 };
 
 enum iio_event_direction {
@@ -128,6 +129,7 @@ enum iio_event_direction {
 	IIO_EV_DIR_NONE,
 	IIO_EV_DIR_SINGLETAP,
 	IIO_EV_DIR_DOUBLETAP,
+	IIO_EV_DIR_FAULT_OPENWIRE,
 };
 
 #endif /* _UAPI_IIO_TYPES_H_ */
-- 
cgit v1.2.3


From 31d43141d13aa63587f140884b1f667800ce4e1d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 5 Feb 2025 16:57:09 +0200
Subject: dmaengine: Replace dma_request_slave_channel() by dma_request_chan()

Replace dma_request_slave_channel() by dma_request_chan() as suggested
since the former is deprecated.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20250205145757.889247-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 346251bf1026..83cbf7197a76 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1639,8 +1639,8 @@ static inline struct dma_chan
 {
 	struct dma_chan *chan;
 
-	chan = dma_request_slave_channel(dev, name);
-	if (chan)
+	chan = dma_request_chan(dev, name);
+	if (!IS_ERR(chan))
 		return chan;
 
 	if (!fn || !fn_param)
-- 
cgit v1.2.3


From 1c83d3dfa0905590408595560629627cba4f9261 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 5 Feb 2025 16:57:10 +0200
Subject: dmaengine: Use dma_request_channel() instead of
 __dma_request_channel()

Reduce use of internal __dma_request_channel() function in public
dmaengine.h.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20250205145757.889247-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 83cbf7197a76..a360e0330436 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1646,7 +1646,7 @@ static inline struct dma_chan
 	if (!fn || !fn_param)
 		return NULL;
 
-	return __dma_request_channel(&mask, fn, fn_param, NULL);
+	return dma_request_channel(mask, fn, fn_param);
 }
 
 static inline char *
-- 
cgit v1.2.3


From 4fe7fd17fe66a5e243c1de7ff32c29fac7039b8d Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Fri, 7 Feb 2025 14:09:05 -0600
Subject: iio: buffer-dmaengine: split requesting DMA channel from allocating
 buffer

Refactor the IIO dmaengine buffer code to split requesting the DMA
channel from allocating the buffer. We want to be able to add a new
function where the IIO device driver manages the DMA channel, so these
two actions need to be separate.

To do this, calling dma_request_chan() is moved from
iio_dmaengine_buffer_alloc() to iio_dmaengine_buffer_setup_ext(). A new
__iio_dmaengine_buffer_setup_ext() helper function is added to simplify
error unwinding and will also be used by a new function in a later
patch.

iio_dmaengine_buffer_free() now only frees the buffer and does not
release the DMA channel. A new iio_dmaengine_buffer_teardown() function
is added to unwind everything done in iio_dmaengine_buffer_setup_ext().
This keeps things more symmetrical with obvious pairs alloc/free and
setup/teardown.

Calling dma_get_slave_caps() in iio_dmaengine_buffer_alloc() is moved so
that we can avoid any gotos for error unwinding.

Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://patch.msgid.link/20250207-dlech-mainline-spi-engine-offload-2-v8-8-e48a489be48c@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/buffer-dmaengine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/iio/buffer-dmaengine.h b/include/linux/iio/buffer-dmaengine.h
index 81d9a19aeb91..72a2e3fd8a5b 100644
--- a/include/linux/iio/buffer-dmaengine.h
+++ b/include/linux/iio/buffer-dmaengine.h
@@ -12,7 +12,7 @@
 struct iio_dev;
 struct device;
 
-void iio_dmaengine_buffer_free(struct iio_buffer *buffer);
+void iio_dmaengine_buffer_teardown(struct iio_buffer *buffer);
 struct iio_buffer *iio_dmaengine_buffer_setup_ext(struct device *dev,
 						  struct iio_dev *indio_dev,
 						  const char *channel,
-- 
cgit v1.2.3


From 79f24971b4ffbdba9733365e298982c45338e6b1 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Fri, 7 Feb 2025 14:09:06 -0600
Subject: iio: buffer-dmaengine: add
 devm_iio_dmaengine_buffer_setup_with_handle()

Add a new devm_iio_dmaengine_buffer_setup_with_handle() function to
handle cases where the DMA channel is managed by the caller rather than
being requested and released by the iio_dmaengine module.

Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://patch.msgid.link/20250207-dlech-mainline-spi-engine-offload-2-v8-9-e48a489be48c@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/buffer-dmaengine.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/iio/buffer-dmaengine.h b/include/linux/iio/buffer-dmaengine.h
index 72a2e3fd8a5b..37f27545f69f 100644
--- a/include/linux/iio/buffer-dmaengine.h
+++ b/include/linux/iio/buffer-dmaengine.h
@@ -11,6 +11,7 @@
 
 struct iio_dev;
 struct device;
+struct dma_chan;
 
 void iio_dmaengine_buffer_teardown(struct iio_buffer *buffer);
 struct iio_buffer *iio_dmaengine_buffer_setup_ext(struct device *dev,
@@ -26,6 +27,10 @@ int devm_iio_dmaengine_buffer_setup_ext(struct device *dev,
 					struct iio_dev *indio_dev,
 					const char *channel,
 					enum iio_buffer_direction dir);
+int devm_iio_dmaengine_buffer_setup_with_handle(struct device *dev,
+						struct iio_dev *indio_dev,
+						struct dma_chan *chan,
+						enum iio_buffer_direction dir);
 
 #define devm_iio_dmaengine_buffer_setup(dev, indio_dev, channel)	\
 	devm_iio_dmaengine_buffer_setup_ext(dev, indio_dev, channel,	\
-- 
cgit v1.2.3


From b7c1e069f54668461856f03696812ab7176c400d Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Fri, 7 Feb 2025 14:09:10 -0600
Subject: dt-bindings: iio: adc: adi,ad4695: add SPI offload properties

Add a pwms property to the adi,ad4695 binding to specify an optional PWM
output connected to the CNV pin on the ADC.

Also add #trigger-source-cells property to allow the BUSY output to be
used as a SPI offload trigger source to indicate when a sample is ready
to be read.

Macros are added to adi,ad4695.h for the cell values to help with
readability since they are arbitrary values.

Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://patch.msgid.link/20250207-dlech-mainline-spi-engine-offload-2-v8-13-e48a489be48c@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/dt-bindings/iio/adc/adi,ad4695.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/iio/adc/adi,ad4695.h b/include/dt-bindings/iio/adc/adi,ad4695.h
index 9fbef542bf67..fea4525d2710 100644
--- a/include/dt-bindings/iio/adc/adi,ad4695.h
+++ b/include/dt-bindings/iio/adc/adi,ad4695.h
@@ -6,4 +6,11 @@
 #define AD4695_COMMON_MODE_REFGND	0xFF
 #define AD4695_COMMON_MODE_COM		0xFE
 
+#define AD4695_TRIGGER_EVENT_BUSY	0
+#define AD4695_TRIGGER_EVENT_ALERT	1
+
+#define AD4695_TRIGGER_PIN_GP0		0
+#define AD4695_TRIGGER_PIN_GP2		2
+#define AD4695_TRIGGER_PIN_GP3		3
+
 #endif /* _DT_BINDINGS_ADI_AD4695_H */
-- 
cgit v1.2.3


From fbc54ae4f8d7cef3b11f70945bf8df8f952814b6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 13 Feb 2025 13:45:05 +0200
Subject: i2c: Unexport i2c_of_match_device()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

i2c_of_match_device() is not used anymore outside of I²C framework,
unexport it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 include/linux/i2c.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 2b2af24d2a43..997e80649889 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -1029,10 +1029,6 @@ static inline struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node
 	return i2c_get_adapter_by_fwnode(of_fwnode_handle(node));
 }
 
-const struct of_device_id
-*i2c_of_match_device(const struct of_device_id *matches,
-		     struct i2c_client *client);
-
 int of_i2c_get_board_info(struct device *dev, struct device_node *node,
 			  struct i2c_board_info *info);
 
@@ -1053,13 +1049,6 @@ static inline struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node
 	return NULL;
 }
 
-static inline const struct of_device_id
-*i2c_of_match_device(const struct of_device_id *matches,
-		     struct i2c_client *client)
-{
-	return NULL;
-}
-
 static inline int of_i2c_get_board_info(struct device *dev,
 					struct device_node *node,
 					struct i2c_board_info *info)
-- 
cgit v1.2.3


From 2c0ae8ef1e5edfd0e42727fba4617694f3aac2eb Mon Sep 17 00:00:00 2001
From: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Date: Fri, 7 Feb 2025 12:28:38 +0530
Subject: soundwire: amd: add support for ACP7.0 & ACP7.1 platforms

Add SoundWire support for ACP7.0 and ACP7.1 platforms.

Signed-off-by: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Link: https://lore.kernel.org/r/20250207065841.4718-4-Vijendar.Mukunda@amd.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_amd.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h
index 799f8578137b..6b839987f14c 100644
--- a/include/linux/soundwire/sdw_amd.h
+++ b/include/linux/soundwire/sdw_amd.h
@@ -28,6 +28,8 @@
 #define ACP_SDW1	1
 #define AMD_SDW_MAX_MANAGER_COUNT	2
 #define ACP63_PCI_REV_ID		0x63
+#define ACP70_PCI_REV_ID		0x70
+#define ACP71_PCI_REV_ID		0x71
 
 struct acp_sdw_pdata {
 	u16 instance;
-- 
cgit v1.2.3


From dcba69711fff8af4370cb4c00460db0bf47c7093 Mon Sep 17 00:00:00 2001
From: Jameson Thies <jthies@google.com>
Date: Tue, 4 Feb 2025 02:45:58 +0000
Subject: platform/chrome: add PD_EVENT_INIT bit definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update cros_ec_commands.h to include a definition for PD_EVENT_INIT.
On platforms supporting UCSI, this host event type is sent when the PPM
initializes.

Signed-off-by: Jameson Thies <jthies@google.com>
Reviewed-by: Benson Leung <bleung@chromium.org>
Acked-by: Tzung-Bi Shih <tzungbi@kernel.org>
Reviewed-by: Łukasz Bartosik <ukaszb@chromium.org>
Link: https://lore.kernel.org/r/20250204024600.4138776-2-jthies@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/platform_data/cros_ec_commands.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index ecf290a0c98f..1f4e4f2b89bb 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -5046,6 +5046,7 @@ struct ec_response_pd_status {
 #define PD_EVENT_DATA_SWAP         BIT(3)
 #define PD_EVENT_TYPEC             BIT(4)
 #define PD_EVENT_PPM               BIT(5)
+#define PD_EVENT_INIT              BIT(6)
 
 struct ec_response_host_event_status {
 	uint32_t status; /* PD MCU host event status */
-- 
cgit v1.2.3


From c08b0f2c317223fa702616dfa77f10a92b7b34b2 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 29 Jan 2025 08:25:50 -0800
Subject: Revert "tty/serial: Add kgdb_nmi driver"

This reverts commit 0c57dfcc6c1d037243c2f8fbf62eab3633326ec0.

The functionality was supoosed to be used by a later patch in the
series that never landed [1]. Drop it.

NOTE: part of functionality was already reverted by commit
39d0be87438a ("serial: kgdb_nmi: Remove unused knock code"). Also note
that this revert is not a clean revert given code changes that have
happened in the meantime.

It's obvious that nobody is using this code since the two exposed
functions (kgdb_register_nmi_console() and
kgdb_unregister_nmi_console()) are both no-ops if
"arch_kgdb_ops.enable_nmi" is not defined. No architectures define it.

[1] https://lore.kernel.org/lkml/1348522080-32629-9-git-send-email-anton.vorontsov@linaro.org/

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: URL [1]
Link: https://lore.kernel.org/r/20250129082535.1.Ia095eac1ae357f87d23e7af2206741f5d40788f1@changeid
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kgdb.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 51ef131e66b7..14739952698b 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -306,14 +306,6 @@ extern const struct kgdb_arch		arch_kgdb_ops;
 
 extern unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs);
 
-#ifdef CONFIG_SERIAL_KGDB_NMI
-extern int kgdb_register_nmi_console(void);
-extern int kgdb_unregister_nmi_console(void);
-#else
-static inline int kgdb_register_nmi_console(void) { return 0; }
-static inline int kgdb_unregister_nmi_console(void) { return 0; }
-#endif
-
 extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern struct kgdb_io *dbg_io_ops;
-- 
cgit v1.2.3


From a029a219385cfdf9c43fb1ef9eb49d173773388f Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 29 Jan 2025 08:25:52 -0800
Subject: Revert "kernel/debug: Mask KGDB NMI upon entry"

This reverts commit 5a14fead07bcf4e0acc877a8d9e1d1f40a441153.

No architectures ever implemented `enable_nmi` since the later patches
in the series adding it never landed. It's been a long time. Drop it.

NOTE: this is not a clean revert due to changes in the file in the
meantime.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20250129082535.3.I2254953cd852f31f354456689d68b2d910de3fbe@changeid
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kgdb.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 14739952698b..5eebbe7a3545 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -257,7 +257,6 @@ extern void kgdb_arch_late(void);
  * hardware breakpoints.
  * @correct_hw_break: Allow an architecture to specify how to correct the
  * hardware debug registers.
- * @enable_nmi: Manage NMI-triggered entry to KGDB
  */
 struct kgdb_arch {
 	unsigned char		gdb_bpt_instr[BREAK_INSTR_SIZE];
@@ -270,8 +269,6 @@ struct kgdb_arch {
 	void	(*disable_hw_break)(struct pt_regs *regs);
 	void	(*remove_all_hw_break)(void);
 	void	(*correct_hw_break)(void);
-
-	void	(*enable_nmi)(bool on);
 };
 
 /**
-- 
cgit v1.2.3


From 633488947ef66b194377411322dc9e12aab79b65 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 13 Feb 2025 15:50:22 +0100
Subject: kernfs: Use RCU to access kernfs_node::parent.

kernfs_rename_lock is used to obtain stable kernfs_node::{name|parent}
pointer. This is a preparation to access kernfs_node::parent under RCU
and ensure that the pointer remains stable under the RCU lifetime
guarantees.

For a complete path, as it is done in kernfs_path_from_node(), the
kernfs_rename_lock is still required in order to obtain a stable parent
relationship while computing the relevant node depth. This must not
change while the nodes are inspected in order to build the path.
If the kernfs user never moves the nodes (changes the parent) then the
kernfs_rename_lock is not required and the RCU guarantees are
sufficient. This "restriction" can be set with
KERNFS_ROOT_INVARIANT_PARENT. Otherwise the lock is required.

Rename kernfs_node::parent to kernfs_node::__parent to denote the RCU
access and use RCU accessor while accessing the node.
Make cgroup use KERNFS_ROOT_INVARIANT_PARENT since the parent here can
not change.

Acked-by: Tejun Heo <tj@kernel.org>
Cc: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/r/20250213145023.2820193-6-bigeasy@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kernfs.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 87c79d076d6d..5dda9a268e44 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -147,6 +147,11 @@ enum kernfs_root_flag {
 	 * Support user xattrs to be written to nodes rooted at this root.
 	 */
 	KERNFS_ROOT_SUPPORT_USER_XATTR		= 0x0008,
+
+	/*
+	 * Renames must not change the parent node.
+	 */
+	KERNFS_ROOT_INVARIANT_PARENT		= 0x0010,
 };
 
 /* type-specific structures for kernfs_node union members */
@@ -199,8 +204,8 @@ struct kernfs_node {
 	 * never moved to a different parent, it is safe to access the
 	 * parent directly.
 	 */
-	struct kernfs_node	*parent;
 	const char		*name;
+	struct kernfs_node	__rcu *__parent;
 
 	struct rb_node		rb;
 
@@ -416,6 +421,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
 struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
 				       unsigned int flags, void *priv);
 void kernfs_destroy_root(struct kernfs_root *root);
+unsigned int kernfs_root_flags(struct kernfs_node *kn);
 
 struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 					 const char *name, umode_t mode,
@@ -514,6 +520,8 @@ kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags,
 { return ERR_PTR(-ENOSYS); }
 
 static inline void kernfs_destroy_root(struct kernfs_root *root) { }
+static inline unsigned int kernfs_root_flags(struct kernfs_node *kn)
+{ return 0; }
 
 static inline struct kernfs_node *
 kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
-- 
cgit v1.2.3


From 741c10b096bc4dd79cd9f215b6ef173bb953e75c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 13 Feb 2025 15:50:23 +0100
Subject: kernfs: Use RCU to access kernfs_node::name.

Using RCU lifetime rules to access kernfs_node::name can avoid the
trouble with kernfs_rename_lock in kernfs_name() and kernfs_path_from_node()
if the fs was created with KERNFS_ROOT_INVARIANT_PARENT. This is usefull
as it allows to implement kernfs_path_from_node() only with RCU
protection and avoiding kernfs_rename_lock. The lock is only required if
the __parent node can be changed and the function requires an unchanged
hierarchy while it iterates from the node to its parent.
The change is needed to allow the lookup of the node's path
(kernfs_path_from_node()) from context which runs always with disabled
preemption and or interrutps even on PREEMPT_RT. The problem is that
kernfs_rename_lock becomes a sleeping lock on PREEMPT_RT.

I went through all ::name users and added the required access for the lookup
with a few extensions:
- rdtgroup_pseudo_lock_create() drops all locks and then uses the name
  later on. resctrl supports rename with different parents. Here I made
  a temporal copy of the name while it is used outside of the lock.

- kernfs_rename_ns() accepts NULL as new_parent. This simplifies
  sysfs_move_dir_ns() where it can set NULL in order to reuse the current
  name.

- kernfs_rename_ns() is only using kernfs_rename_lock if the parents are
  different. All users use either kernfs_rwsem (for stable path view) or
  just RCU for the lookup. The ::name uses always RCU free.

Use RCU lifetime guarantees to access kernfs_node::name.

Suggested-by: Tejun Heo <tj@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reported-by: syzbot+6ea37e2e6ffccf41a7e6@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/lkml/67251dc6.050a0220.529b6.015e.GAE@google.com/
Reported-by: Hillf Danton <hdanton@sina.com>
Closes: https://lore.kernel.org/20241102001224.2789-1-hdanton@sina.com
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/r/20250213145023.2820193-7-bigeasy@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kernfs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 5dda9a268e44..b5a5f32fdfd1 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -204,8 +204,8 @@ struct kernfs_node {
 	 * never moved to a different parent, it is safe to access the
 	 * parent directly.
 	 */
-	const char		*name;
 	struct kernfs_node	__rcu *__parent;
+	const char		__rcu *name;
 
 	struct rb_node		rb;
 
@@ -400,7 +400,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 }
 
 int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
-int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn,
+int kernfs_path_from_node(struct kernfs_node *kn_to, struct kernfs_node *kn_from,
 			  char *buf, size_t buflen);
 void pr_cont_kernfs_name(struct kernfs_node *kn);
 void pr_cont_kernfs_path(struct kernfs_node *kn);
-- 
cgit v1.2.3


From 4018ab42636cbea1bcf4fd69afc31d51cd905ba0 Mon Sep 17 00:00:00 2001
From: Antoniu Miclaus <antoniu.miclaus@analog.com>
Date: Fri, 14 Feb 2025 15:19:47 +0200
Subject: iio: backend: add API for interface get

Add backend support for obtaining the interface type used.

Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Reviewed-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Link: https://patch.msgid.link/20250214131955.31973-2-antoniu.miclaus@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/backend.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index 10be00f3b120..a0ea6c29d7ba 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -70,6 +70,12 @@ enum iio_backend_sample_trigger {
 	IIO_BACKEND_SAMPLE_TRIGGER_MAX
 };
 
+enum iio_backend_interface_type {
+	IIO_BACKEND_INTERFACE_SERIAL_LVDS,
+	IIO_BACKEND_INTERFACE_SERIAL_CMOS,
+	IIO_BACKEND_INTERFACE_MAX
+};
+
 /**
  * struct iio_backend_ops - operations structure for an iio_backend
  * @enable: Enable backend.
@@ -88,6 +94,7 @@ enum iio_backend_sample_trigger {
  * @extend_chan_spec: Extend an IIO channel.
  * @ext_info_set: Extended info setter.
  * @ext_info_get: Extended info getter.
+ * @interface_type_get: Interface type.
  * @read_raw: Read a channel attribute from a backend device
  * @debugfs_print_chan_status: Print channel status into a buffer.
  * @debugfs_reg_access: Read or write register value of backend.
@@ -128,6 +135,8 @@ struct iio_backend_ops {
 			    const char *buf, size_t len);
 	int (*ext_info_get)(struct iio_backend *back, uintptr_t private,
 			    const struct iio_chan_spec *chan, char *buf);
+	int (*interface_type_get)(struct iio_backend *back,
+				  enum iio_backend_interface_type *type);
 	int (*read_raw)(struct iio_backend *back,
 			struct iio_chan_spec const *chan, int *val, int *val2,
 			long mask);
@@ -186,6 +195,8 @@ ssize_t iio_backend_ext_info_set(struct iio_dev *indio_dev, uintptr_t private,
 				 const char *buf, size_t len);
 ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private,
 				 const struct iio_chan_spec *chan, char *buf);
+int iio_backend_interface_type_get(struct iio_backend *back,
+				   enum iio_backend_interface_type *type);
 int iio_backend_read_raw(struct iio_backend *back,
 			 struct iio_chan_spec const *chan, int *val, int *val2,
 			 long mask);
-- 
cgit v1.2.3


From fc3fdb835eebb2ba88c6fdbf2c8b9eae1ceab106 Mon Sep 17 00:00:00 2001
From: Antoniu Miclaus <antoniu.miclaus@analog.com>
Date: Fri, 14 Feb 2025 15:19:48 +0200
Subject: iio: backend: add support for data size set

Add backend support for setting the data size used.
This setting can be adjusted within the IP cores interfacing devices.

Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Reviewed-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Link: https://patch.msgid.link/20250214131955.31973-3-antoniu.miclaus@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/backend.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index a0ea6c29d7ba..9ae861a21472 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -95,6 +95,7 @@ enum iio_backend_interface_type {
  * @ext_info_set: Extended info setter.
  * @ext_info_get: Extended info getter.
  * @interface_type_get: Interface type.
+ * @data_size_set: Data size.
  * @read_raw: Read a channel attribute from a backend device
  * @debugfs_print_chan_status: Print channel status into a buffer.
  * @debugfs_reg_access: Read or write register value of backend.
@@ -137,6 +138,7 @@ struct iio_backend_ops {
 			    const struct iio_chan_spec *chan, char *buf);
 	int (*interface_type_get)(struct iio_backend *back,
 				  enum iio_backend_interface_type *type);
+	int (*data_size_set)(struct iio_backend *back, unsigned int size);
 	int (*read_raw)(struct iio_backend *back,
 			struct iio_chan_spec const *chan, int *val, int *val2,
 			long mask);
@@ -197,6 +199,7 @@ ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private,
 				 const struct iio_chan_spec *chan, char *buf);
 int iio_backend_interface_type_get(struct iio_backend *back,
 				   enum iio_backend_interface_type *type);
+int iio_backend_data_size_set(struct iio_backend *back, unsigned int size);
 int iio_backend_read_raw(struct iio_backend *back,
 			 struct iio_chan_spec const *chan, int *val, int *val2,
 			 long mask);
-- 
cgit v1.2.3


From 22894e0be908791e3df011cdfac02589c2f340bd Mon Sep 17 00:00:00 2001
From: Antoniu Miclaus <antoniu.miclaus@analog.com>
Date: Fri, 14 Feb 2025 15:19:49 +0200
Subject: iio: backend: add API for oversampling

Add backend support for setting oversampling ratio.

Reviewed-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Link: https://patch.msgid.link/20250214131955.31973-4-antoniu.miclaus@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/backend.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index 9ae861a21472..e45b7dfbec35 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -96,6 +96,7 @@ enum iio_backend_interface_type {
  * @ext_info_get: Extended info getter.
  * @interface_type_get: Interface type.
  * @data_size_set: Data size.
+ * @oversampling_ratio_set: Set Oversampling ratio.
  * @read_raw: Read a channel attribute from a backend device
  * @debugfs_print_chan_status: Print channel status into a buffer.
  * @debugfs_reg_access: Read or write register value of backend.
@@ -139,6 +140,8 @@ struct iio_backend_ops {
 	int (*interface_type_get)(struct iio_backend *back,
 				  enum iio_backend_interface_type *type);
 	int (*data_size_set)(struct iio_backend *back, unsigned int size);
+	int (*oversampling_ratio_set)(struct iio_backend *back,
+				      unsigned int ratio);
 	int (*read_raw)(struct iio_backend *back,
 			struct iio_chan_spec const *chan, int *val, int *val2,
 			long mask);
@@ -200,6 +203,8 @@ ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private,
 int iio_backend_interface_type_get(struct iio_backend *back,
 				   enum iio_backend_interface_type *type);
 int iio_backend_data_size_set(struct iio_backend *back, unsigned int size);
+int iio_backend_oversampling_ratio_set(struct iio_backend *back,
+				       unsigned int ratio);
 int iio_backend_read_raw(struct iio_backend *back,
 			 struct iio_chan_spec const *chan, int *val, int *val2,
 			 long mask);
-- 
cgit v1.2.3


From d795e38df4b7ebac1072bbf7d8a5500c1ea83332 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sun, 9 Feb 2025 18:05:58 +0000
Subject: iio: core: Rework claim and release of direct mode to work with
 sparse.

Initial thought was to do something similar to __cond_lock()

	do_iio_device_claim_direct_mode(iio_dev) ? : ({ __acquire(iio_dev); 0; })
+ Appropriate static inline iio_device_release_direct_mode()

However with that, sparse generates false positives. E.g.

drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c:1811:17: warning: context imbalance in 'st_lsm6dsx_read_raw' - unexpected unlock

So instead, this patch rethinks the return type and makes it more
'conditional lock like' (which is part of what is going on under the hood
anyway) and return a boolean - true for successfully acquired, false for
did not acquire.

To allow a migration path given the rework is now non trivial, take a leaf
out of the naming of the conditional guard we currently have for IIO
device direct mode and drop the _mode postfix from the new functions giving
iio_device_claim_direct() and iio_device_release_direct()

Whilst the kernel supports __cond_acquires() upstream sparse does not
yet do so.  Hence rely on sparse expanding a static inline wrapper
to explicitly see whether __acquire() is called.

Note that even with the solution here, sparse sometimes gives false
positives. However in the few cases seen they were complex code
structures that benefited from simplification anyway.

Reviewed-by: David Lechner <dlechner@baylibre.com>
Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Link: https://patch.msgid.link/20250209180624.701140-2-jic23@kernel.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 56161e02f002..5ed03e36178f 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -10,6 +10,7 @@
 #include <linux/device.h>
 #include <linux/cdev.h>
 #include <linux/cleanup.h>
+#include <linux/compiler_types.h>
 #include <linux/slab.h>
 #include <linux/iio/types.h>
 /* IIO TODO LIST */
@@ -662,6 +663,31 @@ int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp);
 int iio_device_claim_direct_mode(struct iio_dev *indio_dev);
 void iio_device_release_direct_mode(struct iio_dev *indio_dev);
 
+/*
+ * Helper functions that allow claim and release of direct mode
+ * in a fashion that doesn't generate many false positives from sparse.
+ * Note this must remain static inline in the header so that sparse
+ * can see the __acquire() marking. Revisit when sparse supports
+ * __cond_acquires()
+ */
+static inline bool iio_device_claim_direct(struct iio_dev *indio_dev)
+{
+	int ret = iio_device_claim_direct_mode(indio_dev);
+
+	if (ret)
+		return false;
+
+	__acquire(iio_dev);
+
+	return true;
+}
+
+static inline void iio_device_release_direct(struct iio_dev *indio_dev)
+{
+	iio_device_release_direct_mode(indio_dev);
+	__release(indio_dev);
+}
+
 /*
  * This autocleanup logic is normally used via
  * iio_device_claim_direct_scoped().
-- 
cgit v1.2.3


From 4c571885898c5c98934d086f2ab11b5e27e4f41f Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sun, 9 Feb 2025 18:06:24 +0000
Subject: iio: Drop iio_device_claim_direct_scoped() and related infrastructure

Scoped conditional automated cleanup turned out to be harder to work
with than expected. Despite several attempts to find a better solution
non have surfaced. As such rip it out of the IIO code.

Reviewed-by: David Lechner <dlechner@baylibre.com>
Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Link: https://patch.msgid.link/20250209180624.701140-28-jic23@kernel.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 27 ---------------------------
 1 file changed, 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 5ed03e36178f..07a0e8132e88 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -9,7 +9,6 @@
 
 #include <linux/device.h>
 #include <linux/cdev.h>
-#include <linux/cleanup.h>
 #include <linux/compiler_types.h>
 #include <linux/slab.h>
 #include <linux/iio/types.h>
@@ -688,32 +687,6 @@ static inline void iio_device_release_direct(struct iio_dev *indio_dev)
 	__release(indio_dev);
 }
 
-/*
- * This autocleanup logic is normally used via
- * iio_device_claim_direct_scoped().
- */
-DEFINE_GUARD(iio_claim_direct, struct iio_dev *, iio_device_claim_direct_mode(_T),
-	     iio_device_release_direct_mode(_T))
-
-DEFINE_GUARD_COND(iio_claim_direct, _try, ({
-			struct iio_dev *dev;
-			int d = iio_device_claim_direct_mode(_T);
-
-			if (d < 0)
-				dev = NULL;
-			else
-				dev = _T;
-			dev;
-		}))
-
-/**
- * iio_device_claim_direct_scoped() - Scoped call to iio_device_claim_direct.
- * @fail: What to do on failure to claim device.
- * @iio_dev: Pointer to the IIO devices structure
- */
-#define iio_device_claim_direct_scoped(fail, iio_dev) \
-	scoped_cond_guard(iio_claim_direct_try, fail, iio_dev)
-
 int iio_device_claim_buffer_mode(struct iio_dev *indio_dev);
 void iio_device_release_buffer_mode(struct iio_dev *indio_dev);
 
-- 
cgit v1.2.3


From 8fd74a31eaf3def0d57264ada57fc981902aeadf Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Sat, 8 Feb 2025 22:15:26 +0800
Subject: driver core: class: Remove needless return in void API
 class_remove_file()

Remove return since both class_remove_file() and class_remove_file_ns()
are void functions.

Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Link: https://lore.kernel.org/r/20250208-cls_rmv_return-v1-1-091b37945aac@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/class.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index 45ee3a634999..65880e60c720 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -193,7 +193,7 @@ static inline int __must_check class_create_file(const struct class *class,
 static inline void class_remove_file(const struct class *class,
 				     const struct class_attribute *attr)
 {
-	return class_remove_file_ns(class, attr, NULL);
+	class_remove_file_ns(class, attr, NULL);
 }
 
 /* Simple class attribute that is just a static string */
-- 
cgit v1.2.3


From a44073c28bc6d4118891d61e31c9fa9dc4333dc0 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Sat, 8 Feb 2025 23:18:39 +0800
Subject: driver core: Remove needless return in void API device_remove_group()

Remove return since both device_remove_group() and device_remove_groups()
are void functions.

Fixes: e323b2dddc1c ("driver core: add device_{add|remove}_group() helpers")
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Link: https://lore.kernel.org/r/20250208-fix_device_remove_group-v1-1-8a5b0ac0ce5c@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index 80a5b3268986..605b60254f6d 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1268,7 +1268,7 @@ static inline void device_remove_group(struct device *dev,
 {
 	const struct attribute_group *groups[] = { grp, NULL };
 
-	return device_remove_groups(dev, groups);
+	device_remove_groups(dev, groups);
 }
 
 int __must_check devm_device_add_group(struct device *dev,
-- 
cgit v1.2.3


From 10d43ecbb0121fee8ddbc1e9755edc0402e458ca Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Thu, 30 Jan 2025 01:26:54 +0000
Subject: mei: Remove unused functions

The following functions have been in the mei code for
a long time but have never been used.

mei_txe_setup_satt2() was added in 2014 by
commit 32e2b59fca2c ("mei: txe: add hw-txe.c")

mei_me_cl_rm_by_uuid_id() was added in 2015 by
commit 79563db9ddd3 ("mei: add reference counting for me clients")

mei_cldev_uuid() was added in 2015 by
commit baeacd037697 ("mei: bus: export uuid and protocol version to mei_cl
bus drivers")

mei_cldev_recv_nonblock() was added in 2016 by
commit 076802d00615 ("mei: bus: enable non-blocking RX")
it is the only user of mei_cldev_recv_nonblock_vtag().

Remove them.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Alexander Usyskin <alexander.usyskin@intel.com>
Link: https://lore.kernel.org/r/20250130012654.255119-1-linux@treblig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mei_cl_bus.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index b38a56a13f39..725fd7727422 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -97,8 +97,6 @@ ssize_t mei_cldev_send(struct mei_cl_device *cldev, const u8 *buf,
 ssize_t mei_cldev_send_timeout(struct mei_cl_device *cldev, const u8 *buf,
 			       size_t length, unsigned long timeout);
 ssize_t mei_cldev_recv(struct mei_cl_device *cldev, u8 *buf, size_t length);
-ssize_t mei_cldev_recv_nonblock(struct mei_cl_device *cldev, u8 *buf,
-				size_t length);
 ssize_t mei_cldev_recv_timeout(struct mei_cl_device *cldev, u8 *buf, size_t length,
 			       unsigned long timeout);
 ssize_t mei_cldev_send_vtag(struct mei_cl_device *cldev, const u8 *buf,
@@ -107,8 +105,6 @@ ssize_t mei_cldev_send_vtag_timeout(struct mei_cl_device *cldev, const u8 *buf,
 				    size_t length, u8 vtag, unsigned long timeout);
 ssize_t mei_cldev_recv_vtag(struct mei_cl_device *cldev, u8 *buf, size_t length,
 			    u8 *vtag);
-ssize_t mei_cldev_recv_nonblock_vtag(struct mei_cl_device *cldev, u8 *buf,
-				     size_t length, u8 *vtag);
 ssize_t mei_cldev_recv_vtag_timeout(struct mei_cl_device *cldev, u8 *buf, size_t length,
 			    u8 *vtag, unsigned long timeout);
 
@@ -116,7 +112,6 @@ int mei_cldev_register_rx_cb(struct mei_cl_device *cldev, mei_cldev_cb_t rx_cb);
 int mei_cldev_register_notif_cb(struct mei_cl_device *cldev,
 				mei_cldev_cb_t notif_cb);
 
-const uuid_le *mei_cldev_uuid(const struct mei_cl_device *cldev);
 u8 mei_cldev_ver(const struct mei_cl_device *cldev);
 
 void *mei_cldev_get_drvdata(const struct mei_cl_device *cldev);
-- 
cgit v1.2.3


From 6866c91f8c2327546d850d5a85025fb81e2089bf Mon Sep 17 00:00:00 2001
From: Billy Tsai <billy_tsai@aspeedtech.com>
Date: Tue, 4 Feb 2025 17:17:01 +0800
Subject: i3c: Remove the const qualifier from i2c_msg pointer in i2c_xfers API

The change is necessary to enable the use of the
`i2c_get_dma_safe_msg_buf()` API, which requires a non-const
`struct i2c_msg *` to operate. The `i2c_get_dma_safe_msg_buf()` function
ensures safe handling of I2C messages when using DMA, making it essential
for scenarios where DMA transfers are involved. By removing the `const`
qualifier, this patch allows drivers to prepare and manage DMA-safe
buffers directly.

Signed-off-by: Billy Tsai <billy_tsai@aspeedtech.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Acked-by: Mukesh Kumar Savaliya <quic_msavaliy@quicinc.com>
Link: https://lore.kernel.org/r/20250204091702.4014466-1-billy_tsai@aspeedtech.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 12d532b012c5..c67922ece617 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -475,7 +475,7 @@ struct i3c_master_controller_ops {
 	int (*attach_i2c_dev)(struct i2c_dev_desc *dev);
 	void (*detach_i2c_dev)(struct i2c_dev_desc *dev);
 	int (*i2c_xfers)(struct i2c_dev_desc *dev,
-			 const struct i2c_msg *xfers, int nxfers);
+			 struct i2c_msg *xfers, int nxfers);
 	int (*request_ibi)(struct i3c_dev_desc *dev,
 			   const struct i3c_ibi_setup *req);
 	void (*free_ibi)(struct i3c_dev_desc *dev);
-- 
cgit v1.2.3


From c749f058b4371430a8338e1ca72b9ae38fef613b Mon Sep 17 00:00:00 2001
From: Kannappan R <r.kannappan@intel.com>
Date: Thu, 20 Feb 2025 16:13:39 +0200
Subject: USB: core: Add eUSB2 descriptor and parsing in USB core

Add support for the 'eUSB2 Isochronous Endpoint Companion Descriptor'
introduced in the recent USB 2.0 specification 'USB 2.0 Double Isochronous
IN Bandwidth' ECN.

It allows embedded USB2 (eUSB2) devices to report and use higher bandwidths
for isochronous IN transfers in order to support higher camera resolutions
on the lid of laptops and tablets with minimal change to the USB2 protocol.

The motivation for expanding USB 2.0 is further clarified in an additional
Embedded USB2 version 2.0 (eUSB2v2) supplement to the USB 2.0
specification. It points out this is optimized for performance, power and
cost by using the USB 2.0 low-voltage, power efficient PHY and half-duplex
link for the asymmetric camera bandwidth needs, avoiding the costly and
complex full-duplex USB 3.x symmetric link and gigabit receivers.

eUSB2 devices that support the higher isochronous IN bandwidth and the new
descriptor can be identified by their device descriptor bcdUSB value of
0x0220

Co-developed-by: Amardeep Rai <amardeep.rai@intel.com>
Signed-off-by: Amardeep Rai <amardeep.rai@intel.com>
Signed-off-by: Kannappan R <r.kannappan@intel.com>
Co-developed-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/20250220141339.1939448-1-mathias.nyman@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h          |  8 +++++---
 include/uapi/linux/usb/ch9.h | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index cfa8005e24f9..b46738701f8d 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -51,6 +51,7 @@ struct ep_device;
  * @desc: descriptor for this endpoint, wMaxPacketSize in native byteorder
  * @ss_ep_comp: SuperSpeed companion descriptor for this endpoint
  * @ssp_isoc_ep_comp: SuperSpeedPlus isoc companion descriptor for this endpoint
+ * @eusb2_isoc_ep_comp: eUSB2 isoc companion descriptor for this endpoint
  * @urb_list: urbs queued to this endpoint; maintained by usbcore
  * @hcpriv: for use by HCD; typically holds hardware dma queue head (QH)
  *	with one or more transfer descriptors (TDs) per urb
@@ -64,9 +65,10 @@ struct ep_device;
  * descriptor within an active interface in a given USB configuration.
  */
 struct usb_host_endpoint {
-	struct usb_endpoint_descriptor		desc;
-	struct usb_ss_ep_comp_descriptor	ss_ep_comp;
-	struct usb_ssp_isoc_ep_comp_descriptor	ssp_isoc_ep_comp;
+	struct usb_endpoint_descriptor			desc;
+	struct usb_ss_ep_comp_descriptor		ss_ep_comp;
+	struct usb_ssp_isoc_ep_comp_descriptor		ssp_isoc_ep_comp;
+	struct usb_eusb2_isoc_ep_comp_descriptor	eusb2_isoc_ep_comp;
 	struct list_head		urb_list;
 	void				*hcpriv;
 	struct ep_device		*ep_dev;	/* For sysfs info */
diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 91f0f7e214a5..475af9358173 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -253,6 +253,9 @@ struct usb_ctrlrequest {
 #define USB_DT_BOS			0x0f
 #define USB_DT_DEVICE_CAPABILITY	0x10
 #define USB_DT_WIRELESS_ENDPOINT_COMP	0x11
+/* From the eUSB2 spec */
+#define USB_DT_EUSB2_ISOC_ENDPOINT_COMP	0x12
+/* From Wireless USB spec */
 #define USB_DT_WIRE_ADAPTER		0x21
 /* From USB Device Firmware Upgrade Specification, Revision 1.1 */
 #define USB_DT_DFU_FUNCTIONAL		0x21
@@ -675,6 +678,18 @@ static inline int usb_endpoint_interrupt_type(
 
 /*-------------------------------------------------------------------------*/
 
+/* USB_DT_EUSB2_ISOC_ENDPOINT_COMP: eUSB2 Isoch Endpoint Companion descriptor */
+struct usb_eusb2_isoc_ep_comp_descriptor {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__le16	wMaxPacketSize;
+	__le32	dwBytesPerInterval;
+} __attribute__ ((packed));
+
+#define USB_DT_EUSB2_ISOC_EP_COMP_SIZE	8
+
+/*-------------------------------------------------------------------------*/
+
 /* USB_DT_SSP_ISOC_ENDPOINT_COMP: SuperSpeedPlus Isochronous Endpoint Companion
  * descriptor
  */
-- 
cgit v1.2.3


From ac9c5170a18162d45c6edd1f0fa2d2b2504bc2cb Mon Sep 17 00:00:00 2001
From: Subramanian Mohan <subramanian.mohan@intel.com>
Date: Wed, 19 Feb 2025 09:36:15 +0530
Subject: pps: generators: replace copy of pps-gen info struct with const
 pointer

Some PPS generator drivers may need to retrieve a pointer to their
internal data while executing the PPS generator enable() method.

During the driver registration the pps_gen_device pointer is returned
from the framework, and for that reason, there is difficulty in
getting generator driver data back in the enable function. We won't be
able to use container_of macro as it results in static assert, and we
might end up in using static pointer.

To solve the issue and to get back the generator driver data back, we
should not copy the struct pps_gen_source_info within the struct
pps_gen_device during the registration stage, but simply save the
pointer of the driver one. In this manner, driver may get a pointer
to their internal data as shown below:

struct pps_gen_foo_data_s {
        ...
	struct pps_gen_source_info gen_info;
	struct pps_gen_device *pps_gen;
	...
};

static int __init pps_gen_foo_init(void)
{
        struct pps_gen_foo_data_s *foo;
	...
        foo->pps_gen = pps_gen_register_source(&foo->gen_info);
	...
}

Then, in the enable() method, we can retrieve the pointer to the main
struct by using the code below:

static int pps_gen_foo_enable(struct pps_gen_device *pps_gen, bool enable)
{
        struct pps_gen_foo_data_s *foo = container_of(pps_gen->info,
						struct pps_gen_foo_data_s, gen_info);
        ...
}

Signed-off-by: Rodolfo Giometti <giometti@enneenne.com>
Tested-by: Subramanian Mohan <subramanian.mohan@intel.com>
Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Subramanian Mohan <subramanian.mohan@intel.com>
Link: https://lore.kernel.org/r/20250219040618.70962-2-subramanian.mohan@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/pps_gen_kernel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/pps_gen_kernel.h b/include/linux/pps_gen_kernel.h
index 022ea0ac4440..6214c8aa2e02 100644
--- a/include/linux/pps_gen_kernel.h
+++ b/include/linux/pps_gen_kernel.h
@@ -43,7 +43,7 @@ struct pps_gen_source_info {
 
 /* The main struct */
 struct pps_gen_device {
-	struct pps_gen_source_info info;	/* PSS generator info */
+	const struct pps_gen_source_info *info;	/* PSS generator info */
 	bool enabled;				/* PSS generator status */
 
 	unsigned int event;
@@ -70,7 +70,7 @@ extern const struct attribute_group *pps_gen_groups[];
  */
 
 extern struct pps_gen_device *pps_gen_register_source(
-				struct pps_gen_source_info *info);
+				const struct pps_gen_source_info *info);
 extern void pps_gen_unregister_source(struct pps_gen_device *pps_gen);
 extern void pps_gen_event(struct pps_gen_device *pps_gen,
 				unsigned int event, void *data);
-- 
cgit v1.2.3


From 46006ceb5d029f92df405db15c9a31f0ee41628c Mon Sep 17 00:00:00 2001
From: Linu Cherian <lcherian@marvell.com>
Date: Wed, 12 Feb 2025 17:19:13 +0530
Subject: coresight: core: Add provision for panic callbacks

Panic callback handlers allows coresight device drivers to sync
relevant trace data and trace metadata to reserved memory
regions so that they can be retrieved later in the subsequent
boot or in the crashdump kernel.

Signed-off-by: Linu Cherian <lcherian@marvell.com>
Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20250212114918.548431-4-lcherian@marvell.com
---
 include/linux/coresight.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 17276965ff1d..3eef4e91df3f 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -340,6 +340,7 @@ enum cs_mode {
 #define link_ops(csdev)		csdev->ops->link_ops
 #define helper_ops(csdev)	csdev->ops->helper_ops
 #define ect_ops(csdev)		csdev->ops->ect_ops
+#define panic_ops(csdev)	csdev->ops->panic_ops
 
 /**
  * struct coresight_ops_sink - basic operations for a sink
@@ -409,11 +410,22 @@ struct coresight_ops_helper {
 	int (*disable)(struct coresight_device *csdev, void *data);
 };
 
+
+/**
+ * struct coresight_ops_panic - Generic device ops for panic handing
+ *
+ * @sync	: Sync the device register state/trace data
+ */
+struct coresight_ops_panic {
+	int (*sync)(struct coresight_device *csdev);
+};
+
 struct coresight_ops {
 	const struct coresight_ops_sink *sink_ops;
 	const struct coresight_ops_link *link_ops;
 	const struct coresight_ops_source *source_ops;
 	const struct coresight_ops_helper *helper_ops;
+	const struct coresight_ops_panic *panic_ops;
 };
 
 static inline u32 csdev_access_relaxed_read32(struct csdev_access *csa,
-- 
cgit v1.2.3


From 3b29bcee8f6f703a5952b85fc2ffcbcfb0862db4 Mon Sep 17 00:00:00 2001
From: Robert Budai <robert.budai@analog.com>
Date: Mon, 17 Feb 2025 12:57:45 +0200
Subject: iio: imu: adis: Add custom ops struct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces a custom ops struct letting users define custom read and
write functions. Some adis devices might define a completely different spi
protocol from the one used in the default implementation.

Co-developed-by: Ramona Gradinariu <ramona.gradinariu@analog.com>
Signed-off-by: Ramona Gradinariu <ramona.gradinariu@analog.com>
Co-developed-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Signed-off-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Co-developed-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Robert Budai <robert.budai@analog.com>
Link: https://patch.msgid.link/20250217105753.605465-2-robert.budai@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/imu/adis.h | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h
index 4bb98d9731de..89cfa75ae9ea 100644
--- a/include/linux/iio/imu/adis.h
+++ b/include/linux/iio/imu/adis.h
@@ -94,6 +94,18 @@ struct adis_data {
 	unsigned int burst_max_speed_hz;
 };
 
+/**
+ * struct adis_ops: Custom ops for adis devices.
+ * @write: Custom spi write implementation.
+ * @read: Custom spi read implementation.
+ */
+struct adis_ops {
+	int (*write)(struct adis *adis, unsigned int reg, unsigned int value,
+		     unsigned int size);
+	int (*read)(struct adis *adis, unsigned int reg, unsigned int *value,
+		    unsigned int size);
+};
+
 /**
  * struct adis - ADIS device instance data
  * @spi: Reference to SPI device which owns this ADIS IIO device
@@ -101,6 +113,7 @@ struct adis_data {
  * @data: ADIS chip variant specific data
  * @burst_extra_len: Burst extra length. Should only be used by devices that can
  *		     dynamically change their burst mode length.
+ * @ops: ops struct for custom read and write functions
  * @state_lock: Lock used by the device to protect state
  * @msg: SPI message object
  * @xfer: SPI transfer objects to be used for a @msg
@@ -116,6 +129,7 @@ struct adis {
 
 	const struct adis_data	*data;
 	unsigned int		burst_extra_len;
+	const struct adis_ops	*ops;
 	/**
 	 * The state_lock is meant to be used during operations that require
 	 * a sequence of SPI R/W in order to protect the SPI transfer
@@ -168,7 +182,7 @@ int __adis_read_reg(struct adis *adis, unsigned int reg,
 static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg,
 				     u8 val)
 {
-	return __adis_write_reg(adis, reg, val, 1);
+	return adis->ops->write(adis, reg, val, 1);
 }
 
 /**
@@ -180,7 +194,7 @@ static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg,
 static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg,
 				      u16 val)
 {
-	return __adis_write_reg(adis, reg, val, 2);
+	return adis->ops->write(adis, reg, val, 2);
 }
 
 /**
@@ -192,7 +206,7 @@ static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg,
 static inline int __adis_write_reg_32(struct adis *adis, unsigned int reg,
 				      u32 val)
 {
-	return __adis_write_reg(adis, reg, val, 4);
+	return adis->ops->write(adis, reg, val, 4);
 }
 
 /**
@@ -207,7 +221,7 @@ static inline int __adis_read_reg_16(struct adis *adis, unsigned int reg,
 	unsigned int tmp;
 	int ret;
 
-	ret = __adis_read_reg(adis, reg, &tmp, 2);
+	ret = adis->ops->read(adis, reg, &tmp, 2);
 	if (ret == 0)
 		*val = tmp;
 
@@ -226,7 +240,7 @@ static inline int __adis_read_reg_32(struct adis *adis, unsigned int reg,
 	unsigned int tmp;
 	int ret;
 
-	ret = __adis_read_reg(adis, reg, &tmp, 4);
+	ret = adis->ops->read(adis, reg, &tmp, 4);
 	if (ret == 0)
 		*val = tmp;
 
@@ -244,7 +258,7 @@ static inline int adis_write_reg(struct adis *adis, unsigned int reg,
 				 unsigned int val, unsigned int size)
 {
 	guard(mutex)(&adis->state_lock);
-	return __adis_write_reg(adis, reg, val, size);
+	return adis->ops->write(adis, reg, val, size);
 }
 
 /**
@@ -258,7 +272,7 @@ static int adis_read_reg(struct adis *adis, unsigned int reg,
 			 unsigned int *val, unsigned int size)
 {
 	guard(mutex)(&adis->state_lock);
-	return __adis_read_reg(adis, reg, val, size);
+	return adis->ops->read(adis, reg, val, size);
 }
 
 /**
-- 
cgit v1.2.3


From 7f15d7a7d12dbcb4a846ca19d9565e0c11ea6694 Mon Sep 17 00:00:00 2001
From: Robert Budai <robert.budai@analog.com>
Date: Mon, 17 Feb 2025 12:57:46 +0200
Subject: iio: imu: adis: Add reset to custom ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch allows the custom definition of reset functionality for adis object.
It is useful in cases where the driver does not need to sleep after the reset
since it is handled by the library.

Co-developed-by: Ramona Gradinariu <ramona.gradinariu@analog.com>
Signed-off-by: Ramona Gradinariu <ramona.gradinariu@analog.com>
Co-developed-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Signed-off-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Co-developed-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Robert Budai <robert.budai@analog.com>
Link: https://patch.msgid.link/20250217105753.605465-3-robert.budai@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/imu/adis.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h
index 89cfa75ae9ea..52652f51db2e 100644
--- a/include/linux/iio/imu/adis.h
+++ b/include/linux/iio/imu/adis.h
@@ -98,12 +98,15 @@ struct adis_data {
  * struct adis_ops: Custom ops for adis devices.
  * @write: Custom spi write implementation.
  * @read: Custom spi read implementation.
+ * @reset: Custom sw reset implementation. The custom implementation does not
+ *	   need to sleep after the reset. It's done by the library already.
  */
 struct adis_ops {
 	int (*write)(struct adis *adis, unsigned int reg, unsigned int value,
 		     unsigned int size);
 	int (*read)(struct adis *adis, unsigned int reg, unsigned int *value,
 		    unsigned int size);
+	int (*reset)(struct adis *adis);
 };
 
 /**
-- 
cgit v1.2.3


From 9fa98d94131806cae0441d05213d36da480d7389 Mon Sep 17 00:00:00 2001
From: Robert Budai <robert.budai@analog.com>
Date: Mon, 17 Feb 2025 12:57:47 +0200
Subject: iio: imu: adis: Add DIAG_STAT register
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some devices may have more than 16 bits of status. This patch allows the
user to specify the size of the DIAG_STAT register. It defaults to 2 if
not specified. This is mainly for backward compatibility.

Co-developed-by: Ramona Gradinariu <ramona.gradinariu@analog.com>
Signed-off-by: Ramona Gradinariu <ramona.gradinariu@analog.com>
Co-developed-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Signed-off-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Co-developed-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Robert Budai <robert.budai@analog.com>
Link: https://patch.msgid.link/20250217105753.605465-4-robert.budai@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/imu/adis.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h
index 52652f51db2e..aa160511e265 100644
--- a/include/linux/iio/imu/adis.h
+++ b/include/linux/iio/imu/adis.h
@@ -44,6 +44,8 @@ struct adis_timeout {
  * @glob_cmd_reg: Register address of the GLOB_CMD register
  * @msc_ctrl_reg: Register address of the MSC_CTRL register
  * @diag_stat_reg: Register address of the DIAG_STAT register
+ * @diag_stat_size:	Length (in bytes) of the DIAG_STAT register. If 0 the
+ *			default length is 2 bytes long.
  * @prod_id_reg: Register address of the PROD_ID register
  * @prod_id: Product ID code that should be expected when reading @prod_id_reg
  * @self_test_mask: Bitmask of supported self-test operations
@@ -70,6 +72,7 @@ struct adis_data {
 	unsigned int glob_cmd_reg;
 	unsigned int msc_ctrl_reg;
 	unsigned int diag_stat_reg;
+	unsigned int diag_stat_size;
 	unsigned int prod_id_reg;
 
 	unsigned int prod_id;
-- 
cgit v1.2.3


From 4ff6039ffb79a4a8a44b63810a8a2f2b43264856 Mon Sep 17 00:00:00 2001
From: Yuanfang Zhang <quic_yuanfang@quicinc.com>
Date: Thu, 16 Jan 2025 17:04:20 +0800
Subject: coresight-etm4x: add isb() before reading the TRCSTATR

As recommended by section 4.3.7 ("Synchronization when using system
instructions to progrom the trace unit") of ARM IHI 0064H.b, the
self-hosted trace analyzer must perform a Context synchronization
event between writing to the TRCPRGCTLR and reading the TRCSTATR.
Additionally, add an ISB between the each read of TRCSTATR on
coresight_timeout() when using system instructions to program the
trace unit.

Fixes: 1ab3bb9df5e3 ("coresight: etm4x: Add necessary synchronization for sysreg access")
Signed-off-by: Yuanfang Zhang <quic_yuanfang@quicinc.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20250116-etm_sync-v4-1-39f2b05e9514@quicinc.com
---
 include/linux/coresight.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 3eef4e91df3f..c7614ccb3232 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -661,6 +661,10 @@ extern int coresight_enable_sysfs(struct coresight_device *csdev);
 extern void coresight_disable_sysfs(struct coresight_device *csdev);
 extern int coresight_timeout(struct csdev_access *csa, u32 offset,
 			     int position, int value);
+typedef void (*coresight_timeout_cb_t) (struct csdev_access *, u32, int, int);
+extern int coresight_timeout_action(struct csdev_access *csa, u32 offset,
+					int position, int value,
+					coresight_timeout_cb_t cb);
 
 extern int coresight_claim_device(struct coresight_device *csdev);
 extern int coresight_claim_device_unlocked(struct coresight_device *csdev);
-- 
cgit v1.2.3


From 84b25926fa7abb5b634d4af9544f47ab0aabc399 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 26 Feb 2025 09:21:18 -0700
Subject: acpi: numa: Add support to enumerate and store extended linear
 address mode

Store the address mode as part of the cache attriutes. Export the mode
attribute to sysfs as all other cache attributes.

Link: https://lore.kernel.org/linux-cxl/668333b17e4b2_5639294fd@dwillia2-xfh.jf.intel.com.notmuch/
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Li Ming <ming.li@zohomail.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Link: https://patch.msgid.link/20250226162224.3633792-2-dave.jiang@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/linux/node.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/node.h b/include/linux/node.h
index 9a881c2208b3..2b7517892230 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -57,6 +57,11 @@ enum cache_write_policy {
 	NODE_CACHE_WRITE_OTHER,
 };
 
+enum cache_mode {
+	NODE_CACHE_ADDR_MODE_RESERVED,
+	NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR,
+};
+
 /**
  * struct node_cache_attrs - system memory caching attributes
  *
@@ -65,6 +70,7 @@ enum cache_write_policy {
  * @size:		Total size of cache in bytes
  * @line_size:		Number of bytes fetched on a cache miss
  * @level:		The cache hierarchy level
+ * @address_mode:		The address mode
  */
 struct node_cache_attrs {
 	enum cache_indexing indexing;
@@ -72,6 +78,7 @@ struct node_cache_attrs {
 	u64 size;
 	u16 line_size;
 	u8 level;
+	u16 address_mode;
 };
 
 #ifdef CONFIG_HMEM_REPORTING
-- 
cgit v1.2.3


From 0ec9849b63338da7883440ed3f52757cd8c847b1 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 26 Feb 2025 09:21:19 -0700
Subject: acpi/hmat / cxl: Add extended linear cache support for CXL

The current cxl region size only indicates the size of the CXL memory
region without accounting for the extended linear cache size. Retrieve the
cache size from HMAT and append that to the cxl region size for the cxl
region range that matches the SRAT range that has extended linear cache
enabled.

The SRAT defines the whole memory range that includes the extended linear
cache and the CXL memory region. The new HMAT ECN/ECR to the Memory Side
Cache Information Structure defines the size of the extended linear cache
size and matches to the SRAT Memory Affinity Structure by the memory
proxmity domain. Add a helper to match the cxl range to the SRAT memory
range in order to retrieve the cache size.

There are several places that checks the cxl region range against the
decoder range. Use new helper to check between the two ranges and address
the new cache size.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Li Ming <ming.li@zohomail.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Link: https://patch.msgid.link/20250226162224.3633792-3-dave.jiang@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/linux/acpi.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4e495b29c640..cbd933504dbf 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1095,6 +1095,17 @@ static inline acpi_handle acpi_get_processor_handle(int cpu)
 
 #endif	/* !CONFIG_ACPI */
 
+#ifdef CONFIG_ACPI_HMAT
+int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid,
+					resource_size_t *size);
+#else
+static inline int hmat_get_extended_linear_cache_size(struct resource *backing_res,
+						      int nid, resource_size_t *size)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 extern void arch_post_acpi_subsys_init(void);
 
 #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
-- 
cgit v1.2.3


From 0da30874729baeb01889b0eca16cfda122687503 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@kernel.org>
Date: Mon, 24 Feb 2025 16:04:17 +0200
Subject: dmaengine: ti: k3-udma-glue: Drop skip_fdq argument from
 k3_udma_glue_reset_rx_chn

The user of k3_udma_glue_reset_rx_chn() e.g. ti_am65_cpsw_nuss can
run on multiple platforms having different DMA architectures.
On some platforms there can be one FDQ for all flows in the RX channel
while for others there is a separate FDQ for each flow in the RX channel.

So far we have been relying on the skip_fdq argument of
k3_udma_glue_reset_rx_chn().

Instead of relying on the user to provide this information, infer it
based on DMA architecture during k3_udma_glue_request_rx_chn() and save it
in an internal flag 'single_fdq'. Use that flag at
k3_udma_glue_reset_rx_chn() to deicide if the FDQ needs
to be cleared for every flow or just for flow 0.

Fixes the below issue on ti_am65_cpsw_nuss driver on AM62-SK.

> ip link set eth1 down
> ip link set eth0 down
> ethtool -L eth0 rx 8
> ip link set eth0 up
> modprobe -r ti_am65_cpsw_nuss

[  103.045726] ------------[ cut here ]------------
[  103.050505] k3_knav_desc_pool size 512000 != avail 64000
[  103.050703] WARNING: CPU: 1 PID: 450 at drivers/net/ethernet/ti/k3-cppi-desc-pool.c:33 k3_cppi_desc_pool_destroy+0xa0/0xa8 [k3_cppi_desc_pool]
[  103.068810] Modules linked in: ti_am65_cpsw_nuss(-) k3_cppi_desc_pool snd_soc_hdmi_codec crct10dif_ce snd_soc_simple_card snd_soc_simple_card_utils display_connector rtc_ti_k3 k3_j72xx_bandgap tidss drm_client_lib snd_soc_davinci_mcas
p drm_dma_helper tps6598x phylink snd_soc_ti_udma rti_wdt drm_display_helper snd_soc_tlv320aic3x_i2c typec at24 phy_gmii_sel snd_soc_ti_edma snd_soc_tlv320aic3x sii902x snd_soc_ti_sdma sa2ul omap_mailbox drm_kms_helper authenc cfg80211 r
fkill fuse drm drm_panel_orientation_quirks backlight ip_tables x_tables ipv6 [last unloaded: k3_cppi_desc_pool]
[  103.119950] CPU: 1 UID: 0 PID: 450 Comm: modprobe Not tainted 6.13.0-rc7-00001-g9c5e3435fa66 #1011
[  103.119968] Hardware name: Texas Instruments AM625 SK (DT)
[  103.119974] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[  103.119983] pc : k3_cppi_desc_pool_destroy+0xa0/0xa8 [k3_cppi_desc_pool]
[  103.148007] lr : k3_cppi_desc_pool_destroy+0xa0/0xa8 [k3_cppi_desc_pool]
[  103.154709] sp : ffff8000826ebbc0
[  103.158015] x29: ffff8000826ebbc0 x28: ffff0000090b6300 x27: 0000000000000000
[  103.165145] x26: 0000000000000000 x25: 0000000000000000 x24: ffff0000019df6b0
[  103.172271] x23: ffff0000019df6b8 x22: ffff0000019df410 x21: ffff8000826ebc88
[  103.179397] x20: 000000000007d000 x19: ffff00000a3b3000 x18: 0000000000000000
[  103.186522] x17: 0000000000000000 x16: 0000000000000000 x15: 000001e8c35e1cde
[  103.193647] x14: 0000000000000396 x13: 000000000000035c x12: 0000000000000000
[  103.200772] x11: 000000000000003a x10: 00000000000009c0 x9 : ffff8000826eba20
[  103.207897] x8 : ffff0000090b6d20 x7 : ffff00007728c180 x6 : ffff00007728c100
[  103.215022] x5 : 0000000000000001 x4 : ffff000000508a50 x3 : ffff7ffff6146000
[  103.222147] x2 : 0000000000000000 x1 : e300b4173ee6b200 x0 : 0000000000000000
[  103.229274] Call trace:
[  103.231714]  k3_cppi_desc_pool_destroy+0xa0/0xa8 [k3_cppi_desc_pool] (P)
[  103.238408]  am65_cpsw_nuss_free_rx_chns+0x28/0x4c [ti_am65_cpsw_nuss]
[  103.244942]  devm_action_release+0x14/0x20
[  103.249040]  release_nodes+0x3c/0x68
[  103.252610]  devres_release_all+0x8c/0xdc
[  103.256614]  device_unbind_cleanup+0x18/0x60
[  103.260876]  device_release_driver_internal+0xf8/0x178
[  103.266004]  driver_detach+0x50/0x9c
[  103.269571]  bus_remove_driver+0x6c/0xbc
[  103.273485]  driver_unregister+0x30/0x60
[  103.277401]  platform_driver_unregister+0x14/0x20
[  103.282096]  am65_cpsw_nuss_driver_exit+0x18/0xff4 [ti_am65_cpsw_nuss]
[  103.288620]  __arm64_sys_delete_module+0x17c/0x25c
[  103.293404]  invoke_syscall+0x44/0x100
[  103.297149]  el0_svc_common.constprop.0+0xc0/0xe0
[  103.301845]  do_el0_svc+0x1c/0x28
[  103.305155]  el0_svc+0x28/0x98
[  103.308207]  el0t_64_sync_handler+0xc8/0xcc
[  103.312384]  el0t_64_sync+0x198/0x19c
[  103.316040] ---[ end trace 0000000000000000 ]---

Signed-off-by: Roger Quadros <rogerq@kernel.org>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Peter Ujfalusi <peter.ujfalusi@gmail.com>
Link: https://lore.kernel.org/r/20250224-k3-udma-glue-single-fdq-v2-1-cbe7621f2507@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/k3-udma-glue.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h
index 2dea217629d0..5d43881e6fb7 100644
--- a/include/linux/dma/k3-udma-glue.h
+++ b/include/linux/dma/k3-udma-glue.h
@@ -138,8 +138,7 @@ int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn,
 			    u32 flow_num);
 void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn,
 		u32 flow_num, void *data,
-		void (*cleanup)(void *data, dma_addr_t desc_dma),
-		bool skip_fdq);
+		void (*cleanup)(void *data, dma_addr_t desc_dma));
 int k3_udma_glue_rx_flow_enable(struct k3_udma_glue_rx_channel *rx_chn,
 				u32 flow_idx);
 int k3_udma_glue_rx_flow_disable(struct k3_udma_glue_rx_channel *rx_chn,
-- 
cgit v1.2.3


From 62fb8adc43afad5fa1c9cadc6f3a8e9fb72af194 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 18 Feb 2025 15:22:05 -0700
Subject: mm: Provide address mask in struct follow_pfnmap_args

follow_pfnmap_start() walks the page table for a given address and
fills out the struct follow_pfnmap_args in pfnmap_args_setup().
The address mask of the page table level is already provided to this
latter function for calculating the pfn.  This address mask can also
be useful for the caller to determine the extent of the contiguous
mapping.

For example, vfio-pci now supports huge_fault for pfnmaps and is able
to insert pud and pmd mappings.  When we DMA map these pfnmaps, ex.
PCI MMIO BARs, we iterate follow_pfnmap_start() to get each pfn to test
for a contiguous pfn range.  Providing the mapping address mask allows
us to skip the extent of the mapping level.  Assuming a 1GB pud level
and 4KB page size, iterations are reduced by a factor of 256K.  In wall
clock time, mapping a 32GB PCI BAR is reduced from ~1s to <1ms.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: linux-mm@kvack.org
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mitchell Augustin <mitchell.augustin@canonical.com>
Tested-by: Mitchell Augustin <mitchell.augustin@canonical.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/20250218222209.1382449-6-alex.williamson@redhat.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b1068ddcbb7..92b30dba7e38 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2417,11 +2417,13 @@ struct follow_pfnmap_args {
 	 * Outputs:
 	 *
 	 * @pfn: the PFN of the address
+	 * @addr_mask: address mask covering pfn
 	 * @pgprot: the pgprot_t of the mapping
 	 * @writable: whether the mapping is writable
 	 * @special: whether the mapping is a special mapping (real PFN maps)
 	 */
 	unsigned long pfn;
+	unsigned long addr_mask;
 	pgprot_t pgprot;
 	bool writable;
 	bool special;
-- 
cgit v1.2.3


From 9fab91e0eae3a9ec1861119877a61d571f21536a Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sun, 23 Feb 2025 16:06:02 +0000
Subject: usb: ulpi: Remove unused otg_ulpi_create

otg_ulpi_create() has been unused since 2022's
commit 8ca79aaad8be ("ARM: pxa: remove unused pxa3xx-ulpi")

Remove it.

The devm_ variant is still used.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Link: https://lore.kernel.org/r/20250223160602.91916-1-linux@treblig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/ulpi.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/usb/ulpi.h b/include/linux/usb/ulpi.h
index 5050f502c1ed..4b651065738a 100644
--- a/include/linux/usb/ulpi.h
+++ b/include/linux/usb/ulpi.h
@@ -49,19 +49,10 @@
 /*-------------------------------------------------------------------------*/
 
 #if IS_ENABLED(CONFIG_USB_ULPI)
-struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops,
-					unsigned int flags);
-
 struct usb_phy *devm_otg_ulpi_create(struct device *dev,
 				     struct usb_phy_io_ops *ops,
 				     unsigned int flags);
 #else
-static inline struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops,
-					      unsigned int flags)
-{
-	return NULL;
-}
-
 static inline struct usb_phy *devm_otg_ulpi_create(struct device *dev,
 						   struct usb_phy_io_ops *ops,
 						   unsigned int flags)
-- 
cgit v1.2.3


From dc872c5f527a07124b2ad92d8c70d11829f17ac9 Mon Sep 17 00:00:00 2001
From: Jie Gan <quic_jiegan@quicinc.com>
Date: Mon, 3 Mar 2025 11:29:22 +0800
Subject: Coresight: Add support for new APB clock name

Add support for new APB clock-name. If the function fails
to obtain the clock with the name "apb_pclk", it will
attempt to acquire the clock with the name "apb".

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Jie Gan <quic_jiegan@quicinc.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20250303032931.2500935-2-quic_jiegan@quicinc.com
---
 include/linux/coresight.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index c7614ccb3232..2e493d5288d8 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -471,8 +471,11 @@ static inline struct clk *coresight_get_enable_apb_pclk(struct device *dev)
 	int ret;
 
 	pclk = clk_get(dev, "apb_pclk");
-	if (IS_ERR(pclk))
-		return NULL;
+	if (IS_ERR(pclk)) {
+		pclk = clk_get(dev, "apb");
+		if (IS_ERR(pclk))
+			return NULL;
+	}
 
 	ret = clk_prepare_enable(pclk);
 	if (ret) {
-- 
cgit v1.2.3


From c367a89dec267c65b556ace5d6aafd07fb1d66b1 Mon Sep 17 00:00:00 2001
From: Jie Gan <quic_jiegan@quicinc.com>
Date: Mon, 3 Mar 2025 11:29:23 +0800
Subject: Coresight: Add trace_id function to retrieving the trace ID

Add 'trace_id' function pointer in coresight_ops. It's responsible for retrieving
the device's trace ID.

Co-developed-by: James Clark <james.clark@linaro.org>
Signed-off-by: James Clark <james.clark@linaro.org>
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Jie Gan <quic_jiegan@quicinc.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20250303032931.2500935-3-quic_jiegan@quicinc.com
---
 include/linux/coresight.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 2e493d5288d8..1396bda77f3f 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -335,6 +335,7 @@ enum cs_mode {
 	CS_MODE_PERF,
 };
 
+#define coresight_ops(csdev)	csdev->ops
 #define source_ops(csdev)	csdev->ops->source_ops
 #define sink_ops(csdev)		csdev->ops->sink_ops
 #define link_ops(csdev)		csdev->ops->link_ops
@@ -421,6 +422,8 @@ struct coresight_ops_panic {
 };
 
 struct coresight_ops {
+	int (*trace_id)(struct coresight_device *csdev, enum cs_mode mode,
+			struct coresight_device *sink);
 	const struct coresight_ops_sink *sink_ops;
 	const struct coresight_ops_link *link_ops;
 	const struct coresight_ops_source *source_ops;
@@ -713,4 +716,6 @@ int coresight_init_driver(const char *drv, struct amba_driver *amba_drv,
 
 void coresight_remove_driver(struct amba_driver *amba_drv,
 			     struct platform_driver *pdev_drv);
+int coresight_etm_get_trace_id(struct coresight_device *csdev, enum cs_mode mode,
+			       struct coresight_device *sink);
 #endif		/* _LINUX_COREISGHT_H */
-- 
cgit v1.2.3


From 3c03c49b2fa59c4d456e2d673fe5847fa6cbcdf2 Mon Sep 17 00:00:00 2001
From: Jie Gan <quic_jiegan@quicinc.com>
Date: Mon, 3 Mar 2025 11:29:25 +0800
Subject: Coresight: Introduce a new struct coresight_path

Introduce a new strcuture, 'struct coresight_path', to store the data that
utilized by the devices in the path. The coresight_path will be built/released
by coresight_build_path/coresight_release_path functions.

Signed-off-by: Jie Gan <quic_jiegan@quicinc.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20250303032931.2500935-5-quic_jiegan@quicinc.com
---
 include/linux/coresight.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 1396bda77f3f..509e1f256412 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -329,6 +329,16 @@ static struct coresight_dev_list (var) = {				\
 
 #define to_coresight_device(d) container_of(d, struct coresight_device, dev)
 
+/**
+ * struct coresight_path - data needed by enable/disable path
+ * @path_list:              path from source to sink.
+ * @trace_id:          trace_id of the whole path.
+ */
+struct coresight_path {
+	struct list_head	path_list;
+	u8			trace_id;
+};
+
 enum cs_mode {
 	CS_MODE_DISABLED,
 	CS_MODE_SYSFS,
-- 
cgit v1.2.3


From 7b365f056d8e02fc70c823bdf736e41a7236a54b Mon Sep 17 00:00:00 2001
From: Jie Gan <quic_jiegan@quicinc.com>
Date: Mon, 3 Mar 2025 11:29:27 +0800
Subject: Coresight: Change to read the trace ID from coresight_path

The source device can directly read the trace ID from the coresight_path
which result in etm_read_alloc_trace_id and etm4_read_alloc_trace_id being
deleted.

Co-developed-by: James Clark <james.clark@linaro.org>
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Jie Gan <quic_jiegan@quicinc.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20250303032931.2500935-7-quic_jiegan@quicinc.com
---
 include/linux/coresight.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 509e1f256412..bc0c853ffa6d 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -401,7 +401,7 @@ struct coresight_ops_link {
 struct coresight_ops_source {
 	int (*cpu_id)(struct coresight_device *csdev);
 	int (*enable)(struct coresight_device *csdev, struct perf_event *event,
-		      enum cs_mode mode, struct coresight_trace_id_map *id_map);
+		      enum cs_mode mode, struct coresight_path *path);
 	void (*disable)(struct coresight_device *csdev,
 			struct perf_event *event);
 };
-- 
cgit v1.2.3


From f78d206f3d73446e4c3568946a26f903da2149b4 Mon Sep 17 00:00:00 2001
From: Jie Gan <quic_jiegan@quicinc.com>
Date: Mon, 3 Mar 2025 11:29:30 +0800
Subject: Coresight: Add Coresight TMC Control Unit driver

The Coresight TMC Control Unit hosts miscellaneous configuration registers
which control various features related to TMC ETR sink.

Based on the trace ID, which is programmed in the related CTCU ATID
register of a specific ETR, trace data with that trace ID gets into
the ETR buffer, while other trace data gets dropped.

Enabling source device sets one bit of the ATID register based on
source device's trace ID.
Disabling source device resets the bit according to the source
device's trace ID.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Jie Gan <quic_jiegan@quicinc.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20250303032931.2500935-10-quic_jiegan@quicinc.com
---
 include/linux/coresight.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index bc0c853ffa6d..89b781e70fe7 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -71,7 +71,8 @@ enum coresight_dev_subtype_source {
 
 enum coresight_dev_subtype_helper {
 	CORESIGHT_DEV_SUBTYPE_HELPER_CATU,
-	CORESIGHT_DEV_SUBTYPE_HELPER_ECT_CTI
+	CORESIGHT_DEV_SUBTYPE_HELPER_ECT_CTI,
+	CORESIGHT_DEV_SUBTYPE_HELPER_CTCU,
 };
 
 /**
-- 
cgit v1.2.3


From e5d5813968217b99ef2b83f13353967b218e3841 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bence=20Cs=C3=B3k=C3=A1s?= <csokas.bence@prolan.hu>
Date: Thu, 6 Mar 2025 14:44:36 +0100
Subject: counter: microchip-tcb-capture: Add IRQ handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add interrupt servicing to allow userspace to wait for the following:
* Change-of-state caused by external trigger
* Capture of timer value into RA/RB
* Compare to RC register
* Overflow

Signed-off-by: Bence Csókás <csokas.bence@prolan.hu>
Link: https://lore.kernel.org/r/20250306134441.582819-2-csokas.bence@prolan.hu
Signed-off-by: William Breathitt Gray <wbg@kernel.org>
---
 include/uapi/linux/counter/microchip-tcb-capture.h | 34 ++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 include/uapi/linux/counter/microchip-tcb-capture.h

(limited to 'include')

diff --git a/include/uapi/linux/counter/microchip-tcb-capture.h b/include/uapi/linux/counter/microchip-tcb-capture.h
new file mode 100644
index 000000000000..f3ef315fe9f6
--- /dev/null
+++ b/include/uapi/linux/counter/microchip-tcb-capture.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Channel numbers used by the microchip-tcb-capture driver
+ * Copyright (C) 2025 Bence Csókás
+ */
+#ifndef _UAPI_COUNTER_MCHP_TCB_H_
+#define _UAPI_COUNTER_MCHP_TCB_H_
+
+/*
+ * The driver defines the following components:
+ *
+ * Count 0
+ * \__  Synapse 0 -- Signal 0 (Channel A, i.e. TIOA)
+ * \__  Synapse 1 -- Signal 1 (Channel B, i.e. TIOB)
+ *
+ * It also supports the following events:
+ *
+ * Channel 0:
+ * - CV register changed
+ * - CV overflowed
+ * - RA captured
+ * Channel 1:
+ * - RB captured
+ * Channel 2:
+ * - RC compare triggered
+ */
+
+/* Event channels */
+#define COUNTER_MCHP_EVCHN_CV 0
+#define COUNTER_MCHP_EVCHN_RA 0
+#define COUNTER_MCHP_EVCHN_RB 1
+#define COUNTER_MCHP_EVCHN_RC 2
+
+#endif /* _UAPI_COUNTER_MCHP_TCB_H_ */
-- 
cgit v1.2.3


From 1adc6240a80278c613f655b71c6c0d447b2d5932 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bence=20Cs=C3=B3k=C3=A1s?= <csokas.bence@prolan.hu>
Date: Thu, 6 Mar 2025 14:44:37 +0100
Subject: counter: microchip-tcb-capture: Add capture extensions for registers
 RA/RB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TCB hardware is capable of capturing the timer value to registers RA and
RB. Add these registers as capture extensions.

Signed-off-by: Bence Csókás <csokas.bence@prolan.hu>
Link: https://lore.kernel.org/r/20250306134441.582819-3-csokas.bence@prolan.hu
Signed-off-by: William Breathitt Gray <wbg@kernel.org>
---
 include/uapi/linux/counter/microchip-tcb-capture.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/counter/microchip-tcb-capture.h b/include/uapi/linux/counter/microchip-tcb-capture.h
index f3ef315fe9f6..136e2faa7730 100644
--- a/include/uapi/linux/counter/microchip-tcb-capture.h
+++ b/include/uapi/linux/counter/microchip-tcb-capture.h
@@ -12,6 +12,8 @@
  * Count 0
  * \__  Synapse 0 -- Signal 0 (Channel A, i.e. TIOA)
  * \__  Synapse 1 -- Signal 1 (Channel B, i.e. TIOB)
+ * \__  Extension capture0    (RA register)
+ * \__  Extension capture1    (RB register)
  *
  * It also supports the following events:
  *
@@ -25,6 +27,10 @@
  * - RC compare triggered
  */
 
+/* Capture extensions */
+#define COUNTER_MCHP_EXCAP_RA 0
+#define COUNTER_MCHP_EXCAP_RB 1
+
 /* Event channels */
 #define COUNTER_MCHP_EVCHN_CV 0
 #define COUNTER_MCHP_EVCHN_RA 0
-- 
cgit v1.2.3


From df896e4f7cf5cc3abffb186e2b6815b785500b57 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Date: Thu, 27 Feb 2025 22:06:02 +0800
Subject: soundwire: extend sdw_stream_type to BPT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BPT/BRA need to be special cased, i.e. there's no point in using the
bandwidth allocation since the entire frame can be used.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Liam Girdwood <liam.r.girdwood@intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Tested-by: shumingf@realtek.com
Link: https://lore.kernel.org/r/20250227140615.8147-4-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 2d6c30317792..857b670eb9a5 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -150,12 +150,14 @@ enum sdw_dpn_pkg_mode {
  *
  * @SDW_STREAM_PCM: PCM data stream
  * @SDW_STREAM_PDM: PDM data stream
+ * @SDW_STREAM_BPT: BPT data stream
  *
  * spec doesn't define this, but is used in implementation
  */
 enum sdw_stream_type {
 	SDW_STREAM_PCM = 0,
 	SDW_STREAM_PDM = 1,
+	SDW_STREAM_BPT = 2,
 };
 
 /**
@@ -879,7 +881,7 @@ struct sdw_port_config {
  * @ch_count: Channel count of the stream
  * @bps: Number of bits per audio sample
  * @direction: Data direction
- * @type: Stream type PCM or PDM
+ * @type: Stream type PCM, PDM or BPT
  */
 struct sdw_stream_config {
 	unsigned int frame_rate;
@@ -929,7 +931,7 @@ struct sdw_stream_params {
  * @name: SoundWire stream name
  * @params: Stream parameters
  * @state: Current state of the stream
- * @type: Stream type PCM or PDM
+ * @type: Stream type PCM, PDM or BPT
  * @m_rt_count: Count of Master runtime(s) in this stream
  * @master_list: List of Master runtime(s) in this stream.
  * master_list can contain only one m_rt per Master instance
-- 
cgit v1.2.3


From dc90bbefa792031d89fe2af9ad4a6febd6be96a9 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Date: Thu, 27 Feb 2025 22:06:03 +0800
Subject: soundwire: stream: extend sdw_alloc_stream() to take 'type' parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the existing definition of sdw_stream_runtime, the 'type' member is
never set and defaults to PCM. To prepare for the BPT/BRA support, we
need to special-case streams and make use of the 'type'.

No functional change for now, the implicit PCM type is now explicit.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Liam Girdwood <liam.r.girdwood@intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Tested-by: shumingf@realtek.com
Link: https://lore.kernel.org/r/20250227140615.8147-5-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 857b670eb9a5..ba58e2a52322 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -1019,7 +1019,7 @@ struct sdw_bus {
 	unsigned int lane_used_bandwidth[SDW_MAX_LANES];
 };
 
-struct sdw_stream_runtime *sdw_alloc_stream(const char *stream_name);
+struct sdw_stream_runtime *sdw_alloc_stream(const char *stream_name, enum sdw_stream_type type);
 void sdw_release_stream(struct sdw_stream_runtime *stream);
 
 int sdw_compute_params(struct sdw_bus *bus, struct sdw_stream_runtime *stream);
-- 
cgit v1.2.3


From 9a756289ac5a8517dc643555d784d830b61576ad Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Date: Thu, 27 Feb 2025 22:06:06 +0800
Subject: soundwire: bus: add send_async/wait APIs for BPT protocol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add definitions and helpers for the BPT/BRA protocol. Peripheral
drivers (aka ASoC codec drivers) can use this API to send bulk data
such as firmware or tables. The design intent is however NOT to
directly use this API but to rely on an intermediate regmap layer.

The API is only available when no other audio streams have been
allocated, and only one BTP/BRA stream is allowed per link. To avoid
the addition of yet another lock, the refcount tests are handled in
the stream master_runtime alloc/free routines where the bus_lock is
already held. Another benefit of this approach is that the same
bus_lock is used to handle runtime and port linked lists, which
reduces the potential for misaligned configurations.

In addition to exclusion with audio streams, BPT transfers have a lot
of overhead, specifically registers writes are needed to enable
transport in DP0. Most DMAs don't handle too well very small data sets
and they may have alignment limitations.

The size and alignment requirements are for now not handled by the
core but must be checked by platform-specific drivers.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Liam Girdwood <liam.r.girdwood@intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Tested-by: shumingf@realtek.com
Link: https://lore.kernel.org/r/20250227140615.8147-8-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index ba58e2a52322..69f3e700796d 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -824,6 +824,15 @@ struct sdw_defer {
 	struct completion complete;
 };
 
+/*
+ * Add a practical limit to BPT transfer sizes. BPT is typically used
+ * to transfer firmware, and larger firmware transfers will increase
+ * the cold latency beyond typical OS or user requirements.
+ */
+#define SDW_BPT_MSG_MAX_BYTES  (1024 * 1024)
+
+struct sdw_bpt_msg;
+
 /**
  * struct sdw_master_ops - Master driver ops
  * @read_prop: Read Master properties
@@ -839,6 +848,10 @@ struct sdw_defer {
  * @get_device_num: Callback for vendor-specific device_number allocation
  * @put_device_num: Callback for vendor-specific device_number release
  * @new_peripheral_assigned: Callback to handle enumeration of new peripheral.
+ * @bpt_send_async: reserve resources for BPT stream and send message
+ * using BTP protocol
+ * @bpt_wait: wait for message completion using BTP protocol
+ * and release resources
  */
 struct sdw_master_ops {
 	int (*read_prop)(struct sdw_bus *bus);
@@ -855,6 +868,9 @@ struct sdw_master_ops {
 	void (*new_peripheral_assigned)(struct sdw_bus *bus,
 					struct sdw_slave *slave,
 					int dev_num);
+	int (*bpt_send_async)(struct sdw_bus *bus, struct sdw_slave *slave,
+			      struct sdw_bpt_msg *msg);
+	int (*bpt_wait)(struct sdw_bus *bus, struct sdw_slave *slave, struct sdw_bpt_msg *msg);
 };
 
 int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent,
@@ -961,6 +977,8 @@ struct sdw_stream_runtime {
  * @defer_msg: Defer message
  * @params: Current bus parameters
  * @stream_refcount: number of streams currently using this bus
+ * @btp_stream_refcount: number of BTP streams currently using this bus (should
+ * be zero or one, multiple streams per link is not supported).
  * @ops: Master callback ops
  * @port_ops: Master port callback ops
  * @prop: Master properties
@@ -998,6 +1016,7 @@ struct sdw_bus {
 	struct sdw_defer defer_msg;
 	struct sdw_bus_params params;
 	int stream_refcount;
+	int bpt_stream_refcount;
 	const struct sdw_master_ops *ops;
 	const struct sdw_master_port_ops *port_ops;
 	struct sdw_master_prop prop;
@@ -1045,6 +1064,10 @@ int sdw_compare_devid(struct sdw_slave *slave, struct sdw_slave_id id);
 void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id);
 bool is_clock_scaling_supported_by_slave(struct sdw_slave *slave);
 
+int sdw_bpt_send_async(struct sdw_bus *bus, struct sdw_slave *slave, struct sdw_bpt_msg *msg);
+int sdw_bpt_wait(struct sdw_bus *bus, struct sdw_slave *slave, struct sdw_bpt_msg *msg);
+int sdw_bpt_send_sync(struct sdw_bus *bus, struct sdw_slave *slave, struct sdw_bpt_msg *msg);
+
 #if IS_ENABLED(CONFIG_SOUNDWIRE)
 
 int sdw_stream_add_slave(struct sdw_slave *slave,
-- 
cgit v1.2.3


From 8e4a239b403bd8aed8787798de8e4e42f79246c2 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Date: Thu, 27 Feb 2025 22:06:07 +0800
Subject: soundwire: bus: add bpt_stream pointer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a convenience pointer to the 'sdw_bus' structure. BPT is a
dedicated stream which will typically not be handled by DAIs or
dailinks. Since there's only one BPT stream per link, storing the
pointer at the link level seems rather natural.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Liam Girdwood <liam.r.girdwood@intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Tested-by: shumingf@realtek.com
Link: https://lore.kernel.org/r/20250227140615.8147-9-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 69f3e700796d..2362f621d94c 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -979,6 +979,7 @@ struct sdw_stream_runtime {
  * @stream_refcount: number of streams currently using this bus
  * @btp_stream_refcount: number of BTP streams currently using this bus (should
  * be zero or one, multiple streams per link is not supported).
+ * @bpt_stream: pointer stored to handle BTP streams.
  * @ops: Master callback ops
  * @port_ops: Master port callback ops
  * @prop: Master properties
@@ -1017,6 +1018,7 @@ struct sdw_bus {
 	struct sdw_bus_params params;
 	int stream_refcount;
 	int bpt_stream_refcount;
+	struct sdw_stream_runtime *bpt_stream;
 	const struct sdw_master_ops *ops;
 	const struct sdw_master_port_ops *port_ops;
 	struct sdw_master_prop prop;
-- 
cgit v1.2.3


From 7f17a73a7dd8252aa88c6f5e23310861de3d5423 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Date: Thu, 27 Feb 2025 22:06:09 +0800
Subject: soundwire: intel_auxdevice: add indirection for BPT send_async/wait
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirror abstraction added for master ops.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Liam Girdwood <liam.r.girdwood@intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Tested-by: shumingf@realtek.com
Link: https://lore.kernel.org/r/20250227140615.8147-11-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 580086417e4b..493d9de4e472 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -436,6 +436,10 @@ struct sdw_intel_hw_ops {
 	bool (*sync_check_cmdsync_unlocked)(struct sdw_intel *sdw);
 
 	void (*program_sdi)(struct sdw_intel *sdw, int dev_num);
+
+	int (*bpt_send_async)(struct sdw_intel *sdw, struct sdw_slave *slave,
+			      struct sdw_bpt_msg *msg);
+	int (*bpt_wait)(struct sdw_intel *sdw, struct sdw_slave *slave, struct sdw_bpt_msg *msg);
 };
 
 extern const struct sdw_intel_hw_ops sdw_intel_cnl_hw_ops;
-- 
cgit v1.2.3


From 5d5cb86fb46ea1c7efd3b894f63fe364e5847043 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Date: Thu, 27 Feb 2025 22:06:10 +0800
Subject: ASoC: SOF: Intel: hda-sdw-bpt: add helpers for SoundWire BPT DMA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add SoundWire BPT DMA helpers as a separate module to avoid circular
dependencies.

For now this assumes no link DMA, only coupled mode.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Liam Girdwood <liam.r.girdwood@intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Tested-by: shumingf@realtek.com
Link: https://lore.kernel.org/r/20250227140615.8147-12-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/sound/hda-sdw-bpt.h | 69 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 include/sound/hda-sdw-bpt.h

(limited to 'include')

diff --git a/include/sound/hda-sdw-bpt.h b/include/sound/hda-sdw-bpt.h
new file mode 100644
index 000000000000..f649549b75d5
--- /dev/null
+++ b/include/sound/hda-sdw-bpt.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * Copyright(c) 2025 Intel Corporation.
+ */
+
+#ifndef __HDA_SDW_BPT_H
+#define __HDA_SDW_BPT_H
+
+#include <linux/device.h>
+
+struct hdac_ext_stream;
+struct snd_dma_buffer;
+
+#if IS_ENABLED(CONFIG_SND_SOF_SOF_HDA_SDW_BPT)
+int hda_sdw_bpt_open(struct device *dev, int link_id, struct hdac_ext_stream **bpt_tx_stream,
+		     struct snd_dma_buffer *dmab_tx_bdl, u32 bpt_tx_num_bytes,
+		     u32 tx_dma_bandwidth, struct hdac_ext_stream **bpt_rx_stream,
+		     struct snd_dma_buffer *dmab_rx_bdl, u32 bpt_rx_num_bytes,
+		     u32 rx_dma_bandwidth);
+
+int hda_sdw_bpt_send_async(struct device *dev, struct hdac_ext_stream *bpt_tx_stream,
+			   struct hdac_ext_stream *bpt_rx_stream);
+
+int hda_sdw_bpt_wait(struct device *dev, struct hdac_ext_stream *bpt_tx_stream,
+		     struct hdac_ext_stream *bpt_rx_stream);
+
+int hda_sdw_bpt_close(struct device *dev, struct hdac_ext_stream *bpt_tx_stream,
+		      struct snd_dma_buffer *dmab_tx_bdl, struct hdac_ext_stream *bpt_rx_stream,
+		      struct snd_dma_buffer *dmab_rx_bdl);
+#else
+static inline int hda_sdw_bpt_open(struct device *dev, int link_id,
+				   struct hdac_ext_stream **bpt_tx_stream,
+				   struct snd_dma_buffer *dmab_tx_bdl, u32 bpt_tx_num_bytes,
+				   u32 tx_dma_bandwidth, struct hdac_ext_stream **bpt_rx_stream,
+				   struct snd_dma_buffer *dmab_rx_bdl, u32 bpt_rx_num_bytes,
+				   u32 rx_dma_bandwidth)
+{
+	WARN_ONCE(1, "SoundWire BPT is disabled");
+	return -EOPNOTSUPP;
+}
+
+static inline int hda_sdw_bpt_send_async(struct device *dev, struct hdac_ext_stream *bpt_tx_stream,
+					 struct hdac_ext_stream *bpt_rx_stream)
+{
+	WARN_ONCE(1, "SoundWire BPT is disabled");
+	return -EOPNOTSUPP;
+}
+
+static inline int hda_sdw_bpt_wait(struct device *dev, struct hdac_ext_stream *bpt_tx_stream,
+				   struct hdac_ext_stream *bpt_rx_stream)
+{
+	WARN_ONCE(1, "SoundWire BPT is disabled");
+	return -EOPNOTSUPP;
+}
+
+static inline int hda_sdw_bpt_close(struct device *dev, struct hdac_ext_stream *bpt_tx_stream,
+				    struct snd_dma_buffer *dmab_tx_bdl,
+				    struct hdac_ext_stream *bpt_rx_stream,
+				    struct snd_dma_buffer *dmab_rx_bdl)
+{
+	WARN_ONCE(1, "SoundWire BPT is disabled");
+	return -EOPNOTSUPP;
+}
+#endif
+
+#endif /* __HDA_SDW_BPT_H */
-- 
cgit v1.2.3


From b02d41d884f64d22d5d5a2a4b35550f5e80bdb3d Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Thu, 6 Mar 2025 01:54:08 +0000
Subject: phy: core: Remove unused phy_pm_runtime_(allow|forbid)

phy_pm_runtime_allow() and phy_pm_runtime_forbid() were added in 2013
as part of
commit ff764963479a ("drivers: phy: add generic PHY framework")
but have remained unused.

Remove them.
Fix up the (English) docs - I've left the Chinese translation.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Link: https://lore.kernel.org/r/20250306015408.277729-1-linux@treblig.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/phy/phy.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index 03cd5bae92d3..e63e6e70e860 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -227,8 +227,6 @@ int phy_pm_runtime_get(struct phy *phy);
 int phy_pm_runtime_get_sync(struct phy *phy);
 int phy_pm_runtime_put(struct phy *phy);
 int phy_pm_runtime_put_sync(struct phy *phy);
-void phy_pm_runtime_allow(struct phy *phy);
-void phy_pm_runtime_forbid(struct phy *phy);
 int phy_init(struct phy *phy);
 int phy_exit(struct phy *phy);
 int phy_power_on(struct phy *phy);
@@ -321,16 +319,6 @@ static inline int phy_pm_runtime_put_sync(struct phy *phy)
 	return -ENOSYS;
 }
 
-static inline void phy_pm_runtime_allow(struct phy *phy)
-{
-	return;
-}
-
-static inline void phy_pm_runtime_forbid(struct phy *phy)
-{
-	return;
-}
-
 static inline int phy_init(struct phy *phy)
 {
 	if (!phy)
-- 
cgit v1.2.3


From b5198201932635e19068cc2e83a99adf38f32c43 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <wbg@kernel.org>
Date: Thu, 6 Mar 2025 16:05:43 +0900
Subject: counter: Introduce the compare component

Compare registers are used in devices to compare a counter channel
against a particular count value (e.g. to check if a threshold has been
reached). A macro COUNTER_COMP_COMPARE() is introduced to facilitate the
creation of compare components as Count extensions.

Link: https://lore.kernel.org/r/20250306-introduce-compare-component-v1-1-93993b3dca9c@kernel.org
Signed-off-by: William Breathitt Gray <wbg@kernel.org>
---
 include/linux/counter.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index 426b7d58a438..f208e867dd0f 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -580,6 +580,9 @@ struct counter_array {
 #define COUNTER_COMP_CEILING(_read, _write) \
 	COUNTER_COMP_COUNT_U64("ceiling", _read, _write)
 
+#define COUNTER_COMP_COMPARE(_read, _write) \
+	COUNTER_COMP_COUNT_U64("compare", _read, _write)
+
 #define COUNTER_COMP_COUNT_MODE(_read, _write, _available) \
 { \
 	.type = COUNTER_COMP_COUNT_MODE, \
-- 
cgit v1.2.3


From 26f060c106f630e34876c096c1c8997a9e68c371 Mon Sep 17 00:00:00 2001
From: Yeoreum Yun <yeoreum.yun@arm.com>
Date: Thu, 6 Mar 2025 12:11:02 +0000
Subject: coresight: change coresight_device lock type to raw_spinlock_t

coresight_device->cscfg_csdev_lock can be held during __schedule()
by perf_event_task_sched_out()/in().

Since coresight->cscfg_csdev_lock type is spinlock_t and
perf_event_task_sched_out()/in() is called after acquiring rq_lock,
which is raw_spinlock_t (an unsleepable lock),
this poses an issue in PREEMPT_RT kernel where spinlock_t is sleepable.

To address this, change type of coresight_device->cscfg_csdev_lock
from spinlock_t to raw_spinlock_t.

Reviewed-by: James Clark <james.clark@linaro.org>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Yeoreum Yun <yeoreum.yun@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20250306121110.1647948-2-yeoreum.yun@arm.com
---
 include/linux/coresight.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 89b781e70fe7..4541bfc1cc6b 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -302,7 +302,7 @@ struct coresight_device {
 	/* system configuration and feature lists */
 	struct list_head feature_csdev_list;
 	struct list_head config_csdev_list;
-	spinlock_t cscfg_csdev_lock;
+	raw_spinlock_t cscfg_csdev_lock;
 	void *active_cscfg_ctxt;
 };
 
-- 
cgit v1.2.3


From 4cf364ca57d851e192ce02e98d314d22fa514895 Mon Sep 17 00:00:00 2001
From: Yeoreum Yun <yeoreum.yun@arm.com>
Date: Thu, 6 Mar 2025 12:11:04 +0000
Subject: coresight: change coresight_trace_id_map's lock type to
 raw_spinlock_t

coresight_trace_id_map->lock can be acquired while coresight devices'
drvdata_lock.

But the drvdata_lock can be raw_spinlock_t (i.e) coresight-etm4x.

To address this, change type of coresight_trace_id_map->lock to
raw_spinlock_t

Signed-off-by: Yeoreum Yun <yeoreum.yun@arm.com>
Reviewed-by: James Clark <james.clark@linaro.org>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20250306121110.1647948-4-yeoreum.yun@arm.com
---
 include/linux/coresight.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 4541bfc1cc6b..d79a242b271d 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -239,7 +239,7 @@ struct coresight_trace_id_map {
 	DECLARE_BITMAP(used_ids, CORESIGHT_TRACE_IDS_MAX);
 	atomic_t __percpu *cpu_map;
 	atomic_t perf_cs_etm_session_active;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 /**
-- 
cgit v1.2.3


From b9014a10bdb8e967d85819f5ef61e6f80d0b46c9 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathan.lynch@amd.com>
Date: Mon, 10 Feb 2025 17:29:11 -0600
Subject: dmaengine: Remove device_prep_dma_imm_data from struct dma_device

The device_prep_dma_imm_data() method isn't implemented or invoked by
any code since commit 80ade22c06ca ("misc: mic: remove the MIC drivers").

Remove it, shrinking struct dma_device by a few bytes.

Signed-off-by: Nathan Lynch <nathan.lynch@amd.com>
Link: https://lore.kernel.org/r/20250210-dmaengine-drop-imm-data-v1-1-e017766da2fa@amd.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index a360e0330436..bb146c5ac3e4 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -839,7 +839,6 @@ struct dma_filter {
  *	The function takes a buffer of size buf_len. The callback function will
  *	be called after period_len bytes have been transferred.
  * @device_prep_interleaved_dma: Transfer expression in a generic way.
- * @device_prep_dma_imm_data: DMA's 8 byte immediate data to the dst address
  * @device_caps: May be used to override the generic DMA slave capabilities
  *	with per-channel specific ones
  * @device_config: Pushes a new configuration to a channel, return 0 or an error
@@ -942,9 +941,6 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
 		struct dma_chan *chan, struct dma_interleaved_template *xt,
 		unsigned long flags);
-	struct dma_async_tx_descriptor *(*device_prep_dma_imm_data)(
-		struct dma_chan *chan, dma_addr_t dst, u64 data,
-		unsigned long flags);
 
 	void (*device_caps)(struct dma_chan *chan, struct dma_slave_caps *caps);
 	int (*device_config)(struct dma_chan *chan, struct dma_slave_config *config);
-- 
cgit v1.2.3


From 36f257e3b0ba904f5a4e7fa8dafaa60e88cdd28c Mon Sep 17 00:00:00 2001
From: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Date: Mon, 10 Mar 2025 22:38:38 +0000
Subject: acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors

When PCIe AER is in FW-First, OS should process CXL Protocol errors from
CPER records. Introduce support for handling and logging CXL Protocol
errors.

The defined trace events cxl_aer_uncorrectable_error and
cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
to trace FW-First Protocol errors.

Since the CXL code is required to be called from process context and
GHES is in interrupt context, use workqueues for processing.

Similar to CXL CPER event handling, use kfifo to handle errors as it
simplifies queue processing by providing lock free fifo operations.

Add the ability for the CXL sub-system to register a workqueue to
process CXL CPER protocol errors.

[DJ: return cxl_cper_register_prot_err_work() directly in cxl_ras_init()]

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Li Ming <ming.li@zohomail.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://patch.msgid.link/20250310223839.31342-2-Smita.KoralahalliChannabasappa@amd.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/cxl/event.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/cxl/event.h b/include/cxl/event.h
index 8381a07052d0..f9ae1796da85 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -254,6 +254,9 @@ struct cxl_cper_prot_err_work_data {
 int cxl_cper_register_work(struct work_struct *work);
 int cxl_cper_unregister_work(struct work_struct *work);
 int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd);
+int cxl_cper_register_prot_err_work(struct work_struct *work);
+int cxl_cper_unregister_prot_err_work(struct work_struct *work);
+int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd);
 #else
 static inline int cxl_cper_register_work(struct work_struct *work)
 {
@@ -268,6 +271,18 @@ static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
 {
 	return 0;
 }
+static inline int cxl_cper_register_prot_err_work(struct work_struct *work)
+{
+	return 0;
+}
+static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work)
+{
+	return 0;
+}
+static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
+{
+	return 0;
+}
 #endif
 
 #endif /* _LINUX_CXL_EVENT_H */
-- 
cgit v1.2.3


From 5a0fcb0ef5584caf7da3f31896e08650c532e4c1 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <ajd@linux.ibm.com>
Date: Wed, 19 Feb 2025 18:00:06 +1100
Subject: cxl: Remove driver

Remove the cxl driver that provides support for the IBM Coherent
Accelerator Processor Interface. Revert or clean up associated code in
arch/powerpc that is no longer necessary.

cxl has received minimal maintenance for several years, and is not
supported on the Power10 processor. We aren't aware of any users who are
likely to be using recent kernels.

Thanks to Mikey Neuling, Ian Munsie, Daniel Axtens, Frederic Barrat,
Christophe Lombard, Philippe Bergheaud, Vaibhav Jain and Alastair
D'Silva for their work on this driver over the years.

Signed-off-by: Andrew Donnellan <ajd@linux.ibm.com>
Acked-by: Frederic Barrat <fbarrat@linux.ibm.com>
Acked-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20250219070007.177725-2-ajd@linux.ibm.com
---
 include/misc/cxl-base.h |  48 ---------
 include/misc/cxl.h      | 265 ------------------------------------------------
 include/misc/cxllib.h   | 129 -----------------------
 include/uapi/misc/cxl.h | 156 ----------------------------
 4 files changed, 598 deletions(-)
 delete mode 100644 include/misc/cxl-base.h
 delete mode 100644 include/misc/cxl.h
 delete mode 100644 include/misc/cxllib.h
 delete mode 100644 include/uapi/misc/cxl.h

(limited to 'include')

diff --git a/include/misc/cxl-base.h b/include/misc/cxl-base.h
deleted file mode 100644
index 2251da7f32d9..000000000000
--- a/include/misc/cxl-base.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright 2014 IBM Corp.
- */
-
-#ifndef _MISC_CXL_BASE_H
-#define _MISC_CXL_BASE_H
-
-#ifdef CONFIG_CXL_BASE
-
-#define CXL_IRQ_RANGES 4
-
-struct cxl_irq_ranges {
-	irq_hw_number_t offset[CXL_IRQ_RANGES];
-	irq_hw_number_t range[CXL_IRQ_RANGES];
-};
-
-extern atomic_t cxl_use_count;
-
-static inline bool cxl_ctx_in_use(void)
-{
-       return (atomic_read(&cxl_use_count) != 0);
-}
-
-static inline void cxl_ctx_get(void)
-{
-       atomic_inc(&cxl_use_count);
-}
-
-static inline void cxl_ctx_put(void)
-{
-       atomic_dec(&cxl_use_count);
-}
-
-struct cxl_afu *cxl_afu_get(struct cxl_afu *afu);
-void cxl_afu_put(struct cxl_afu *afu);
-void cxl_slbia(struct mm_struct *mm);
-
-#else /* CONFIG_CXL_BASE */
-
-static inline bool cxl_ctx_in_use(void) { return false; }
-static inline struct cxl_afu *cxl_afu_get(struct cxl_afu *afu) { return NULL; }
-static inline void cxl_afu_put(struct cxl_afu *afu) {}
-static inline void cxl_slbia(struct mm_struct *mm) {}
-
-#endif /* CONFIG_CXL_BASE */
-
-#endif
diff --git a/include/misc/cxl.h b/include/misc/cxl.h
deleted file mode 100644
index d8044299d654..000000000000
--- a/include/misc/cxl.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright 2015 IBM Corp.
- */
-
-#ifndef _MISC_CXL_H
-#define _MISC_CXL_H
-
-#include <linux/pci.h>
-#include <linux/poll.h>
-#include <linux/interrupt.h>
-#include <uapi/misc/cxl.h>
-
-/*
- * This documents the in kernel API for driver to use CXL. It allows kernel
- * drivers to bind to AFUs using an AFU configuration record exposed as a PCI
- * configuration record.
- *
- * This API enables control over AFU and contexts which can't be part of the
- * generic PCI API. This API is agnostic to the actual AFU.
- */
-
-/* Get the AFU associated with a pci_dev */
-struct cxl_afu *cxl_pci_to_afu(struct pci_dev *dev);
-
-/* Get the AFU conf record number associated with a pci_dev */
-unsigned int cxl_pci_to_cfg_record(struct pci_dev *dev);
-
-
-/*
- * Context lifetime overview:
- *
- * An AFU context may be inited and then started and stopped multiple times
- * before it's released. ie.
- *    - cxl_dev_context_init()
- *      - cxl_start_context()
- *      - cxl_stop_context()
- *      - cxl_start_context()
- *      - cxl_stop_context()
- *     ...repeat...
- *    - cxl_release_context()
- * Once released, a context can't be started again.
- *
- * One context is inited by the cxl driver for every pci_dev. This is to be
- * used as a default kernel context. cxl_get_context() will get this
- * context. This context will be released by PCI hot unplug, so doesn't need to
- * be released explicitly by drivers.
- *
- * Additional kernel contexts may be inited using cxl_dev_context_init().
- * These must be released using cxl_context_detach().
- *
- * Once a context has been inited, IRQs may be configured. Firstly these IRQs
- * must be allocated (cxl_allocate_afu_irqs()), then individually mapped to
- * specific handlers (cxl_map_afu_irq()).
- *
- * These IRQs can be unmapped (cxl_unmap_afu_irq()) and finally released
- * (cxl_free_afu_irqs()).
- *
- * The AFU can be reset (cxl_afu_reset()). This will cause the PSL/AFU
- * hardware to lose track of all contexts. It's upto the caller of
- * cxl_afu_reset() to restart these contexts.
- */
-
-/*
- * On pci_enabled_device(), the cxl driver will init a single cxl context for
- * use by the driver. It doesn't start this context (as that will likely
- * generate DMA traffic for most AFUs).
- *
- * This gets the default context associated with this pci_dev.  This context
- * doesn't need to be released as this will be done by the PCI subsystem on hot
- * unplug.
- */
-struct cxl_context *cxl_get_context(struct pci_dev *dev);
-/*
- * Allocate and initalise a context associated with a AFU PCI device. This
- * doesn't start the context in the AFU.
- */
-struct cxl_context *cxl_dev_context_init(struct pci_dev *dev);
-/*
- * Release and free a context. Context should be stopped before calling.
- */
-int cxl_release_context(struct cxl_context *ctx);
-
-/*
- * Set and get private data associated with a context. Allows drivers to have a
- * back pointer to some useful structure.
- */
-int cxl_set_priv(struct cxl_context *ctx, void *priv);
-void *cxl_get_priv(struct cxl_context *ctx);
-
-/*
- * Allocate AFU interrupts for this context. num=0 will allocate the default
- * for this AFU as given in the AFU descriptor. This number doesn't include the
- * interrupt 0 (CAIA defines AFU IRQ 0 for page faults). Each interrupt to be
- * used must map a handler with cxl_map_afu_irq.
- */
-int cxl_allocate_afu_irqs(struct cxl_context *cxl, int num);
-/* Free allocated interrupts */
-void cxl_free_afu_irqs(struct cxl_context *cxl);
-
-/*
- * Map a handler for an AFU interrupt associated with a particular context. AFU
- * IRQS numbers start from 1 (CAIA defines AFU IRQ 0 for page faults). cookie
- * is private data is that will be provided to the interrupt handler.
- */
-int cxl_map_afu_irq(struct cxl_context *cxl, int num,
-		    irq_handler_t handler, void *cookie, char *name);
-/* unmap mapped IRQ handlers */
-void cxl_unmap_afu_irq(struct cxl_context *cxl, int num, void *cookie);
-
-/*
- * Start work on the AFU. This starts an cxl context and associates it with a
- * task. task == NULL will make it a kernel context.
- */
-int cxl_start_context(struct cxl_context *ctx, u64 wed,
-		      struct task_struct *task);
-/*
- * Stop a context and remove it from the PSL
- */
-int cxl_stop_context(struct cxl_context *ctx);
-
-/* Reset the AFU */
-int cxl_afu_reset(struct cxl_context *ctx);
-
-/*
- * Set a context as a master context.
- * This sets the default problem space area mapped as the full space, rather
- * than just the per context area (for slaves).
- */
-void cxl_set_master(struct cxl_context *ctx);
-
-/*
- * Map and unmap the AFU Problem Space area. The amount and location mapped
- * depends on if this context is a master or slave.
- */
-void __iomem *cxl_psa_map(struct cxl_context *ctx);
-void cxl_psa_unmap(void __iomem *addr);
-
-/*  Get the process element for this context */
-int cxl_process_element(struct cxl_context *ctx);
-
-/*
- * These calls allow drivers to create their own file descriptors and make them
- * identical to the cxl file descriptor user API. An example use case:
- *
- * struct file_operations cxl_my_fops = {};
- * ......
- *	// Init the context
- *	ctx = cxl_dev_context_init(dev);
- *	if (IS_ERR(ctx))
- *		return PTR_ERR(ctx);
- *	// Create and attach a new file descriptor to my file ops
- *	file = cxl_get_fd(ctx, &cxl_my_fops, &fd);
- *	// Start context
- *	rc = cxl_start_work(ctx, &work.work);
- *	if (rc) {
- *		fput(file);
- *		put_unused_fd(fd);
- *		return -ENODEV;
- *	}
- *	// No error paths after installing the fd
- *	fd_install(fd, file);
- *	return fd;
- *
- * This inits a context, and gets a file descriptor and associates some file
- * ops to that file descriptor. If the file ops are blank, the cxl driver will
- * fill them in with the default ones that mimic the standard user API.  Once
- * completed, the file descriptor can be installed. Once the file descriptor is
- * installed, it's visible to the user so no errors must occur past this point.
- *
- * If cxl_fd_release() file op call is installed, the context will be stopped
- * and released when the fd is released. Hence the driver won't need to manage
- * this itself.
- */
-
-/*
- * Take a context and associate it with my file ops. Returns the associated
- * file and file descriptor. Any file ops which are blank are filled in by the
- * cxl driver with the default ops to mimic the standard API.
- */
-struct file *cxl_get_fd(struct cxl_context *ctx, struct file_operations *fops,
-			int *fd);
-/* Get the context associated with this file */
-struct cxl_context *cxl_fops_get_context(struct file *file);
-/*
- * Start a context associated a struct cxl_ioctl_start_work used by the
- * standard cxl user API.
- */
-int cxl_start_work(struct cxl_context *ctx,
-		   struct cxl_ioctl_start_work *work);
-/*
- * Export all the existing fops so drivers can use them
- */
-int cxl_fd_open(struct inode *inode, struct file *file);
-int cxl_fd_release(struct inode *inode, struct file *file);
-long cxl_fd_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int cxl_fd_mmap(struct file *file, struct vm_area_struct *vm);
-__poll_t cxl_fd_poll(struct file *file, struct poll_table_struct *poll);
-ssize_t cxl_fd_read(struct file *file, char __user *buf, size_t count,
-			   loff_t *off);
-
-/*
- * For EEH, a driver may want to assert a PERST will reload the same image
- * from flash into the FPGA.
- *
- * This is a property of the entire adapter, not a single AFU, so drivers
- * should set this property with care!
- */
-void cxl_perst_reloads_same_image(struct cxl_afu *afu,
-				  bool perst_reloads_same_image);
-
-/*
- * Read the VPD for the card where the AFU resides
- */
-ssize_t cxl_read_adapter_vpd(struct pci_dev *dev, void *buf, size_t count);
-
-/*
- * AFU driver ops allow an AFU driver to create their own events to pass to
- * userspace through the file descriptor as a simpler alternative to overriding
- * the read() and poll() calls that works with the generic cxl events. These
- * events are given priority over the generic cxl events, so they will be
- * delivered first if multiple types of events are pending.
- *
- * The AFU driver must call cxl_context_events_pending() to notify the cxl
- * driver that new events are ready to be delivered for a specific context.
- * cxl_context_events_pending() will adjust the current count of AFU driver
- * events for this context, and wake up anyone waiting on the context wait
- * queue.
- *
- * The cxl driver will then call fetch_event() to get a structure defining
- * the size and address of the driver specific event data. The cxl driver
- * will build a cxl header with type and process_element fields filled in,
- * and header.size set to sizeof(struct cxl_event_header) + data_size.
- * The total size of the event is limited to CXL_READ_MIN_SIZE (4K).
- *
- * fetch_event() is called with a spin lock held, so it must not sleep.
- *
- * The cxl driver will then deliver the event to userspace, and finally
- * call event_delivered() to return the status of the operation, identified
- * by cxl context and AFU driver event data pointers.
- *   0        Success
- *   -EFAULT  copy_to_user() has failed
- *   -EINVAL  Event data pointer is NULL, or event size is greater than
- *            CXL_READ_MIN_SIZE.
- */
-struct cxl_afu_driver_ops {
-	struct cxl_event_afu_driver_reserved *(*fetch_event) (
-						struct cxl_context *ctx);
-	void (*event_delivered) (struct cxl_context *ctx,
-				 struct cxl_event_afu_driver_reserved *event,
-				 int rc);
-};
-
-/*
- * Associate the above driver ops with a specific context.
- * Reset the current count of AFU driver events.
- */
-void cxl_set_driver_ops(struct cxl_context *ctx,
-			struct cxl_afu_driver_ops *ops);
-
-/* Notify cxl driver that new events are ready to be delivered for context */
-void cxl_context_events_pending(struct cxl_context *ctx,
-				unsigned int new_events);
-
-#endif /* _MISC_CXL_H */
diff --git a/include/misc/cxllib.h b/include/misc/cxllib.h
deleted file mode 100644
index eacc417288fc..000000000000
--- a/include/misc/cxllib.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright 2017 IBM Corp.
- */
-
-#ifndef _MISC_CXLLIB_H
-#define _MISC_CXLLIB_H
-
-#include <linux/pci.h>
-#include <asm/reg.h>
-
-/*
- * cxl driver exports a in-kernel 'library' API which can be called by
- * other drivers to help interacting with an IBM XSL.
- */
-
-/*
- * tells whether capi is supported on the PCIe slot where the
- * device is seated
- *
- * Input:
- *	dev: device whose slot needs to be checked
- *	flags: 0 for the time being
- */
-bool cxllib_slot_is_supported(struct pci_dev *dev, unsigned long flags);
-
-
-/*
- * Returns the configuration parameters to be used by the XSL or device
- *
- * Input:
- *	dev: device, used to find PHB
- * Output:
- *	struct cxllib_xsl_config:
- *		version
- *		capi BAR address, i.e. 0x2000000000000-0x2FFFFFFFFFFFF
- *		capi BAR size
- *		data send control (XSL_DSNCTL)
- *		dummy read address (XSL_DRA)
- */
-#define CXL_XSL_CONFIG_VERSION1		1
-struct cxllib_xsl_config {
-	u32	version;     /* format version for register encoding */
-	u32	log_bar_size;/* log size of the capi_window */
-	u64	bar_addr;    /* address of the start of capi window */
-	u64	dsnctl;      /* matches definition of XSL_DSNCTL */
-	u64	dra;         /* real address that can be used for dummy read */
-};
-
-int cxllib_get_xsl_config(struct pci_dev *dev, struct cxllib_xsl_config *cfg);
-
-
-/*
- * Activate capi for the pci host bridge associated with the device.
- * Can be extended to deactivate once we know how to do it.
- * Device must be ready to accept messages from the CAPP unit and
- * respond accordingly (TLB invalidates, ...)
- *
- * PHB is switched to capi mode through calls to skiboot.
- * CAPP snooping is activated
- *
- * Input:
- *	dev: device whose PHB should switch mode
- *	mode: mode to switch to i.e. CAPI or PCI
- *	flags: options related to the mode
- */
-enum cxllib_mode {
-	CXL_MODE_CXL,
-	CXL_MODE_PCI,
-};
-
-#define CXL_MODE_NO_DMA       0
-#define CXL_MODE_DMA_TVT0     1
-#define CXL_MODE_DMA_TVT1     2
-
-int cxllib_switch_phb_mode(struct pci_dev *dev, enum cxllib_mode mode,
-			unsigned long flags);
-
-
-/*
- * Set the device for capi DMA.
- * Define its dma_ops and dma offset so that allocations will be using TVT#1
- *
- * Input:
- *	dev: device to set
- *	flags: options. CXL_MODE_DMA_TVT1 should be used
- */
-int cxllib_set_device_dma(struct pci_dev *dev, unsigned long flags);
-
-
-/*
- * Get the Process Element structure for the given thread
- *
- * Input:
- *    task: task_struct for the context of the translation
- *    translation_mode: whether addresses should be translated
- * Output:
- *    attr: attributes to fill up the Process Element structure from CAIA
- */
-struct cxllib_pe_attributes {
-	u64 sr;
-	u32 lpid;
-	u32 tid;
-	u32 pid;
-};
-#define CXL_TRANSLATED_MODE 0
-#define CXL_REAL_MODE 1
-
-int cxllib_get_PE_attributes(struct task_struct *task,
-	     unsigned long translation_mode, struct cxllib_pe_attributes *attr);
-
-
-/*
- * Handle memory fault.
- * Fault in all the pages of the specified buffer for the permissions
- * provided in ‘flags’
- *
- * Shouldn't be called from interrupt context
- *
- * Input:
- *	mm: struct mm for the thread faulting the pages
- *	addr: base address of the buffer to page in
- *	size: size of the buffer to page in
- *	flags: permission requested (DSISR_ISSTORE...)
- */
-int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags);
-
-
-#endif /* _MISC_CXLLIB_H */
diff --git a/include/uapi/misc/cxl.h b/include/uapi/misc/cxl.h
deleted file mode 100644
index 56376d3907d8..000000000000
--- a/include/uapi/misc/cxl.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
-/*
- * Copyright 2014 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _UAPI_MISC_CXL_H
-#define _UAPI_MISC_CXL_H
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-
-
-struct cxl_ioctl_start_work {
-	__u64 flags;
-	__u64 work_element_descriptor;
-	__u64 amr;
-	__s16 num_interrupts;
-	__u16 tid;
-	__s32 reserved1;
-	__u64 reserved2;
-	__u64 reserved3;
-	__u64 reserved4;
-	__u64 reserved5;
-};
-
-#define CXL_START_WORK_AMR		0x0000000000000001ULL
-#define CXL_START_WORK_NUM_IRQS		0x0000000000000002ULL
-#define CXL_START_WORK_ERR_FF		0x0000000000000004ULL
-#define CXL_START_WORK_TID		0x0000000000000008ULL
-#define CXL_START_WORK_ALL		(CXL_START_WORK_AMR |\
-					 CXL_START_WORK_NUM_IRQS |\
-					 CXL_START_WORK_ERR_FF |\
-					 CXL_START_WORK_TID)
-
-
-/* Possible modes that an afu can be in */
-#define CXL_MODE_DEDICATED   0x1
-#define CXL_MODE_DIRECTED    0x2
-
-/* possible flags for the cxl_afu_id flags field */
-#define CXL_AFUID_FLAG_SLAVE    0x1  /* In directed-mode afu is in slave mode */
-
-struct cxl_afu_id {
-	__u64 flags;     /* One of CXL_AFUID_FLAG_X */
-	__u32 card_id;
-	__u32 afu_offset;
-	__u32 afu_mode;  /* one of the CXL_MODE_X */
-	__u32 reserved1;
-	__u64 reserved2;
-	__u64 reserved3;
-	__u64 reserved4;
-	__u64 reserved5;
-	__u64 reserved6;
-};
-
-/* base adapter image header is included in the image */
-#define CXL_AI_NEED_HEADER	0x0000000000000001ULL
-#define CXL_AI_ALL		CXL_AI_NEED_HEADER
-
-#define CXL_AI_HEADER_SIZE 128
-#define CXL_AI_BUFFER_SIZE 4096
-#define CXL_AI_MAX_ENTRIES 256
-#define CXL_AI_MAX_CHUNK_SIZE (CXL_AI_BUFFER_SIZE * CXL_AI_MAX_ENTRIES)
-
-struct cxl_adapter_image {
-	__u64 flags;
-	__u64 data;
-	__u64 len_data;
-	__u64 len_image;
-	__u64 reserved1;
-	__u64 reserved2;
-	__u64 reserved3;
-	__u64 reserved4;
-};
-
-/* ioctl numbers */
-#define CXL_MAGIC 0xCA
-/* AFU devices */
-#define CXL_IOCTL_START_WORK		_IOW(CXL_MAGIC, 0x00, struct cxl_ioctl_start_work)
-#define CXL_IOCTL_GET_PROCESS_ELEMENT	_IOR(CXL_MAGIC, 0x01, __u32)
-#define CXL_IOCTL_GET_AFU_ID            _IOR(CXL_MAGIC, 0x02, struct cxl_afu_id)
-/* adapter devices */
-#define CXL_IOCTL_DOWNLOAD_IMAGE        _IOW(CXL_MAGIC, 0x0A, struct cxl_adapter_image)
-#define CXL_IOCTL_VALIDATE_IMAGE        _IOW(CXL_MAGIC, 0x0B, struct cxl_adapter_image)
-
-#define CXL_READ_MIN_SIZE 0x1000 /* 4K */
-
-/* Events from read() */
-enum cxl_event_type {
-	CXL_EVENT_RESERVED      = 0,
-	CXL_EVENT_AFU_INTERRUPT = 1,
-	CXL_EVENT_DATA_STORAGE  = 2,
-	CXL_EVENT_AFU_ERROR     = 3,
-	CXL_EVENT_AFU_DRIVER    = 4,
-};
-
-struct cxl_event_header {
-	__u16 type;
-	__u16 size;
-	__u16 process_element;
-	__u16 reserved1;
-};
-
-struct cxl_event_afu_interrupt {
-	__u16 flags;
-	__u16 irq; /* Raised AFU interrupt number */
-	__u32 reserved1;
-};
-
-struct cxl_event_data_storage {
-	__u16 flags;
-	__u16 reserved1;
-	__u32 reserved2;
-	__u64 addr;
-	__u64 dsisr;
-	__u64 reserved3;
-};
-
-struct cxl_event_afu_error {
-	__u16 flags;
-	__u16 reserved1;
-	__u32 reserved2;
-	__u64 error;
-};
-
-struct cxl_event_afu_driver_reserved {
-	/*
-	 * Defines the buffer passed to the cxl driver by the AFU driver.
-	 *
-	 * This is not ABI since the event header.size passed to the user for
-	 * existing events is set in the read call to sizeof(cxl_event_header)
-	 * + sizeof(whatever event is being dispatched) and the user is already
-	 * required to use a 4K buffer on the read call.
-	 *
-	 * Of course the contents will be ABI, but that's up the AFU driver.
-	 */
-	__u32 data_size;
-	__u8 data[];
-};
-
-struct cxl_event {
-	struct cxl_event_header header;
-	union {
-		struct cxl_event_afu_interrupt irq;
-		struct cxl_event_data_storage fault;
-		struct cxl_event_afu_error afu_error;
-		struct cxl_event_afu_driver_reserved afu_driver_event;
-	};
-};
-
-#endif /* _UAPI_MISC_CXL_H */
-- 
cgit v1.2.3


From ac053946f5c40ce90ca7ccb75ed687612d9eccf9 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 27 Jan 2025 17:05:06 +0100
Subject: compiler.h: introduce TYPEOF_UNQUAL() macro

Define TYPEOF_UNQUAL() to use __typeof_unqual__() as typeof operator when
available, to return unqualified type of the expression.

Current version of sparse doesn't know anything about __typeof_unqual__()
operator.  Avoid the usage of __typeof_unqual__() when sparse checking is
active to prevent sparse errors with unknowing keyword.

Link: https://lkml.kernel.org/r/20250127160709.80604-3-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler-clang.h |  8 ++++++++
 include/linux/compiler-gcc.h   |  8 ++++++++
 include/linux/compiler.h       | 20 ++++++++++++++++++++
 3 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 2e7c2c282f3a..4fc8e26914ad 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -128,3 +128,11 @@
  */
 #define ASM_INPUT_G "ir"
 #define ASM_INPUT_RM "r"
+
+/*
+ * Declare compiler support for __typeof_unqual__() operator.
+ *
+ * Bindgen uses LLVM even if our C compiler is GCC, so we cannot
+ * rely on the auto-detected CONFIG_CC_HAS_TYPEOF_UNQUAL.
+ */
+#define CC_HAS_TYPEOF_UNQUAL (__clang_major__ >= 19)
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index c9b58188ec61..32048052c64a 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -137,3 +137,11 @@
 #if GCC_VERSION < 90100
 #undef __alloc_size__
 #endif
+
+/*
+ * Declare compiler support for __typeof_unqual__() operator.
+ *
+ * Bindgen uses LLVM even if our C compiler is GCC, so we cannot
+ * rely on the auto-detected CONFIG_CC_HAS_TYPEOF_UNQUAL.
+ */
+#define CC_HAS_TYPEOF_UNQUAL (__GNUC__ >= 14)
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 155385754824..3a7a537e13a3 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -210,6 +210,26 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 #define __must_be_cstr(p) \
 	__BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)")
 
+/*
+ * Use __typeof_unqual__() when available.
+ *
+ * XXX: Remove test for __CHECKER__ once
+ * sparse learns about __typeof_unqual__().
+ */
+#if CC_HAS_TYPEOF_UNQUAL && !defined(__CHECKER__)
+# define USE_TYPEOF_UNQUAL 1
+#endif
+
+/*
+ * Define TYPEOF_UNQUAL() to use __typeof_unqual__() as typeof
+ * operator when available, to return an unqualified type of the exp.
+ */
+#if defined(USE_TYPEOF_UNQUAL)
+# define TYPEOF_UNQUAL(exp) __typeof_unqual__(exp)
+#else
+# define TYPEOF_UNQUAL(exp) __typeof__(exp)
+#endif
+
 #endif /* __KERNEL__ */
 
 /**
-- 
cgit v1.2.3


From 8a3c392388c6a6e0c8937a24712b630ec9ac7016 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 27 Jan 2025 17:05:07 +0100
Subject: percpu: use TYPEOF_UNQUAL() in variable declarations

Use TYPEOF_UNQUAL() to declare variables as a corresponding type without
named address space qualifier to avoid "`__seg_gs' specified for auto
variable `var'" errors.

Link: https://lkml.kernel.org/r/20250127160709.80604-4-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Acked-by: Nadav Amit <nadav.amit@gmail.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/percpu.h | 26 +++++++++++++-------------
 include/linux/part_stat.h    |  2 +-
 include/linux/percpu-defs.h  |  4 ++--
 include/net/snmp.h           |  5 ++---
 4 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 94cbd50cc870..50597b975a49 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -74,7 +74,7 @@ do {									\
 
 #define raw_cpu_generic_add_return(pcp, val)				\
 ({									\
-	typeof(pcp) *__p = raw_cpu_ptr(&(pcp));				\
+	TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp));			\
 									\
 	*__p += val;							\
 	*__p;								\
@@ -82,8 +82,8 @@ do {									\
 
 #define raw_cpu_generic_xchg(pcp, nval)					\
 ({									\
-	typeof(pcp) *__p = raw_cpu_ptr(&(pcp));				\
-	typeof(pcp) __ret;						\
+	TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp));			\
+	TYPEOF_UNQUAL(pcp) __ret;					\
 	__ret = *__p;							\
 	*__p = nval;							\
 	__ret;								\
@@ -91,7 +91,7 @@ do {									\
 
 #define __cpu_fallback_try_cmpxchg(pcp, ovalp, nval, _cmpxchg)		\
 ({									\
-	typeof(pcp) __val, __old = *(ovalp);				\
+	TYPEOF_UNQUAL(pcp) __val, __old = *(ovalp);			\
 	__val = _cmpxchg(pcp, __old, nval);				\
 	if (__val != __old)						\
 		*(ovalp) = __val;					\
@@ -100,8 +100,8 @@ do {									\
 
 #define raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval)			\
 ({									\
-	typeof(pcp) *__p = raw_cpu_ptr(&(pcp));				\
-	typeof(pcp) __val = *__p, ___old = *(ovalp);			\
+	TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp));			\
+	TYPEOF_UNQUAL(pcp) __val = *__p, ___old = *(ovalp);		\
 	bool __ret;							\
 	if (__val == ___old) {						\
 		*__p = nval;						\
@@ -115,14 +115,14 @@ do {									\
 
 #define raw_cpu_generic_cmpxchg(pcp, oval, nval)			\
 ({									\
-	typeof(pcp) __old = (oval);					\
+	TYPEOF_UNQUAL(pcp) __old = (oval);				\
 	raw_cpu_generic_try_cmpxchg(pcp, &__old, nval);			\
 	__old;								\
 })
 
 #define __this_cpu_generic_read_nopreempt(pcp)				\
 ({									\
-	typeof(pcp) ___ret;						\
+	TYPEOF_UNQUAL(pcp) ___ret;					\
 	preempt_disable_notrace();					\
 	___ret = READ_ONCE(*raw_cpu_ptr(&(pcp)));			\
 	preempt_enable_notrace();					\
@@ -131,7 +131,7 @@ do {									\
 
 #define __this_cpu_generic_read_noirq(pcp)				\
 ({									\
-	typeof(pcp) ___ret;						\
+	TYPEOF_UNQUAL(pcp) ___ret;					\
 	unsigned long ___flags;						\
 	raw_local_irq_save(___flags);					\
 	___ret = raw_cpu_generic_read(pcp);				\
@@ -141,7 +141,7 @@ do {									\
 
 #define this_cpu_generic_read(pcp)					\
 ({									\
-	typeof(pcp) __ret;						\
+	TYPEOF_UNQUAL(pcp) __ret;					\
 	if (__native_word(pcp))						\
 		__ret = __this_cpu_generic_read_nopreempt(pcp);		\
 	else								\
@@ -160,7 +160,7 @@ do {									\
 
 #define this_cpu_generic_add_return(pcp, val)				\
 ({									\
-	typeof(pcp) __ret;						\
+	TYPEOF_UNQUAL(pcp) __ret;					\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
 	__ret = raw_cpu_generic_add_return(pcp, val);			\
@@ -170,7 +170,7 @@ do {									\
 
 #define this_cpu_generic_xchg(pcp, nval)				\
 ({									\
-	typeof(pcp) __ret;						\
+	TYPEOF_UNQUAL(pcp) __ret;					\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
 	__ret = raw_cpu_generic_xchg(pcp, nval);			\
@@ -190,7 +190,7 @@ do {									\
 
 #define this_cpu_generic_cmpxchg(pcp, oval, nval)			\
 ({									\
-	typeof(pcp) __ret;						\
+	TYPEOF_UNQUAL(pcp) __ret;					\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
 	__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval);		\
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index ac8c44dd8237..c5e9cac0575e 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -33,7 +33,7 @@ struct disk_stats {
 
 #define part_stat_read(part, field)					\
 ({									\
-	typeof((part)->bd_stats->field) res = 0;			\
+	TYPEOF_UNQUAL((part)->bd_stats->field) res = 0;			\
 	unsigned int _cpu;						\
 	for_each_possible_cpu(_cpu)					\
 		res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 5b520fe86b60..79b9402404f1 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -317,7 +317,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { }
 
 #define __pcpu_size_call_return(stem, variable)				\
 ({									\
-	typeof(variable) pscr_ret__;					\
+	TYPEOF_UNQUAL(variable) pscr_ret__;				\
 	__verify_pcpu_ptr(&(variable));					\
 	switch(sizeof(variable)) {					\
 	case 1: pscr_ret__ = stem##1(variable); break;			\
@@ -332,7 +332,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { }
 
 #define __pcpu_size_call_return2(stem, variable, ...)			\
 ({									\
-	typeof(variable) pscr2_ret__;					\
+	TYPEOF_UNQUAL(variable) pscr2_ret__;				\
 	__verify_pcpu_ptr(&(variable));					\
 	switch(sizeof(variable)) {					\
 	case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break;	\
diff --git a/include/net/snmp.h b/include/net/snmp.h
index 468a67836e2f..4cb4326dfebe 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -159,7 +159,7 @@ struct linux_tls_mib {
 
 #define __SNMP_ADD_STATS64(mib, field, addend) 				\
 	do {								\
-		__typeof__(*mib) *ptr = raw_cpu_ptr(mib);		\
+		TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib);		\
 		u64_stats_update_begin(&ptr->syncp);			\
 		ptr->mibs[field] += addend;				\
 		u64_stats_update_end(&ptr->syncp);			\
@@ -176,8 +176,7 @@ struct linux_tls_mib {
 #define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1)
 #define __SNMP_UPD_PO_STATS64(mib, basefield, addend)			\
 	do {								\
-		__typeof__(*mib) *ptr;				\
-		ptr = raw_cpu_ptr((mib));				\
+		TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib);		\
 		u64_stats_update_begin(&ptr->syncp);			\
 		ptr->mibs[basefield##PKTS]++;				\
 		ptr->mibs[basefield##OCTETS] += addend;			\
-- 
cgit v1.2.3


From 6a39fe05ecaa3946ed0af9efd3e56689d519e420 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 27 Jan 2025 17:05:08 +0100
Subject: percpu: use TYPEOF_UNQUAL() in *_cpu_ptr() accessors

Use TYPEOF_UNQUAL() macro to declare the return type of *_cpu_ptr()
accessors in the generic named address space to avoid access to data from
pointer to non-enclosed address space type of errors.

Link: https://lkml.kernel.org/r/20250127160709.80604-5-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Acked-by: Nadav Amit <nadav.amit@gmail.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu-defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 79b9402404f1..a7cf954ea99d 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -221,7 +221,7 @@ do {									\
 } while (0)
 
 #define PERCPU_PTR(__p)							\
-	(typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p))
+	(TYPEOF_UNQUAL(*(__p)) __force __kernel *)((__force unsigned long)(__p))
 
 #ifdef CONFIG_SMP
 
-- 
cgit v1.2.3


From 6cea5ae714ba47ea4807d15903baca9857a450e6 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 27 Jan 2025 17:05:09 +0100
Subject: percpu: repurpose __percpu tag as a named address space qualifier

The patch introduces __percpu_qual define and repurposes __percpu tag as a
named address space qualifier using the new define.

Arches can now conditionally define __percpu_qual as their named address
space qualifier for percpu variables.

Link: https://lkml.kernel.org/r/20250127160709.80604-6-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Acked-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/percpu.h   | 13 +++++++++++++
 include/linux/compiler_types.h |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 50597b975a49..02aeca21479a 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -6,6 +6,19 @@
 #include <linux/threads.h>
 #include <linux/percpu-defs.h>
 
+/*
+ * __percpu_qual is the qualifier for the percpu named address space.
+ *
+ * Most arches use generic named address space for percpu variables but
+ * some arches define percpu variables in different named address space
+ * (on the x86 arch, percpu variable may be declared as being relative
+ * to the %fs or %gs segments using __seg_fs or __seg_gs named address
+ * space qualifier).
+ */
+#ifndef __percpu_qual
+# define __percpu_qual
+#endif
+
 #ifdef CONFIG_SMP
 
 /*
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 981cc3d7e3aa..5d6544545658 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -57,7 +57,7 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
 #  define __user	BTF_TYPE_TAG(user)
 # endif
 # define __iomem
-# define __percpu	BTF_TYPE_TAG(percpu)
+# define __percpu	__percpu_qual BTF_TYPE_TAG(percpu)
 # define __rcu		BTF_TYPE_TAG(rcu)
 
 # define __chk_user_ptr(x)	(void)0
-- 
cgit v1.2.3


From 89ce924f0bd447eb52a5f224d879dbf8f09451db Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 24 Jan 2025 00:41:32 -0500
Subject: mm: memcontrol: move memsw charge callbacks to v1

The interweaving of two entirely different swap accounting strategies has
been one of the more confusing parts of the memcg code.  Split out the v1
code to clarify the implementation and a handful of callsites, and to
avoid building the v1 bits when !CONFIG_MEMCG_V1.

   text	  data	   bss	   dec	   hex	filename
  39253	  6446	  4160	 49859	  c2c3	mm/memcontrol.o.old
  38877	  6382	  4160	 49419	  c10b	mm/memcontrol.o

Link: https://lkml.kernel.org/r/20250124054132.45643-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Balbir Singh <balbirs@nvidia.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 17 +++++++++++------
 include/linux/swap.h       |  5 -----
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6e74b8254d9b..57664e2a8fb7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -649,8 +649,6 @@ int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp);
 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
 				  gfp_t gfp, swp_entry_t entry);
 
-void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
-
 void __mem_cgroup_uncharge(struct folio *folio);
 
 /**
@@ -1165,10 +1163,6 @@ static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
 	return 0;
 }
 
-static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr)
-{
-}
-
 static inline void mem_cgroup_uncharge(struct folio *folio)
 {
 }
@@ -1848,6 +1842,9 @@ static inline void mem_cgroup_exit_user_fault(void)
 	current->in_user_fault = 0;
 }
 
+void memcg1_swapout(struct folio *folio, swp_entry_t entry);
+void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages);
+
 #else /* CONFIG_MEMCG_V1 */
 static inline
 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
@@ -1875,6 +1872,14 @@ static inline void mem_cgroup_exit_user_fault(void)
 {
 }
 
+static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry)
+{
+}
+
+static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages)
+{
+}
+
 #endif /* CONFIG_MEMCG_V1 */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b13b72645db3..91b30701274e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -659,7 +659,6 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
 #endif
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
-void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
 static inline int mem_cgroup_try_charge_swap(struct folio *folio,
 		swp_entry_t entry)
@@ -680,10 +679,6 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p
 extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
 extern bool mem_cgroup_swap_full(struct folio *folio);
 #else
-static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
-{
-}
-
 static inline int mem_cgroup_try_charge_swap(struct folio *folio,
 					     swp_entry_t entry)
 {
-- 
cgit v1.2.3


From 599b684a7854e51a51358fe59bbdfea281f0b461 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 10 Feb 2025 20:37:45 +0100
Subject: mm/rmap: convert make_device_exclusive_range() to
 make_device_exclusive()

The single "real" user in the tree of make_device_exclusive_range() always
requests making only a single address exclusive.  The current
implementation is hard to fix for properly supporting anonymous THP /
large folios and for avoiding messing with rmap walks in weird ways.

So let's always process a single address/page and return folio + page to
minimize page -> folio lookups.  This is a preparation for further
changes.

Reject any non-anonymous or hugetlb folios early, directly after GUP.

While at it, extend the documentation of make_device_exclusive() to
clarify some things.

Link: https://lkml.kernel.org/r/20250210193801.781278-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Simona Vetter <simona.vetter@ffwll.ch>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Tested-by: Alistair Popple <apopple@nvidia.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Karol Herbst <kherbst@redhat.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Lyude <lyude@redhat.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yanteng Si <si.yanteng@linux.dev>
Cc: Barry Song <v-songbaohua@oppo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 2 +-
 include/linux/rmap.h         | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index e2dd57ca368b..d4e714661826 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -46,7 +46,7 @@ struct mmu_interval_notifier;
  * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
  * longer have exclusive access to the page. When sent during creation of an
  * exclusive range the owner will be initialised to the value provided by the
- * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
+ * caller of make_device_exclusive(), otherwise the owner will be NULL.
  */
 enum mmu_notifier_event {
 	MMU_NOTIFY_UNMAP = 0,
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 683a04088f3f..86425d42c1a9 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -663,9 +663,8 @@ int folio_referenced(struct folio *, int is_locked,
 void try_to_migrate(struct folio *folio, enum ttu_flags flags);
 void try_to_unmap(struct folio *, enum ttu_flags flags);
 
-int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
-				unsigned long end, struct page **pages,
-				void *arg);
+struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
+		void *owner, struct folio **foliop);
 
 /* Avoid racy checks */
 #define PVMW_SYNC		(1 << 0)
-- 
cgit v1.2.3


From c25465eb7630ffcadaab29c1010071512f8c9621 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 10 Feb 2025 20:37:48 +0100
Subject: mm: use single SWP_DEVICE_EXCLUSIVE entry type

There is no need for the distinction anymore; let's merge the readable and
writable device-exclusive entries into a single device-exclusive entry
type.

Link: https://lkml.kernel.org/r/20250210193801.781278-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Simona Vetter <simona.vetter@ffwll.ch>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Tested-by: Alistair Popple <apopple@nvidia.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Karol Herbst <kherbst@redhat.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Lyude <lyude@redhat.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yanteng Si <si.yanteng@linux.dev>
Cc: Barry Song <v-songbaohua@oppo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h    |  7 +++----
 include/linux/swapops.h | 27 ++++-----------------------
 2 files changed, 7 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 91b30701274e..9a48e79a0a52 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -74,14 +74,13 @@ static inline int current_is_kswapd(void)
  * to a special SWP_DEVICE_{READ|WRITE} entry.
  *
  * When a page is mapped by the device for exclusive access we set the CPU page
- * table entries to special SWP_DEVICE_EXCLUSIVE_* entries.
+ * table entries to a special SWP_DEVICE_EXCLUSIVE entry.
  */
 #ifdef CONFIG_DEVICE_PRIVATE
-#define SWP_DEVICE_NUM 4
+#define SWP_DEVICE_NUM 3
 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
 #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
-#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
-#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3)
+#define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
 #else
 #define SWP_DEVICE_NUM 0
 #endif
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 96f26e29fefe..64ea151a7ae3 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -186,26 +186,16 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry)
 	return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
 }
 
-static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
+static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset)
 {
-	return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset);
-}
-
-static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
-{
-	return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset);
+	return swp_entry(SWP_DEVICE_EXCLUSIVE, offset);
 }
 
 static inline bool is_device_exclusive_entry(swp_entry_t entry)
 {
-	return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ ||
-		swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
+	return swp_type(entry) == SWP_DEVICE_EXCLUSIVE;
 }
 
-static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
-{
-	return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE);
-}
 #else /* CONFIG_DEVICE_PRIVATE */
 static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
 {
@@ -227,12 +217,7 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry)
 	return false;
 }
 
-static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
-{
-	return swp_entry(0, 0);
-}
-
-static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
+static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset)
 {
 	return swp_entry(0, 0);
 }
@@ -242,10 +227,6 @@ static inline bool is_device_exclusive_entry(swp_entry_t entry)
 	return false;
 }
 
-static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
-{
-	return false;
-}
 #endif /* CONFIG_DEVICE_PRIVATE */
 
 #ifdef CONFIG_MIGRATION
-- 
cgit v1.2.3


From 6df8bae8e851eacf2acf2237860213e002aba74f Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Wed, 29 Jan 2025 18:06:32 +0000
Subject: mm: zbud: remove zbud

The zbud compressed pages allocator is rarely used, most users use
zsmalloc.  zbud consumes much more memory (only stores 1 or 2 compressed
pages per physical page).  The only advantage of zbud is a marginal
performance improvement that by no means justify the memory overhead.

Historically, zsmalloc had significantly worse latency than zbud and
z3fold but offered better memory savings.  This is no longer the case as
shown by a simple recent analysis [1].  In a kernel build test on tmpfs in
a limited cgroup, zbud 2-3% less time than zsmalloc, but at the cost of
using ~32% more memory (1.5G vs 1.13G).  The tradeoff does not make sense
for zbud in any practical scenario.

The only alleged advantage of zbud is not having the dependency on
CONFIG_MMU, but CONFIG_SWAP already depends on CONFIG_MMU anyway, and zbud
is only used by zswap.

Remove zbud after z3fold's removal, leaving zsmalloc as the one and only
zpool allocator.  Leave the removal of the zpool API (and its associated
config options) to a followup cleanup after no more allocators show up.

Deprecating zbud for a few cycles before removing it was initially
proposed [2], like z3fold was marked as deprecated for 2 cycles [3].
However, Johannes rightfully pointed out that the 2 cycles is too short
for most downstream consumers, and z3fold was deprecated first only as a
courtesy anyway.

[1]https://lore.kernel.org/lkml/CAJD7tkbRF6od-2x_L8-A1QL3=2Ww13sCj4S3i4bNndqF+3+_Vg@mail.gmail.com/
[2]https://lore.kernel.org/lkml/Z5gdnSX5Lv-nfjQL@google.com/
[3]https://lore.kernel.org/lkml/20240904233343.933462-1-yosryahmed@google.com/

Link: https://lkml.kernel.org/r/20250129180633.3501650-3-yosry.ahmed@linux.dev
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: WANG Xuerui <kernel@xen0n.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zpool.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index a67d62b79698..5e6dc46b8cc4 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -4,9 +4,8 @@
  *
  * Copyright (C) 2014 Dan Streetman
  *
- * This is a common frontend for the zbud and zsmalloc memory
- * storage pool implementations.  Typically, this is used to
- * store compressed memory.
+ * This is a common frontend for the zswap compressed memory storage
+ * implementations.
  */
 
 #ifndef _ZPOOL_H_
-- 
cgit v1.2.3


From c372473a545edff2fdbc002fc67c181e17c7557b Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 31 Jan 2025 12:31:53 +0000
Subject: mm: completely abstract unnecessary adj_start calculation

The adj_start calculation has been a constant source of confusion in the
VMA merge code.

There are two cases to consider, one where we adjust the start of the
vmg->middle VMA (i.e.  the vmg->__adjust_middle_start merge flag is set),
in which case adj_start is calculated as:

(1) adj_start = vmg->end - vmg->middle->vm_start

And the case where we adjust the start of the vmg->next VMA (i.e.  the
vmg->__adjust_next_start merge flag is set), in which case adj_start is
calculated as:

(2) adj_start = -(vmg->middle->vm_end - vmg->end)

We apply (1) thusly:

vmg->middle->vm_start =
	vmg->middle->vm_start + vmg->end - vmg->middle->vm_start

Which simplifies to:

vmg->middle->vm_start = vmg->end

Similarly, we apply (2) as:

vmg->next->vm_start =
	vmg->next->vm_start + -(vmg->middle->vm_end - vmg->end)

Noting that for these VMAs to be mergeable vmg->middle->vm_end ==
vmg->next->vm_start and so this simplifies to:

vmg->next->vm_start =
	vmg->next->vm_start + -(vmg->next->vm_start - vmg->end)

Which simplifies to:

vmg->next->vm_start = vmg->end

Therefore in each case, we simply need to adjust the start of the VMA to
vmg->end (!) and can do away with this adj_start calculation.  The only
caveat is that we must ensure we update the vm_pgoff field correctly.

We therefore abstract this entire calculation to a new function
vmg_adjust_set_range() which performs this calculation and sets the
adjusted VMA's new range using the general vma_set_range() function.

We also must update vma_adjust_trans_huge() which expects the
now-abstracted adj_start parameter.  It turns out this is wholly
unnecessary.

In vma_adjust_trans_huge() the relevant code is:

	if (adjust_next > 0) {
		struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
		unsigned long nstart = next->vm_start;
		nstart += adjust_next;
		split_huge_pmd_if_needed(next, nstart);
	}

The only case where this is relevant is when vmg->__adjust_middle_start is
specified (in which case adj_next would have been positive), i.e.  the one
in which the vma specified is vmg->prev and this the sought 'next' VMA
would be vmg->middle.

We can therefore eliminate the find_vma() invocation altogether and simply
provide the vmg->middle VMA in this instance, or NULL otherwise.

Again we have an adj_next offset calculation:

next->vm_start + vmg->end - vmg->middle->vm_start

Where next == vmg->middle this simplifies to vmg->end as previously
demonstrated.

Therefore nstart is equal to vmg->end, which is already passed to
vma_adjust_trans_huge() via the 'end' parameter and so this code (rather
delightfully) simplifies to:

	if (next)
		split_huge_pmd_if_needed(next, end);

With these changes in place, it becomes silly for commit_merge() to return
vmg->target, as it is always the same and threaded through vmg, so we
finally change commit_merge() to return an error value once again.

This patch has no change in functional behaviour.

Link: https://lkml.kernel.org/r/7bce2cd4b5afb56211822835d145471280c3dccc.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 93e509b6c00e..e1bea54820ff 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -404,7 +404,7 @@ int madvise_collapse(struct vm_area_struct *vma,
 		     struct vm_area_struct **prev,
 		     unsigned long start, unsigned long end);
 void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
-			   unsigned long end, long adjust_next);
+			   unsigned long end, struct vm_area_struct *next);
 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma);
 
@@ -571,7 +571,7 @@ static inline int madvise_collapse(struct vm_area_struct *vma,
 static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 					 unsigned long start,
 					 unsigned long end,
-					 long adjust_next)
+					 struct vm_area_struct *next)
 {
 }
 static inline int is_swap_pmd(pmd_t pmd)
-- 
cgit v1.2.3


From 51ff4d7486f0c0b4110a6da4af805b179dd7b11e Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Sat, 1 Feb 2025 15:18:00 -0800
Subject: mm: avoid extra mem_alloc_profiling_enabled() checks

Refactor code to avoid extra mem_alloc_profiling_enabled() checks inside
pgalloc_tag_get() function which is often called after that check was
already done.

Link: https://lkml.kernel.org/r/20250201231803.2661189-1-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: David Wang <00107082@163.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Minchan Kim <minchan@google.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgalloc_tag.h | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 3469c4b20105..4a82b6b4820e 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -205,28 +205,32 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
 	}
 }
 
-static inline struct alloc_tag *pgalloc_tag_get(struct page *page)
+/* Should be called only if mem_alloc_profiling_enabled() */
+static inline struct alloc_tag *__pgalloc_tag_get(struct page *page)
 {
 	struct alloc_tag *tag = NULL;
-
-	if (mem_alloc_profiling_enabled()) {
-		union pgtag_ref_handle handle;
-		union codetag_ref ref;
-
-		if (get_page_tag_ref(page, &ref, &handle)) {
-			alloc_tag_sub_check(&ref);
-			if (ref.ct)
-				tag = ct_to_alloc_tag(ref.ct);
-			put_page_tag_ref(handle);
-		}
+	union pgtag_ref_handle handle;
+	union codetag_ref ref;
+
+	if (get_page_tag_ref(page, &ref, &handle)) {
+		alloc_tag_sub_check(&ref);
+		if (ref.ct)
+			tag = ct_to_alloc_tag(ref.ct);
+		put_page_tag_ref(handle);
 	}
 
 	return tag;
 }
 
-static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
+static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
 {
-	if (mem_alloc_profiling_enabled() && tag)
+	struct alloc_tag *tag;
+
+	if (!mem_alloc_profiling_enabled())
+		return;
+
+	tag = __pgalloc_tag_get(page);
+	if (tag)
 		this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
 }
 
@@ -241,8 +245,7 @@ static inline void clear_page_tag_ref(struct page *page) {}
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
 				   unsigned int nr) {}
 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; }
-static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {}
 static inline void alloc_tag_sec_init(void) {}
 static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {}
 static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {}
-- 
cgit v1.2.3


From 93d5440ece3c0aa341fb02e3a44a1b7ab44304c8 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Sat, 1 Feb 2025 15:18:02 -0800
Subject: alloc_tag: uninline code gated by mem_alloc_profiling_key in page
 allocator

When a sizable code section is protected by a disabled static key, that
code gets into the instruction cache even though it's not executed and
consumes the cache, increasing cache misses. This can be remedied by
moving such code into a separate uninlined function.
On a Pixel6 phone, page allocation profiling overhead measured with
CONFIG_MEM_ALLOC_PROFILING=y and profiling disabled is:

             baseline             modified
Big core     4.93%                1.53%
Medium core  4.39%                1.41%
Little core  1.02%                0.36%

This improvement comes at the expense of the configuration when profiling
gets enabled, since there is now an additional function call. The overhead
from this additional call on Pixel6 is:

Big core     0.24%
Middle core  0.63%
Little core  1.1%

However this is negligible when compared with the overall overhead of the
memory allocation profiling when it is enabled.
On x86 this patch does not make noticeable difference because the overhead
with mem_alloc_profiling_key disabled is much lower (under 1%) to start
with, so any improvement is less visible and hard to distinguish from the
noise. The overhead from additional call when profiling is enabled is also
within noise levels.

Link: https://lkml.kernel.org/r/20250201231803.2661189-3-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: David Wang <00107082@163.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Minchan Kim <minchan@google.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgalloc_tag.h | 60 ++++-----------------------------------------
 1 file changed, 5 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 4a82b6b4820e..c74077977830 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -162,47 +162,13 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code
 	}
 }
 
-static inline void clear_page_tag_ref(struct page *page)
-{
-	if (mem_alloc_profiling_enabled()) {
-		union pgtag_ref_handle handle;
-		union codetag_ref ref;
-
-		if (get_page_tag_ref(page, &ref, &handle)) {
-			set_codetag_empty(&ref);
-			update_page_tag_ref(handle, &ref);
-			put_page_tag_ref(handle);
-		}
-	}
-}
-
-static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
-				   unsigned int nr)
-{
-	if (mem_alloc_profiling_enabled()) {
-		union pgtag_ref_handle handle;
-		union codetag_ref ref;
-
-		if (get_page_tag_ref(page, &ref, &handle)) {
-			alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
-			update_page_tag_ref(handle, &ref);
-			put_page_tag_ref(handle);
-		}
-	}
-}
+/* Should be called only if mem_alloc_profiling_enabled() */
+void __clear_page_tag_ref(struct page *page);
 
-static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+static inline void clear_page_tag_ref(struct page *page)
 {
-	if (mem_alloc_profiling_enabled()) {
-		union pgtag_ref_handle handle;
-		union codetag_ref ref;
-
-		if (get_page_tag_ref(page, &ref, &handle)) {
-			alloc_tag_sub(&ref, PAGE_SIZE * nr);
-			update_page_tag_ref(handle, &ref);
-			put_page_tag_ref(handle);
-		}
-	}
+	if (mem_alloc_profiling_enabled())
+		__clear_page_tag_ref(page);
 }
 
 /* Should be called only if mem_alloc_profiling_enabled() */
@@ -222,18 +188,6 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page)
 	return tag;
 }
 
-static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
-{
-	struct alloc_tag *tag;
-
-	if (!mem_alloc_profiling_enabled())
-		return;
-
-	tag = __pgalloc_tag_get(page);
-	if (tag)
-		this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
-}
-
 void pgalloc_tag_split(struct folio *folio, int old_order, int new_order);
 void pgalloc_tag_swap(struct folio *new, struct folio *old);
 
@@ -242,10 +196,6 @@ void __init alloc_tag_sec_init(void);
 #else /* CONFIG_MEM_ALLOC_PROFILING */
 
 static inline void clear_page_tag_ref(struct page *page) {}
-static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
-				   unsigned int nr) {}
-static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {}
 static inline void alloc_tag_sec_init(void) {}
 static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {}
 static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {}
-- 
cgit v1.2.3


From 0eb7d2c337f9df3b6adbcbdce213595d94781e05 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 5 Feb 2025 17:27:12 +0800
Subject: mm/swap: remove SWAP_FLAG_PRIO_SHIFT

It doesn't make sense to have a zero value of shift.  Remove it to avoid
confusion.

Link: https://lkml.kernel.org/r/20250205092721.9395-4-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Kairui Song <ryncsn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 9a48e79a0a52..bbd06cbd1f2b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -24,7 +24,6 @@ struct pagevec;
 
 #define SWAP_FLAG_PREFER	0x8000	/* set if swap priority specified */
 #define SWAP_FLAG_PRIO_MASK	0x7fff
-#define SWAP_FLAG_PRIO_SHIFT	0
 #define SWAP_FLAG_DISCARD	0x10000 /* enable discard for swap */
 #define SWAP_FLAG_DISCARD_ONCE	0x20000 /* discard swap area at swapon-time */
 #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */
-- 
cgit v1.2.3


From c523aa890760b004eea47a0e00b5750a528f3ced Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 5 Feb 2025 17:27:19 +0800
Subject: mm/swap: rename swap_swapcount() to swap_entry_swapped()

The new function name can reflect the real behaviour of the function more
clearly and more accurately.  And the renaming avoids the confusion
between swap_swapcount() and swp_swapcount().

Link: https://lkml.kernel.org/r/20250205092721.9395-11-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Kairui Song <ryncsn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index bbd06cbd1f2b..2fe91c293636 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -499,7 +499,7 @@ int find_first_swap(dev_t *device);
 extern unsigned int count_swap_pages(int, int);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int __swap_count(swp_entry_t entry);
-extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry);
+extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
 struct swap_info_struct *swp_swap_info(swp_entry_t entry);
 struct backing_dev_info;
@@ -582,9 +582,9 @@ static inline int __swap_count(swp_entry_t entry)
 	return 0;
 }
 
-static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
 {
-	return 0;
+	return false;
 }
 
 static inline int swp_swapcount(swp_entry_t entry)
-- 
cgit v1.2.3


From 94ba17adaba0f651fdcf745c8891a88e2e028cfa Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 7 Feb 2025 13:20:33 -0800
Subject: mm/damon: avoid applying DAMOS action to same entity multiple times

'paddr' DAMON operations set can apply a DAMOS scheme's action to a large
folio multiple times in single DAMOS-regions-walk if the folio is laid on
multiple DAMON regions.  Add a field for DAMOS scheme object that can be
used by the underlying ops to know what was the last entity that the
scheme's action has applied.  The core layer unsets the field when each
DAMOS-regions-walk is done for the given scheme.  And update 'paddr' ops
to use the infrastructure to avoid the problem.

Link: https://lkml.kernel.org/r/20250207212033.45269-3-sj@kernel.org
Fixes: 57223ac29584 ("mm/damon/paddr: support the pageout scheme")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Usama Arif <usamaarif642@gmail.com>
Closes: https://lore.kernel.org/20250203225604.44742-3-usamaarif642@gmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index c9074d569596..b4d37d9b9221 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -432,6 +432,7 @@ struct damos_access_pattern {
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
  * @target_nid:		Destination node if @action is "migrate_{hot,cold}".
  * @filters:		Additional set of &struct damos_filter for &action.
+ * @last_applied:	Last @action applied ops-managing entity.
  * @stat:		Statistics of this scheme.
  * @list:		List head for siblings.
  *
@@ -454,6 +455,15 @@ struct damos_access_pattern {
  * implementation could check pages of the region and skip &action to respect
  * &filters
  *
+ * The minimum entity that @action can be applied depends on the underlying
+ * &struct damon_operations.  Since it may not be aligned with the core layer
+ * abstract, namely &struct damon_region, &struct damon_operations could apply
+ * @action to same entity multiple times.  Large folios that underlying on
+ * multiple &struct damon region objects could be such examples.  The &struct
+ * damon_operations can use @last_applied to avoid that.  DAMOS core logic
+ * unsets @last_applied when each regions walking for applying the scheme is
+ * finished.
+ *
  * After applying the &action to each region, &stat_count and &stat_sz is
  * updated to reflect the number of regions and total size of regions that the
  * &action is applied.
@@ -482,6 +492,7 @@ struct damos {
 		int target_nid;
 	};
 	struct list_head filters;
+	void *last_applied;
 	struct damos_stat stat;
 	struct list_head list;
 };
-- 
cgit v1.2.3


From a4811f53bb89b3524629b1be41add9349fe0a750 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Sat, 8 Feb 2025 15:52:55 +0000
Subject: mm: provide mapping_wrprotect_range() function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the fb_defio video driver, page dirty state is used to determine when
frame buffer pages have been changed, allowing for batched, deferred I/O
to be performed for efficiency.

This implementation had only one means of doing so effectively - the use
of the folio_mkclean() function.

However, this use of the function is inappropriate, as the fb_defio
implementation allocates kernel memory to back the framebuffer, and then
is forced to specified page->index, mapping fields in order to permit the
folio_mkclean() rmap traversal to proceed correctly.

It is not correct to specify these fields on kernel-allocated memory, and
moreover since these are not folios, page->index, mapping are deprecated
fields, soon to be removed.

We therefore need to provide a means by which we can correctly traverse
the reverse mapping and write-protect mappings for a page backing an
address_space page cache object at a given offset.

This patch provides this - mapping_wrprotect_range() - which allows for
this operation to be performed for a specified address_space, offset, PFN
and size, without requiring a folio nor, of course, an inappropriate use
of page->index, mapping.

With this provided, we can subsequently adjust the fb_defio implementation
to make use of this function and avoid incorrect invocation of
folio_mkclean() and more importantly, incorrect manipulation of
page->index and mapping fields.

Link: https://lkml.kernel.org/r/e5bf969d64e7f2f2ae944d42341fc8994b736a81.1739029358.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Jaya Kumar <jayakumar.lkml@gmail.com>
Cc: Kajtar Zsolt <soci@c64.rulez.org>
Cc: Maíra Canal <mcanal@igalia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Simona Vetter <simona.vetter@ffwll.ch>
Cc: Thomas Zimemrmann <tzimmermann@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 86425d42c1a9..69e9a431a40e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -738,6 +738,9 @@ unsigned long page_address_in_vma(const struct folio *folio,
  */
 int folio_mkclean(struct folio *);
 
+int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
+		unsigned long pfn, unsigned long nr_pages);
+
 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 		      struct vm_area_struct *vma);
 
-- 
cgit v1.2.3


From 6cdef2ddce2b7de8faca69061a6780080cfecc10 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Sat, 8 Feb 2025 15:52:56 +0000
Subject: fb_defio: do not use deprecated page->mapping, index fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the introduction of mapping_wrprotect_range() there is no need to use
folio_mkclean() in order to write-protect mappings of frame buffer pages,
and therefore no need to inappropriately set kernel-allocated page->index,
mapping fields to permit this operation.

Instead, store the pointer to the page cache object for the mapped driver
in the fb_deferred_io object, and use the already stored page offset from
the pageref object to look up mappings in order to write-protect them.

This is justified, as for the page objects to store a mapping pointer at
the point of assignment of pages, they must all reference the same
underlying address_space object.  Since the life time of the pagerefs is
also the lifetime of the fb_deferred_io object, storing the pointer here
makes sense.

This eliminates the need for all of the logic around setting and
maintaining page->index,mapping which we remove.

This eliminates the use of folio_mkclean() entirely but otherwise should
have no functional change.

[lorenzo.stoakes@oracle.com: fixup unused variable warnings]
  Link: https://lkml.kernel.org/r/d4018405-2762-4385-a816-e54cc23839ac@lucifer.local
Link: https://lkml.kernel.org/r/81171ab16c14e3df28f6de9d14982cee528d8519.1739029358.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Kajtar Zsolt <soci@c64.rulez.org>
Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Jaya Kumar <jayakumar.lkml@gmail.com>
Cc: Maíra Canal <mcanal@igalia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Simona Vetter <simona.vetter@ffwll.ch>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index 5ba187e08cf7..cd653862ab99 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -225,6 +225,7 @@ struct fb_deferred_io {
 	int open_count; /* number of opened files; protected by fb_info lock */
 	struct mutex lock; /* mutex that protects the pageref list */
 	struct list_head pagereflist; /* list of pagerefs for touched pages */
+	struct address_space *mapping; /* page cache object for fb device */
 	/* callback */
 	struct page *(*get_page)(struct fb_info *info, unsigned long offset);
 	void (*deferred_io)(struct fb_info *info, struct list_head *pagelist);
-- 
cgit v1.2.3


From 6340584e489f1f8ddb65e44a7ad2ab9f3503c4d3 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 11 Feb 2025 12:59:11 -0800
Subject: mm/vmstat: revert "fix a W=1 clang compiler warning"

Commit 75b5ab134bb5 enabled -Wenum-enum-conversion in W=1 builds.  Commit
30c2de0a267c ("mm/vmstat: fix a W=1 clang compiler warning") fixed a
-Wenum-enum-conversion warning.  Commit 8f6629c004b1 removed the
-Wenum-enum-conversion option again from W=1 builds.  Since the W=1
compiler warning fix is no longer necessary, revert it.

Link: https://lkml.kernel.org/r/20250211205911.1707684-1-bvanassche@acm.org
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ivan Shapovalov <intelfx@intelfx.name>
Cc: David Laight <david.laight@aculab.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vmstat.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 9f3a04345b86..d2761bf8ff32 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -515,7 +515,7 @@ static inline const char *node_stat_name(enum node_stat_item item)
 
 static inline const char *lru_list_name(enum lru_list lru)
 {
-	return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_"
+	return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
 }
 
 #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
-- 
cgit v1.2.3


From 0431c42622612a96cce97cdd4cfbe7d487606cf3 Mon Sep 17 00:00:00 2001
From: Usama Arif <usamaarif642@gmail.com>
Date: Tue, 11 Feb 2025 12:43:40 +0000
Subject: mm/damon: introduce DAMOS filter type hugepage_size

Patch series "mm/damon: add support for hugepage_size DAMOS filter", v5.

hugepage_size DAMOS filter can be used to gather statistics to check if
memory regions of specific access tempratures are backed by hugepages of a
size in a specific range.  This filter can help to observe and prove the
effectivenes of different schemes for shrinking/collapsing hugepages.


This patch (of 4):

This is to gather statistics to check if memory regions of specific access
tempratures are backed by pages of a size in a specific range.  This
filter can help to observe and prove the effectivenes of different schemes
for shrinking/collapsing hugepages.

[sj@kernel.org: add kernel-doc comment for damos_filter->sz_range]
  Link: https://lkml.kernel.org/r/20250218223058.52459-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20250211124437.278873-1-usamaarif642@gmail.com
Link: https://lkml.kernel.org/r/20250211124437.278873-2-usamaarif642@gmail.com
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Usama Arif <usamaarif642@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index b4d37d9b9221..5e7ae7bca5dc 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -35,6 +35,16 @@ struct damon_addr_range {
 	unsigned long end;
 };
 
+/**
+ * struct damon_size_range - Represents size for filter to operate on [@min, @max].
+ * @min:	Min size (inclusive).
+ * @max:	Max size (inclusive).
+ */
+struct damon_size_range {
+	unsigned long min;
+	unsigned long max;
+};
+
 /**
  * struct damon_region - Represents a monitoring target region.
  * @ar:			The address range of the region.
@@ -326,6 +336,7 @@ struct damos_stat {
  * @DAMOS_FILTER_TYPE_ANON:	Anonymous pages.
  * @DAMOS_FILTER_TYPE_MEMCG:	Specific memcg's pages.
  * @DAMOS_FILTER_TYPE_YOUNG:	Recently accessed pages.
+ * @DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:	Page is part of a hugepage.
  * @DAMOS_FILTER_TYPE_ADDR:	Address range.
  * @DAMOS_FILTER_TYPE_TARGET:	Data Access Monitoring target.
  * @NR_DAMOS_FILTER_TYPES:	Number of filter types.
@@ -345,6 +356,7 @@ enum damos_filter_type {
 	DAMOS_FILTER_TYPE_ANON,
 	DAMOS_FILTER_TYPE_MEMCG,
 	DAMOS_FILTER_TYPE_YOUNG,
+	DAMOS_FILTER_TYPE_HUGEPAGE_SIZE,
 	DAMOS_FILTER_TYPE_ADDR,
 	DAMOS_FILTER_TYPE_TARGET,
 	NR_DAMOS_FILTER_TYPES,
@@ -360,6 +372,7 @@ enum damos_filter_type {
  * @target_idx:	Index of the &struct damon_target of
  *		&damon_ctx->adaptive_targets if @type is
  *		DAMOS_FILTER_TYPE_TARGET.
+ * @sz_range:	Size range if @type is DAMOS_FILTER_TYPE_HUGEPAGE_SIZE.
  * @list:	List head for siblings.
  *
  * Before applying the &damos->action to a memory region, DAMOS checks if each
@@ -376,6 +389,7 @@ struct damos_filter {
 		unsigned short memcg_id;
 		struct damon_addr_range addr_range;
 		int target_idx;
+		struct damon_size_range sz_range;
 	};
 	struct list_head list;
 };
-- 
cgit v1.2.3


From 1d23b9403aedea2d8eb4175e8eeb05fcd2967219 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 11 Feb 2025 18:13:39 +0800
Subject: mm/mmu_gather: remove unused __tlb_remove_page()

Nobody is using __tlb_remove_page() now, clean it up.

And also remove the code comment above tlb_remove_page() because it's not
meaningful any more.

Link: https://lkml.kernel.org/r/Z6si0/A/zzEF/bFJ@MiWiFi-R3L-srv
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/tlb.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index e402aef79c93..1fac1985127d 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -489,16 +489,6 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 		tlb_flush_mmu(tlb);
 }
 
-static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb,
-		struct page *page, bool delay_rmap)
-{
-	return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE);
-}
-
-/* tlb_remove_page
- *	Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
- *	required.
- */
 static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
 	return tlb_remove_page_size(tlb, page, PAGE_SIZE);
-- 
cgit v1.2.3


From 7bd1fa0d5624e78628ad7ce70d7e69082c34c634 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 11 Feb 2025 11:43:48 +0800
Subject: mm/mmu_gather: clean up the stale code comment

In commit d7f861b9c43a ("mm/mmu_gather: add __tlb_remove_folio_pages()"),
helper function __tlb_remove_folio_pages_size() was added.  And based on
the helper, wrapper functions __tlb_remove_folio_pages() and
__tlb_remove_page_size() are created and used by upper level functions.

So let's update the code comment to reflect the current code about
tlb_remove_page()/tlb_remove_page_size(), etc.

Link: https://lkml.kernel.org/r/20250211034348.39531-2-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/tlb.h | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 1fac1985127d..d1adfba8387e 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -67,22 +67,21 @@
  *
  *    See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE.
  *
- *  - tlb_remove_page() / __tlb_remove_page()
- *  - tlb_remove_page_size() / __tlb_remove_page_size()
- *  - __tlb_remove_folio_pages()
+ *  - tlb_remove_page() / tlb_remove_page_size()
+ *  - __tlb_remove_folio_pages() / __tlb_remove_page_size()
+ *  - __tlb_remove_folio_pages_size()
  *
- *    __tlb_remove_page_size() is the basic primitive that queues a page for
- *    freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
- *    boolean indicating if the queue is (now) full and a call to
- *    tlb_flush_mmu() is required.
+ *    __tlb_remove_folio_pages_size() is the basic primitive that queues pages
+ *    for freeing. It will return a boolean indicating if the queue is (now)
+ *    full and a call to tlb_flush_mmu() is required.
  *
  *    tlb_remove_page() and tlb_remove_page_size() imply the call to
  *    tlb_flush_mmu() when required and has no return value.
  *
- *    __tlb_remove_folio_pages() is similar to __tlb_remove_page(), however,
- *    instead of removing a single page, remove the given number of consecutive
- *    pages that are all part of the same (large) folio: just like calling
- *    __tlb_remove_page() on each page individually.
+ *    __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(),
+ *    however, instead of removing a single page, assume PAGE_SIZE and remove
+ *    the given number of consecutive pages that are all part of the
+ *    same (large) folio.
  *
  *  - tlb_change_page_size()
  *
-- 
cgit v1.2.3


From b2ae5fccb8c0ec2167e6e6c5c5a5eb8d656f70ef Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:38 -0800
Subject: mm: introduce vma_start_read_locked{_nested} helpers

Patch series "reimplement per-vma lock as a refcount", v10.

Back when per-vma locks were introduces, vm_lock was moved out of
vm_area_struct in [1] because of the performance regression caused by
false cacheline sharing.  Recent investigation [2] revealed that the
regressions is limited to a rather old Broadwell microarchitecture and
even there it can be mitigated by disabling adjacent cacheline
prefetching, see [3].

Splitting single logical structure into multiple ones leads to more
complicated management, extra pointer dereferences and overall less
maintainable code.  When that split-away part is a lock, it complicates
things even further.  With no performance benefits, there are no reasons
for this split.  Merging the vm_lock back into vm_area_struct also allows
vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset.

This patchset:

1. moves vm_lock back into vm_area_struct, aligning it at the
   cacheline boundary and changing the cache to be cacheline-aligned to
   minimize cacheline sharing;

2. changes vm_area_struct initialization to mark new vma as detached
   until it is inserted into vma tree;

3. replaces vm_lock and vma->detached flag with a reference counter;

4. regroups vm_area_struct members to fit them into 3 cachelines;

5. changes vm_area_struct cache to SLAB_TYPESAFE_BY_RCU to allow for
   their reuse and to minimize call_rcu() calls.

Pagefault microbenchmarks show performance improvement:
Hmean     faults/cpu-1    507926.5547 (   0.00%)   506519.3692 *  -0.28%*
Hmean     faults/cpu-4    479119.7051 (   0.00%)   481333.6802 *   0.46%*
Hmean     faults/cpu-7    452880.2961 (   0.00%)   455845.6211 *   0.65%*
Hmean     faults/cpu-12   347639.1021 (   0.00%)   352004.2254 *   1.26%*
Hmean     faults/cpu-21   200061.2238 (   0.00%)   229597.0317 *  14.76%*
Hmean     faults/cpu-30   145251.2001 (   0.00%)   164202.5067 *  13.05%*
Hmean     faults/cpu-48   106848.4434 (   0.00%)   120641.5504 *  12.91%*
Hmean     faults/cpu-56    92472.3835 (   0.00%)   103464.7916 *  11.89%*
Hmean     faults/sec-1    507566.1468 (   0.00%)   506139.0811 *  -0.28%*
Hmean     faults/sec-4   1880478.2402 (   0.00%)  1886795.6329 *   0.34%*
Hmean     faults/sec-7   3106394.3438 (   0.00%)  3140550.7485 *   1.10%*
Hmean     faults/sec-12  4061358.4795 (   0.00%)  4112477.0206 *   1.26%*
Hmean     faults/sec-21  3988619.1169 (   0.00%)  4577747.1436 *  14.77%*
Hmean     faults/sec-30  3909839.5449 (   0.00%)  4311052.2787 *  10.26%*
Hmean     faults/sec-48  4761108.4691 (   0.00%)  5283790.5026 *  10.98%*
Hmean     faults/sec-56  4885561.4590 (   0.00%)  5415839.4045 *  10.85%*


This patch (of 18):

Introduce helper functions which can be used to read-lock a VMA when
holding mmap_lock for read.  Replace direct accesses to vma->vm_lock with
these new helpers.

Link: https://lkml.kernel.org/r/20250213224655.1680278-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-2-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fd1e85b4b48a..6a4914bc1a38 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -735,6 +735,30 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	return true;
 }
 
+/*
+ * Use only while holding mmap read lock which guarantees that locking will not
+ * fail (nobody can concurrently write-lock the vma). vma_start_read() should
+ * not be used in such cases because it might fail due to mm_lock_seq overflow.
+ * This functionality is used to obtain vma read lock and drop the mmap read lock.
+ */
+static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
+{
+	mmap_assert_locked(vma->vm_mm);
+	down_read_nested(&vma->vm_lock->lock, subclass);
+}
+
+/*
+ * Use only while holding mmap read lock which guarantees that locking will not
+ * fail (nobody can concurrently write-lock the vma). vma_start_read() should
+ * not be used in such cases because it might fail due to mm_lock_seq overflow.
+ * This functionality is used to obtain vma read lock and drop the mmap read lock.
+ */
+static inline void vma_start_read_locked(struct vm_area_struct *vma)
+{
+	mmap_assert_locked(vma->vm_mm);
+	down_read(&vma->vm_lock->lock);
+}
+
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
 	rcu_read_lock(); /* keeps vma alive till the end of up_read */
-- 
cgit v1.2.3


From 7b6218ae1253491d56f21f4b1f3609f3dd873600 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:39 -0800
Subject: mm: move per-vma lock into vm_area_struct

Back when per-vma locks were introduces, vm_lock was moved out of
vm_area_struct in [1] because of the performance regression caused by
false cacheline sharing.  Recent investigation [2] revealed that the
regressions is limited to a rather old Broadwell microarchitecture and
even there it can be mitigated by disabling adjacent cacheline
prefetching, see [3].

Splitting single logical structure into multiple ones leads to more
complicated management, extra pointer dereferences and overall less
maintainable code.  When that split-away part is a lock, it complicates
things even further.  With no performance benefits, there are no reasons
for this split.  Merging the vm_lock back into vm_area_struct also allows
vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset.  Move
vm_lock back into vm_area_struct, aligning it at the cacheline boundary
and changing the cache to be cacheline-aligned as well.  With kernel
compiled using defconfig, this causes VMA memory consumption to grow from
160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes:

    slabinfo before:
     <name>           ... <objsize> <objperslab> <pagesperslab> : ...
     vma_lock         ...     40  102    1 : ...
     vm_area_struct   ...    160   51    2 : ...

    slabinfo after moving vm_lock:
     <name>           ... <objsize> <objperslab> <pagesperslab> : ...
     vm_area_struct   ...    256   32    2 : ...

Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages,
which is 5.5MB per 100000 VMAs.  Note that the size of this structure is
dependent on the kernel configuration and typically the original size is
higher than 160 bytes.  Therefore these calculations are close to the
worst case scenario.  A more realistic vm_area_struct usage before this
change is:

     <name>           ... <objsize> <objperslab> <pagesperslab> : ...
     vma_lock         ...     40  102    1 : ...
     vm_area_struct   ...    176   46    2 : ...

Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages,
which is 3.9MB per 100000 VMAs.  This memory consumption growth can be
addressed later by optimizing the vm_lock.

[1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/
[2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/
[3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/

Link: https://lkml.kernel.org/r/20250213224655.1680278-3-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       | 28 ++++++++++++++++------------
 include/linux/mm_types.h |  6 ++++--
 2 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6a4914bc1a38..bf3b6362927f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -697,6 +697,12 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_lock_init(struct vm_area_struct *vma)
+{
+	init_rwsem(&vma->vm_lock.lock);
+	vma->vm_lock_seq = UINT_MAX;
+}
+
 /*
  * Try to read-lock a vma. The function is allowed to occasionally yield false
  * locked result to avoid performance overhead, in which case we fall back to
@@ -714,7 +720,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
 		return false;
 
-	if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
+	if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
 		return false;
 
 	/*
@@ -729,7 +735,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	 * This pairs with RELEASE semantics in vma_end_write_all().
 	 */
 	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
-		up_read(&vma->vm_lock->lock);
+		up_read(&vma->vm_lock.lock);
 		return false;
 	}
 	return true;
@@ -744,7 +750,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
 {
 	mmap_assert_locked(vma->vm_mm);
-	down_read_nested(&vma->vm_lock->lock, subclass);
+	down_read_nested(&vma->vm_lock.lock, subclass);
 }
 
 /*
@@ -756,13 +762,13 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int
 static inline void vma_start_read_locked(struct vm_area_struct *vma)
 {
 	mmap_assert_locked(vma->vm_mm);
-	down_read(&vma->vm_lock->lock);
+	down_read(&vma->vm_lock.lock);
 }
 
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
 	rcu_read_lock(); /* keeps vma alive till the end of up_read */
-	up_read(&vma->vm_lock->lock);
+	up_read(&vma->vm_lock.lock);
 	rcu_read_unlock();
 }
 
@@ -791,7 +797,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	if (__is_vma_write_locked(vma, &mm_lock_seq))
 		return;
 
-	down_write(&vma->vm_lock->lock);
+	down_write(&vma->vm_lock.lock);
 	/*
 	 * We should use WRITE_ONCE() here because we can have concurrent reads
 	 * from the early lockless pessimistic check in vma_start_read().
@@ -799,7 +805,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
 	 */
 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
-	up_write(&vma->vm_lock->lock);
+	up_write(&vma->vm_lock.lock);
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -811,7 +817,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 
 static inline void vma_assert_locked(struct vm_area_struct *vma)
 {
-	if (!rwsem_is_locked(&vma->vm_lock->lock))
+	if (!rwsem_is_locked(&vma->vm_lock.lock))
 		vma_assert_write_locked(vma);
 }
 
@@ -844,6 +850,7 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 
 #else /* CONFIG_PER_VMA_LOCK */
 
+static inline void vma_lock_init(struct vm_area_struct *vma) {}
 static inline bool vma_start_read(struct vm_area_struct *vma)
 		{ return false; }
 static inline void vma_end_read(struct vm_area_struct *vma) {}
@@ -878,10 +885,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
 
 extern const struct vm_operations_struct vma_dummy_vm_ops;
 
-/*
- * WARNING: vma_init does not initialize vma->vm_lock.
- * Use vm_area_alloc()/vm_area_free() if vma needs locking.
- */
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	memset(vma, 0, sizeof(*vma));
@@ -890,6 +893,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma_mark_detached(vma, false);
 	vma_numab_state_init(vma);
+	vma_lock_init(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0234f14f2aa6..36dea20cd101 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -730,8 +730,6 @@ struct vm_area_struct {
 	 * slowpath.
 	 */
 	unsigned int vm_lock_seq;
-	/* Unstable RCU readers are allowed to read this. */
-	struct vma_lock *vm_lock;
 #endif
 
 	/*
@@ -784,6 +782,10 @@ struct vm_area_struct {
 	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_PER_VMA_LOCK
+	/* Unstable RCU readers are allowed to read this. */
+	struct vma_lock vm_lock ____cacheline_aligned_in_smp;
+#endif
 } __randomize_layout;
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3


From 8ef95d8f15f9e785341775814f0ed2ee22017aa5 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:40 -0800
Subject: mm: mark vma as detached until it's added into vma tree

Current implementation does not set detached flag when a VMA is first
allocated.  This does not represent the real state of the VMA, which is
detached until it is added into mm's VMA tree.  Fix this by marking new
VMAs as detached and resetting detached flag only after VMA is added into
a tree.

Introduce vma_mark_attached() to make the API more readable and to
simplify possible future cleanup when vma->vm_mm might be used to indicate
detached vma and vma_mark_attached() will need an additional mm parameter.

Link: https://lkml.kernel.org/r/20250213224655.1680278-4-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf3b6362927f..d333d4070362 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -821,12 +821,21 @@ static inline void vma_assert_locked(struct vm_area_struct *vma)
 		vma_assert_write_locked(vma);
 }
 
-static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
+static inline void vma_mark_attached(struct vm_area_struct *vma)
+{
+	vma->detached = false;
+}
+
+static inline void vma_mark_detached(struct vm_area_struct *vma)
 {
 	/* When detaching vma should be write-locked */
-	if (detached)
-		vma_assert_write_locked(vma);
-	vma->detached = detached;
+	vma_assert_write_locked(vma);
+	vma->detached = true;
+}
+
+static inline bool is_vma_detached(struct vm_area_struct *vma)
+{
+	return vma->detached;
 }
 
 static inline void release_fault_lock(struct vm_fault *vmf)
@@ -857,8 +866,8 @@ static inline void vma_end_read(struct vm_area_struct *vma) {}
 static inline void vma_start_write(struct vm_area_struct *vma) {}
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 		{ mmap_assert_write_locked(vma->vm_mm); }
-static inline void vma_mark_detached(struct vm_area_struct *vma,
-				     bool detached) {}
+static inline void vma_mark_attached(struct vm_area_struct *vma) {}
+static inline void vma_mark_detached(struct vm_area_struct *vma) {}
 
 static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 		unsigned long address)
@@ -891,7 +900,10 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &vma_dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	vma_mark_detached(vma, false);
+#ifdef CONFIG_PER_VMA_LOCK
+	/* vma is not locked, can't use vma_mark_detached() */
+	vma->detached = true;
+#endif
 	vma_numab_state_init(vma);
 	vma_lock_init(vma);
 }
@@ -1086,6 +1098,7 @@ static inline int vma_iter_bulk_store(struct vma_iterator *vmi,
 	if (unlikely(mas_is_err(&vmi->mas)))
 		return -ENOMEM;
 
+	vma_mark_attached(vma);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 55e50223bf3e06abceaf68e2ad125458bb5f874f Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:41 -0800
Subject: mm: introduce vma_iter_store_attached() to use with attached vmas

vma_iter_store() functions can be used both when adding a new vma and when
updating an existing one.  However for existing ones we do not need to
mark them attached as they are already marked that way.  With
vma->detached being a separate flag, double-marking a vmas as attached or
detached is not an issue because the flag will simply be overwritten with
the same value.  However once we fold this flag into the refcount later in
this series, re-attaching or re-detaching a vma becomes an issue since
these operations will be incrementing/decrementing a refcount.

Introduce vma_iter_store_new() and vma_iter_store_overwrite() to replace
vma_iter_store() and avoid re-attaching a vma during vma update.  Add
assertions in vma_mark_attached()/vma_mark_detached() to catch invalid
usage.  Update vma tests to check for vma detached state correctness.

Link: https://lkml.kernel.org/r/20250213224655.1680278-5-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d333d4070362..003c3e5c0a96 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -821,8 +821,19 @@ static inline void vma_assert_locked(struct vm_area_struct *vma)
 		vma_assert_write_locked(vma);
 }
 
+static inline void vma_assert_attached(struct vm_area_struct *vma)
+{
+	WARN_ON_ONCE(vma->detached);
+}
+
+static inline void vma_assert_detached(struct vm_area_struct *vma)
+{
+	WARN_ON_ONCE(!vma->detached);
+}
+
 static inline void vma_mark_attached(struct vm_area_struct *vma)
 {
+	vma_assert_detached(vma);
 	vma->detached = false;
 }
 
@@ -830,6 +841,7 @@ static inline void vma_mark_detached(struct vm_area_struct *vma)
 {
 	/* When detaching vma should be write-locked */
 	vma_assert_write_locked(vma);
+	vma_assert_attached(vma);
 	vma->detached = true;
 }
 
@@ -866,6 +878,8 @@ static inline void vma_end_read(struct vm_area_struct *vma) {}
 static inline void vma_start_write(struct vm_area_struct *vma) {}
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 		{ mmap_assert_write_locked(vma->vm_mm); }
+static inline void vma_assert_attached(struct vm_area_struct *vma) {}
+static inline void vma_assert_detached(struct vm_area_struct *vma) {}
 static inline void vma_mark_attached(struct vm_area_struct *vma) {}
 static inline void vma_mark_detached(struct vm_area_struct *vma) {}
 
-- 
cgit v1.2.3


From 2c2bd11caba2a45445c1f0c722451fe29e59db79 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:43 -0800
Subject: types: move struct rcuwait into types.h

Move rcuwait struct definition into types.h so that rcuwait can be used
without including rcuwait.h which includes other headers.  Without this
change mm_types.h can't use rcuwait due to a the following circular
dependency:

mm_types.h -> rcuwait.h -> signal.h -> mm_types.h

Link: https://lkml.kernel.org/r/20250213224655.1680278-7-surenb@google.com
Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Acked-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rcuwait.h | 13 +------------
 include/linux/types.h   | 12 ++++++++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
index 27343424225c..9ad134a04b41 100644
--- a/include/linux/rcuwait.h
+++ b/include/linux/rcuwait.h
@@ -4,18 +4,7 @@
 
 #include <linux/rcupdate.h>
 #include <linux/sched/signal.h>
-
-/*
- * rcuwait provides a way of blocking and waking up a single
- * task in an rcu-safe manner.
- *
- * The only time @task is non-nil is when a user is blocked (or
- * checking if it needs to) on a condition, and reset as soon as we
- * know that the condition has succeeded and are awoken.
- */
-struct rcuwait {
-	struct task_struct __rcu *task;
-};
+#include <linux/types.h>
 
 #define __RCUWAIT_INITIALIZER(name)		\
 	{ .task = NULL, }
diff --git a/include/linux/types.h b/include/linux/types.h
index 1c509ce8f7f6..a3d2182c2686 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -248,5 +248,17 @@ typedef void (*swap_func_t)(void *a, void *b, int size);
 typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv);
 typedef int (*cmp_func_t)(const void *a, const void *b);
 
+/*
+ * rcuwait provides a way of blocking and waking up a single
+ * task in an rcu-safe manner.
+ *
+ * The only time @task is non-nil is when a user is blocked (or
+ * checking if it needs to) on a condition, and reset as soon as we
+ * know that the condition has succeeded and are awoken.
+ */
+struct rcuwait {
+	struct task_struct __rcu *task;
+};
+
 #endif /*  __ASSEMBLY__ */
 #endif /* _LINUX_TYPES_H */
-- 
cgit v1.2.3


From 7440adb405dfc4abe21bc95abf3481e6a6649c05 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:44 -0800
Subject: mm: allow vma_start_read_locked/vma_start_read_locked_nested to fail

With upcoming replacement of vm_lock with vm_refcnt, we need to handle a
possibility of vma_start_read_locked/vma_start_read_locked_nested failing
due to refcount overflow.  Prepare for such possibility by changing these
APIs and adjusting their users.

Link: https://lkml.kernel.org/r/20250213224655.1680278-8-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 003c3e5c0a96..09b48af68699 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -747,10 +747,11 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
  * not be used in such cases because it might fail due to mm_lock_seq overflow.
  * This functionality is used to obtain vma read lock and drop the mmap read lock.
  */
-static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
+static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
 {
 	mmap_assert_locked(vma->vm_mm);
 	down_read_nested(&vma->vm_lock.lock, subclass);
+	return true;
 }
 
 /*
@@ -759,10 +760,11 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int
  * not be used in such cases because it might fail due to mm_lock_seq overflow.
  * This functionality is used to obtain vma read lock and drop the mmap read lock.
  */
-static inline void vma_start_read_locked(struct vm_area_struct *vma)
+static inline bool vma_start_read_locked(struct vm_area_struct *vma)
 {
 	mmap_assert_locked(vma->vm_mm);
 	down_read(&vma->vm_lock.lock);
+	return true;
 }
 
 static inline void vma_end_read(struct vm_area_struct *vma)
-- 
cgit v1.2.3


From ce0853966085dd8eab7153ce0b815c4a07d86698 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:45 -0800
Subject: mm: move mmap_init_lock() out of the header file

mmap_init_lock() is used only from mm_init() in fork.c, therefore it does
not have to reside in the header file.  This move lets us avoid including
additional headers in mmap_lock.h later, when mmap_init_lock() needs to
initialize rcuwait object.

Link: https://lkml.kernel.org/r/20250213224655.1680278-9-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmap_lock.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 45a21faa3ff6..4706c6769902 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -122,12 +122,6 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int
 
 #endif /* CONFIG_PER_VMA_LOCK */
 
-static inline void mmap_init_lock(struct mm_struct *mm)
-{
-	init_rwsem(&mm->mmap_lock);
-	mm_lock_seqcount_init(mm);
-}
-
 static inline void mmap_write_lock(struct mm_struct *mm)
 {
 	__mmap_lock_trace_start_locking(mm, true);
-- 
cgit v1.2.3


From 45ad9f5290dc4bb2249e951d4b3756d3ebda2d66 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:46 -0800
Subject: mm: uninline the main body of vma_start_write()

vma_start_write() is used in many places and will grow in size very soon.
It is not used in performance critical paths and uninlining it should
limit the future code size growth.  No functional changes.

Link: https://lkml.kernel.org/r/20250213224655.1680278-10-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 09b48af68699..c24c521e38a2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -787,6 +787,8 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l
 	return (vma->vm_lock_seq == *mm_lock_seq);
 }
 
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
+
 /*
  * Begin writing to a VMA.
  * Exclude concurrent readers under the per-VMA lock until the currently
@@ -799,15 +801,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	if (__is_vma_write_locked(vma, &mm_lock_seq))
 		return;
 
-	down_write(&vma->vm_lock.lock);
-	/*
-	 * We should use WRITE_ONCE() here because we can have concurrent reads
-	 * from the early lockless pessimistic check in vma_start_read().
-	 * We don't really care about the correctness of that early check, but
-	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
-	 */
-	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
-	up_write(&vma->vm_lock.lock);
+	__vma_start_write(vma, mm_lock_seq);
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
-- 
cgit v1.2.3


From 7f8ceea0c58039dcea3d31b8d5da58aa5f6e12bf Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:47 -0800
Subject: refcount: provide ops for cases when object's memory can be reused

For speculative lookups where a successful inc_not_zero() pins the object,
but where we still need to double check if the object acquired is indeed
the one we set out to acquire (identity check), needs this validation to
happen *after* the increment.  Similarly, when a new object is initialized
and its memory might have been previously occupied by another object, all
stores to initialize the object should happen *before* refcount
initialization.

Notably SLAB_TYPESAFE_BY_RCU is one such an example when this ordering is
required for reference counting.

Add refcount_{add|inc}_not_zero_acquire() to guarantee the proper ordering
between acquiring a reference count on an object and performing the
identity check for that object.

Add refcount_set_release() to guarantee proper ordering between stores
initializing object attributes and the store initializing the refcount.
refcount_set_release() should be done after all other object attributes
are initialized.  Once refcount_set_release() is called, the object should
be considered visible to other tasks even if it was not yet added into an
object collection normally used to discover it.  This is because other
tasks might have discovered the object previously occupying the same
memory and after memory reuse they can succeed in taking refcount for the
new object and start using it.

Object reuse example to consider:

consumer:
    obj = lookup(collection, key);
    if (!refcount_inc_not_zero_acquire(&obj->ref))
        return;
    if (READ_ONCE(obj->key) != key) { /* identity check */
        put_ref(obj);
        return;
    }
    use(obj->value);

                 producer:
                     remove(collection, obj->key);
                     if (!refcount_dec_and_test(&obj->ref))
                         return;
                     obj->key = KEY_INVALID;
                     free(obj);
                     obj = malloc(); /* obj is reused */
                     obj->key = new_key;
                     obj->value = new_value;
                     refcount_set_release(obj->ref, 1);
                     add(collection, new_key, obj);

refcount_{add|inc}_not_zero_acquire() is required to prevent the following
reordering when refcount_inc_not_zero() is used instead:

consumer:
    obj = lookup(collection, key);
    if (READ_ONCE(obj->key) != key) { /* reordered identity check */
        put_ref(obj);
        return;
    }
                 producer:
                     remove(collection, obj->key);
                     if (!refcount_dec_and_test(&obj->ref))
                         return;
                     obj->key = KEY_INVALID;
                     free(obj);
                     obj = malloc(); /* obj is reused */
                     obj->key = new_key;
                     obj->value = new_value;
                     refcount_set_release(obj->ref, 1);
                     add(collection, new_key, obj);

    if (!refcount_inc_not_zero(&obj->ref))
        return;
    use(obj->value); /* USING WRONG OBJECT */

refcount_set_release() is required to prevent the following reordering
when refcount_set() is used instead:

consumer:
    obj = lookup(collection, key);

                 producer:
                     remove(collection, obj->key);
                     if (!refcount_dec_and_test(&obj->ref))
                         return;
                     obj->key = KEY_INVALID;
                     free(obj);
                     obj = malloc(); /* obj is reused */
                     obj->key = new_key; /* new_key == old_key */
                     refcount_set(obj->ref, 1);

    if (!refcount_inc_not_zero_acquire(&obj->ref))
        return;
    if (READ_ONCE(obj->key) != key) { /* pass since new_key == old_key */
        put_ref(obj);
        return;
    }
    use(obj->value); /* USING STALE obj->value */

                     obj->value = new_value; /* reordered store */
                     add(collection, key, obj);

[surenb@google.com: fix title underlines in refcount-vs-atomic.rst]
  Link: https://lkml.kernel.org/r/20250217161645.3137927-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-11-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>	[slab]
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/refcount.h | 106 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/slab.h     |   9 ++++
 2 files changed, 115 insertions(+)

(limited to 'include')

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 35f039ecb272..4589d2e7bfea 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -87,6 +87,15 @@
  * The decrements dec_and_test() and sub_and_test() also provide acquire
  * ordering on success.
  *
+ * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() provide
+ * acquire and release ordering for cases when the memory occupied by the
+ * object might be reused to store another object. This is important for the
+ * cases where secondary validation is required to detect such reuse, e.g.
+ * SLAB_TYPESAFE_BY_RCU. The secondary validation checks have to happen after
+ * the refcount is taken, hence acquire order is necessary. Similarly, when the
+ * object is initialized, all stores to its attributes should be visible before
+ * the refcount is set, otherwise a stale attribute value might be used by
+ * another task which succeeds in taking a refcount to the new object.
  */
 
 #ifndef _LINUX_REFCOUNT_H
@@ -125,6 +134,31 @@ static inline void refcount_set(refcount_t *r, int n)
 	atomic_set(&r->refs, n);
 }
 
+/**
+ * refcount_set_release - set a refcount's value with release ordering
+ * @r: the refcount
+ * @n: value to which the refcount will be set
+ *
+ * This function should be used when memory occupied by the object might be
+ * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
+ *
+ * Provides release memory ordering which will order previous memory operations
+ * against this store. This ensures all updates to this object are visible
+ * once the refcount is set and stale values from the object previously
+ * occupying this memory are overwritten with new ones.
+ *
+ * This function should be called only after new object is fully initialized.
+ * After this call the object should be considered visible to other tasks even
+ * if it was not yet added into an object collection normally used to discover
+ * it. This is because other tasks might have discovered the object previously
+ * occupying the same memory and after memory reuse they can succeed in taking
+ * refcount to the new object and start using it.
+ */
+static inline void refcount_set_release(refcount_t *r, int n)
+{
+	atomic_set_release(&r->refs, n);
+}
+
 /**
  * refcount_read - get a refcount's value
  * @r: the refcount
@@ -178,6 +212,52 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
 	return __refcount_add_not_zero(i, r, NULL);
 }
 
+static inline __must_check __signed_wrap
+bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp)
+{
+	int old = refcount_read(r);
+
+	do {
+		if (!old)
+			break;
+	} while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i));
+
+	if (oldp)
+		*oldp = old;
+
+	if (unlikely(old < 0 || old + i < 0))
+		refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);
+
+	return old;
+}
+
+/**
+ * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0
+ *
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Will saturate at REFCOUNT_SATURATED and WARN.
+ *
+ * This function should be used when memory occupied by the object might be
+ * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
+ *
+ * Provides acquire memory ordering on success, it is assumed the caller has
+ * guaranteed the object memory to be stable (RCU, etc.). It does provide a
+ * control dependency and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc_not_zero_acquire() should instead be used to increment a
+ * reference count.
+ *
+ * Return: false if the passed refcount is 0, true otherwise
+ */
+static inline __must_check bool refcount_add_not_zero_acquire(int i, refcount_t *r)
+{
+	return __refcount_add_not_zero_acquire(i, r, NULL);
+}
+
 static inline __signed_wrap
 void __refcount_add(int i, refcount_t *r, int *oldp)
 {
@@ -236,6 +316,32 @@ static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
 	return __refcount_inc_not_zero(r, NULL);
 }
 
+static inline __must_check bool __refcount_inc_not_zero_acquire(refcount_t *r, int *oldp)
+{
+	return __refcount_add_not_zero_acquire(1, r, oldp);
+}
+
+/**
+ * refcount_inc_not_zero_acquire - increment a refcount with acquire ordering unless it is 0
+ * @r: the refcount to increment
+ *
+ * Similar to refcount_inc_not_zero(), but provides acquire memory ordering on
+ * success.
+ *
+ * This function should be used when memory occupied by the object might be
+ * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
+ *
+ * Provides acquire memory ordering on success, it is assumed the caller has
+ * guaranteed the object memory to be stable (RCU, etc.). It does provide a
+ * control dependency and thereby orders future stores. See the comment on top.
+ *
+ * Return: true if the increment was successful, false otherwise
+ */
+static inline __must_check bool refcount_inc_not_zero_acquire(refcount_t *r)
+{
+	return __refcount_inc_not_zero_acquire(r, NULL);
+}
+
 static inline void __refcount_inc(refcount_t *r, int *oldp)
 {
 	__refcount_add(1, r, oldp);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 09eedaecf120..ad902a2d692b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -136,6 +136,15 @@ enum _slab_flag_bits {
  * rcu_read_lock before reading the address, then rcu_read_unlock after
  * taking the spinlock within the structure expected at that address.
  *
+ * Note that object identity check has to be done *after* acquiring a
+ * reference, therefore user has to ensure proper ordering for loads.
+ * Similarly, when initializing objects allocated with SLAB_TYPESAFE_BY_RCU,
+ * the newly allocated object has to be fully initialized *before* its
+ * refcount gets initialized and proper ordering for stores is required.
+ * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() are
+ * designed with the proper fences required for reference counting objects
+ * allocated with SLAB_TYPESAFE_BY_RCU.
+ *
  * Note that it is not possible to acquire a lock within a structure
  * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference
  * as described above.  The reason is that SLAB_TYPESAFE_BY_RCU pages
-- 
cgit v1.2.3


From 4e0dbe105d5088c77eb09de6f049aaf44711a2ec Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:48 -0800
Subject: refcount: introduce __refcount_{add|inc}_not_zero_limited_acquire

Introduce functions to increase refcount but with a top limit above which
they will fail to increase (the limit is inclusive).  Setting the limit to
INT_MAX indicates no limit.

Link: https://lkml.kernel.org/r/20250213224655.1680278-12-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/refcount.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 4589d2e7bfea..80dc023ac2bf 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -213,13 +213,20 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
 }
 
 static inline __must_check __signed_wrap
-bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp)
+bool __refcount_add_not_zero_limited_acquire(int i, refcount_t *r, int *oldp,
+					     int limit)
 {
 	int old = refcount_read(r);
 
 	do {
 		if (!old)
 			break;
+
+		if (i > limit - old) {
+			if (oldp)
+				*oldp = old;
+			return false;
+		}
 	} while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i));
 
 	if (oldp)
@@ -231,6 +238,18 @@ bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp)
 	return old;
 }
 
+static inline __must_check bool
+__refcount_inc_not_zero_limited_acquire(refcount_t *r, int *oldp, int limit)
+{
+	return __refcount_add_not_zero_limited_acquire(1, r, oldp, limit);
+}
+
+static inline __must_check __signed_wrap
+bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp)
+{
+	return __refcount_add_not_zero_limited_acquire(i, r, oldp, INT_MAX);
+}
+
 /**
  * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0
  *
-- 
cgit v1.2.3


From f35ab95ca0af7a27feab57b9d7e906405bddb093 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:49 -0800
Subject: mm: replace vm_lock and detached flag with a reference count

rw_semaphore is a sizable structure of 40 bytes and consumes considerable
space for each vm_area_struct.  However vma_lock has two important
specifics which can be used to replace rw_semaphore with a simpler
structure:

1. Readers never wait.  They try to take the vma_lock and fall back to
   mmap_lock if that fails.

2. Only one writer at a time will ever try to write-lock a vma_lock
   because writers first take mmap_lock in write mode.  Because of these
   requirements, full rw_semaphore functionality is not needed and we can
   replace rw_semaphore and the vma->detached flag with a refcount
   (vm_refcnt).

When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state.  Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached() to
be done only after vma has been write-locked.  vma_mark_attached() changes
vm_refcnt to 1 to indicate that it has been attached to the vma tree.
When a reader takes read lock, it increments vm_refcnt, unless the top
usable bit of vm_refcnt (0x40000000) is set, indicating presence of a
writer.  When writer takes write lock, it sets the top usable bit to
indicate its presence.  If there are readers, writer will wait using newly
introduced mm->vma_writer_wait.  Since all writers take mmap_lock in write
mode first, there can be only one writer at a time.  The last reader to
release the lock will signal the writer to wake up.  refcount might
overflow if there are many competing readers, in which case read-locking
will fail.  Readers are expected to handle such failures.

In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma
with no readers;
5. vm_refcnt overflow is handled by the readers.

While this vm_lock replacement does not yet result in a smaller
vm_area_struct (it stays at 256 bytes due to cacheline alignment), it
allows for further size optimization by structure member regrouping to
bring the size of vm_area_struct below 192 bytes.

[surenb@google.com: fix a crash due to vma_end_read() that should have been removed]
  Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       | 128 +++++++++++++++++++++++++++++++----------------
 include/linux/mm_types.h |  22 ++++----
 2 files changed, 95 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c24c521e38a2..06f179c844c3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -32,6 +32,7 @@
 #include <linux/memremap.h>
 #include <linux/slab.h>
 #include <linux/cacheinfo.h>
+#include <linux/rcuwait.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -697,19 +698,54 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_PER_VMA_LOCK
-static inline void vma_lock_init(struct vm_area_struct *vma)
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
 {
-	init_rwsem(&vma->vm_lock.lock);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	static struct lock_class_key lockdep_key;
+
+	lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
+#endif
+	if (reset_refcnt)
+		refcount_set(&vma->vm_refcnt, 0);
 	vma->vm_lock_seq = UINT_MAX;
 }
 
+static inline bool is_vma_writer_only(int refcnt)
+{
+	/*
+	 * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
+	 * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
+	 * a detached vma happens only in vma_mark_detached() and is a rare
+	 * case, therefore most of the time there will be no unnecessary wakeup.
+	 */
+	return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
+}
+
+static inline void vma_refcount_put(struct vm_area_struct *vma)
+{
+	/* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
+	struct mm_struct *mm = vma->vm_mm;
+	int oldcnt;
+
+	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+	if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
+
+		if (is_vma_writer_only(oldcnt - 1))
+			rcuwait_wake_up(&mm->vma_writer_wait);
+	}
+}
+
 /*
  * Try to read-lock a vma. The function is allowed to occasionally yield false
  * locked result to avoid performance overhead, in which case we fall back to
  * using mmap_lock. The function should never yield false unlocked result.
+ * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
+ * detached.
  */
-static inline bool vma_start_read(struct vm_area_struct *vma)
+static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma)
 {
+	int oldcnt;
+
 	/*
 	 * Check before locking. A race might cause false locked result.
 	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
@@ -718,15 +754,25 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	 * need ordering is below.
 	 */
 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
-		return false;
+		return NULL;
 
-	if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
-		return false;
+	/*
+	 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
+	 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
+	 * Acquire fence is required here to avoid reordering against later
+	 * vm_lock_seq check and checks inside lock_vma_under_rcu().
+	 */
+	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+							      VMA_REF_LIMIT))) {
+		/* return EAGAIN if vma got detached from under us */
+		return oldcnt ? NULL : ERR_PTR(-EAGAIN);
+	}
 
+	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
 	/*
-	 * Overflow might produce false locked result.
+	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
 	 * False unlocked result is impossible because we modify and check
-	 * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
+	 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
 	 * modification invalidates all existing locks.
 	 *
 	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
@@ -735,10 +781,11 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	 * This pairs with RELEASE semantics in vma_end_write_all().
 	 */
 	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
-		up_read(&vma->vm_lock.lock);
-		return false;
+		vma_refcount_put(vma);
+		return NULL;
 	}
-	return true;
+
+	return vma;
 }
 
 /*
@@ -749,8 +796,14 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
  */
 static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
 {
+	int oldcnt;
+
 	mmap_assert_locked(vma->vm_mm);
-	down_read_nested(&vma->vm_lock.lock, subclass);
+	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+							      VMA_REF_LIMIT)))
+		return false;
+
+	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
 	return true;
 }
 
@@ -762,16 +815,12 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int
  */
 static inline bool vma_start_read_locked(struct vm_area_struct *vma)
 {
-	mmap_assert_locked(vma->vm_mm);
-	down_read(&vma->vm_lock.lock);
-	return true;
+	return vma_start_read_locked_nested(vma, 0);
 }
 
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
-	rcu_read_lock(); /* keeps vma alive till the end of up_read */
-	up_read(&vma->vm_lock.lock);
-	rcu_read_unlock();
+	vma_refcount_put(vma);
 }
 
 /* WARNING! Can only be used if mmap_lock is expected to be write-locked */
@@ -813,38 +862,35 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 
 static inline void vma_assert_locked(struct vm_area_struct *vma)
 {
-	if (!rwsem_is_locked(&vma->vm_lock.lock))
-		vma_assert_write_locked(vma);
+	unsigned int mm_lock_seq;
+
+	VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
+		      !__is_vma_write_locked(vma, &mm_lock_seq), vma);
 }
 
+/*
+ * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
+ * assertions should be made either under mmap_write_lock or when the object
+ * has been isolated under mmap_write_lock, ensuring no competing writers.
+ */
 static inline void vma_assert_attached(struct vm_area_struct *vma)
 {
-	WARN_ON_ONCE(vma->detached);
+	WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
 }
 
 static inline void vma_assert_detached(struct vm_area_struct *vma)
 {
-	WARN_ON_ONCE(!vma->detached);
+	WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
 }
 
 static inline void vma_mark_attached(struct vm_area_struct *vma)
 {
-	vma_assert_detached(vma);
-	vma->detached = false;
-}
-
-static inline void vma_mark_detached(struct vm_area_struct *vma)
-{
-	/* When detaching vma should be write-locked */
 	vma_assert_write_locked(vma);
-	vma_assert_attached(vma);
-	vma->detached = true;
+	vma_assert_detached(vma);
+	refcount_set(&vma->vm_refcnt, 1);
 }
 
-static inline bool is_vma_detached(struct vm_area_struct *vma)
-{
-	return vma->detached;
-}
+void vma_mark_detached(struct vm_area_struct *vma);
 
 static inline void release_fault_lock(struct vm_fault *vmf)
 {
@@ -867,9 +913,9 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 
 #else /* CONFIG_PER_VMA_LOCK */
 
-static inline void vma_lock_init(struct vm_area_struct *vma) {}
-static inline bool vma_start_read(struct vm_area_struct *vma)
-		{ return false; }
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
+static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma)
+		{ return NULL; }
 static inline void vma_end_read(struct vm_area_struct *vma) {}
 static inline void vma_start_write(struct vm_area_struct *vma) {}
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -910,12 +956,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &vma_dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
-#ifdef CONFIG_PER_VMA_LOCK
-	/* vma is not locked, can't use vma_mark_detached() */
-	vma->detached = true;
-#endif
 	vma_numab_state_init(vma);
-	vma_lock_init(vma);
+	vma_lock_init(vma, false);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 36dea20cd101..9de0a6cb3c2d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -19,6 +19,7 @@
 #include <linux/workqueue.h>
 #include <linux/seqlock.h>
 #include <linux/percpu_counter.h>
+#include <linux/types.h>
 
 #include <asm/mmu.h>
 
@@ -629,9 +630,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
 }
 #endif
 
-struct vma_lock {
-	struct rw_semaphore lock;
-};
+#define VMA_LOCK_OFFSET	0x40000000
+#define VMA_REF_LIMIT	(VMA_LOCK_OFFSET - 1)
 
 struct vma_numab_state {
 	/*
@@ -709,19 +709,13 @@ struct vm_area_struct {
 	};
 
 #ifdef CONFIG_PER_VMA_LOCK
-	/*
-	 * Flag to indicate areas detached from the mm->mm_mt tree.
-	 * Unstable RCU readers are allowed to read this.
-	 */
-	bool detached;
-
 	/*
 	 * Can only be written (using WRITE_ONCE()) while holding both:
 	 *  - mmap_lock (in write mode)
-	 *  - vm_lock->lock (in write mode)
+	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set
 	 * Can be read reliably while holding one of:
 	 *  - mmap_lock (in read or write mode)
-	 *  - vm_lock->lock (in read or write mode)
+	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
 	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
 	 * while holding nothing (except RCU to keep the VMA struct allocated).
 	 *
@@ -784,7 +778,10 @@ struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 #ifdef CONFIG_PER_VMA_LOCK
 	/* Unstable RCU readers are allowed to read this. */
-	struct vma_lock vm_lock ____cacheline_aligned_in_smp;
+	refcount_t vm_refcnt ____cacheline_aligned_in_smp;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map vmlock_dep_map;
+#endif
 #endif
 } __randomize_layout;
 
@@ -920,6 +917,7 @@ struct mm_struct {
 					  * by mmlist_lock
 					  */
 #ifdef CONFIG_PER_VMA_LOCK
+		struct rcuwait vma_writer_wait;
 		/*
 		 * This field has lock-like semantics, meaning it is sometimes
 		 * accessed with ACQUIRE/RELEASE semantics.
-- 
cgit v1.2.3


From 6bef4c2f97221f3b595d08c8656eb5845ef80fe9 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:50 -0800
Subject: mm: move lesser used vma_area_struct members into the last cacheline

Move several vma_area_struct members which are rarely or never used during
page fault handling into the last cacheline to better pack vm_area_struct.
As a result vm_area_struct will fit into 3 as opposed to 4 cachelines.
New typical vm_area_struct layout:

struct vm_area_struct {
    union {
        struct {
            long unsigned int vm_start;              /*     0     8 */
            long unsigned int vm_end;                /*     8     8 */
        };                                           /*     0    16 */
        freeptr_t          vm_freeptr;               /*     0     8 */
    };                                               /*     0    16 */
    struct mm_struct *         vm_mm;                /*    16     8 */
    pgprot_t                   vm_page_prot;         /*    24     8 */
    union {
        const vm_flags_t   vm_flags;                 /*    32     8 */
        vm_flags_t         __vm_flags;               /*    32     8 */
    };                                               /*    32     8 */
    unsigned int               vm_lock_seq;          /*    40     4 */

    /* XXX 4 bytes hole, try to pack */

    struct list_head           anon_vma_chain;       /*    48    16 */
    /* --- cacheline 1 boundary (64 bytes) --- */
    struct anon_vma *          anon_vma;             /*    64     8 */
    const struct vm_operations_struct  * vm_ops;     /*    72     8 */
    long unsigned int          vm_pgoff;             /*    80     8 */
    struct file *              vm_file;              /*    88     8 */
    void *                     vm_private_data;      /*    96     8 */
    atomic_long_t              swap_readahead_info;  /*   104     8 */
    struct mempolicy *         vm_policy;            /*   112     8 */
    struct vma_numab_state *   numab_state;          /*   120     8 */
    /* --- cacheline 2 boundary (128 bytes) --- */
    refcount_t          vm_refcnt (__aligned__(64)); /*   128     4 */

    /* XXX 4 bytes hole, try to pack */

    struct {
        struct rb_node     rb (__aligned__(8));      /*   136    24 */
        long unsigned int  rb_subtree_last;          /*   160     8 */
    } __attribute__((__aligned__(8))) shared;        /*   136    32 */
    struct anon_vma_name *     anon_name;            /*   168     8 */
    struct vm_userfaultfd_ctx  vm_userfaultfd_ctx;   /*   176     8 */

    /* size: 192, cachelines: 3, members: 18 */
    /* sum members: 176, holes: 2, sum holes: 8 */
    /* padding: 8 */
    /* forced alignments: 2, forced holes: 1, sum forced holes: 4 */
} __attribute__((__aligned__(64)));

Memory consumption per 1000 VMAs becomes 48 pages:

    slabinfo after vm_area_struct changes:
     <name>           ... <objsize> <objperslab> <pagesperslab> : ...
     vm_area_struct   ...    192   42    2 : ...

Link: https://lkml.kernel.org/r/20250213224655.1680278-14-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9de0a6cb3c2d..c3aa0e20be41 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -725,17 +725,6 @@ struct vm_area_struct {
 	 */
 	unsigned int vm_lock_seq;
 #endif
-
-	/*
-	 * For areas with an address space and backing store,
-	 * linkage into the address_space->i_mmap interval tree.
-	 *
-	 */
-	struct {
-		struct rb_node rb;
-		unsigned long rb_subtree_last;
-	} shared;
-
 	/*
 	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
 	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
@@ -755,14 +744,6 @@ struct vm_area_struct {
 	struct file * vm_file;		/* File we map to (can be NULL). */
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 
-#ifdef CONFIG_ANON_VMA_NAME
-	/*
-	 * For private and shared anonymous mappings, a pointer to a null
-	 * terminated string containing the name given to the vma, or NULL if
-	 * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
-	 */
-	struct anon_vma_name *anon_name;
-#endif
 #ifdef CONFIG_SWAP
 	atomic_long_t swap_readahead_info;
 #endif
@@ -775,7 +756,6 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA_BALANCING
 	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
 #endif
-	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 #ifdef CONFIG_PER_VMA_LOCK
 	/* Unstable RCU readers are allowed to read this. */
 	refcount_t vm_refcnt ____cacheline_aligned_in_smp;
@@ -783,6 +763,24 @@ struct vm_area_struct {
 	struct lockdep_map vmlock_dep_map;
 #endif
 #endif
+	/*
+	 * For areas with an address space and backing store,
+	 * linkage into the address_space->i_mmap interval tree.
+	 *
+	 */
+	struct {
+		struct rb_node rb;
+		unsigned long rb_subtree_last;
+	} shared;
+#ifdef CONFIG_ANON_VMA_NAME
+	/*
+	 * For private and shared anonymous mappings, a pointer to a null
+	 * terminated string containing the name given to the vma, or NULL if
+	 * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
+	 */
+	struct anon_vma_name *anon_name;
+#endif
+	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 } __randomize_layout;
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3


From e218d9fedd056a0b17468d6e19fddaf2c3550f97 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:52 -0800
Subject: mm: remove extra vma_numab_state_init() call

vma_init() already memset's the whole vm_area_struct to 0, so there is no
need to an additional vma_numab_state_init().

Link: https://lkml.kernel.org/r/20250213224655.1680278-16-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 06f179c844c3..aad932c4bcf0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -956,7 +956,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &vma_dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	vma_numab_state_init(vma);
 	vma_lock_init(vma, false);
 }
 
-- 
cgit v1.2.3


From e49510bf00de4f832eaaebcfc113795f127ad519 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:53 -0800
Subject: mm: prepare lock_vma_under_rcu() for vma reuse possibility

Once we make vma cache SLAB_TYPESAFE_BY_RCU, it will be possible for a vma
to be reused and attached to another mm after lock_vma_under_rcu() locks
the vma.  lock_vma_under_rcu() should ensure that vma_start_read() is
using the original mm and after locking the vma it should ensure that
vma->vm_mm has not changed from under us.

Link: https://lkml.kernel.org/r/20250213224655.1680278-17-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index aad932c4bcf0..e3f962de1677 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -739,10 +739,13 @@ static inline void vma_refcount_put(struct vm_area_struct *vma)
  * Try to read-lock a vma. The function is allowed to occasionally yield false
  * locked result to avoid performance overhead, in which case we fall back to
  * using mmap_lock. The function should never yield false unlocked result.
+ * False locked result is possible if mm_lock_seq overflows or if vma gets
+ * reused and attached to a different mm before we lock it.
  * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
  * detached.
  */
-static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma)
+static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
+						    struct vm_area_struct *vma)
 {
 	int oldcnt;
 
@@ -753,7 +756,7 @@ static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma)
 	 * we don't rely on for anything - the mm_lock_seq read against which we
 	 * need ordering is below.
 	 */
-	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
+	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
 		return NULL;
 
 	/*
@@ -780,7 +783,7 @@ static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma)
 	 * after it has been unlocked.
 	 * This pairs with RELEASE semantics in vma_end_write_all().
 	 */
-	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
+	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
 		vma_refcount_put(vma);
 		return NULL;
 	}
@@ -914,7 +917,8 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 #else /* CONFIG_PER_VMA_LOCK */
 
 static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
-static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma)
+static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
+						    struct vm_area_struct *vma)
 		{ return NULL; }
 static inline void vma_end_read(struct vm_area_struct *vma) {}
 static inline void vma_start_write(struct vm_area_struct *vma) {}
-- 
cgit v1.2.3


From 3104138517fc66aad21f4a2487bb572e9fc2e3ec Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 13 Feb 2025 14:46:54 -0800
Subject: mm: make vma cache SLAB_TYPESAFE_BY_RCU

To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
object reuse before RCU grace period is over will be detected by
lock_vma_under_rcu().

Current checks are sufficient as long as vma is detached before it is
freed.  The only place this is not currently happening is in exit_mmap().
Add the missing vma_mark_detached() in exit_mmap().

Another issue which might trick lock_vma_under_rcu() during vma reuse is
vm_area_dup(), which copies the entire content of the vma into a new one,
overriding new vma's vm_refcnt and temporarily making it appear as
attached.  This might trick a racing lock_vma_under_rcu() to operate on a
reused vma if it found the vma before it got reused.  To prevent this
situation, we should ensure that vm_refcnt stays at detached state (0)
when it is copied and advances to attached state only after it is added
into the vma tree.  Introduce vm_area_init_from() which preserves new
vma's vm_refcnt and use it in vm_area_dup().  Since all vmas are in
detached state with no current readers when they are freed,

lock_vma_under_rcu() will not be able to take vm_refcnt after vma got
detached even if vma is reused. vma_mark_attached() in modified to
include a release fence to ensure all stores to the vma happen before
vm_refcnt gets initialized.

Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate
vm_area_struct reuse and will minimize the number of call_rcu() calls.

[surenb@google.com: remove atomic_set_release() usage in tools/]
  Link: https://lkml.kernel.org/r/20250217054351.2973666-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-18-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Shivank Garg <shivankg@amd.com>
  Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       |  4 +---
 include/linux/mm_types.h | 13 ++++++++++---
 include/linux/slab.h     |  6 ------
 3 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e3f962de1677..14115c9949d8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
 struct vm_area_struct *vm_area_alloc(struct mm_struct *);
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
 void vm_area_free(struct vm_area_struct *);
-/* Use only if VMA has no other users */
-void __vm_area_free(struct vm_area_struct *vma);
 
 #ifndef CONFIG_MMU
 extern struct rb_root nommu_region_tree;
@@ -890,7 +888,7 @@ static inline void vma_mark_attached(struct vm_area_struct *vma)
 {
 	vma_assert_write_locked(vma);
 	vma_assert_detached(vma);
-	refcount_set(&vma->vm_refcnt, 1);
+	refcount_set_release(&vma->vm_refcnt, 1);
 }
 
 void vma_mark_detached(struct vm_area_struct *vma);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c3aa0e20be41..6a93abb4452b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -574,6 +574,12 @@ static inline void *folio_get_private(struct folio *folio)
 
 typedef unsigned long vm_flags_t;
 
+/*
+ * freeptr_t represents a SLUB freelist pointer, which might be encoded
+ * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
+ */
+typedef struct { unsigned long v; } freeptr_t;
+
 /*
  * A region containing a mapping of a non-memory backed file under NOMMU
  * conditions.  These are held in a global tree and are pinned by the VMAs that
@@ -677,6 +683,9 @@ struct vma_numab_state {
  *
  * Only explicitly marked struct members may be accessed by RCU readers before
  * getting a stable reference.
+ *
+ * WARNING: when adding new members, please update vm_area_init_from() to copy
+ * them during vm_area_struct content duplication.
  */
 struct vm_area_struct {
 	/* The first cache line has the info for VMA tree walking. */
@@ -687,9 +696,7 @@ struct vm_area_struct {
 			unsigned long vm_start;
 			unsigned long vm_end;
 		};
-#ifdef CONFIG_PER_VMA_LOCK
-		struct rcu_head vm_rcu;	/* Used for deferred freeing. */
-#endif
+		freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
 	};
 
 	/*
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ad902a2d692b..f8924fd6ea26 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -243,12 +243,6 @@ enum _slab_flag_bits {
 #define SLAB_NO_OBJ_EXT		__SLAB_FLAG_UNUSED
 #endif
 
-/*
- * freeptr_t represents a SLUB freelist pointer, which might be encoded
- * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
- */
-typedef struct { unsigned long v; } freeptr_t;
-
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
-- 
cgit v1.2.3


From 63a23847dc47113b879a5f53cc0ca5cedc881ffd Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 17 Feb 2025 19:20:06 +0000
Subject: fs: convert block_commit_write() to take a folio

All callers now have a folio, so pass it in instead of converting
folio->page->folio.

Link: https://lkml.kernel.org/r/20250217192009.437916-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 932139c5d46f..6672e1a5031c 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -271,7 +271,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
 			unsigned, struct folio **, void **,
 			get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
-void block_commit_write(struct page *page, unsigned int from, unsigned int to);
+void block_commit_write(struct folio *folio, size_t from, size_t to);
 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 				get_block_t get_block);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
-- 
cgit v1.2.3


From 52d671a1a36a16f3a0dd9a2beff964e75bce9787 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 17 Feb 2025 19:20:07 +0000
Subject: fs: remove page_file_mapping()

This wrapper has no more callers.  Delete it.

Link: https://lkml.kernel.org/r/20250217192009.437916-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 47bfc6b1b632..975c56fb4f85 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -575,11 +575,6 @@ static inline struct address_space *folio_flush_mapping(struct folio *folio)
 	return folio_mapping(folio);
 }
 
-static inline struct address_space *page_file_mapping(struct page *page)
-{
-	return folio_file_mapping(page_folio(page));
-}
-
 /**
  * folio_inode - Get the host inode for this folio.
  * @folio: The folio.
-- 
cgit v1.2.3


From 0d40cfe63a2f19b9d375382e6d90b9ebd412901e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 17 Feb 2025 19:20:08 +0000
Subject: fs: remove folio_file_mapping()

No callers of this function remain as filesystems no longer see swapfile
pages through their normal read/write paths.

Link: https://lkml.kernel.org/r/20250217192009.437916-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 975c56fb4f85..ad7c0f615e9b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -535,26 +535,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
 struct address_space *folio_mapping(struct folio *);
 struct address_space *swapcache_mapping(struct folio *);
 
-/**
- * folio_file_mapping - Find the mapping this folio belongs to.
- * @folio: The folio.
- *
- * For folios which are in the page cache, return the mapping that this
- * page belongs to.  Folios in the swap cache return the mapping of the
- * swap file or swap device where the data is stored.  This is different
- * from the mapping returned by folio_mapping().  The only reason to
- * use it is if, like NFS, you return 0 from ->activate_swapfile.
- *
- * Do not call this for folios which aren't in the page cache or swap cache.
- */
-static inline struct address_space *folio_file_mapping(struct folio *folio)
-{
-	if (unlikely(folio_test_swapcache(folio)))
-		return swapcache_mapping(folio);
-
-	return folio->mapping;
-}
-
 /**
  * folio_flush_mapping - Find the file mapping this folio belongs to.
  * @folio: The folio.
-- 
cgit v1.2.3


From 86758b504864913233f6a16076184ba784cd4466 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Tue, 18 Feb 2025 15:49:54 +0530
Subject: mm/ioremap: pass pgprot_t to ioremap_prot() instead of unsigned long

ioremap_prot() currently accepts pgprot_val parameter as an unsigned long,
thus implicitly assuming that pgprot_val and pgprot_t could never be
bigger than unsigned long.  But this assumption soon will not be true on
arm64 when using D128 pgtables.  In 128 bit page table configuration,
unsigned long is 64 bit, but pgprot_t is 128 bit.

Passing platform abstracted pgprot_t argument is better as compared to
size based data types.  Let's change the parameter to directly pass
pgprot_t like another similar helper generic_ioremap_prot().

Without this change in place, D128 configuration does not work on arm64 as
the top 64 bits gets silently stripped when passing the protection value
to this function.

Link: https://lkml.kernel.org/r/20250218101954.415331-1-anshuman.khandual@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Co-developed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com> [arm64]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/io.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index a5cbbf3e26ec..402020b23423 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -1111,7 +1111,7 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size,
 				   pgprot_t prot);
 
 void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
-			   unsigned long prot);
+			   pgprot_t prot);
 void iounmap(volatile void __iomem *addr);
 void generic_iounmap(volatile void __iomem *addr);
 
@@ -1120,7 +1120,7 @@ void generic_iounmap(volatile void __iomem *addr);
 static inline void __iomem *ioremap(phys_addr_t addr, size_t size)
 {
 	/* _PAGE_IOREMAP needs to be supplied by the architecture */
-	return ioremap_prot(addr, size, _PAGE_IOREMAP);
+	return ioremap_prot(addr, size, __pgprot(_PAGE_IOREMAP));
 }
 #endif
 #endif /* !CONFIG_MMU || CONFIG_GENERIC_IOREMAP */
-- 
cgit v1.2.3


From c009da4258f9885c5a3749fc004870db9c0e7a99 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Fri, 28 Feb 2025 18:29:03 +0000
Subject: mm, cma: support multiple contiguous ranges, if requested

Currently, CMA manages one range of physically contiguous memory.
Creation of larger CMA areas with hugetlb_cma may run in to gaps in
physical memory, so that they are not able to allocate that contiguous
physical range from memblock when creating the CMA area.

This can happen, for example, on an AMD system with > 1TB of memory, where
there will be a gap just below the 1TB (40bit DMA) line.  If you have set
aside most of memory for potential hugetlb CMA allocation,
cma_declare_contiguous_nid will fail.

hugetlb_cma doesn't need the entire area to be one physically contiguous
range.  It just cares about being able to get physically contiguous chunks
of a certain size (e.g.  1G), and it is fine to have the CMA area backed
by multiple physical ranges, as long as it gets 1G contiguous allocations.

Multi-range support is implemented by introducing an array of ranges,
instead of just one big one.  Each range has its own bitmap.  Effectively,
the allocate and release operations work as before, just per-range.  So,
instead of going through one large bitmap, they now go through a number of
smaller ones.

The maximum number of supported ranges is 8, as defined in CMA_MAX_RANGES.

Since some current users of CMA expect a CMA area to just use one
physically contiguous range, only allow for multiple ranges if a new
interface, cma_declare_contiguous_nid_multi, is used.  The other
interfaces will work like before, creating only CMA areas with 1 range.

cma_declare_contiguous_nid_multi works as follows, mimicking the
default "bottom-up, above 4G" reservation approach:

0) Try cma_declare_contiguous_nid, which will use only one
   region. If this succeeds, return. This makes sure that for
   all the cases that currently work, the behavior remains
   unchanged even if the caller switches from
   cma_declare_contiguous_nid to cma_declare_contiguous_nid_multi.
1) Select the largest free memblock ranges above 4G, with
   a maximum number of CMA_MAX_RANGES.
2) If we did not find at most CMA_MAX_RANGES that add
   up to the total size requested, return -ENOMEM.
3) Sort the selected ranges by base address.
4) Reserve them bottom-up until we get what we wanted.

Link: https://lkml.kernel.org/r/20250228182928.2645936-3-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cma.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/cma.h b/include/linux/cma.h
index d15b64f51336..863427c27dc2 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -40,6 +40,9 @@ static inline int __init cma_declare_contiguous(phys_addr_t base,
 	return cma_declare_contiguous_nid(base, size, limit, alignment,
 			order_per_bit, fixed, name, res_cma, NUMA_NO_NODE);
 }
+extern int __init cma_declare_contiguous_multi(phys_addr_t size,
+			phys_addr_t align, unsigned int order_per_bit,
+			const char *name, struct cma **res_cma, int nid);
 extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 					unsigned int order_per_bit,
 					const char *name,
-- 
cgit v1.2.3


From 624ab90b7b8758090654520b03c97cecc438a414 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Fri, 28 Feb 2025 18:29:04 +0000
Subject: mm/cma: introduce cma_intersects function

Now that CMA areas can have multiple physical ranges, code can't assume a
CMA struct represents a base_pfn plus a size, as returned from
cma_get_base.

Most cases are ok though, since they all explicitly refer to CMA areas
that were created using existing interfaces (cma_declare_contiguous_nid or
cma_init_reserved_mem), which guarantees they have just one physical
range.

An exception is the s390 code, which walks all CMA ranges to see if they
intersect with a range of memory that is about to be hotremoved.  So, in
the future, it might run in to multi-range areas.  To keep this check
working, define a cma_intersects function.  This just checks if a physaddr
range intersects any of the ranges.  Use it in the s390 check.

Link: https://lkml.kernel.org/r/20250228182928.2645936-4-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Acked-by: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cma.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/cma.h b/include/linux/cma.h
index 863427c27dc2..03d85c100dcc 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -53,6 +53,7 @@ extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned
 extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count);
 
 extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
+extern bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end);
 
 extern void cma_reserve_pages_on_error(struct cma *cma);
 
-- 
cgit v1.2.3


From 5b47c02967ab770aa7661c8863a21b2fd59e35ff Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Fri, 28 Feb 2025 18:29:08 +0000
Subject: mm/hugetlb: convert cmdline parameters from setup to early

Convert the cmdline parameters (hugepagesz, hugepages, default_hugepagesz
and hugetlb_free_vmemmap) to early parameters.

Since parse_early_param might run before MMU setups on some platforms
(powerpc), validation of huge page sizes as specified in command line
parameters would fail.  So instead, for the hstate-related values, just
record the them and parse them on demand, from hugetlb_bootmem_alloc.

The allocation of hugetlb bootmem pages is now done in
hugetlb_bootmem_alloc, which is called explicitly at the start of
mm_core_init().  core_initcall would be too late, as that happens with
memblock already torn down.

This change will allow earlier allocation and initialization of bootmem
hugetlb pages later on.

No functional change intended.

Link: https://lkml.kernel.org/r/20250228182928.2645936-8-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 76a75ec03dd6..a596aaa178d1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -174,6 +174,8 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);
 extern int sysctl_hugetlb_shm_group;
 extern struct list_head huge_boot_pages[MAX_NUMNODES];
 
+void hugetlb_bootmem_alloc(void);
+
 /* arch callbacks */
 
 #ifndef CONFIG_HIGHPTE
@@ -1257,6 +1259,10 @@ static inline bool hugetlbfs_pagecache_present(
 {
 	return false;
 }
+
+static inline void hugetlb_bootmem_alloc(void)
+{
+}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 static inline spinlock_t *huge_pte_lock(struct hstate *h,
-- 
cgit v1.2.3


From 243a75e236802614075511266c8d1834d4bcffe9 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Fri, 28 Feb 2025 18:29:10 +0000
Subject: mm/bootmem_info: export register_page_bootmem_memmap

If other mm code wants to use this function for early memmap inialization
(on the platforms that have it), it should be made available properly, not
just unconditionally in mm.h

Make this function available for such cases.

Link: https://lkml.kernel.org/r/20250228182928.2645936-10-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/bootmem_info.h | 7 +++++++
 include/linux/mm.h           | 3 ---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
index d8a8d245824a..4c506e76a808 100644
--- a/include/linux/bootmem_info.h
+++ b/include/linux/bootmem_info.h
@@ -18,6 +18,8 @@ enum bootmem_type {
 
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
 void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
+void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
+				  unsigned long nr_pages);
 
 void get_page_bootmem(unsigned long info, struct page *page,
 		enum bootmem_type type);
@@ -58,6 +60,11 @@ static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
 }
 
+static inline void register_page_bootmem_memmap(unsigned long section_nr,
+		struct page *map, unsigned long nr_pages)
+{
+}
+
 static inline void put_page_bootmem(struct page *page)
 {
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 14115c9949d8..e90ab0bdcc8c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4018,9 +4018,6 @@ static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
 }
 #endif
 
-void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
-				  unsigned long nr_pages);
-
 enum mf_flags {
 	MF_COUNT_INCREASED = 1 << 0,
 	MF_ACTION_REQUIRED = 1 << 1,
-- 
cgit v1.2.3


From d65917c42373f70159a3fc453f8f028fd665e04f Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Fri, 28 Feb 2025 18:29:11 +0000
Subject: mm/sparse: allow for alternate vmemmap section init at boot

Add functions that are called just before the per-section memmap is
initialized and just before the memmap page structures are initialized.
They are called sparse_vmemmap_init_nid_early and
sparse_vmemmap_init_nid_late, respectively.

This allows for mm subsystems to add calls to initialize memmap and page
structures in a specific way, if using SPARSEMEM_VMEMMAP.  Specifically,
hugetlb can pre-HVO bootmem allocated pages that way, so that no time and
resources are wasted on allocating vmemmap pages, only to free them later
(and possibly unnecessarily running the system out of memory in the
process).

Refactor some code and export a few convenience functions for external
use.

In sparse_init_nid, skip any sections that are already initialized, e.g.
they have been initialized by sparse_vmemmap_init_nid_early already.

The hugetlb code to use these functions will be added in a later commit.

Export section_map_size, as any alternate memmap init code will want to
use it.

The internal config option to enable this is SPARSEMEM_VMEMMAP_PREINIT,
which is selected if an architecture-specific option,
ARCH_WANT_HUGETLB_VMEMMAP_PREINIT, is set.  In the future, if other
subsystems want to do preinit too, they can do it in a similar fashion.

The internal config option is there because a section flag is used, and
the number of flags available is architecture-dependent (see mmzone.h).
Architecures can decide if there is room for the flag when enabling
options that select SPARSEMEM_VMEMMAP_PREINIT.

Fortunately, as of right now, all sparse vmemmap using architectures do
have room.

Link: https://lkml.kernel.org/r/20250228182928.2645936-11-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h     |  1 +
 include/linux/mmzone.h | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e90ab0bdcc8c..03e4807bd911 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3928,6 +3928,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 #endif
 
 void *sparse_buffer_alloc(unsigned long size);
+unsigned long section_map_size(void);
 struct page * __populate_section_memmap(unsigned long pfn,
 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
 		struct dev_pagemap *pgmap);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9540b41894da..44ecb2f90db4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1933,6 +1933,9 @@ enum {
 	SECTION_IS_EARLY_BIT,
 #ifdef CONFIG_ZONE_DEVICE
 	SECTION_TAINT_ZONE_DEVICE_BIT,
+#endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+	SECTION_IS_VMEMMAP_PREINIT_BIT,
 #endif
 	SECTION_MAP_LAST_BIT,
 };
@@ -1944,6 +1947,9 @@ enum {
 #ifdef CONFIG_ZONE_DEVICE
 #define SECTION_TAINT_ZONE_DEVICE	BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
 #endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+#define SECTION_IS_VMEMMAP_PREINIT	BIT(SECTION_IS_VMEMMAP_PREINIT_BIT)
+#endif
 #define SECTION_MAP_MASK		(~(BIT(SECTION_MAP_LAST_BIT) - 1))
 #define SECTION_NID_SHIFT		SECTION_MAP_LAST_BIT
 
@@ -1998,6 +2004,30 @@ static inline int online_device_section(struct mem_section *section)
 }
 #endif
 
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+static inline int preinited_vmemmap_section(struct mem_section *section)
+{
+	return (section &&
+		(section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT));
+}
+
+void sparse_vmemmap_init_nid_early(int nid);
+void sparse_vmemmap_init_nid_late(int nid);
+
+#else
+static inline int preinited_vmemmap_section(struct mem_section *section)
+{
+	return 0;
+}
+static inline void sparse_vmemmap_init_nid_early(int nid)
+{
+}
+
+static inline void sparse_vmemmap_init_nid_late(int nid)
+{
+}
+#endif
+
 static inline int online_section_nr(unsigned long nr)
 {
 	return online_section(__nr_to_section(nr));
@@ -2035,6 +2065,9 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
 }
 #endif
 
+void sparse_init_early_section(int nid, struct page *map, unsigned long pnum,
+			       unsigned long flags);
+
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
 /**
  * pfn_valid - check if there is a valid memory map entry for a PFN
@@ -2116,6 +2149,8 @@ void sparse_init(void);
 #else
 #define sparse_init()	do {} while (0)
 #define sparse_index_init(_sec, _nid)  do {} while (0)
+#define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0)
+#define sparse_vmemmap_init_nid_late(_nid) do {} while (0)
 #define pfn_in_present_section pfn_valid
 #define subsection_map_init(_pfn, _nr_pages) do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
-- 
cgit v1.2.3


From 9eb6207b7812cee13431831c9661d21b53f3e93f Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Fri, 28 Feb 2025 18:29:15 +0000
Subject: mm/sparse: add vmemmap_*_hvo functions

Add a few functions to enable early HVO:

vmemmap_populate_hvo
vmemmap_undo_hvo
vmemmap_wrprotect_hvo

The populate and undo functions are expected to be used in early init,
from the sparse_init_nid_early() function.  The wrprotect function is to
be used, potentially, later.

To implement these functions, mostly re-use the existing compound pages
vmemmap logic used by DAX.  vmemmap_populate_address has its argument
changed a bit in this commit: the page structure passed in to be reused in
the mapping is replaced by a PFN and a flag.  The flag indicates whether
an extra ref should be taken on the vmemmap page containing the head page
structure.  Taking the ref is appropriate to for DAX / ZONE_DEVICE, but
not for HugeTLB HVO.

The HugeTLB vmemmap optimization maps tail page structure pages read-only.
The vmemmap_wrprotect_hvo function that does this is implemented
separately, because it cannot be guaranteed that reserved page structures
will not be write accessed during memory initialization.  Even with
CONFIG_DEFERRED_STRUCT_PAGE_INIT, they might still be written to (if they
are at the bottom of a zone).  So, vmemmap_populate_hvo leaves the tail
page structure pages RW initially, and then later during initialization,
after memmap init is fully done, vmemmap_wrprotect_hvo must be called to
finish the job.

Subsequent commits will use these functions for early HugeTLB HVO.

Link: https://lkml.kernel.org/r/20250228182928.2645936-15-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 03e4807bd911..9a74a3ee68bc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3937,7 +3937,8 @@ p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
 pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
-			    struct vmem_altmap *altmap, struct page *reuse);
+			    struct vmem_altmap *altmap, unsigned long ptpfn,
+			    unsigned long flags);
 void *vmemmap_alloc_block(unsigned long size, int node);
 struct vmem_altmap;
 void *vmemmap_alloc_block_buf(unsigned long size, int node,
@@ -3953,6 +3954,12 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
 			       int node, struct vmem_altmap *altmap);
 int vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap);
+int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node,
+			 unsigned long headsize);
+int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node,
+		     unsigned long headsize);
+void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
+			  unsigned long headsize);
 void vmemmap_populate_print_last(void);
 #ifdef CONFIG_MEMORY_HOTPLUG
 void vmemmap_free(unsigned long start, unsigned long end,
-- 
cgit v1.2.3


From d58b2498200724e4f8c12d71a5953da03c8c8bdf Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Fri, 28 Feb 2025 18:29:16 +0000
Subject: mm/hugetlb: deal with multiple calls to hugetlb_bootmem_alloc

Architectures that want pre-HVO of hugetlb vmemmap pages will need to call
hugetlb_bootmem_alloc from an earlier spot in boot (before sparse_init).
To facilitate some architectures doing this, protect hugetlb_bootmem_alloc
against multiple calls.

Also provide a helper function to check if it's been called, so that the
early HVO code, to be added later, can see if there is anything to do.

Link: https://lkml.kernel.org/r/20250228182928.2645936-16-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a596aaa178d1..f0ab4ca4ecf2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -175,6 +175,7 @@ extern int sysctl_hugetlb_shm_group;
 extern struct list_head huge_boot_pages[MAX_NUMNODES];
 
 void hugetlb_bootmem_alloc(void);
+bool hugetlb_bootmem_allocated(void);
 
 /* arch callbacks */
 
@@ -1263,6 +1264,11 @@ static inline bool hugetlbfs_pagecache_present(
 static inline void hugetlb_bootmem_alloc(void)
 {
 }
+
+static inline bool hugetlb_bootmem_allocated(void)
+{
+	return false;
+}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 static inline spinlock_t *huge_pte_lock(struct hstate *h,
-- 
cgit v1.2.3


From 752fe17af693323dba5622b19c858a46fed219a1 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Fri, 28 Feb 2025 18:29:18 +0000
Subject: mm/hugetlb: add pre-HVO framework

Define flags for pre-HVOed bootmem hugetlb pages, and act on them.

The most important flag is the HVO flag, signalling that a bootmem
allocated gigantic page has already been HVO-ed.  If this flag is seen by
the hugetlb bootmem gather code, the page is marked as HVO optimized.  The
HVO code will then not try to optimize it again.  Instead, it will just
map the tail page mirror pages read-only, completing the HVO steps.

No functional change, as nothing sets the flags yet.

Link: https://lkml.kernel.org/r/20250228182928.2645936-18-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f0ab4ca4ecf2..bbccc3e6b9dd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -681,8 +681,12 @@ struct hstate {
 struct huge_bootmem_page {
 	struct list_head list;
 	struct hstate *hstate;
+	unsigned long flags;
 };
 
+#define HUGE_BOOTMEM_HVO		0x0001
+#define HUGE_BOOTMEM_ZONES_VALID	0x0002
+
 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
 int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
 void wait_for_freed_hugetlb_folios(void);
-- 
cgit v1.2.3


From b1222550fbf73840e31a103305d03ad53b8f5a59 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Fri, 28 Feb 2025 18:29:20 +0000
Subject: mm/hugetlb: do pre-HVO for bootmem allocated pages

For large systems, the overhead of vmemmap pages for hugetlb is
substantial.  It's about 1.5% of memory, which is about 45G for a 3T
system.  If you want to configure most of that system for hugetlb (e.g.
to use as backing memory for VMs), there is a chance of running out of
memory on boot, even though you know that the 45G will become available
later.

To avoid this scenario, and since it's a waste to first allocate and then
free that 45G during boot, do pre-HVO for hugetlb bootmem allocated pages
('gigantic' pages).

pre-HVO is done by adding functions that are called from
sparse_init_nid_early and sparse_init_nid_late.  The first is called
before memmap allocation, so it takes care of allocating memmap HVO-style.
The second verifies that all bootmem pages look good, specifically it
checks that they do not intersect with multiple zones.  This can only be
done from sparse_init_nid_late path, when zones have been initialized.

The hugetlb page size must be aligned to the section size, and aligned to
the size of memory described by the number of page structures contained in
one PMD (since pre-HVO is not prepared to split PMDs).  This should be
true for most 'gigantic' pages, it is for 1G pages on x86, where both of
these alignment requirements are 128M.

This will only have an effect if hugetlb_bootmem_alloc was called early in
boot.  If not, it won't do anything, and HVO for bootmem hugetlb pages
works as before.

Link: https://lkml.kernel.org/r/20250228182928.2645936-20-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index bbccc3e6b9dd..f6b82b0524ed 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -687,6 +687,8 @@ struct huge_bootmem_page {
 #define HUGE_BOOTMEM_HVO		0x0001
 #define HUGE_BOOTMEM_ZONES_VALID	0x0002
 
+bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m);
+
 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
 int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
 void wait_for_freed_hugetlb_folios(void);
-- 
cgit v1.2.3


From 9320fa2717810a7d3451dd1a18c31f986a1d4068 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Fri, 28 Feb 2025 18:29:24 +0000
Subject: mm/cma: introduce a cma validate function

Define a function to check if a CMA area is valid, which means: do its
ranges not cross any zone boundaries.  Store the result in the newly
created flags for each CMA area, so that multiple calls are dealt with.

This allows for checking the validity of a CMA area early, which is needed
later in order to be able to allocate hugetlb bootmem pages from it with
pre-HVO.

Link: https://lkml.kernel.org/r/20250228182928.2645936-24-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cma.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/cma.h b/include/linux/cma.h
index 03d85c100dcc..62d9c1cf6326 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -60,6 +60,7 @@ extern void cma_reserve_pages_on_error(struct cma *cma);
 #ifdef CONFIG_CMA
 struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp);
 bool cma_free_folio(struct cma *cma, const struct folio *folio);
+bool cma_validate_zones(struct cma *cma);
 #else
 static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp)
 {
@@ -70,6 +71,10 @@ static inline bool cma_free_folio(struct cma *cma, const struct folio *folio)
 {
 	return false;
 }
+static inline bool cma_validate_zones(struct cma *cma)
+{
+	return false;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From d2d78671408061b5332d9ad357686812bbec5070 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Fri, 28 Feb 2025 18:29:27 +0000
Subject: mm/hugetlb: enable bootmem allocation from CMA areas

If hugetlb_cma_only is enabled, we know that hugetlb pages can only be
allocated from CMA.  Now that there is an interface to do early
reservations from a CMA area (returning memblock memory), it can be used
to allocate hugetlb pages from CMA.

This also allows for doing pre-HVO on these pages (if enabled).

Make sure to initialize the page structures and associated data correctly.
Create a flag to signal that a hugetlb page has been allocated from CMA
to make things a little easier.

Some configurations of powerpc have a special hugetlb bootmem allocator,
so introduce a boolean arch_specific_huge_bootmem_alloc that returns true
if such an allocator is present.  In that case, CMA bootmem allocations
can't be used, so check that function before trying.

Link: https://lkml.kernel.org/r/20250228182928.2645936-27-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f6b82b0524ed..8f3ac832ee7f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -591,6 +591,7 @@ enum hugetlb_page_flags {
 	HPG_freed,
 	HPG_vmemmap_optimized,
 	HPG_raw_hwp_unreliable,
+	HPG_cma,
 	__NR_HPAGEFLAGS,
 };
 
@@ -650,6 +651,7 @@ HPAGEFLAG(Temporary, temporary)
 HPAGEFLAG(Freed, freed)
 HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
 HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
+HPAGEFLAG(Cma, cma)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
@@ -678,14 +680,18 @@ struct hstate {
 	char name[HSTATE_NAME_LEN];
 };
 
+struct cma;
+
 struct huge_bootmem_page {
 	struct list_head list;
 	struct hstate *hstate;
 	unsigned long flags;
+	struct cma *cma;
 };
 
 #define HUGE_BOOTMEM_HVO		0x0001
 #define HUGE_BOOTMEM_ZONES_VALID	0x0002
+#define HUGE_BOOTMEM_CMA		0x0004
 
 bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m);
 
@@ -824,6 +830,17 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift,
 }
 #endif
 
+#ifndef arch_has_huge_bootmem_alloc
+/*
+ * Some architectures do their own bootmem allocation, so they can't use
+ * early CMA allocation.
+ */
+static inline bool arch_has_huge_bootmem_alloc(void)
+{
+	return false;
+}
+#endif
+
 static inline struct hstate *folio_hstate(struct folio *folio)
 {
 	VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
-- 
cgit v1.2.3


From f809b9f3046f702bc1ae40695acb27c1b0abc346 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 19 Feb 2025 14:01:45 -0800
Subject: mm/damon: implement a new DAMOS filter type for unmapped pages

Patch series "mm/damon: introduce DAMOS filter type for unmapped pages".

User decides whether their memory will be mapped or unmapped.  It implies
that the two types of memory can have different characteristics and
management requirements.  Provide the DAMON-observaibility DAMOS-operation
capability for the different types by introducing a new DAMOS filter type
for unmapped pages.


This patch (of 2):

Implement yet another DAMOS filter type for unmapped pages on DAMON kernel
API, and add support of it from the physical address space DAMON
operations set (paddr).  Since it is for only unmapped pages, support from
the virtual address spaces DAMON operations set (vaddr) is not required.

Link: https://lkml.kernel.org/r/20250219220146.133650-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20250219220146.133650-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 5e7ae7bca5dc..242910b190c9 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -337,6 +337,7 @@ struct damos_stat {
  * @DAMOS_FILTER_TYPE_MEMCG:	Specific memcg's pages.
  * @DAMOS_FILTER_TYPE_YOUNG:	Recently accessed pages.
  * @DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:	Page is part of a hugepage.
+ * @DAMOS_FILTER_TYPE_UNMAPPED:	Unmapped pages.
  * @DAMOS_FILTER_TYPE_ADDR:	Address range.
  * @DAMOS_FILTER_TYPE_TARGET:	Data Access Monitoring target.
  * @NR_DAMOS_FILTER_TYPES:	Number of filter types.
@@ -357,6 +358,7 @@ enum damos_filter_type {
 	DAMOS_FILTER_TYPE_MEMCG,
 	DAMOS_FILTER_TYPE_YOUNG,
 	DAMOS_FILTER_TYPE_HUGEPAGE_SIZE,
+	DAMOS_FILTER_TYPE_UNMAPPED,
 	DAMOS_FILTER_TYPE_ADDR,
 	DAMOS_FILTER_TYPE_TARGET,
 	NR_DAMOS_FILTER_TYPES,
-- 
cgit v1.2.3


From 58abac769b05f6e972d34a02a46a04589d6e9853 Mon Sep 17 00:00:00 2001
From: Liu Ye <liuye@kylinos.cn>
Date: Wed, 12 Feb 2025 10:58:42 +0800
Subject: mm/folio_queue: delete __folio_order and use folio_order directly

__folio_order is the same as folio_order, remove __folio_order and then
just include mm.h and use folio_order directly.

Link: https://lkml.kernel.org/r/20250212025843.80283-2-liuye@kylinos.cn
Signed-off-by: Liu Ye <liuye@kylinos.cn>
Reviewed-by: Shivank Garg <shivankg@amd.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Christian Brauner <brauner@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/folio_queue.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h
index 4d3f8074c137..45ad2408a80c 100644
--- a/include/linux/folio_queue.h
+++ b/include/linux/folio_queue.h
@@ -15,6 +15,7 @@
 #define _LINUX_FOLIO_QUEUE_H
 
 #include <linux/pagevec.h>
+#include <linux/mm.h>
 
 /*
  * Segment in a queue of running buffers.  Each segment can hold a number of
@@ -216,13 +217,6 @@ static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot)
 	clear_bit(slot, &folioq->marks3);
 }
 
-static inline unsigned int __folio_order(struct folio *folio)
-{
-	if (!folio_test_large(folio))
-		return 0;
-	return folio->_flags_1 & 0xff;
-}
-
 /**
  * folioq_append: Add a folio to a folio queue segment
  * @folioq: The segment to add to
@@ -241,7 +235,7 @@ static inline unsigned int folioq_append(struct folio_queue *folioq, struct foli
 	unsigned int slot = folioq->vec.nr++;
 
 	folioq->vec.folios[slot] = folio;
-	folioq->orders[slot] = __folio_order(folio);
+	folioq->orders[slot] = folio_order(folio);
 	return slot;
 }
 
@@ -263,7 +257,7 @@ static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct
 	unsigned int slot = folioq->vec.nr++;
 
 	folioq->vec.folios[slot] = folio;
-	folioq->orders[slot] = __folio_order(folio);
+	folioq->orders[slot] = folio_order(folio);
 	folioq_mark(folioq, slot);
 	return slot;
 }
-- 
cgit v1.2.3


From 44f76413496ec343da0d8292ceecdcabe3e6ec16 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <senozhatsky@chromium.org>
Date: Mon, 3 Mar 2025 11:03:23 +0900
Subject: zsmalloc: introduce new object mapping API

Current object mapping API is a little cumbersome.  First, it's
inconsistent, sometimes it returns with page-faults disabled and sometimes
with page-faults enabled.  Second, and most importantly, it enforces
atomicity restrictions on its users.  zs_map_object() has to return a
liner object address which is not always possible because some objects
span multiple physical (non-contiguous) pages.  For such objects zsmalloc
uses a per-CPU buffer to which object's data is copied before a pointer to
that per-CPU buffer is returned back to the caller.  This leads to
another, final, issue - extra memcpy().  Since the caller gets a pointer
to per-CPU buffer it can memcpy() data only to that buffer, and during
zs_unmap_object() zsmalloc will memcpy() from that per-CPU buffer to
physical pages that object in question spans across.

New API splits functions by access mode:
- zs_obj_read_begin(handle, local_copy)
  Returns a pointer to handle memory.  For objects that span two
  physical pages a local_copy buffer is used to store object's
  data before the address is returned to the caller.  Otherwise
  the object's page is kmap_local mapped directly.

- zs_obj_read_end(handle, buf)
  Unmaps the page if it was kmap_local mapped by zs_obj_read_begin().

- zs_obj_write(handle, buf, len)
  Copies len-bytes from compression buffer to handle memory
  (takes care of objects that span two pages).  This does not
  need any additional (e.g. per-CPU) buffers and writes the data
  directly to zsmalloc pool pages.

In terms of performance, on a synthetic and completely reproducible
test that allocates fixed number of objects of fixed sizes and
iterates over those objects, first mapping in RO then in RW mode:

OLD API
=======

3 first results out of 10

  369,205,778      instructions        #    0.80  insn per cycle
   40,467,926      branches            #  113.732 M/sec

  369,002,122      instructions        #    0.62  insn per cycle
   40,426,145      branches            #  189.361 M/sec

  369,036,706      instructions        #    0.63  insn per cycle
   40,430,860      branches            #  204.105 M/sec

[..]

NEW API
=======

3 first results out of 10

  265,799,293      instructions        #    0.51  insn per cycle
   29,834,567      branches            #  170.281 M/sec

  265,765,970      instructions        #    0.55  insn per cycle
   29,829,019      branches            #  161.602 M/sec

  265,764,702      instructions        #    0.51  insn per cycle
   29,828,015      branches            #  189.677 M/sec

[..]

T-test on all 10 runs
=====================

Difference at 95.0% confidence
   -1.03219e+08 +/- 55308.7
   -27.9705% +/- 0.0149878%
   (Student's t, pooled s = 58864.4)

The old API will stay around until the remaining users switch to the new
one.  After that we'll also remove zsmalloc per-CPU buffer and CPU hotplug
handling.

The split of map(RO) and map(WO) into read_{begin/end}/write is suggested
by Yosry Ahmed.

Link: https://lkml.kernel.org/r/20250303022425.285971-15-senozhatsky@chromium.org
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Suggested-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zsmalloc.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index a48cd0ffe57d..7d70983cf398 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -58,4 +58,12 @@ unsigned long zs_compact(struct zs_pool *pool);
 unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size);
 
 void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
+
+void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle,
+			void *local_copy);
+void zs_obj_read_end(struct zs_pool *pool, unsigned long handle,
+		     void *handle_mem);
+void zs_obj_write(struct zs_pool *pool, unsigned long handle,
+		  void *handle_mem, size_t mem_len);
+
 #endif
-- 
cgit v1.2.3


From 1b7e90020eb770aa52991a34f21552b4c38ea690 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Fri, 14 Mar 2025 00:59:33 +0800
Subject: mm, swap: use percpu cluster as allocation fast path

Current allocation workflow first traverses the plist with a global lock
held, after choosing a device, it uses the percpu cluster on that swap
device.  This commit moves the percpu cluster variable out of being tied
to individual swap devices, making it a global percpu variable, and will
be used directly for allocation as a fast path.

The global percpu cluster variable will never point to a HDD device, and
allocations on a HDD device are still globally serialized.

This improves the allocator performance and prepares for removal of the
slot cache in later commits.  There shouldn't be much observable behavior
change, except one thing: this changes how swap device allocation rotation
works.

Currently, each allocation will rotate the plist, and because of the
existence of slot cache (one order 0 allocation usually returns 64
entries), swap devices of the same priority are rotated for every 64 order
0 entries consumed.  High order allocations are different, they will
bypass the slot cache, and so swap device is rotated for every 16K, 32K,
or up to 2M allocation.

The rotation rule was never clearly defined or documented, it was changed
several times without mentioning.

After this commit, and once slot cache is gone in later commits, swap
device rotation will happen for every consumed cluster.  Ideally non-HDD
devices will be rotated if 2M space has been consumed for each order.
Fragmented clusters will rotate the device faster, which seems OK.  HDD
devices is rotated for every allocation regardless of the allocation
order, which should be OK too and trivial.

This commit also slightly changes allocation behaviour for slot cache.
The new added cluster allocation fast path may allocate entries from
different device to the slot cache, this is not observable from user
space, only impact performance very slightly, and slot cache will be just
gone in next commit, so this can be ignored.

Link: https://lkml.kernel.org/r/20250313165935.63303-6-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2fe91c293636..374bffc87427 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -284,12 +284,10 @@ enum swap_cluster_flags {
 #endif
 
 /*
- * We assign a cluster to each CPU, so each CPU can allocate swap entry from
- * its own cluster and swapout sequentially. The purpose is to optimize swapout
- * throughput.
+ * We keep using same cluster for rotational device so IO will be sequential.
+ * The purpose is to optimize SWAP throughput on these device.
  */
-struct percpu_cluster {
-	local_lock_t lock; /* Protect the percpu_cluster above */
+struct swap_sequential_cluster {
 	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
 };
 
@@ -315,8 +313,7 @@ struct swap_info_struct {
 	atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
 	unsigned int pages;		/* total of usable pages of swap */
 	atomic_long_t inuse_pages;	/* number of those currently in use */
-	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
-	struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */
+	struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
 	spinlock_t global_cluster_lock;	/* Serialize usage of global cluster */
 	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
 	struct block_device *bdev;	/* swap device or bdev of swap file */
-- 
cgit v1.2.3


From 0ff67f990bd45726e0d9e91111d998e7a3595b32 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Fri, 14 Mar 2025 00:59:34 +0800
Subject: mm, swap: remove swap slot cache

Slot cache is no longer needed now, removing it and all related code.

- vm-scalability with: `usemem --init-time -O -y -x -R -31 1G`,
12G memory cgroup using simulated pmem as SWAP (32G pmem, 32 CPUs),
16 test runs for each case, measuring the total throughput:

                      Before (KB/s) (stdev)  After (KB/s) (stdev)
Random (4K):          424907.60 (24410.78)   414745.92  (34554.78)
Random (64K):         163308.82 (11635.72)   167314.50  (18434.99)
Sequential (4K, !-R): 6150056.79 (103205.90) 6321469.06 (115878.16)

The performance changes are below noise level.

- Build linux kernel with make -j96, using 4K folio with 1.5G memory
cgroup limit and 64K folio with 2G memory cgroup limit, on top of tmpfs,
12 test runs, measuring the system time:

                  Before (s) (stdev)  After (s) (stdev)
make -j96 (4K):   6445.69 (61.95)     6408.80 (69.46)
make -j96 (64K):  6841.71 (409.04)    6437.99 (435.55)

Similar to above, 64k mTHP case showed a slight improvement.

Link: https://lkml.kernel.org/r/20250313165935.63303-7-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h       |  3 ---
 include/linux/swap_slots.h | 28 ----------------------------
 2 files changed, 31 deletions(-)
 delete mode 100644 include/linux/swap_slots.h

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 374bffc87427..c5856dcc263a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -465,7 +465,6 @@ void free_pages_and_swap_cache(struct encoded_page **, int);
 extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
 extern atomic_t nr_rotate_swap;
-extern bool has_usable_swap(void);
 
 /* Swap 50% full? Release swapcache more aggressively.. */
 static inline bool vm_swap_full(void)
@@ -483,13 +482,11 @@ swp_entry_t folio_alloc_swap(struct folio *folio);
 bool folio_free_swap(struct folio *folio);
 void put_swap_folio(struct folio *folio, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
-extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t, int);
 extern int swap_duplicate(swp_entry_t);
 extern int swapcache_prepare(swp_entry_t entry, int nr);
 extern void swap_free_nr(swp_entry_t entry, int nr_pages);
-extern void swapcache_free_entries(swp_entry_t *entries, int n);
 extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
 int swap_type_of(dev_t device, sector_t offset);
 int find_first_swap(dev_t *device);
diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
deleted file mode 100644
index 840aec3523b2..000000000000
--- a/include/linux/swap_slots.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SWAP_SLOTS_H
-#define _LINUX_SWAP_SLOTS_H
-
-#include <linux/swap.h>
-#include <linux/spinlock.h>
-#include <linux/mutex.h>
-
-#define SWAP_SLOTS_CACHE_SIZE			SWAP_BATCH
-#define THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE	(5*SWAP_SLOTS_CACHE_SIZE)
-#define THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE	(2*SWAP_SLOTS_CACHE_SIZE)
-
-struct swap_slots_cache {
-	bool		lock_initialized;
-	struct mutex	alloc_lock; /* protects slots, nr, cur */
-	swp_entry_t	*slots;
-	int		nr;
-	int		cur;
-	int		n_ret;
-};
-
-void disable_swap_slots_cache_lock(void);
-void reenable_swap_slots_cache_unlock(void);
-void enable_swap_slots_cache(void);
-
-extern bool swap_slot_cache_enabled;
-
-#endif /* _LINUX_SWAP_SLOTS_H */
-- 
cgit v1.2.3


From b487a2da3575b6cdfb6d6559311830c8fea70bb9 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Fri, 14 Mar 2025 00:59:35 +0800
Subject: mm, swap: simplify folio swap allocation

With slot cache gone, clean up the allocation helpers even more.
folio_alloc_swap will be the only entry for allocation and adding the
folio to swap cache (except suspend), making it opposite of
folio_free_swap.

Link: https://lkml.kernel.org/r/20250313165935.63303-8-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c5856dcc263a..9c99eee160f9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -478,7 +478,7 @@ static inline long get_nr_swap_pages(void)
 }
 
 extern void si_swapinfo(struct sysinfo *);
-swp_entry_t folio_alloc_swap(struct folio *folio);
+int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask);
 bool folio_free_swap(struct folio *folio);
 void put_swap_folio(struct folio *folio, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
@@ -586,11 +586,9 @@ static inline int swp_swapcount(swp_entry_t entry)
 	return 0;
 }
 
-static inline swp_entry_t folio_alloc_swap(struct folio *folio)
+static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask)
 {
-	swp_entry_t entry;
-	entry.val = 0;
-	return entry;
+	return -EINVAL;
 }
 
 static inline bool folio_free_swap(struct folio *folio)
-- 
cgit v1.2.3


From 7b54a96f30dec4b812841d179a26e88a7887dc06 Mon Sep 17 00:00:00 2001
From: Sourabh Jain <sourabhjain@linux.ibm.com>
Date: Fri, 31 Jan 2025 17:08:25 +0530
Subject: crash: remove an unused argument from reserve_crashkernel_generic()

cmdline argument is not used in reserve_crashkernel_generic() so remove
it.  Correspondingly, all the callers have been updated as well.

No functional change intended.

Link: https://lkml.kernel.org/r/20250131113830.925179-3-sourabhjain@linux.ibm.com
Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Acked-by: Hari Bathini <hbathini@linux.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_reserve.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h
index 5a9df944fb80..1fe7e7d1b214 100644
--- a/include/linux/crash_reserve.h
+++ b/include/linux/crash_reserve.h
@@ -32,13 +32,12 @@ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 #define CRASH_ADDR_HIGH_MAX		memblock_end_of_DRAM()
 #endif
 
-void __init reserve_crashkernel_generic(char *cmdline,
-		unsigned long long crash_size,
-		unsigned long long crash_base,
-		unsigned long long crash_low_size,
-		bool high);
+void __init reserve_crashkernel_generic(unsigned long long crash_size,
+					unsigned long long crash_base,
+					unsigned long long crash_low_size,
+					bool high);
 #else
-static inline void __init reserve_crashkernel_generic(char *cmdline,
+static inline void __init reserve_crashkernel_generic(
 		unsigned long long crash_size,
 		unsigned long long crash_base,
 		unsigned long long crash_low_size,
-- 
cgit v1.2.3


From 9f0552c978f8950b4661f1222290fb57af811a8b Mon Sep 17 00:00:00 2001
From: Sourabh Jain <sourabhjain@linux.ibm.com>
Date: Fri, 31 Jan 2025 17:08:26 +0530
Subject: crash: let arch decide usable memory range in reserved area

Although the crashkernel area is reserved, on architectures like PowerPC,
it is possible for the crashkernel reserved area to contain components
like RTAS, TCE, OPAL, etc.  To avoid placing kexec segments over these
components, PowerPC has its own set of APIs to locate holes in the
crashkernel reserved area.

Add an arch hook in the generic locate mem hole APIs so that architectures
can handle such special regions in the crashkernel area while locating
memory holes for kexec segments using generic APIs.  With this, a lot of
redundant arch-specific code can be removed, as it performs the exact same
job as the generic APIs.

To keep the generic and arch-specific changes separate, the changes
related to moving PowerPC to use the generic APIs and the removal of
PowerPC-specific APIs for memory hole allocation are done in a subsequent
patch titled "powerpc/crash: Use generic APIs to locate memory hole for
kdump.

Link: https://lkml.kernel.org/r/20250131113830.925179-4-sourabhjain@linux.ibm.com
Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index f0e9f8eda7a3..407f8b0346aa 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -205,6 +205,15 @@ static inline int arch_kimage_file_post_load_cleanup(struct kimage *image)
 }
 #endif
 
+#ifndef arch_check_excluded_range
+static inline int arch_check_excluded_range(struct kimage *image,
+					    unsigned long start,
+					    unsigned long end)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_KEXEC_SIG
 #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
 int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len);
-- 
cgit v1.2.3


From 8c6bbda879b62f16bb03321a84554b4f63415c55 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 3 Feb 2025 16:05:22 +0100
Subject: rcu: provide a static initializer for hlist_nulls_head

Patch series "ucount: Simplify refcounting with rcuref_t".

I noticed that the atomic_dec_and_lock_irqsave() in put_ucounts() loops
sometimes even during boot.  Something like 2-3 iterations but still.
This series replaces the refcounting with rcuref_t and adds a RCU lookup.

This allows a lockless lookup in alloc_ucounts() if the entry is available
and a cmpxchg()less put of the item.


This patch (of 4):

Provide a static initializer for hlist_nulls_head so that it can be used
in statically defined data structures.

Link: https://lkml.kernel.org/r/20250203150525.456525-1-bigeasy@linutronix.de
Link: https://lkml.kernel.org/r/20250203150525.456525-2-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai jiangshan <jiangshanlai@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mengen Sun <mengensun@tencent.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: YueHong Wu <yuehongwu@tencent.com>
Cc: Zqiang <qiang.zhang1211@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list_nulls.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index fa6e8471bd22..248db9b77ee2 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -28,6 +28,7 @@ struct hlist_nulls_node {
 #define NULLS_MARKER(value) (1UL | (((long)value) << 1))
 #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
 	((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
+#define HLIST_NULLS_HEAD_INIT(nulls) {.first = (struct hlist_nulls_node *)NULLS_MARKER(nulls)}
 
 #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
 
-- 
cgit v1.2.3


From 5f01a22c5b231dd590f61a2591b3090665733bcb Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 3 Feb 2025 16:05:24 +0100
Subject: ucount: use RCU for ucounts lookups

The ucounts element is looked up under ucounts_lock.  This can be
optimized by using RCU for a lockless lookup and return and element if the
reference can be obtained.

Replace hlist_head with hlist_nulls_head which is RCU compatible.  Let
find_ucounts() search for the required item within a RCU section and
return the item if a reference could be obtained.  This means
alloc_ucounts() will always return an element (unless the memory
allocation failed).  Let put_ucounts() RCU free the element if the
reference counter dropped to zero.

Link: https://lkml.kernel.org/r/20250203150525.456525-4-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai jiangshan <jiangshanlai@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mengen Sun <mengensun@tencent.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: YueHong Wu <yuehongwu@tencent.com>
Cc: Zqiang <qiang.zhang1211@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/user_namespace.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 7183e5aca282..ad4dbef92597 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -5,6 +5,7 @@
 #include <linux/kref.h>
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
+#include <linux/rculist_nulls.h>
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 #include <linux/rwsem.h>
@@ -115,9 +116,10 @@ struct user_namespace {
 } __randomize_layout;
 
 struct ucounts {
-	struct hlist_node node;
+	struct hlist_nulls_node node;
 	struct user_namespace *ns;
 	kuid_t uid;
+	struct rcu_head rcu;
 	atomic_t count;
 	atomic_long_t ucount[UCOUNT_COUNTS];
 	atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS];
-- 
cgit v1.2.3


From b4dc0bee2a749083028afba346910e198653f42a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 3 Feb 2025 16:05:25 +0100
Subject: ucount: use rcuref_t for reference counting

Use rcuref_t for reference counting.  This eliminates the cmpxchg loop in
the get and put path.  This also eliminates the need to acquire the lock
in the put path because once the final user returns the reference, it can
no longer be obtained anymore.

Use rcuref_t for reference counting.

Link: https://lkml.kernel.org/r/20250203150525.456525-5-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai jiangshan <jiangshanlai@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mengen Sun <mengensun@tencent.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: YueHong Wu <yuehongwu@tencent.com>
Cc: Zqiang <qiang.zhang1211@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/user_namespace.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index ad4dbef92597..a0bb6d012137 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -8,6 +8,7 @@
 #include <linux/rculist_nulls.h>
 #include <linux/sched.h>
 #include <linux/workqueue.h>
+#include <linux/rcuref.h>
 #include <linux/rwsem.h>
 #include <linux/sysctl.h>
 #include <linux/err.h>
@@ -120,7 +121,7 @@ struct ucounts {
 	struct user_namespace *ns;
 	kuid_t uid;
 	struct rcu_head rcu;
-	atomic_t count;
+	rcuref_t count;
 	atomic_long_t ucount[UCOUNT_COUNTS];
 	atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS];
 };
@@ -133,9 +134,15 @@ void retire_userns_sysctls(struct user_namespace *ns);
 struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
 void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
 struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
-struct ucounts * __must_check get_ucounts(struct ucounts *ucounts);
 void put_ucounts(struct ucounts *ucounts);
 
+static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts)
+{
+	if (rcuref_get(&ucounts->count))
+		return ucounts;
+	return NULL;
+}
+
 static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type)
 {
 	return atomic_long_read(&ucounts->rlimit[type]);
-- 
cgit v1.2.3


From 318f05a057157c400a92f17c5f2fa3dd96f57c7e Mon Sep 17 00:00:00 2001
From: Ahmad Fatoum <a.fatoum@pengutronix.de>
Date: Mon, 17 Feb 2025 21:39:41 +0100
Subject: reboot: replace __hw_protection_shutdown bool action parameter with
 an enum

Patch series "reboot: support runtime configuration of emergency
hw_protection action", v3.

We currently leave the decision of whether to shutdown or reboot to
protect hardware in an emergency situation to the individual drivers.

This works out in some cases, where the driver detecting the critical
failure has inside knowledge: It binds to the system management controller
for example or is guided by hardware description that defines what to do.

This is inadequate in the general case though as a driver reporting e.g.
an imminent power failure can't know whether a shutdown or a reboot would
be more appropriate for a given hardware platform.

To address this, this series adds a hw_protection kernel parameter and
sysfs toggle that can be used to change the action from the shutdown
default to reboot.  A new hw_protection_trigger API then makes use of this
default action.

My particular use case is unattended embedded systems that don't have
support for shutdown and that power on automatically when power is
supplied:

  - A brief power cycle gets detected by the driver
  - The kernel powers down the system and SoC goes into shutdown mode
  - Power is restored
  - The system remains oblivious to the restored power
  - System needs to be manually power cycled for a duration long enough
    to drain the capacitors

With this series, such systems can configure the kernel with
hw_protection=reboot to have the boot firmware worry about critical
conditions.


This patch (of 12):

Currently __hw_protection_shutdown() either reboots or shuts down the
system according to its shutdown argument.

To make the logic easier to follow, both inside __hw_protection_shutdown
and at caller sites, lets replace the bool parameter with an enum.

This will be extra useful, when in a later commit, a third action is added
to the enumeration.

No functional change.

Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-0-e1c09b090c0c@pengutronix.de
Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-1-e1c09b090c0c@pengutronix.de
Signed-off-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Cc: Benson Leung <bleung@chromium.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Fabio Estevam <festevam@denx.de>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Matteo Croce <teknoraver@meta.com>
Cc: Matti Vaittinen <mazziesaccount@gmail.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/reboot.h | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index abcdde4df697..e97f6b8e8586 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -177,16 +177,28 @@ void ctrl_alt_del(void);
 
 extern void orderly_poweroff(bool force);
 extern void orderly_reboot(void);
-void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown);
+
+/**
+ * enum hw_protection_action - Hardware protection action
+ *
+ * @HWPROT_ACT_SHUTDOWN:
+ *	The system should be shut down (powered off) for HW protection.
+ * @HWPROT_ACT_REBOOT:
+ *	The system should be rebooted for HW protection.
+ */
+enum hw_protection_action { HWPROT_ACT_SHUTDOWN, HWPROT_ACT_REBOOT };
+
+void __hw_protection_shutdown(const char *reason, int ms_until_forced,
+			      enum hw_protection_action action);
 
 static inline void hw_protection_reboot(const char *reason, int ms_until_forced)
 {
-	__hw_protection_shutdown(reason, ms_until_forced, false);
+	__hw_protection_shutdown(reason, ms_until_forced, HWPROT_ACT_REBOOT);
 }
 
 static inline void hw_protection_shutdown(const char *reason, int ms_until_forced)
 {
-	__hw_protection_shutdown(reason, ms_until_forced, true);
+	__hw_protection_shutdown(reason, ms_until_forced, HWPROT_ACT_SHUTDOWN);
 }
 
 /*
-- 
cgit v1.2.3


From 81cab0f94ab864f13e29e561382d722daec6a55a Mon Sep 17 00:00:00 2001
From: Ahmad Fatoum <a.fatoum@pengutronix.de>
Date: Mon, 17 Feb 2025 21:39:45 +0100
Subject: reboot: rename now misleading __hw_protection_shutdown symbols

The __hw_protection_shutdown function name has become misleading since it
can cause either a shutdown (poweroff) or a reboot depending on its
argument.

To avoid further confusion, let's rename it, so it doesn't suggest that a
poweroff is all it can do.

Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-5-e1c09b090c0c@pengutronix.de
Signed-off-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Cc: Benson Leung <bleung@chromium.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Fabio Estevam <festevam@denx.de>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Matteo Croce <teknoraver@meta.com>
Cc: Matti Vaittinen <mazziesaccount@gmail.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/reboot.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index e97f6b8e8586..53c64e31b3cf 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -188,17 +188,17 @@ extern void orderly_reboot(void);
  */
 enum hw_protection_action { HWPROT_ACT_SHUTDOWN, HWPROT_ACT_REBOOT };
 
-void __hw_protection_shutdown(const char *reason, int ms_until_forced,
-			      enum hw_protection_action action);
+void __hw_protection_trigger(const char *reason, int ms_until_forced,
+			     enum hw_protection_action action);
 
 static inline void hw_protection_reboot(const char *reason, int ms_until_forced)
 {
-	__hw_protection_shutdown(reason, ms_until_forced, HWPROT_ACT_REBOOT);
+	__hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_REBOOT);
 }
 
 static inline void hw_protection_shutdown(const char *reason, int ms_until_forced)
 {
-	__hw_protection_shutdown(reason, ms_until_forced, HWPROT_ACT_SHUTDOWN);
+	__hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_SHUTDOWN);
 }
 
 /*
-- 
cgit v1.2.3


From e016173f656b89f5f71b7d45dfc599ada8107eef Mon Sep 17 00:00:00 2001
From: Ahmad Fatoum <a.fatoum@pengutronix.de>
Date: Mon, 17 Feb 2025 21:39:47 +0100
Subject: reboot: add support for configuring emergency hardware protection
 action

We currently leave the decision of whether to shutdown or reboot to
protect hardware in an emergency situation to the individual drivers.

This works out in some cases, where the driver detecting the critical
failure has inside knowledge: It binds to the system management controller
for example or is guided by hardware description that defines what to do.

In the general case, however, the driver detecting the issue can't know
what the appropriate course of action is and shouldn't be dictating the
policy of dealing with it.

Therefore, add a global hw_protection toggle that allows the user to
specify whether shutdown or reboot should be the default action when the
driver doesn't set policy.

This introduces no functional change yet as hw_protection_trigger() has no
callers, but these will be added in subsequent commits.

[arnd@arndb.de: hide unused hw_protection_attr]
  Link: https://lkml.kernel.org/r/20250224141849.1546019-1-arnd@kernel.org
Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-7-e1c09b090c0c@pengutronix.de
Signed-off-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Cc: Benson Leung <bleung@chromium.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Fabio Estevam <festevam@denx.de>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Matteo Croce <teknoraver@meta.com>
Cc: Matti Vaittinen <mazziesaccount@gmail.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/reboot.h          | 22 +++++++++++++++++++++-
 include/uapi/linux/capability.h |  1 +
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 53c64e31b3cf..79e02876f2ba 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -181,16 +181,36 @@ extern void orderly_reboot(void);
 /**
  * enum hw_protection_action - Hardware protection action
  *
+ * @HWPROT_ACT_DEFAULT:
+ *      The default action should be taken. This is HWPROT_ACT_SHUTDOWN
+ *      by default, but can be overridden.
  * @HWPROT_ACT_SHUTDOWN:
  *	The system should be shut down (powered off) for HW protection.
  * @HWPROT_ACT_REBOOT:
  *	The system should be rebooted for HW protection.
  */
-enum hw_protection_action { HWPROT_ACT_SHUTDOWN, HWPROT_ACT_REBOOT };
+enum hw_protection_action { HWPROT_ACT_DEFAULT, HWPROT_ACT_SHUTDOWN, HWPROT_ACT_REBOOT };
 
 void __hw_protection_trigger(const char *reason, int ms_until_forced,
 			     enum hw_protection_action action);
 
+/**
+ * hw_protection_trigger - Trigger default emergency system hardware protection action
+ *
+ * @reason:		Reason of emergency shutdown or reboot to be printed.
+ * @ms_until_forced:	Time to wait for orderly shutdown or reboot before
+ *			triggering it. Negative value disables the forced
+ *			shutdown or reboot.
+ *
+ * Initiate an emergency system shutdown or reboot in order to protect
+ * hardware from further damage. The exact action taken is controllable at
+ * runtime and defaults to shutdown.
+ */
+static inline void hw_protection_trigger(const char *reason, int ms_until_forced)
+{
+	__hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_DEFAULT);
+}
+
 static inline void hw_protection_reboot(const char *reason, int ms_until_forced)
 {
 	__hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_REBOOT);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 5bb906098697..2e21b5594f81 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -275,6 +275,7 @@ struct vfs_ns_cap_data {
 /* Allow setting encryption key on loopback filesystem */
 /* Allow setting zone reclaim policy */
 /* Allow everything under CAP_BPF and CAP_PERFMON for backward compatibility */
+/* Allow setting hardware protection emergency action */
 
 #define CAP_SYS_ADMIN        21
 
-- 
cgit v1.2.3


From 5e40d0d196595874c3f0606f0642021e6ade8054 Mon Sep 17 00:00:00 2001
From: Ahmad Fatoum <a.fatoum@pengutronix.de>
Date: Mon, 17 Feb 2025 21:39:52 +0100
Subject: reboot: retire hw_protection_reboot and hw_protection_shutdown
 helpers

The hw_protection_reboot and hw_protection_shutdown functions mix
mechanism with policy: They let the driver requesting an emergency action
for hardware protection also decide how to deal with it.

This is inadequate in the general case as a driver reporting e.g.  an
imminent power failure can't know whether a shutdown or a reboot would be
more appropriate for a given hardware platform.

With the addition of the hw_protection parameter, it's now possible to
configure at runtime the default emergency action and drivers are expected
to use hw_protection_trigger to have this parameter dictate policy.

As no current users of either hw_protection_shutdown or
hw_protection_shutdown helpers remain, remove them, as not to tempt driver
authors to call them.

Existing users now either defer to hw_protection_trigger or call
__hw_protection_trigger with a suitable argument directly when they have
inside knowledge on whether a reboot or shutdown would be more
appropriate.

Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-12-e1c09b090c0c@pengutronix.de
Signed-off-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Cc: Benson Leung <bleung@chromium.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Fabio Estevam <festevam@denx.de>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Matteo Croce <teknoraver@meta.com>
Cc: Matti Vaittinen <mazziesaccount@gmail.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/reboot.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 79e02876f2ba..aa08c3bbbf59 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -211,16 +211,6 @@ static inline void hw_protection_trigger(const char *reason, int ms_until_forced
 	__hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_DEFAULT);
 }
 
-static inline void hw_protection_reboot(const char *reason, int ms_until_forced)
-{
-	__hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_REBOOT);
-}
-
-static inline void hw_protection_shutdown(const char *reason, int ms_until_forced)
-{
-	__hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_SHUTDOWN);
-}
-
 /*
  * Emergency restart, callable from an interrupt handler.
  */
-- 
cgit v1.2.3


From 0ac451ecec6dd516fdac1ca064467e753ef6ae1a Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Sun, 16 Feb 2025 00:56:18 +0800
Subject: lib min_heap: use size_t for array size and index variables

Replace the int type with size_t for variables representing array sizes
and indices in the min-heap implementation.  Using size_t aligns with
standard practices for size-related variables and avoids potential issues
on platforms where int may be insufficient to represent all valid sizes or
indices.

Link: https://lkml.kernel.org/r/20250215165618.1757219-1-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
Cc: Yu-Chun Lin <eleanor15x@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/min_heap.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
index 1160bed6579e..79ddc0adbf2b 100644
--- a/include/linux/min_heap.h
+++ b/include/linux/min_heap.h
@@ -218,7 +218,7 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size)
 
 /* Initialize a min-heap. */
 static __always_inline
-void __min_heap_init_inline(min_heap_char *heap, void *data, int size)
+void __min_heap_init_inline(min_heap_char *heap, void *data, size_t size)
 {
 	heap->nr = 0;
 	heap->size = size;
@@ -254,7 +254,7 @@ bool __min_heap_full_inline(min_heap_char *heap)
 
 /* Sift the element at pos down the heap. */
 static __always_inline
-void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size,
+void __min_heap_sift_down_inline(min_heap_char *heap, size_t pos, size_t elem_size,
 				 const struct min_heap_callbacks *func, void *args)
 {
 	const unsigned long lsbit = elem_size & -elem_size;
@@ -324,7 +324,7 @@ static __always_inline
 void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size,
 			      const struct min_heap_callbacks *func, void *args)
 {
-	int i;
+	ssize_t i;
 
 	for (i = heap->nr / 2 - 1; i >= 0; i--)
 		__min_heap_sift_down_inline(heap, i, elem_size, func, args);
@@ -379,7 +379,7 @@ bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t ele
 			    const struct min_heap_callbacks *func, void *args)
 {
 	void *data = heap->data;
-	int pos;
+	size_t pos;
 
 	if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap"))
 		return false;
@@ -428,10 +428,10 @@ bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx,
 	__min_heap_del_inline(container_of(&(_heap)->nr, min_heap_char, nr),	\
 			      __minheap_obj_size(_heap), _idx, _func, _args)
 
-void __min_heap_init(min_heap_char *heap, void *data, int size);
+void __min_heap_init(min_heap_char *heap, void *data, size_t size);
 void *__min_heap_peek(struct min_heap_char *heap);
 bool __min_heap_full(min_heap_char *heap);
-void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size,
+void __min_heap_sift_down(min_heap_char *heap, size_t pos, size_t elem_size,
 			  const struct min_heap_callbacks *func, void *args);
 void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
 			const struct min_heap_callbacks *func, void *args);
-- 
cgit v1.2.3


From 758502918e77323bf999a3eddd71466bf6135173 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Fri, 21 Feb 2025 05:02:21 -0800
Subject: rhashtable: remove needless return in three void APIs

Remove needless 'return' in the following void APIs:

 rhltable_walk_enter()
 rhltable_free_and_destroy()
 rhltable_destroy()

Since both the API and callee involved are void functions.

Link: https://lkml.kernel.org/r/20250221-rmv_return-v1-16-cc8dff275827@quicinc.com
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rhashtable.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 8463a128e2f4..6c85b28ea30b 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1259,7 +1259,7 @@ static inline int rhashtable_replace_fast(
 static inline void rhltable_walk_enter(struct rhltable *hlt,
 				       struct rhashtable_iter *iter)
 {
-	return rhashtable_walk_enter(&hlt->ht, iter);
+	rhashtable_walk_enter(&hlt->ht, iter);
 }
 
 /**
@@ -1275,12 +1275,12 @@ static inline void rhltable_free_and_destroy(struct rhltable *hlt,
 							     void *arg),
 					     void *arg)
 {
-	return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
+	rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
 }
 
 static inline void rhltable_destroy(struct rhltable *hlt)
 {
-	return rhltable_free_and_destroy(hlt, NULL, NULL);
+	rhltable_free_and_destroy(hlt, NULL, NULL);
 }
 
 #endif /* _LINUX_RHASHTABLE_H */
-- 
cgit v1.2.3


From ede7cd607a1d206b9579264a7405d78316b2d7fd Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Fri, 21 Feb 2025 05:02:07 -0800
Subject: cpu: remove needless return in void API
 suspend_enable_secondary_cpus()

Remove needless 'return' in void API suspend_enable_secondary_cpus() since
both the API and thaw_secondary_cpus() are void functions.

Link: https://lkml.kernel.org/r/20250221-rmv_return-v1-2-cc8dff275827@quicinc.com
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 6a0a8f1c7c90..e3049543008b 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -148,7 +148,7 @@ static inline int suspend_disable_secondary_cpus(void)
 }
 static inline void suspend_enable_secondary_cpus(void)
 {
-	return thaw_secondary_cpus();
+	thaw_secondary_cpus();
 }
 
 #else /* !CONFIG_PM_SLEEP_SMP */
-- 
cgit v1.2.3


From 720ba85040a6549674e5599685ca7df86a902448 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 26 Feb 2025 14:22:57 +0100
Subject: mm/mmu_notifier: use MMU_NOTIFY_CLEAR in
 remove_device_exclusive_entry()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's limit the use of MMU_NOTIFY_EXCLUSIVE to the case where we convert a
present PTE to device-exclusive.  For the other case, we can simply use
MMU_NOTIFY_CLEAR, because it really is clearing the device-exclusive entry
first, to then install the present entry.

Update the documentation of MMU_NOTIFY_EXCLUSIVE, to document the single
use case more thoroughly.

If ever required, we could add a separate MMU_NOTIFY_CLEAR_EXCLUSIVE; for
now using MMU_NOTIFY_CLEAR seems to be sufficient.

Link: https://lkml.kernel.org/r/20250226132257.2826043-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index d4e714661826..bc2402a45741 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -43,10 +43,10 @@ struct mmu_interval_notifier;
  * a device driver to possibly ignore the invalidation if the
  * owner field matches the driver's device private pgmap owner.
  *
- * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
- * longer have exclusive access to the page. When sent during creation of an
- * exclusive range the owner will be initialised to the value provided by the
- * caller of make_device_exclusive(), otherwise the owner will be NULL.
+ * @MMU_NOTIFY_EXCLUSIVE: conversion of a page table entry to device-exclusive.
+ * The owner is initialized to the value provided by the caller of
+ * make_device_exclusive(), such that this caller can filter out these
+ * events.
  */
 enum mmu_notifier_event {
 	MMU_NOTIFY_UNMAP = 0,
-- 
cgit v1.2.3


From 1eb3471bf5749ff3769ec52723bd9b8d773c7a62 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 3 Mar 2025 14:17:19 -0800
Subject: mm/damon: add data structure for monitoring intervals auto-tuning

Patch series "mm/damon: auto-tune aggregation interval".

DAMON requires time-consuming and repetitive aggregation interval tuning.
Introduce a feature for automating it using a feedback loop that aims an
amount of observed access events, like auto-exposing cameras.

Background: Access Frequency Monitoring and Aggregation Interval
================================================================

DAMON checks if each memory element (damon_region) is accessed or not for
every user-specified time interval called 'sampling interval'.  It
aggregates the check intervals on per-element counter called
'nr_accesses'.  DAMON users can read the counters to get the access
temperature of a given element.  The counters are reset for every another
user-specified time interval called 'aggregation interval'.

This can be illustrated as DAMON continuously capturing a snapshot of
access events that happen and captured within the last aggregation
interval.  This implies the aggregation interval plays a key role for the
quality of the snapshots, like the camera exposure time.  If it is too
short, the amount of access events that happened and captured for each
snapshot is small, so each snapshot will show no many interesting things
but just a cold and dark world with hopefuly one pale blue dot or two.  If
it is too long, too many events are aggregated in a single shot, so each
snapshot will look like world of flames, or Muspellheim.  It will be
difficult to find practical insights in both cases.

Problem: Time Consuming and Repetitive Tuning
=============================================

The appropriate length of the aggregation interval depends on how
frequently the system and workloads are making access events that DAMON
can observe.  Hence, users have to tune the interval with excessive amount
of tests with the target system and workloads.  If the system and
workloads are changed, the tuning should be done again.  If the
characteristic of the workloads is dynamic, it becomes more challenging.
It is therefore time-consuming and repetitive.

The tuning challenge mainly stems from the wrong question.  It is not
asking users what quality of monitoring results they want, but how DAMON
should operate for their hidden goal.  To make the right answer, users
need to fully understand DAMON's mechanisms and the characteristics of
their workloads.  Users shouldn't be asked to understand the underlying
mechanism.  Understanding the characteristics of the workloads shouldn't
be the role of users but DAMON.

Aim-oriented Feedback-driven Auto-Tuning
=========================================

Fortunately, the appropriate length of the aggregation interval can be
inferred using a feedback loop.  If the current snapshots are showing no
much intresting information, in other words, if it shows only rare access
events, increasing the aggregation interval helps, and vice versa.  We
tested this theory on a few real-world workloads, and documented one of
the experience with an official DAMON monitoring intervals tuning
guideline.  Since it is a simple theory that requires repeatable tries, it
can be a good job for machines.

Based on the guideline's theory, we design an automation of aggregation
interval tuning, in a way similar to that of camera auto-exposure feature.
It defines the amount of interesting information as the ratio of
DAMON-observed access events that DAMON actually observed to theoretical
maximum amount of it within each snapshot.  Events are accounted in byte
and sampling attempts granularity.  For example, let's say there is a
region of 'X' bytes size.  DAMON tried access check smapling for the
region 'Y' times in total for a given aggregation.  Among the 'Y'
attempts, 'Z' times it shown positive results.  Then, the theoritical
maximum number of access events for the region is 'X * Y'.  And the number
of access events that DAMON has observed for the region is 'X * Z'.  The
abount of the interesting information is '(X * Z / X * Y)'.  Note that
each snapshot would have multiple regions.

Users can set an arbitrary value of the ratio as their target.  Once the
target is set, the automation periodically measures the current value of
the ratio and increase or decrease the aggregation interval if the ratio
value is lower or higher than the target.  The amount of the change is
proportion to the distance between the current adn the target values.

To avoid auto-tuning goes too long way, let users set the minimum and the
maximum aggregation interval times.  Changing only aggregation interval
while sampling interval is kept makes the maximum level of access
frequency in each snapshot, or discernment of regions inconsistent.  Also,
unnecessarily short sampling interval causes meaningless monitoring
overhed.  The automation therefore adjusts the sampling interval together
with aggregation interval, while keeping the ratio between the two
intervals.  Users can set the ratio, or the discernment.

Discussion
==========

The modified question (aimed amount of access events, or lights, in each
snapshot) is easy to answer by both the users and the kernel.  If users
are interested in finding more cold regions, the value should be lower,
and vice versa.  If users have no idea, kernel can suggest a fair default
value based on some theories and experiments.  For example, based on the
Pareto principle (80/20 rule), we could expect 20% target ratio will
capture 80% of real access events.  Since 80% might be too high, applying
the rule once again, 4% (20% * 20%) may capture about 56% (80% * 80%) of
real access events.

Sampling to aggregation intervals ratio and min/max aggregation intervals
are also arguably easy to answer.  What users want is discernment of
regions for efficient system operation, for examples, X amount of colder
regions or Y amount of warmer regions, not exactly how many times each
cache line is accessed in nanoseconds degree.  The appropriate min/max
aggregation interval can relatively naively set, and may better to set for
aimed monitoring overhead.  Since sampling interval is directly deciding
the overhead, setting it based on the sampling interval can be easy.  With
my experiences, I'd argue the intervals ratio 0.05, and 5 milliseconds to
20 seconds sampling interval range (100 milliseconds to 400 seconds
aggregation interval) can be a good default suggestion.

Evaluation
==========

On a machine running a real world server workload, I ran DAMON to monitor
its physical address space for about 23 hours, with this feature turned
on.  We set it to tune sampling interval in a range from 5 milliseconds to
10 seconds, aiming 4 % DAMON-observed access ratio per three aggregation
intervals.  The exact command I used is as below.

    damo start --monitoring_intervals_goal 4% 3 5ms 10s --damos_action stat

During the test run, DAMON continuously updated sampling and aggregation
intervals as designed, within the given range.  For all the time, DAMON
was able to find the intervals that meets the target access events ratio
in the given intervals range (sampling interval between 5 milliseconds and
10 seconds).

For most of the time, tuned sampling interval was converged in 300-400
milliseconds.  It made only small amount of changes within the range.  The
average of the tuned sampling interval during the test was about 380
milliseconds.

The workload periodically gets less load and decreases its CPU usage.
Presumably this also caused it making less memory access events.
Reactively to such event,s DAMON also increased the intervals as expected.
It was still able to find the optimum interval that satisfying the target
access ratio within the given intervals range.  Usually it was converged
to about 5 seconds.  Once the workload gets normal amount of load again,
DAMON reactively reduced the intervals to the normal range.

I collected and visualized DAMON's monitoring results on the server a few
times.  Every time the visualized access pattern looked not biased to only
cold or hot pages but diverse and balanced.  Let me show some of the
snapshots that I collected at the nearly end of the test (after about 23
hours have passed since starting DAMON on the server).

The recency histogram looks as below.  Please note that this visualization
shows only a very coarse grained information.  For more details about the
visualization format, please refer to DAMON user-space tool
documentation[1].

    # ./damo report access --style recency-sz-hist --tried_regions_of 0 0 0 --access_rate 0 0
    <last accessed time (us)> <total size>
    [-19 h 7 m 45.514 s, -17 h 12 m 58.963 s)  6.198 GiB  |****                |
    [-17 h 12 m 58.963 s, -15 h 18 m 12.412 s) 0 B        |                    |
    [-15 h 18 m 12.412 s, -13 h 23 m 25.860 s) 0 B        |                    |
    [-13 h 23 m 25.860 s, -11 h 28 m 39.309 s) 0 B        |                    |
    [-11 h 28 m 39.309 s, -9 h 33 m 52.757 s)  0 B        |                    |
    [-9 h 33 m 52.757 s, -7 h 39 m 6.206 s)    0 B        |                    |
    [-7 h 39 m 6.206 s, -5 h 44 m 19.654 s)    0 B        |                    |
    [-5 h 44 m 19.654 s, -3 h 49 m 33.103 s)   0 B        |                    |
    [-3 h 49 m 33.103 s, -1 h 54 m 46.551 s)   0 B        |                    |
    [-1 h 54 m 46.551 s, -0 ns)                16.967 GiB |*********           |
    [-0 ns, --6886551440000 ns)                38.835 GiB |********************|
    memory bw estimate: 9.425 GiB per second
    total size: 62.000 GiB

It shows about 38 GiB of memory was accessed at least once within last
aggregation interval (given ~300 milliseconds tuned sampling interval,
this is about six seconds).  This is about 61 % of the total memory.  In
other words, DAMON found warmest 61 % memory of the system.  The number is
particularly interesting given our Pareto principle based theory for the
tuning goal value.  We set it as 20 % of 20 % (4 %), thinking it would
capture 80 % of 80 % (64 %) real access events.  And it foudn 61 % hot
memory, or working set.  Nevertheless, to make the theory clearer, much
more discussion and tests would be needed.  At the moment, nonetheless, we
can say making the target value higher helps finding more hot memory
regions.

The histogram also shows an amount of cold memory.  About 17 GiB memory of
the system has not accessed at least for last aggregation interval (about
six seconds), and at most for about last two hours.  The real longest
unaccessed time of the 17 GiB memory was about 19 minutes, though.  This
is a limitation of this visualization format.

It further found very cold 6 GiB memory.  It has not accessed at least for
last 17 hours and at most 19 hours.

What about hot memory distribution?  To see this, I capture and visualize
the snapshot in access temperature histogram.  Again, please refer to the
DAMON user-space tool documentation[1] for the format and what access
temperature mean.  Both the visualization and metric shows only very
coarse grained and limited information.  The resulting histogram look like
below.

    # ./damo report access --style temperature-sz-hist --tried_regions_of 0 0 0
    <temperature> <total size>
    [-6,840,763,776,000, -5,501,580,939,800) 6.198 GiB  |***                 |
    [-5,501,580,939,800, -4,162,398,103,600) 0 B        |                    |
    [-4,162,398,103,600, -2,823,215,267,400) 0 B        |                    |
    [-2,823,215,267,400, -1,484,032,431,200) 0 B        |                    |
    [-1,484,032,431,200, -144,849,595,000)   0 B        |                    |
    [-144,849,595,000, 1,194,333,241,200)    55.802 GiB |********************|
    [1,194,333,241,200, 2,533,516,077,400)   4.000 KiB  |*                   |
    [2,533,516,077,400, 3,872,698,913,600)   4.000 KiB  |*                   |
    [3,872,698,913,600, 5,211,881,749,800)   8.000 KiB  |*                   |
    [5,211,881,749,800, 6,551,064,586,000)   12.000 KiB |*                   |
    [6,551,064,586,000, 7,890,247,422,200)   4.000 KiB  |*                   |
    memory bw estimate: 5.178 GiB per second
    total size: 62.000 GiB

We can see most of the memory is in similar access temperature range, and
definitely some pages are extremely hot.

To see the picture in more detail, let's capture and visualize the
snapshot per DAMON-region, sorted by their access temperature.  The total
number of the regions was about 300.  Due to the limited space, I'm
showing only a few parts of the output here.

    # ./damo report access --style hot --tried_regions_of 0 0 0
    heatmap: 00000000888888889999999888888888888888888888888888888888888888888888888888888888
    # min/max temperatures: -6,827,258,184,000, 17,589,052,500, column size: 793.600 MiB
     |999999999999999999999999999999999999999| 4.000 KiB   access 100 % 18 h 9 m 43.918 s
     |999999999999999999999999999999999999999| 8.000 KiB   access 100 % 17 h 56 m 5.351 s
     |999999999999999999999999999999999999999| 4.000 KiB   access 100 % 15 h 24 m 19.634 s
     |999999999999999999999999999999999999999| 4.000 KiB   access 100 % 14 h 10 m 55.606 s
     |999999999999999999999999999999999999999| 4.000 KiB   access 100 % 11 h 34 m 18.993 s
    [...]
               |99999999999999999999999999999| 8.000 KiB   access 100 % 1 m 27.945 s
               |11111111111111111111111111111| 80.000 KiB  access 15 %  1 m 21.180 s
               |00000000000000000000000000000| 24.000 KiB  access 5 %   1 m 21.180 s
               |00000000000000000000000000000| 5.919 GiB   access 10 %  1 m 14.415 s
               |99999999999999999999999999999| 12.000 KiB  access 100 % 1 m 7.650 s
    [...]
                                           |0| 4.000 KiB   access 5 %   0 ns
                                           |0| 12.000 KiB  access 5 %   0 ns
                                           |0| 188.000 KiB access 0 %   0 ns
                                           |0| 24.000 KiB  access 0 %   0 ns
                                           |0| 48.000 KiB  access 0 %   0 ns
    [...]
             |0000000000000000000000000000000| 8.000 KiB   access 0 %   6 m 45.901 s
            |00000000000000000000000000000000| 36.000 KiB  access 0 %   7 m 26.491 s
            |00000000000000000000000000000000| 4.000 KiB   access 0 %   12 m 37.682 s
           |000000000000000000000000000000000| 8.000 KiB   access 0 %   18 m 9.168 s
           |000000000000000000000000000000000| 16.000 KiB  access 0 %   19 m 3.288 s
    |0000000000000000000000000000000000000000| 6.198 GiB   access 0 %   18 h 57 m 52.582 s
    memory bw estimate: 8.798 GiB per second
    total size: 62.000 GiB

We can see DAMON found small and extremely hot regions that accessed for
all access check sampling (once per about 300 milliseconds) for more than
10 hours.  The access temperature rapidly decreases.  DAMON was also able
to find small and big regions that not accessed for up to about 19
minutes.  It even found an outlier cold region of 6 GiB that not accessed
for about 19 hours.  It is unclear what the outlier region is, as of this
writing.

For the testing, DAMON was consuming about 0.1% of single CPU time.  This
is again expected results, since DAMON was using about 370 milliseconds
sampling interval in most case.

    # ps -p $kdamond_pid -o %cpu
    %CPU
     0.1

I also ran similar tests against kernel build workload and an in-memory
cache workload benchmark[2].  Detialed results including tuned intervals
and captured access pattern were of course different sicne those depend on
the workloads.  But the auto-tuning feature was always working as expected
like the above results for the real world workload.

To wrap up, with intervals auto-tuning feature, DAMON was able to capture
access pattern snapshots of a quality on a real world server workload.
The auto-tuning feature was able to adaptively react to the dynamic access
patterns of the workload and reliably provide consistent monitoring
results without manual human interventions.  Also, the auto-tuning made
DAMON consumes only necessary amount of resource for the required quality.

References
==========

[1] https://github.com/damonitor/damo/blob/next/USAGE.md#access-report-styles
[2] https://github.com/facebookresearch/DCPerf/blob/main/packages/tao_bench/README.md


This patch (of 8):

Add data structures for DAMON sampling and aggregation intervals automatic
tuning that aims specific amount of DAMON-observed access events per
snapshot.  In more detail, define the data structure for the tuning goal,
link it to the monitoring attributes data structure so that DAMON kernel
API callers can make the request, and update parameters setup DAMON
function to respect the new parameter.

Link: https://lkml.kernel.org/r/20250303221726.484227-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20250303221726.484227-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 242910b190c9..5f2609f24761 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -659,12 +659,38 @@ struct damon_call_control {
 	bool canceled;
 };
 
+/**
+ * struct damon_intervals_goal - Monitoring intervals auto-tuning goal.
+ *
+ * @access_bp:		Access events observation ratio to achieve in bp.
+ * @aggrs:		Number of aggregations to acheive @access_bp within.
+ * @min_sample_us:	Minimum resulting sampling interval in microseconds.
+ * @max_sample_us:	Maximum resulting sampling interval in microseconds.
+ *
+ * DAMON automatically tunes &damon_attrs->sample_interval and
+ * &damon_attrs->aggr_interval aiming the ratio in bp (1/10,000) of
+ * DAMON-observed access events to theoretical maximum amount within @aggrs
+ * aggregations be same to @access_bp.  The logic increases
+ * &damon_attrs->aggr_interval and &damon_attrs->sampling_interval in same
+ * ratio if the current access events observation ratio is lower than the
+ * target for each @aggrs aggregations, and vice versa.
+ *
+ * If @aggrs is zero, the tuning is disabled and hence this struct is ignored.
+ */
+struct damon_intervals_goal {
+	unsigned long access_bp;
+	unsigned long aggrs;
+	unsigned long min_sample_us;
+	unsigned long max_sample_us;
+};
+
 /**
  * struct damon_attrs - Monitoring attributes for accuracy/overhead control.
  *
  * @sample_interval:		The time between access samplings.
  * @aggr_interval:		The time between monitor results aggregations.
  * @ops_update_interval:	The time between monitoring operations updates.
+ * @intervals_goal:		Intervals auto-tuning goal.
  * @min_nr_regions:		The minimum number of adaptive monitoring
  *				regions.
  * @max_nr_regions:		The maximum number of adaptive monitoring
@@ -684,6 +710,7 @@ struct damon_attrs {
 	unsigned long sample_interval;
 	unsigned long aggr_interval;
 	unsigned long ops_update_interval;
+	struct damon_intervals_goal intervals_goal;
 	unsigned long min_nr_regions;
 	unsigned long max_nr_regions;
 };
-- 
cgit v1.2.3


From f04b0fedbe714f822bd066b319a60faa39a985a1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 3 Mar 2025 14:17:20 -0800
Subject: mm/damon/core: implement intervals auto-tuning

Implement the DAMON sampling and aggregation intervals auto-tuning
mechanism as briefly described on 'struct damon_intervals_goal'.  The core
part for deciding the direction and amount of the changes is implemented
reusing the feedback loop function which is being used for DAMOS quotas
auto-tuning.  Unlike the DAMOS quotas auto-tuning use case, limit the
maximum decreasing amount after the adjustment to 50% of the current
value, though.  This is because the intervals have no good merits at rapid
reductions since it could unnecessarily increase the monitoring overhead.

Link: https://lkml.kernel.org/r/20250303221726.484227-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 5f2609f24761..b3e2c793c1f4 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -713,6 +713,17 @@ struct damon_attrs {
 	struct damon_intervals_goal intervals_goal;
 	unsigned long min_nr_regions;
 	unsigned long max_nr_regions;
+/* private: internal use only */
+	/*
+	 * @aggr_interval to @sample_interval ratio.
+	 * Core-external components call damon_set_attrs() with &damon_attrs
+	 * that this field is unset.  In the case, damon_set_attrs() sets this
+	 * field of resulting &damon_attrs.  Core-internal components such as
+	 * kdamond_tune_intervals() calls damon_set_attrs() with &damon_attrs
+	 * that this field is set.  In the case, damon_set_attrs() just keep
+	 * it.
+	 */
+	unsigned long aggr_samples;
 };
 
 /**
@@ -761,6 +772,11 @@ struct damon_ctx {
 	 * update
 	 */
 	unsigned long next_ops_update_sis;
+	/*
+	 * number of sample intervals that should be passed before next
+	 * intervals tuning
+	 */
+	unsigned long next_intervals_tune_sis;
 	/* for waiting until the execution of the kdamond_fn is started */
 	struct completion kdamond_started;
 	/* for scheme quotas prioritization */
-- 
cgit v1.2.3


From 691ee97e1a9de0cdb3efb893c1f180e3f4a35e32 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 3 Mar 2025 14:15:35 +0000
Subject: mm: fix lazy mmu docs and usage

Patch series "Fix lazy mmu mode", v2.

I'm planning to implement lazy mmu mode for arm64 to optimize vmalloc.  As
part of that, I will extend lazy mmu mode to cover kernel mappings in
vmalloc table walkers.  While lazy mmu mode is already used for kernel
mappings in a few places, this will extend it's use significantly.

Having reviewed the existing lazy mmu implementations in powerpc, sparc
and x86, it looks like there are a bunch of bugs, some of which may be
more likely to trigger once I extend the use of lazy mmu.  So this series
attempts to clarify the requirements and fix all the bugs in advance of
that series.  See patch #1 commit log for all the details.


This patch (of 5):

The docs, implementations and use of arch_[enter|leave]_lazy_mmu_mode() is
a bit of a mess (to put it politely).  There are a number of issues
related to nesting of lazy mmu regions and confusion over whether the
task, when in a lazy mmu region, is preemptible or not.  Fix all the
issues relating to the core-mm.  Follow up commits will fix the
arch-specific implementations.  3 arches implement lazy mmu; powerpc,
sparc and x86.

When arch_[enter|leave]_lazy_mmu_mode() was first introduced by commit
6606c3e0da53 ("[PATCH] paravirt: lazy mmu mode hooks.patch"), it was
expected that lazy mmu regions would never nest and that the appropriate
page table lock(s) would be held while in the region, thus ensuring the
region is non-preemptible.  Additionally lazy mmu regions were only used
during manipulation of user mappings.

Commit 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy
updates") started invoking the lazy mmu mode in apply_to_pte_range(),
which is used for both user and kernel mappings.  For kernel mappings the
region is no longer protected by any lock so there is no longer any
guarantee about non-preemptibility.  Additionally, for RT configs, the
holding the PTL only implies no CPU migration, it doesn't prevent
preemption.

Commit bcc6cc832573 ("mm: add default definition of set_ptes()") added
arch_[enter|leave]_lazy_mmu_mode() to the default implementation of
set_ptes(), used by x86.  So after this commit, lazy mmu regions can be
nested.  Additionally commit 1a10a44dfc1d ("sparc64: implement the new
page table range API") and commit 9fee28baa601 ("powerpc: implement the
new page table range API") did the same for the sparc and powerpc
set_ptes() overrides.

powerpc couldn't deal with preemption so avoids it in commit b9ef323ea168
("powerpc/64s: Disable preemption in hash lazy mmu mode"), which
explicitly disables preemption for the whole region in its implementation.
x86 can support preemption (or at least it could until it tried to add
support nesting; more on this below).  Sparc looks to be totally broken in
the face of preemption, as far as I can tell.

powerpc can't deal with nesting, so avoids it in commit 47b8def9358c
("powerpc/mm: Avoid calling arch_enter/leave_lazy_mmu() in set_ptes"),
which removes the lazy mmu calls from its implementation of set_ptes().
x86 attempted to support nesting in commit 49147beb0ccb ("x86/xen: allow
nesting of same lazy mode") but as far as I can tell, this breaks its
support for preemption.

In short, it's all a mess; the semantics for
arch_[enter|leave]_lazy_mmu_mode() are not clearly defined and as a result
the implementations all have different expectations, sticking plasters and
bugs.

arm64 is aiming to start using these hooks, so let's clean everything up
before adding an arm64 implementation.  Update the documentation to state
that lazy mmu regions can never be nested, must not be called in interrupt
context and preemption may or may not be enabled for the duration of the
region.  And fix the generic implementation of set_ptes() to avoid
nesting.

arch-specific fixes to conform to the new spec will proceed this one.

These issues were spotted by code review and I have no evidence of issues
being reported in the wild.

Link: https://lkml.kernel.org/r/20250303141542.3371656-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20250303141542.3371656-2-ryan.roberts@arm.com
Fixes: bcc6cc832573 ("mm: add default definition of set_ptes()")
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juegren Gross <jgross@suse.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 94d267d02372..787c632ee2c9 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -222,10 +222,14 @@ static inline int pmd_dirty(pmd_t pmd)
  * hazard could result in the direct mode hypervisor case, since the actual
  * write to the page tables may not yet have taken place, so reads though
  * a raw PTE pointer after it has been modified are not guaranteed to be
- * up to date.  This mode can only be entered and left under the protection of
- * the page table locks for all page tables which may be modified.  In the UP
- * case, this is required so that preemption is disabled, and in the SMP case,
- * it must synchronize the delayed page table writes properly on other CPUs.
+ * up to date.
+ *
+ * In the general case, no lock is guaranteed to be held between entry and exit
+ * of the lazy mode. So the implementation must assume preemption may be enabled
+ * and cpu migration is possible; it must take steps to be robust against this.
+ * (In practice, for user PTE updates, the appropriate page table lock(s) are
+ * held, but for kernel PTE updates, no lock is held). Nesting is not permitted
+ * and the mode cannot be used in interrupt context.
  */
 #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
 #define arch_enter_lazy_mmu_mode()	do {} while (0)
@@ -287,7 +291,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 {
 	page_table_check_ptes_set(mm, ptep, pte, nr);
 
-	arch_enter_lazy_mmu_mode();
 	for (;;) {
 		set_pte(ptep, pte);
 		if (--nr == 0)
@@ -295,7 +298,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 		ptep++;
 		pte = pte_next_pfn(pte);
 	}
-	arch_leave_lazy_mmu_mode();
 }
 #endif
 #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1)
-- 
cgit v1.2.3


From 0e2759afcaf9bff25a63201856fa89b64181749f Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Thu, 27 Feb 2025 23:58:07 -0800
Subject: page_counter: track failcnt only for legacy cgroups

Currently page_counter tracks failcnt for counters used by v1 and v2
controllers.  However failcnt is only exported for v1 deployment and thus
there is no need to maintain it in v2.  The oom report does expose failcnt
for memory and swap in v2 but v2 already maintains MEMCG_MAX and
MEMCG_SWAP_MAX event counters which can be used.

Link: https://lkml.kernel.org/r/20250228075808.207484-3-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_counter.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 46406f3fe34d..e4bd8fd427be 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -28,12 +28,13 @@ struct page_counter {
 	unsigned long watermark;
 	/* Latest cg2 reset watermark */
 	unsigned long local_watermark;
-	unsigned long failcnt;
+	unsigned long failcnt; /* v1-only field */
 
 	/* Keep all the read most fields in a separete cacheline. */
 	CACHELINE_PADDING(_pad2_);
 
 	bool protection_support;
+	bool track_failcnt;
 	unsigned long min;
 	unsigned long low;
 	unsigned long high;
@@ -58,6 +59,7 @@ static inline void page_counter_init(struct page_counter *counter,
 	counter->max = PAGE_COUNTER_MAX;
 	counter->parent = parent;
 	counter->protection_support = protection_support;
+	counter->track_failcnt = false;
 }
 
 static inline unsigned long page_counter_read(struct page_counter *counter)
-- 
cgit v1.2.3


From f7b0797d36e75d2a622580b56b9bfd3130d5d0e9 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Thu, 27 Feb 2025 23:58:08 -0800
Subject: page_counter: reduce struct page_counter size

The struct page_counter has explicit padding for better cache alignment.
The commit c6f53ed8f213a ("mm, memcg: cg2 memory{.swap,}.peak write
handlers") added a field to the struct page_counter and accidently
increased its size.  Let's move the failcnt field which is v1-only field
to the same cacheline of usage to reduce the size of struct page_counter.

Link: https://lkml.kernel.org/r/20250228075808.207484-4-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_counter.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index e4bd8fd427be..d649b6bbbc87 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -9,10 +9,12 @@
 
 struct page_counter {
 	/*
-	 * Make sure 'usage' does not share cacheline with any other field. The
-	 * memcg->memory.usage is a hot member of struct mem_cgroup.
+	 * Make sure 'usage' does not share cacheline with any other field in
+	 * v2. The memcg->memory.usage is a hot member of struct mem_cgroup.
 	 */
 	atomic_long_t usage;
+	unsigned long failcnt; /* v1-only field */
+
 	CACHELINE_PADDING(_pad1_);
 
 	/* effective memory.min and memory.min usage tracking */
@@ -28,7 +30,6 @@ struct page_counter {
 	unsigned long watermark;
 	/* Latest cg2 reset watermark */
 	unsigned long local_watermark;
-	unsigned long failcnt; /* v1-only field */
 
 	/* Keep all the read most fields in a separete cacheline. */
 	CACHELINE_PADDING(_pad2_);
-- 
cgit v1.2.3


From f1ab2831e2a4312046bca79256b2efc41d373eaf Mon Sep 17 00:00:00 2001
From: Tang Yizhou <yizhou.tang@shopee.com>
Date: Tue, 4 Mar 2025 19:03:16 +0800
Subject: writeback: let trace_balance_dirty_pages() take struct dtc as
 parameter

Patch series "Fix calculations in trace_balance_dirty_pages() for cgwb", v2.

In my experiment, I found that the output of trace_balance_dirty_pages()
in the cgroup writeback scenario was strange because
trace_balance_dirty_pages() always uses global_wb_domain.dirty_limit for
related calculations instead of the dirty_limit of the corresponding
memcg's wb_domain.

The basic idea of the fix is to store the hard dirty limit value computed
in wb_position_ratio() into struct dirty_throttle_control and use it for
calculations in trace_balance_dirty_pages().


This patch (of 3):

Currently, trace_balance_dirty_pages() already has 12 parameters.  In the
patch #3, I initially attempted to introduce an additional parameter.
However, in include/linux/trace_events.h, bpf_trace_run12() only supports
up to 12 parameters and bpf_trace_run13() does not exist.

To reduce the number of parameters in trace_balance_dirty_pages(), we can
make it accept a pointer to struct dirty_throttle_control as a parameter.
To achieve this, we need to move the definition of struct
dirty_throttle_control from mm/page-writeback.c to
include/linux/writeback.h.

Link: https://lkml.kernel.org/r/20250304110318.159567-1-yizhou.tang@shopee.com
Link: https://lkml.kernel.org/r/20250304110318.159567-2-yizhou.tang@shopee.com
Signed-off-by: Tang Yizhou <yizhou.tang@shopee.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Tang Yizhou <yizhou.tang@shopee.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/writeback.h        | 23 +++++++++++++++++++++++
 include/trace/events/writeback.h | 16 ++++++----------
 2 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d11b903c2edb..32095928365c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -313,6 +313,29 @@ static inline void cgroup_writeback_umount(struct super_block *sb)
 /*
  * mm/page-writeback.c
  */
+/* consolidated parameters for balance_dirty_pages() and its subroutines */
+struct dirty_throttle_control {
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct wb_domain	*dom;
+	struct dirty_throttle_control *gdtc;	/* only set in memcg dtc's */
+#endif
+	struct bdi_writeback	*wb;
+	struct fprop_local_percpu *wb_completions;
+
+	unsigned long		avail;		/* dirtyable */
+	unsigned long		dirty;		/* file_dirty + write + nfs */
+	unsigned long		thresh;		/* dirty threshold */
+	unsigned long		bg_thresh;	/* dirty background threshold */
+
+	unsigned long		wb_dirty;	/* per-wb counterparts */
+	unsigned long		wb_thresh;
+	unsigned long		wb_bg_thresh;
+
+	unsigned long		pos_ratio;
+	bool			freerun;
+	bool			dirty_exceeded;
+};
+
 void laptop_io_completion(struct backing_dev_info *info);
 void laptop_sync_completion(void);
 void laptop_mode_timer_fn(struct timer_list *t);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index a261e86e61fa..3213b9023794 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -629,11 +629,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
 TRACE_EVENT(balance_dirty_pages,
 
 	TP_PROTO(struct bdi_writeback *wb,
-		 unsigned long thresh,
-		 unsigned long bg_thresh,
-		 unsigned long dirty,
-		 unsigned long bdi_thresh,
-		 unsigned long bdi_dirty,
+		 struct dirty_throttle_control *dtc,
 		 unsigned long dirty_ratelimit,
 		 unsigned long task_ratelimit,
 		 unsigned long dirtied,
@@ -641,7 +637,7 @@ TRACE_EVENT(balance_dirty_pages,
 		 long pause,
 		 unsigned long start_time),
 
-	TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
+	TP_ARGS(wb, dtc,
 		dirty_ratelimit, task_ratelimit,
 		dirtied, period, pause, start_time),
 
@@ -664,16 +660,16 @@ TRACE_EVENT(balance_dirty_pages,
 	),
 
 	TP_fast_assign(
-		unsigned long freerun = (thresh + bg_thresh) / 2;
+		unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2;
 		strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
 
 		__entry->limit		= global_wb_domain.dirty_limit;
 		__entry->setpoint	= (global_wb_domain.dirty_limit +
 						freerun) / 2;
-		__entry->dirty		= dirty;
+		__entry->dirty		= dtc->dirty;
 		__entry->bdi_setpoint	= __entry->setpoint *
-						bdi_thresh / (thresh + 1);
-		__entry->bdi_dirty	= bdi_dirty;
+						dtc->wb_thresh / (dtc->thresh + 1);
+		__entry->bdi_dirty	= dtc->wb_dirty;
 		__entry->dirty_ratelimit = KBps(dirty_ratelimit);
 		__entry->task_ratelimit	= KBps(task_ratelimit);
 		__entry->dirtied	= dirtied;
-- 
cgit v1.2.3


From 28c24ef9e04f95672b72b1297eff7dae91cceea8 Mon Sep 17 00:00:00 2001
From: Tang Yizhou <yizhou.tang@shopee.com>
Date: Tue, 4 Mar 2025 19:03:17 +0800
Subject: writeback: rename variables in trace_balance_dirty_pages()

Rename bdi_setpoint and bdi_dirty in the tracepoint to wb_setpoint and
wb_dirty, respectively. These changes were omitted by Tejun in the cgroup
writeback patchset.

Link: https://lkml.kernel.org/r/20250304110318.159567-3-yizhou.tang@shopee.com
Signed-off-by: Tang Yizhou <yizhou.tang@shopee.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/writeback.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 3213b9023794..3046ca6b08ea 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -646,8 +646,8 @@ TRACE_EVENT(balance_dirty_pages,
 		__field(unsigned long,	limit)
 		__field(unsigned long,	setpoint)
 		__field(unsigned long,	dirty)
-		__field(unsigned long,	bdi_setpoint)
-		__field(unsigned long,	bdi_dirty)
+		__field(unsigned long,	wb_setpoint)
+		__field(unsigned long,	wb_dirty)
 		__field(unsigned long,	dirty_ratelimit)
 		__field(unsigned long,	task_ratelimit)
 		__field(unsigned int,	dirtied)
@@ -667,9 +667,9 @@ TRACE_EVENT(balance_dirty_pages,
 		__entry->setpoint	= (global_wb_domain.dirty_limit +
 						freerun) / 2;
 		__entry->dirty		= dtc->dirty;
-		__entry->bdi_setpoint	= __entry->setpoint *
+		__entry->wb_setpoint	= __entry->setpoint *
 						dtc->wb_thresh / (dtc->thresh + 1);
-		__entry->bdi_dirty	= dtc->wb_dirty;
+		__entry->wb_dirty	= dtc->wb_dirty;
 		__entry->dirty_ratelimit = KBps(dirty_ratelimit);
 		__entry->task_ratelimit	= KBps(task_ratelimit);
 		__entry->dirtied	= dirtied;
@@ -685,7 +685,7 @@ TRACE_EVENT(balance_dirty_pages,
 
 	TP_printk("bdi %s: "
 		  "limit=%lu setpoint=%lu dirty=%lu "
-		  "bdi_setpoint=%lu bdi_dirty=%lu "
+		  "wb_setpoint=%lu wb_dirty=%lu "
 		  "dirty_ratelimit=%lu task_ratelimit=%lu "
 		  "dirtied=%u dirtied_pause=%u "
 		  "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu",
@@ -693,8 +693,8 @@ TRACE_EVENT(balance_dirty_pages,
 		  __entry->limit,
 		  __entry->setpoint,
 		  __entry->dirty,
-		  __entry->bdi_setpoint,
-		  __entry->bdi_dirty,
+		  __entry->wb_setpoint,
+		  __entry->wb_dirty,
 		  __entry->dirty_ratelimit,
 		  __entry->task_ratelimit,
 		  __entry->dirtied,
-- 
cgit v1.2.3


From 6cc4c3aa714bc58ec5d20f3054ca5f23534984d1 Mon Sep 17 00:00:00 2001
From: Tang Yizhou <yizhou.tang@shopee.com>
Date: Tue, 4 Mar 2025 19:03:18 +0800
Subject: writeback: fix calculations in trace_balance_dirty_pages() for cgwb

In the commit dcc25ae76eb7 ("writeback: move global_dirty_limit into
wb_domain") of the cgroup writeback backpressure propagation patchset,
Tejun made some adaptations to trace_balance_dirty_pages() for cgroup
writeback.  However, this adaptation was incomplete and Tejun missed
further adaptation in the subsequent patches.

In the cgroup writeback scenario, if sdtc in balance_dirty_pages() is
assigned to mdtc, then upon entering trace_balance_dirty_pages(),
__entry->limit should be assigned based on the dirty_limit of the
corresponding memcg's wb_domain, rather than global_wb_domain.

To address this issue and simplify the implementation, introduce a 'limit'
field in struct dirty_throttle_control to store the hard_limit value
computed in wb_position_ratio() by calling hard_dirty_limit().  This field
will then be used in trace_balance_dirty_pages() to assign the value to
__entry->limit.

Link: https://lkml.kernel.org/r/20250304110318.159567-4-yizhou.tang@shopee.com
Fixes: dcc25ae76eb7 ("writeback: move global_dirty_limit into wb_domain")
Signed-off-by: Tang Yizhou <yizhou.tang@shopee.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/writeback.h        | 1 +
 include/trace/events/writeback.h | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 32095928365c..58bda3347914 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -326,6 +326,7 @@ struct dirty_throttle_control {
 	unsigned long		dirty;		/* file_dirty + write + nfs */
 	unsigned long		thresh;		/* dirty threshold */
 	unsigned long		bg_thresh;	/* dirty background threshold */
+	unsigned long		limit;		/* hard dirty limit */
 
 	unsigned long		wb_dirty;	/* per-wb counterparts */
 	unsigned long		wb_thresh;
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 3046ca6b08ea..0ff388131fc9 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -663,9 +663,8 @@ TRACE_EVENT(balance_dirty_pages,
 		unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2;
 		strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
 
-		__entry->limit		= global_wb_domain.dirty_limit;
-		__entry->setpoint	= (global_wb_domain.dirty_limit +
-						freerun) / 2;
+		__entry->limit		= dtc->limit;
+		__entry->setpoint	= (dtc->limit + freerun) / 2;
 		__entry->dirty		= dtc->dirty;
 		__entry->wb_setpoint	= __entry->setpoint *
 						dtc->wb_thresh / (dtc->thresh + 1);
-- 
cgit v1.2.3


From ab82e57981d0e4cb46d2817e7b65b9d5fdcf3832 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 4 Mar 2025 13:19:05 -0800
Subject: mm/damon/core: introduce damos->ops_filters

Patch series "mm/damon: make allow filters after reject filters useful and
intuitive".

DAMOS filters do allow or reject elements of memory for given DAMOS scheme
only if those match the filter criterias.  For elements that don't match
any DAMOS filter, 'allowing' is the default behavior.  This makes
allow-filters that don't have any reject-filter after them meaningless
sources of overhead.  The decision was made to keep the behavior
consistent with that before the introduction of allow-filters.  This,
however, makes usage of DAMOS filters confusing and inefficient.  It is
more intuitive and still consistent behavior to reject by default unless
there is no filter at all or the last filter is a reject filter.  Update
the filtering logic in the way and update documents to clarify the
behavior.

Note that this is changing the old behavior.  But the old behavior for the
problematic filter combination was definitely confusing, inefficient and
anyway useless.  Also, the behavior has relatively recently introduced.
It is difficult to anticipate any user that depends on the behavior.
Hence this is not a user-breaking behavior change but an obvious
improvement.


This patch (of 9):

DAMOS filters can be categorized into two groups depending on which layer
they are handled, namely core layer and ops layer.  The groups are
important because the filtering behavior depends on evaluation sequence of
filters, and core layer-handled filters are evaluated before operations
layer-handled ones.

The behavior is clearly documented, but the implementation is bit
inefficient and complicated.  All filters are maintained in a single list
(damos->filters) in mix.  Filters evaluation logics in core layer and
operations layer iterates all the filters on the list, while skipping
filters that should be not handled by the layer of the logic.  It is
inefficient.  Making future extensions having differentiations for filters
of different handling layers will also be complicated.

Add a new list that will be used for having all operations layer-handled
DAMOS filters to DAMOS scheme data structure.  Also add the support of its
initialization and basic traversal functions.

Link: https://lkml.kernel.org/r/20250304211913.53574-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20250304211913.53574-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index b3e2c793c1f4..7f76e2e99f37 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -448,6 +448,7 @@ struct damos_access_pattern {
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
  * @target_nid:		Destination node if @action is "migrate_{hot,cold}".
  * @filters:		Additional set of &struct damos_filter for &action.
+ * @ops_filters:	ops layer handling &struct damos_filter objects list.
  * @last_applied:	Last @action applied ops-managing entity.
  * @stat:		Statistics of this scheme.
  * @list:		List head for siblings.
@@ -508,6 +509,7 @@ struct damos {
 		int target_nid;
 	};
 	struct list_head filters;
+	struct list_head ops_filters;
 	void *last_applied;
 	struct damos_stat stat;
 	struct list_head list;
@@ -858,6 +860,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
 #define damos_for_each_filter_safe(f, next, scheme) \
 	list_for_each_entry_safe(f, next, &(scheme)->filters, list)
 
+#define damos_for_each_ops_filter(f, scheme) \
+	list_for_each_entry(f, &(scheme)->ops_filters, list)
+
+#define damos_for_each_ops_filter_safe(f, next, scheme) \
+	list_for_each_entry_safe(f, next, &(scheme)->ops_filters, list)
+
 #ifdef CONFIG_DAMON
 
 struct damon_region *damon_new_region(unsigned long start, unsigned long end);
-- 
cgit v1.2.3


From dd038b728c8a2a0e1a632b767a50f09f076dab79 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 4 Mar 2025 13:19:10 -0800
Subject: mm/damon: add default allow/reject behavior fields to struct damos

Current default allow/reject behavior of filters handling stage has made
before introduction of the allow behavior.  For allow-filters usage, it is
confusing and inefficient.

It is more intuitive to decide the default filtering stage allow/reject
behavior as opposite to the last filter's behavior.  The decision should
be made separately for core and operations layers' filtering stages, since
last core layer-handled filter is not really a last filter if there are
operations layer handling filters.

Keeping separate decisions for the two categories can make the logic
simpler.  Add fields for storing the two decisions.

Link: https://lkml.kernel.org/r/20250304211913.53574-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 7f76e2e99f37..52559475dbe7 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -502,6 +502,9 @@ struct damos {
 	 * layer-handled filters.  If true, operations layer allows it, too.
 	 */
 	bool core_filters_allowed;
+	/* whether to reject core/ops filters umatched regions */
+	bool core_filters_default_reject;
+	bool ops_filters_default_reject;
 /* public: */
 	struct damos_quota quota;
 	struct damos_watermarks wmarks;
-- 
cgit v1.2.3


From 9bbe033c75a56d72fc35e7c8ca6f3258d9782fa5 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Wed, 5 Mar 2025 06:11:29 +0000
Subject: mm: zpool: add interfaces for object read/write APIs

Patch series "Switch zswap to object read/write APIs".

This patch series updates zswap to use the new object read/write APIs
defined by zsmalloc in [1], and remove the old object mapping APIs and the
related code from zpool and zsmalloc.


This patch (of 5):

Zsmalloc introduced new APIs to read/write objects besides mapping them.
Add the necessary zpool interfaces.

Link: https://lkml.kernel.org/r/20250305061134.4105762-1-yosry.ahmed@linux.dev
Link: https://lkml.kernel.org/r/20250305061134.4105762-2-yosry.ahmed@linux.dev
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zpool.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 5e6dc46b8cc4..1784e735ee04 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -52,6 +52,16 @@ void *zpool_map_handle(struct zpool *pool, unsigned long handle,
 
 void zpool_unmap_handle(struct zpool *pool, unsigned long handle);
 
+
+void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle,
+			   void *local_copy);
+
+void zpool_obj_read_end(struct zpool *zpool, unsigned long handle,
+			void *handle_mem);
+
+void zpool_obj_write(struct zpool *zpool, unsigned long handle,
+		     void *handle_mem, size_t mem_len);
+
 u64 zpool_get_total_pages(struct zpool *pool);
 
 
@@ -90,6 +100,13 @@ struct zpool_driver {
 				enum zpool_mapmode mm);
 	void (*unmap)(void *pool, unsigned long handle);
 
+	void *(*obj_read_begin)(void *pool, unsigned long handle,
+				void *local_copy);
+	void (*obj_read_end)(void *pool, unsigned long handle,
+			     void *handle_mem);
+	void (*obj_write)(void *pool, unsigned long handle,
+			  void *handle_mem, size_t mem_len);
+
 	u64 (*total_pages)(void *pool);
 };
 
-- 
cgit v1.2.3


From fcbea574754c63f7035d0c4ef7dfb161b60b5bde Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Wed, 5 Mar 2025 06:11:31 +0000
Subject: mm: zpool: remove object mapping APIs

zpool_map_handle(), zpool_unmap_handle(), and zpool_can_sleep_mapped() are
no longer used.  Remove them with the underlying driver callbacks.

Link: https://lkml.kernel.org/r/20250305061134.4105762-4-yosry.ahmed@linux.dev
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zpool.h | 30 ------------------------------
 1 file changed, 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 1784e735ee04..2c8a9d2654f6 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -13,25 +13,6 @@
 
 struct zpool;
 
-/*
- * Control how a handle is mapped.  It will be ignored if the
- * implementation does not support it.  Its use is optional.
- * Note that this does not refer to memory protection, it
- * refers to how the memory will be copied in/out if copying
- * is necessary during mapping; read-write is the safest as
- * it copies the existing memory in on map, and copies the
- * changed memory back out on unmap.  Write-only does not copy
- * in the memory and should only be used for initialization.
- * If in doubt, use ZPOOL_MM_DEFAULT which is read-write.
- */
-enum zpool_mapmode {
-	ZPOOL_MM_RW, /* normal read-write mapping */
-	ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */
-	ZPOOL_MM_WO, /* write-only (no copy-in at map time) */
-
-	ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
-};
-
 bool zpool_has_pool(char *type);
 
 struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp);
@@ -47,12 +28,6 @@ int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
 
 void zpool_free(struct zpool *pool, unsigned long handle);
 
-void *zpool_map_handle(struct zpool *pool, unsigned long handle,
-			enum zpool_mapmode mm);
-
-void zpool_unmap_handle(struct zpool *pool, unsigned long handle);
-
-
 void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle,
 			   void *local_copy);
 
@@ -95,11 +70,6 @@ struct zpool_driver {
 				unsigned long *handle);
 	void (*free)(void *pool, unsigned long handle);
 
-	bool sleep_mapped;
-	void *(*map)(void *pool, unsigned long handle,
-				enum zpool_mapmode mm);
-	void (*unmap)(void *pool, unsigned long handle);
-
 	void *(*obj_read_begin)(void *pool, unsigned long handle,
 				void *local_copy);
 	void (*obj_read_end)(void *pool, unsigned long handle,
-- 
cgit v1.2.3


From 07864f1a57fb1f798a7d21f13e4929c9cb52daf7 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Wed, 5 Mar 2025 06:11:32 +0000
Subject: mm: zsmalloc: remove object mapping APIs and per-CPU map areas

zs_map_object() and zs_unmap_object() are no longer used, remove them.
Since these are the only users of per-CPU mapping_areas, remove them and
the associated CPU hotplug callbacks too.

[yosry.ahmed@linux.dev: update the docs]
  Link: https://lkml.kernel.org/r/Z8ier-ZZp8T6MOTH@google.com
Link: https://lkml.kernel.org/r/20250305061134.4105762-5-yosry.ahmed@linux.dev
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Acked-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpuhotplug.h |  1 -
 include/linux/zsmalloc.h   | 21 ---------------------
 2 files changed, 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 6cc5e484547c..1987400000b4 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -116,7 +116,6 @@ enum cpuhp_state {
 	CPUHP_NET_IUCV_PREPARE,
 	CPUHP_ARM_BL_PREPARE,
 	CPUHP_TRACE_RB_PREPARE,
-	CPUHP_MM_ZS_PREPARE,
 	CPUHP_MM_ZSWP_POOL_PREPARE,
 	CPUHP_KVM_PPC_BOOK3S_PREPARE,
 	CPUHP_ZCOMP_PREPARE,
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 7d70983cf398..c26baf9fb331 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -16,23 +16,6 @@
 
 #include <linux/types.h>
 
-/*
- * zsmalloc mapping modes
- *
- * NOTE: These only make a difference when a mapped object spans pages.
- */
-enum zs_mapmode {
-	ZS_MM_RW, /* normal read-write mapping */
-	ZS_MM_RO, /* read-only (no copy-out at unmap time) */
-	ZS_MM_WO /* write-only (no copy-in at map time) */
-	/*
-	 * NOTE: ZS_MM_WO should only be used for initializing new
-	 * (uninitialized) allocations.  Partial writes to already
-	 * initialized allocations should use ZS_MM_RW to preserve the
-	 * existing data.
-	 */
-};
-
 struct zs_pool_stats {
 	/* How many pages were migrated (freed) */
 	atomic_long_t pages_compacted;
@@ -48,10 +31,6 @@ void zs_free(struct zs_pool *pool, unsigned long obj);
 
 size_t zs_huge_class_size(struct zs_pool *pool);
 
-void *zs_map_object(struct zs_pool *pool, unsigned long handle,
-			enum zs_mapmode mm);
-void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
-
 unsigned long zs_get_total_pages(struct zs_pool *pool);
 unsigned long zs_compact(struct zs_pool *pool);
 
-- 
cgit v1.2.3


From 7b60041156079f256a7df3f7de469de7db618580 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Wed, 5 Mar 2025 06:11:33 +0000
Subject: mm: zpool: remove zpool_malloc_support_movable()

zpool_malloc_support_movable() always returns true for zsmalloc, the only
remaining zpool driver.  Remove it and set the gfp flags in
zswap_compress() accordingly.  Opportunistically use GFP_NOWAIT instead of
__GFP_NOWARN | __GFP_KSWAPD_RECLAIM for conciseness as they are
equivalent.

Link: https://lkml.kernel.org/r/20250305061134.4105762-6-yosry.ahmed@linux.dev
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zpool.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 2c8a9d2654f6..52f30e526607 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -21,8 +21,6 @@ const char *zpool_get_type(struct zpool *pool);
 
 void zpool_destroy_pool(struct zpool *pool);
 
-bool zpool_malloc_support_movable(struct zpool *pool);
-
 int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
 			unsigned long *handle);
 
@@ -65,7 +63,6 @@ struct zpool_driver {
 	void *(*create)(const char *name, gfp_t gfp);
 	void (*destroy)(void *pool);
 
-	bool malloc_support_movable;
 	int (*malloc)(void *pool, size_t size, gfp_t gfp,
 				unsigned long *handle);
 	void (*free)(void *pool, unsigned long handle);
-- 
cgit v1.2.3


From bba38b874886b227283d56ecb2f7ebbaa01a781d Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Wed, 19 Feb 2025 14:41:14 +0100
Subject: rtc: pm8xxx: add support for uefi offset

On many Qualcomm platforms the PMIC RTC control and time registers are
read-only so that the RTC time can not be updated. Instead an offset
needs be stored in some machine-specific non-volatile memory, which the
driver can take into account.

Add support for storing a 32-bit offset from the GPS time epoch in a
UEFI variable so that the RTC time can be set on such platforms.

The UEFI variable is

            882f8c2b-9646-435f-8de5-f208ff80c1bd-RTCInfo

and holds a 12-byte structure where the first four bytes is a GPS time
offset in little-endian byte order.

Note that this format is not arbitrary as the variable is shared with
the UEFI firmware (and Windows).

Tested-by: Jens Glathe <jens.glathe@oldschoolsolutions.biz>
Tested-by: Steev Klimaszewski <steev@kali.org>
Tested-by: Joel Stanley <joel@jms.id.au>
Tested-by: Sebastian Reichel <sre@kernel.org> # Lenovo T14s Gen6
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://lore.kernel.org/r/20250219134118.31017-3-johan+linaro@kernel.org
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/rtc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 3f4d315aaec9..95da051fb155 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -170,6 +170,7 @@ struct rtc_device {
 /* useful timestamps */
 #define RTC_TIMESTAMP_BEGIN_0000	-62167219200ULL /* 0000-01-01 00:00:00 */
 #define RTC_TIMESTAMP_BEGIN_1900	-2208988800LL /* 1900-01-01 00:00:00 */
+#define RTC_TIMESTAMP_EPOCH_GPS		315964800LL /* 1980-01-06 00:00:00 */
 #define RTC_TIMESTAMP_BEGIN_2000	946684800LL /* 2000-01-01 00:00:00 */
 #define RTC_TIMESTAMP_END_2063		2966371199LL /* 2063-12-31 23:59:59 */
 #define RTC_TIMESTAMP_END_2079		3471292799LL /* 2079-12-31 23:59:59 */
-- 
cgit v1.2.3


From 16b1936ae6d1858252413bf4bae8bcf247eb4b4c Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Mon, 10 Mar 2025 07:49:34 +0000
Subject: lib/rbtree: add random seed

Current test use pseudo rand function with fixed seed, which means the
test data is the same pattern each time.

Add random seed parameter to randomize the test.

Link: https://lkml.kernel.org/r/20250310074938.26756-4-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michel Lespinasse <michel@lespinasse.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/types.h b/include/linux/types.h
index 1c509ce8f7f6..864ab701f297 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -92,6 +92,7 @@ typedef unsigned char		unchar;
 typedef unsigned short		ushort;
 typedef unsigned int		uint;
 typedef unsigned long		ulong;
+typedef unsigned long long	ullong;
 
 #ifndef __BIT_TYPES_DEFINED__
 #define __BIT_TYPES_DEFINED__
-- 
cgit v1.2.3


From 19811285784f7f3d4abaa6e94623f2e7d10219d0 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Mon, 10 Mar 2025 07:49:37 +0000
Subject: lib/interval_tree: skip the check before go to the right subtree

The interval_tree_subtree_search() holds the loop invariant:

    start <= node->ITSUBTREE

Let's say we have a following tree:

         node
         /  \
      left  right

So we know node->ITSUBTREE is contributed by one of the following:

  * left->ITSUBTREE
  * ITLAST(node)
  * right->ITSUBTREE

When we come to the right node, we are sure the first two don't
contribute to node->ITSUBTREE and it must be the right node does the
job.

So skip the check before go to the right subtree.

Link: https://lkml.kernel.org/r/20250310074938.26756-7-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michel Lespinasse <michel@lespinasse.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/interval_tree_generic.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
index aaa8a0767aa3..1b400f26f63d 100644
--- a/include/linux/interval_tree_generic.h
+++ b/include/linux/interval_tree_generic.h
@@ -104,12 +104,8 @@ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last)	      \
 		if (ITSTART(node) <= last) {		/* Cond1 */	      \
 			if (start <= ITLAST(node))	/* Cond2 */	      \
 				return node;	/* node is leftmost match */  \
-			if (node->ITRB.rb_right) {			      \
-				node = rb_entry(node->ITRB.rb_right,	      \
-						ITSTRUCT, ITRB);	      \
-				if (start <= node->ITSUBTREE)		      \
-					continue;			      \
-			}						      \
+			node = rb_entry(node->ITRB.rb_right, ITSTRUCT, ITRB); \
+			continue;					      \
 		}							      \
 		return NULL;	/* No match */				      \
 	}								      \
-- 
cgit v1.2.3


From 35a566a24e58f1b5f89737edf60b77de58719ed0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 18 Feb 2025 18:14:26 -0500
Subject: NFSv4: Avoid unnecessary scans of filesystems for returning
 delegations

The amount of looping through the list of delegations is occasionally
leading to soft lockups. If the state manager was asked to return
delegations asynchronously, it should only scan those filesystems that
hold delegations that need to be returned.

Fixes: af3b61bf6131 ("NFSv4: Clean up nfs_client_return_marked_delegations()")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs_sb.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index f00bfcee7120..4e9ad6f6e907 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -250,6 +250,8 @@ struct nfs_server {
 	struct list_head	ss_copies;
 	struct list_head	ss_src_copies;
 
+	unsigned long		delegation_flags;
+#define NFS4SERV_DELEGRETURN		(1)
 	unsigned long		delegation_gen;
 	unsigned long		mig_gen;
 	unsigned long		mig_status;
-- 
cgit v1.2.3


From f163aa81a799e2d46d7f8f0b42a0e7770eaa0d06 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 18 Feb 2025 19:03:21 -0500
Subject: NFSv4: Avoid unnecessary scans of filesystems for expired delegations

The amount of looping through the list of delegations is occasionally
leading to soft lockups.  If the state manager was asked to reap the
expired delegations, it should scan only those filesystems that hold
delegations that need to be reaped.

Fixes: 7f156ef0bf45 ("NFSv4: Clean up nfs_delegation_reap_expired()")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs_sb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 4e9ad6f6e907..7d6f164036fa 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -252,6 +252,7 @@ struct nfs_server {
 
 	unsigned long		delegation_flags;
 #define NFS4SERV_DELEGRETURN		(1)
+#define NFS4SERV_DELEGATION_EXPIRED	(2)
 	unsigned long		delegation_gen;
 	unsigned long		mig_gen;
 	unsigned long		mig_status;
-- 
cgit v1.2.3


From e767b59e29b8327d25edde65efc743f479f30d0a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 18 Feb 2025 18:37:51 -0500
Subject: NFSv4: Avoid unnecessary scans of filesystems for delayed delegations

The amount of looping through the list of delegations is occasionally
leading to soft lockups. If the state manager was asked to manage the
delayed return of delegations, then only scan those filesystems
containing delegations that were marked as being delayed.

Fixes: be20037725d1 ("NFSv4: Fix delegation return in cases where we have to retry")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs_sb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 7d6f164036fa..108862d81b57 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -253,6 +253,7 @@ struct nfs_server {
 	unsigned long		delegation_flags;
 #define NFS4SERV_DELEGRETURN		(1)
 #define NFS4SERV_DELEGATION_EXPIRED	(2)
+#define NFS4SERV_DELEGRETURN_DELAYED	(3)
 	unsigned long		delegation_gen;
 	unsigned long		mig_gen;
 	unsigned long		mig_status;
-- 
cgit v1.2.3


From 8955e7ce61fb6ba28f88e0a38479c992991ba6ab Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 13 Jan 2025 10:32:39 -0500
Subject: NFS: Implement NFSv4.2's OFFLOAD_STATUS XDR

Add XDR encoding and decoding functions for the NFSv4.2
OFFLOAD_STATUS operation.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Link: https://lore.kernel.org/r/20250113153235.48706-13-cel@kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs4.h    | 1 +
 include/linux/nfs_xdr.h | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 9ac83ca88326..5fa60fe441b5 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -691,6 +691,7 @@ enum {
 	NFSPROC4_CLNT_LISTXATTRS,
 	NFSPROC4_CLNT_REMOVEXATTR,
 	NFSPROC4_CLNT_READ_PLUS,
+	NFSPROC4_CLNT_OFFLOAD_STATUS,
 };
 
 /* nfs41 types */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9155a6ffc370..dc5288111999 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1515,8 +1515,9 @@ struct nfs42_offload_status_args {
 
 struct nfs42_offload_status_res {
 	struct nfs4_sequence_res	osr_seq_res;
-	uint64_t			osr_count;
-	int				osr_status;
+	u64				osr_count;
+	int				complete_count;
+	u32				osr_complete;
 };
 
 struct nfs42_copy_notify_args {
-- 
cgit v1.2.3


From 77dd8a302f56679178924f82a29eaf437857ea75 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 13 Jan 2025 10:32:40 -0500
Subject: NFS: Implement NFSv4.2's OFFLOAD_STATUS operation

Enable the Linux NFS client to observe the progress of an offloaded
asynchronous COPY operation. This new operation will be put to use
in a subsequent patch.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Link: https://lore.kernel.org/r/20250113153235.48706-14-cel@kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs_sb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 108862d81b57..1711eefcf24f 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -293,6 +293,7 @@ struct nfs_server {
 #define NFS_CAP_CASE_INSENSITIVE	(1U << 6)
 #define NFS_CAP_CASE_PRESERVING	(1U << 7)
 #define NFS_CAP_REBOOT_LAYOUTRETURN	(1U << 8)
+#define NFS_CAP_OFFLOAD_STATUS	(1U << 9)
 #define NFS_CAP_OPEN_XOR	(1U << 12)
 #define NFS_CAP_DELEGTIME	(1U << 13)
 #define NFS_CAP_POSIX_LOCK	(1U << 14)
-- 
cgit v1.2.3


From e6fa3963a30d53427d33a577c5ba4bfd3373bc93 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 28 Feb 2025 14:30:59 +1100
Subject: fs/dax: refactor wait for dax idle page

A FS DAX page is considered idle when its refcount drops to one.  This is
currently open-coded in all file systems supporting FS DAX.  Move the idle
detection to a common function to make future changes easier.

Link: https://lkml.kernel.org/r/c2c9d269110b90224eeb1dc661ffbc1d82aa20c9.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/dax.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index df41a0017b31..9b1ce984d410 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -207,6 +207,14 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
 int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		const struct iomap_ops *ops);
 
+static inline int dax_wait_page_idle(struct page *page,
+				void (cb)(struct inode *),
+				struct inode *inode)
+{
+	return ___wait_var_event(page, page_ref_count(page) == 1,
+				TASK_INTERRUPTIBLE, 0, 0, cb(inode));
+}
+
 #if IS_ENABLED(CONFIG_DAX)
 int dax_read_lock(void);
 void dax_read_unlock(int id);
-- 
cgit v1.2.3


From d5b3afea22a52517e6bc835432e6dfd079b8bf7c Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 28 Feb 2025 14:31:00 +1100
Subject: fs/dax: create a common implementation to break DAX layouts

Prior to freeing a block file systems supporting FS DAX must check that
the associated pages are both unmapped from user-space and not undergoing
DMA or other access from eg.  get_user_pages().  This is achieved by
unmapping the file range and scanning the FS DAX page-cache to see if any
pages within the mapping have an elevated refcount.

This is done using two functions - dax_layout_busy_page_range() which
returns a page to wait for the refcount to become idle on.  Rather than
open-code this introduce a common implementation to both unmap and wait
for the page to become idle.

Link: https://lkml.kernel.org/r/c4d381e41fc618296cee2820403c166d80599d5c.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/dax.h | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9b1ce984d410..a6b277f1e13a 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -207,12 +207,9 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
 int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		const struct iomap_ops *ops);
 
-static inline int dax_wait_page_idle(struct page *page,
-				void (cb)(struct inode *),
-				struct inode *inode)
+static inline bool dax_page_is_idle(struct page *page)
 {
-	return ___wait_var_event(page, page_ref_count(page) == 1,
-				TASK_INTERRUPTIBLE, 0, 0, cb(inode));
+	return page && page_ref_count(page) == 1;
 }
 
 #if IS_ENABLED(CONFIG_DAX)
@@ -228,6 +225,15 @@ static inline void dax_read_unlock(int id)
 {
 }
 #endif /* CONFIG_DAX */
+
+#if !IS_ENABLED(CONFIG_FS_DAX)
+static inline int __must_check dax_break_layout(struct inode *inode,
+			    loff_t start, loff_t end, void (cb)(struct inode *))
+{
+	return 0;
+}
+#endif
+
 bool dax_alive(struct dax_device *dax_dev);
 void *dax_get_private(struct dax_device *dax_dev);
 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
@@ -251,6 +257,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
+int __must_check dax_break_layout(struct inode *inode, loff_t start,
+				loff_t end, void (cb)(struct inode *));
+static inline int __must_check dax_break_layout_inode(struct inode *inode,
+						void (cb)(struct inode *))
+{
+	return dax_break_layout(inode, 0, LLONG_MAX, cb);
+}
 int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 				  struct inode *dest, loff_t destoff,
 				  loff_t len, bool *is_same,
-- 
cgit v1.2.3


From bde708f1a65d025c45575bfe1e7bf7bdf7e71e87 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 28 Feb 2025 14:31:01 +1100
Subject: fs/dax: always remove DAX page-cache entries when breaking layouts

Prior to any truncation operations file systems call dax_break_mapping()
to ensure pages in the range are not under going DMA.  Later DAX
page-cache entries will be removed by truncate_folio_batch_exceptionals()
in the generic page-cache code.

However this makes it possible for folios to be removed from the
page-cache even though they are still DMA busy if the file-system hasn't
called dax_break_mapping().  It also means they can never be waited on in
future because FS DAX will lose track of them once the page-cache entry
has been deleted.

Instead it is better to delete the FS DAX entry when the file-system calls
dax_break_mapping() as part of it's truncate operation.  This ensures only
idle pages can be removed from the FS DAX page-cache and makes it easy to
detect if a file-system hasn't called dax_break_mapping() prior to a
truncate operation.

Link: https://lkml.kernel.org/r/3be6115eaaa8d28fee37fcba3287be4f226a7d24.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/dax.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index a6b277f1e13a..2fbb262092ca 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -255,6 +255,8 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
 vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
 		unsigned int order, pfn_t pfn);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
+void dax_delete_mapping_range(struct address_space *mapping,
+				loff_t start, loff_t end);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
 int __must_check dax_break_layout(struct inode *inode, loff_t start,
-- 
cgit v1.2.3


From 0e2f80afcfa699ce722c01afc9286a942bd57211 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 28 Feb 2025 14:31:02 +1100
Subject: fs/dax: ensure all pages are idle prior to filesystem unmount

File systems call dax_break_mapping() prior to reallocating file system
blocks to ensure the page is not undergoing any DMA or other accesses.
Generally this is needed when a file is truncated to ensure that if a
block is reallocated nothing is writing to it.  However filesystems
currently don't call this when an FS DAX inode is evicted.

This can cause problems when the file system is unmounted as a page can
continue to be under going DMA or other remote access after unmount.  This
means if the file system is remounted any truncate or other operation
which requires the underlying file system block to be freed will not wait
for the remote access to complete.  Therefore a busy block may be
reallocated to a new file leading to corruption.

Link: https://lkml.kernel.org/r/2d3cf575bbd095084993154be2f0aa7442e5cd28.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: Dan Wiliams <dan.j.williams@intel.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/dax.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 2fbb262092ca..2333c30f6d36 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -232,6 +232,10 @@ static inline int __must_check dax_break_layout(struct inode *inode,
 {
 	return 0;
 }
+
+static inline void dax_break_layout_final(struct inode *inode)
+{
+}
 #endif
 
 bool dax_alive(struct dax_device *dax_dev);
@@ -266,6 +270,7 @@ static inline int __must_check dax_break_layout_inode(struct inode *inode,
 {
 	return dax_break_layout(inode, 0, LLONG_MAX, cb);
 }
+void dax_break_layout_final(struct inode *inode);
 int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 				  struct inode *dest, loff_t destoff,
 				  loff_t len, bool *is_same,
-- 
cgit v1.2.3


From cbe298d82cf70aa6bb16cfc8ae4db522f39a0034 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 28 Feb 2025 14:31:03 +1100
Subject: fs/dax: remove PAGE_MAPPING_DAX_SHARED mapping flag

The page ->mapping pointer can have magic values like
PAGE_MAPPING_DAX_SHARED and PAGE_MAPPING_ANON for page owner specific
usage.  Currently PAGE_MAPPING_DAX_SHARED and PAGE_MAPPING_ANON alias to
the same value.  This isn't a problem because FS DAX pages are never seen
by the anonymous mapping code and vice versa.

However a future change will make FS DAX pages more like normal pages, so
folio_test_anon() must not return true for a FS DAX page.

We could explicitly test for a FS DAX page in folio_test_anon(), etc.
however the PAGE_MAPPING_DAX_SHARED flag isn't actually needed.  Instead
we can use the page->mapping field to implicitly track the first mapping
of a page.  If page->mapping is non-NULL it implies the page is associated
with a single mapping at page->index.  If the page is associated with a
second mapping clear page->mapping and set page->share to 1.

This is possible because a shared mapping implies the file-system
implements dax_holder_operations which makes the ->mapping and ->index,
which is a union with ->share, unused.

The page is considered shared when page->mapping == NULL and page->share >
0 or page->mapping != NULL, implying it is present in at least one address
space.  This also makes it easier for a future change to detect when a
page is first mapped into an address space which requires special
handling.

Link: https://lkml.kernel.org/r/c22f699202db0acee2f7039eb026e68261ce42d6.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: Dan Wiliams <dan.j.williams@intel.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 36d283552f80..cab382bd965e 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -673,12 +673,6 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
 #define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
 #define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
 
-/*
- * Different with flags above, this flag is used only for fsdax mode.  It
- * indicates that this page->mapping is now under reflink case.
- */
-#define PAGE_MAPPING_DAX_SHARED	((void *)0x1)
-
 static __always_inline bool folio_mapping_flags(const struct folio *folio)
 {
 	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
-- 
cgit v1.2.3


From 82ba975e4c43d98afebced82d940ddb7aec42a9d Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 28 Feb 2025 14:31:06 +1100
Subject: mm: allow compound zone device pages

Zone device pages are used to represent various type of device memory
managed by device drivers.  Currently compound zone device pages are not
supported.  This is because MEMORY_DEVICE_FS_DAX pages are the only user
of higher order zone device pages and have their own page reference
counting.

A future change will unify FS DAX reference counting with normal page
reference counting rules and remove the special FS DAX reference counting.
Supporting that requires compound zone device pages.

Supporting compound zone device pages requires compound_head() to
distinguish between head and tail pages whilst still preserving the
special struct page fields that are specific to zone device pages.

A tail page is distinguished by having bit zero being set in
page->compound_head, with the remaining bits pointing to the head page.
For zone device pages page->compound_head is shared with page->pgmap.

The page->pgmap field must be common to all pages within a folio, even if
the folio spans memory sections.  Therefore pgmap is the same for both
head and tail pages and can be moved into the folio and we can use the
standard scheme to find compound_head from a tail page.

Link: https://lkml.kernel.org/r/67055d772e6102accf85161d0b57b0b3944292bf.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h |  6 +++---
 include/linux/migrate.h  |  4 ++--
 include/linux/mm_types.h |  9 +++++++--
 include/linux/mmzone.h   | 12 +++++++++++-
 4 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 3f7143ade32c..0256a4218dc3 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -161,7 +161,7 @@ static inline bool is_device_private_page(const struct page *page)
 {
 	return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
 		is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+		page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE;
 }
 
 static inline bool folio_is_device_private(const struct folio *folio)
@@ -173,13 +173,13 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
 {
 	return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
 		is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
+		page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA;
 }
 
 static inline bool is_device_coherent_page(const struct page *page)
 {
 	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_COHERENT;
+		page_pgmap(page)->type == MEMORY_DEVICE_COHERENT;
 }
 
 static inline bool folio_is_device_coherent(const struct folio *folio)
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 29919faea2f1..61899ec7a9a3 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -205,8 +205,8 @@ struct migrate_vma {
 	unsigned long		end;
 
 	/*
-	 * Set to the owner value also stored in page->pgmap->owner for
-	 * migrating out of device private memory. The flags also need to
+	 * Set to the owner value also stored in page_pgmap(page)->owner
+	 * for migrating out of device private memory. The flags also need to
 	 * be set to MIGRATE_VMA_SELECT_DEVICE_PRIVATE.
 	 * The caller should always set this field when using mmu notifier
 	 * callbacks to avoid device MMU invalidations for device private
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6a93abb4452b..0fa7907d437e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -130,8 +130,11 @@ struct page {
 			unsigned long compound_head;	/* Bit zero is set */
 		};
 		struct {	/* ZONE_DEVICE pages */
-			/** @pgmap: Points to the hosting device page map. */
-			struct dev_pagemap *pgmap;
+			/*
+			 * The first word is used for compound_head or folio
+			 * pgmap
+			 */
+			void *_unused_pgmap_compound_head;
 			void *zone_device_data;
 			/*
 			 * ZONE_DEVICE private pages are counted as being
@@ -300,6 +303,7 @@ typedef struct {
  * @_refcount: Do not access this member directly.  Use folio_ref_count()
  *    to find how many references there are to this folio.
  * @memcg_data: Memory Control Group data.
+ * @pgmap: Metadata for ZONE_DEVICE mappings
  * @virtual: Virtual address in the kernel direct map.
  * @_last_cpupid: IDs of last CPU and last process that accessed the folio.
  * @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
@@ -338,6 +342,7 @@ struct folio {
 	/* private: */
 				};
 	/* public: */
+				struct dev_pagemap *pgmap;
 			};
 			struct address_space *mapping;
 			pgoff_t index;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 44ecb2f90db4..550dbba92521 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1158,6 +1158,12 @@ static inline bool is_zone_device_page(const struct page *page)
 	return page_zonenum(page) == ZONE_DEVICE;
 }
 
+static inline struct dev_pagemap *page_pgmap(const struct page *page)
+{
+	VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page);
+	return page_folio(page)->pgmap;
+}
+
 /*
  * Consecutive zone device pages should not be merged into the same sgl
  * or bvec segment with other types of pages or if they belong to different
@@ -1173,7 +1179,7 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
 		return false;
 	if (!is_zone_device_page(a))
 		return true;
-	return a->pgmap == b->pgmap;
+	return page_pgmap(a) == page_pgmap(b);
 }
 
 extern void memmap_init_zone_device(struct zone *, unsigned long,
@@ -1188,6 +1194,10 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
 {
 	return true;
 }
+static inline struct dev_pagemap *page_pgmap(const struct page *page)
+{
+	return NULL;
+}
 #endif
 
 static inline bool folio_is_zone_device(const struct folio *folio)
-- 
cgit v1.2.3


From ec2e0cc67f9c3ed5926da390ad86b25c142a2e4a Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 28 Feb 2025 14:31:08 +1100
Subject: mm/memory: add vmf_insert_page_mkwrite()

Currently to map a DAX page the DAX driver calls vmf_insert_pfn.  This
creates a special devmap PTE entry for the pfn but does not take a
reference on the underlying struct page for the mapping.  This is because
DAX page refcounts are treated specially, as indicated by the presence of
a devmap entry.

To allow DAX page refcounts to be managed the same as normal page
refcounts introduce vmf_insert_page_mkwrite().  This will take a reference
on the underlying page much the same as vmf_insert_page, except it also
permits upgrading an existing mapping to be writable if
requested/possible.

Link: https://lkml.kernel.org/r/4ce3aa984c060f370105e0bfef1035869578be47.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: Dan Wiliams <dan.j.williams@intel.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9a74a3ee68bc..c6fb9300f6c0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3644,6 +3644,8 @@ int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
 				unsigned long num);
 int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
 				unsigned long num);
+vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
+			bool write);
 vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn);
 vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
-- 
cgit v1.2.3


From 349994cf61e6eaa5996d53e45ffd2272d32d5e4e Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 28 Feb 2025 14:31:09 +1100
Subject: mm/rmap: add support for PUD sized mappings to rmap

The rmap doesn't currently support adding a PUD mapping of a folio.  This
patch adds support for entire PUD mappings of folios, primarily to allow
for more standard refcounting of device DAX folios.  Currently DAX is the
only user of this and it doesn't require support for partially mapped
PUD-sized folios so we don't support for that for now.

Link: https://lkml.kernel.org/r/248582c07896e30627d1aeaeebc6949cfd91b851.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 69e9a431a40e..6abf7960077a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -192,6 +192,7 @@ typedef int __bitwise rmap_t;
 enum rmap_level {
 	RMAP_LEVEL_PTE = 0,
 	RMAP_LEVEL_PMD,
+	RMAP_LEVEL_PUD,
 };
 
 static inline void __folio_rmap_sanity_checks(const struct folio *folio,
@@ -228,6 +229,14 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio,
 		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
 		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
 		break;
+	case RMAP_LEVEL_PUD:
+		/*
+		 * Assume that we are creating a single "entire" mapping of the
+		 * folio.
+		 */
+		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
+		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
+		break;
 	default:
 		VM_WARN_ON_ONCE(true);
 	}
@@ -251,12 +260,16 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
 	folio_add_file_rmap_ptes(folio, page, 1, vma)
 void folio_add_file_rmap_pmd(struct folio *, struct page *,
 		struct vm_area_struct *);
+void folio_add_file_rmap_pud(struct folio *, struct page *,
+		struct vm_area_struct *);
 void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
 		struct vm_area_struct *);
 #define folio_remove_rmap_pte(folio, page, vma) \
 	folio_remove_rmap_ptes(folio, page, 1, vma)
 void folio_remove_rmap_pmd(struct folio *, struct page *,
 		struct vm_area_struct *);
+void folio_remove_rmap_pud(struct folio *, struct page *,
+		struct vm_area_struct *);
 
 void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
@@ -341,6 +354,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
 		atomic_add(orig_nr_pages, &folio->_large_mapcount);
 		break;
 	case RMAP_LEVEL_PMD:
+	case RMAP_LEVEL_PUD:
 		atomic_inc(&folio->_entire_mapcount);
 		atomic_inc(&folio->_large_mapcount);
 		break;
@@ -437,6 +451,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
 		atomic_add(orig_nr_pages, &folio->_large_mapcount);
 		break;
 	case RMAP_LEVEL_PMD:
+	case RMAP_LEVEL_PUD:
 		if (PageAnonExclusive(page)) {
 			if (unlikely(maybe_pinned))
 				return -EBUSY;
-- 
cgit v1.2.3


From dbe54153296d0931144a63295fc9367794bbe797 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 28 Feb 2025 14:31:10 +1100
Subject: mm/huge_memory: add vmf_insert_folio_pud()

Currently DAX folio/page reference counts are managed differently to
normal pages.  To allow these to be managed the same as normal pages
introduce vmf_insert_folio_pud.  This will map the entire PUD-sized folio
and take references as it would for a normally mapped page.

This is distinct from the current mechanism, vmf_insert_pfn_pud, which
simply inserts a special devmap PUD entry into the page table without
holding a reference to the page for the mapping.

Link: https://lkml.kernel.org/r/649a1ef91d556593948351e94f51ef73a14f6794.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e1bea54820ff..4eeb54b50621 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,6 +39,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
+				bool write);
 
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_UNSUPPORTED,
-- 
cgit v1.2.3


From 6c88f72691f88fd1fc493a3c2eed97f91b82b3f0 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 28 Feb 2025 14:31:11 +1100
Subject: mm/huge_memory: add vmf_insert_folio_pmd()

Currently DAX folio/page reference counts are managed differently to
normal pages.  To allow these to be managed the same as normal pages
introduce vmf_insert_folio_pmd.  This will map the entire PMD-sized folio
and take references as it would for a normally mapped page.

This is distinct from the current mechanism, vmf_insert_pfn_pmd, which
simply inserts a special devmap PMD entry into the page table without
holding a reference to the page for the mapping.

It is not currently useful to implement a more generic vmf_insert_folio()
which selects the correct behaviour based on folio_order().  This is
because PTE faults require only a subpage of the folio to be PTE mapped
rather than the entire folio.  It would be possible to add this context
somewhere but callers already need to handle PTE faults and PMD faults
separately so a more generic function is not useful.

Link: https://lkml.kernel.org/r/7bf92a2e68225d13ea368d53bbfee327314d1c40.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: Dan Wiliams <dan.j.williams@intel.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4eeb54b50621..e57e811cfd3c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,6 +39,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
+				bool write);
 vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
 				bool write);
 
-- 
cgit v1.2.3


From e5cb23256347469c80138a2a8804887239465d0b Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 28 Feb 2025 14:31:12 +1100
Subject: mm/gup: don't allow FOLL_LONGTERM pinning of FS DAX pages

Longterm pinning of FS DAX pages should already be disallowed by various
pXX_devmap checks.  However a future change will cause these checks to be
invalid for FS DAX pages so make folio_is_longterm_pinnable() return false
for FS DAX pages.

Link: https://lkml.kernel.org/r/250a31876704b79f7c65b159f3c835e547f052df.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h | 11 +++++++++++
 include/linux/mm.h       |  7 +++++++
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 0256a4218dc3..4aa151914eab 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -187,6 +187,17 @@ static inline bool folio_is_device_coherent(const struct folio *folio)
 	return is_device_coherent_page(&folio->page);
 }
 
+static inline bool is_fsdax_page(const struct page *page)
+{
+	return is_zone_device_page(page) &&
+		page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX;
+}
+
+static inline bool folio_is_fsdax(const struct folio *folio)
+{
+	return is_fsdax_page(&folio->page);
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 void zone_device_page_init(struct page *page);
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c6fb9300f6c0..009484a07074 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2115,6 +2115,13 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio)
 	if (folio_is_device_coherent(folio))
 		return false;
 
+	/*
+	 * Filesystems can only tolerate transient delays to truncate and
+	 * hole-punch operations
+	 */
+	if (folio_is_fsdax(folio))
+		return false;
+
 	/* Otherwise, non-movable zone folios can be pinned. */
 	return !folio_is_zone_movable(folio);
 
-- 
cgit v1.2.3


From 38607c62b34b46317c46d5baf1df03ac6e48a1c6 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 28 Feb 2025 14:31:14 +1100
Subject: fs/dax: properly refcount fs dax pages

Currently fs dax pages are considered free when the refcount drops to one
and their refcounts are not increased when mapped via PTEs or decreased
when unmapped.  This requires special logic in mm paths to detect that
these pages should not be properly refcounted, and to detect when the
refcount drops to one instead of zero.

On the other hand get_user_pages(), etc.  will properly refcount fs dax
pages by taking a reference and dropping it when the page is unpinned.

Tracking this special behaviour requires extra PTE bits (eg.  pte_devmap)
and introduces rules that are potentially confusing and specific to FS DAX
pages.  To fix this, and to possibly allow removal of the special PTE bits
in future, convert the fs dax page refcounts to be zero based and instead
take a reference on the page each time it is mapped as is currently the
case for normal pages.

This may also allow a future clean-up to remove the pgmap refcounting that
is currently done in mm/gup.c.

Link: https://lkml.kernel.org/r/c7d886ad7468a20452ef6e0ddab6cfe220874e7c.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/dax.h      |  2 +-
 include/linux/mm.h       | 27 ++-------------------------
 include/linux/mm_types.h |  7 ++++++-
 3 files changed, 9 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 2333c30f6d36..dcc9fcdf14e4 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -209,7 +209,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 
 static inline bool dax_page_is_idle(struct page *page)
 {
-	return page && page_ref_count(page) == 1;
+	return page && page_ref_count(page) == 0;
 }
 
 #if IS_ENABLED(CONFIG_DAX)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 009484a07074..de008efd96aa 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1192,6 +1192,8 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);
 struct mmu_gather;
 struct inode;
 
+extern void prep_compound_page(struct page *page, unsigned int order);
+
 /*
  * compound_order() can be called without holding a reference, which means
  * that niceties like page_folio() don't work.  These callers should be
@@ -1513,25 +1515,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf);
  *   back into memory.
  */
 
-#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
-DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-
-bool __put_devmap_managed_folio_refs(struct folio *folio, int refs);
-static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
-	if (!static_branch_unlikely(&devmap_managed_key))
-		return false;
-	if (!folio_is_zone_device(folio))
-		return false;
-	return __put_devmap_managed_folio_refs(folio, refs);
-}
-#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
-	return false;
-}
-#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-
 /* 127: arbitrary random number, small enough to assemble well */
 #define folio_ref_zero_or_close_to_overflow(folio) \
 	((unsigned int) folio_ref_count(folio) + 127u <= 127u)
@@ -1652,12 +1635,6 @@ static inline void put_page(struct page *page)
 	if (folio_test_slab(folio))
 		return;
 
-	/*
-	 * For some devmap managed pages we need to catch refcount transition
-	 * from 2 to 1:
-	 */
-	if (put_devmap_managed_folio_refs(folio, 1))
-		return;
 	folio_put(folio);
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0fa7907d437e..b1827d78ff89 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -296,6 +296,8 @@ typedef struct {
  *    anonymous memory.
  * @index: Offset within the file, in units of pages.  For anonymous memory,
  *    this is the index from the beginning of the mmap.
+ * @share: number of DAX mappings that reference this folio. See
+ *    dax_associate_entry.
  * @private: Filesystem per-folio data (see folio_attach_private()).
  * @swap: Used for swp_entry_t if folio_test_swapcache().
  * @_mapcount: Do not access this member directly.  Use folio_mapcount() to
@@ -345,7 +347,10 @@ struct folio {
 				struct dev_pagemap *pgmap;
 			};
 			struct address_space *mapping;
-			pgoff_t index;
+			union {
+				pgoff_t index;
+				unsigned long share;
+			};
 			union {
 				void *private;
 				swp_entry_t swap;
-- 
cgit v1.2.3


From 6220ea5583e977e1eeea06c237cc440e5ac3de46 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:29:54 +0100
Subject: mm: factor out large folio handling from folio_order() into
 folio_large_order()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm: MM owner tracking for large folios (!hugetlb) +
CONFIG_NO_PAGE_MAPCOUNT", v3.

Let's add an "easy" way to decide -- without false positives, without
page-mapcounts and without page table/rmap scanning -- whether a large
folio is "certainly mapped exclusively" into a single MM, or whether it
"maybe mapped shared" into multiple MMs.

Use that information to implement Copy-on-Write reuse, to convert
folio_likely_mapped_shared() to folio_maybe_mapped_share(), and to
introduce a kernel config option that lets us not use+maintain per-page
mapcounts in large folios anymore.

The bigger picture was presented at LSF/MM [1].

This series is effectively a follow-up on my early work [2], which
implemented a more precise, but also more complicated, way to identify
whether a large folio is "mapped shared" into multiple MMs or "mapped
exclusively" into a single MM.


1 Patch Organization
====================

Patch #1 -> #6: make more room in order-1 folios, so we have two
                "unsigned long" available for our purposes

Patch #7 -> #11: preparations

Patch #12: MM owner tracking for large folios

Patch #13: COW reuse for PTE-mapped anon THP

Patch #14: folio_maybe_mapped_shared()

Patch #15 -> #20: introduce and implement CONFIG_NO_PAGE_MAPCOUNT


2 MM owner tracking
===================

We assign each MM a unique ID ("MM ID"), to be able to squeeze more
information in our folios.  On 32bit we use 15-bit IDs, on 64bit we use
31-bit IDs.

For each large folios, we now store two MM-ID+mapcount ("slot")
combinations:
* mm0_id + mm0_mapcount
* mm1_id + mm1_mapcount

On 32bit, we use a 16-bit per-MM mapcount, on 64bit an ordinary 32bit
mapcount.  This way, we require 2x "unsigned long" on 32bit and 64bit for
both slots.

Paired with the large mapcount, we can reliably identify whether one of
these MMs is the current owner (-> owns all mappings) or even holds all
folio references (-> owns all mappings, and all references are from
mappings).

As long as only two MMs map folio pages at a time, we can reliably and
precisely identify whether a large folio is "mapped shared" or "mapped
exclusively".

Any additional MM that starts mapping the folio while there are no free
slots becomes an "untracked MM".  If one such "untracked MM" is the last
one mapping a folio exclusively, we will not detect the folio as "mapped
exclusively" but instead as "maybe mapped shared".  (exception: only a
single mapping remains)

So that's where the approach gets imprecise.

For now, we use a bit-spinlock to sync the large mapcount + slots, and
make sure we do keep the machinery fast, to not degrade (un)map
performance drastically: for example, we make sure to only use a single
atomic (when grabbing the bit-spinlock), like we would already perform
when updating the large mapcount.


3 CONFIG_NO_PAGE_MAPCOUNT
=========================

patch #15 -> #20 spell out and document what exactly is affected when not
maintaining the per-page mapcounts in large folios anymore.

Most importantly, as we cannot maintain folio->_nr_pages_mapped anymore
when (un)mapping pages, we'll account a complete folio as mapped if a
single page is mapped.  In addition, we'll not detect partially mapped
anonymous folios as such in all cases yet.

Likely less relevant changes include that we might now under-estimate the
USS (Unique Set Size) of a process, but never over-estimate it.

The goal is to make CONFIG_NO_PAGE_MAPCOUNT the default at some point, to
then slowly make it the only option, as we learn about real-life impacts
and possible ways to mitigate them.


4 Performance
=============

Detailed performance numbers were included in v1 [3], and not that much
changed between v1 and v2.

I did plenty of measurements on different systems in the meantime, that
all revealed slightly different results.

The pte-mapped-folio micro-benchmarks [4] are fairly sensitive to code
layout changes on some systems.  Especially the fork() benchmark started
being more-shaky-than-before on recent kernels for some reason.

In summary, with my micro-benchmarks:

* Small folios are not impacted.

* CoW performance seems to be mostly unchanged across all folios sizes.

* CoW reuse performance of large folios now matches CoW reuse
  performance of small folios, because we now actually implement the CoW
  reuse optimization.  On an Intel Xeon Silver 4210R I measured a ~65%
  reduction in runtime, on an arm64 system I measured ~54% reduction.

* munmap() performance improves with CONFIG_NO_PAGE_MAPCOUNT.  I saw
  double-digit % reduction (up to ~30% on an Intel Xeon Silver 4210R and
  up to ~70% on an AmpereOne A192-32X) with larger folios.  The larger the
  folios, the larger the performance improvement.

* munmao() performance very slightly (couple percent) degrades without
  CONFIG_NO_PAGE_MAPCOUNT for smaller folios.  For larger folios, there
  seems to be no change at all.

* fork() performance improves with CONFIG_NO_PAGE_MAPCOUNT.  I saw
  double-digit % reduction (up to ~20% on an Intel Xeon Silver 4210R and
  up to ~10% on an AmpereOne A192-32X) with larger folios.  The larger the
  folios, the larger the performance improvement.

* While fork() performance without CONFIG_NO_PAGE_MAPCOUNT seems to be
  almost unchanged on some systems, I saw some degradation for smaller
  folios on the AmpereOne A192-32X.  I did not investigate the details
  yet, but I suspect code layout changes or suboptimal code placement /
  inlining.

I'm not to worried about the fork() micro-benchmarks for smaller folios
given how shaky the results are lately and by how much we improved fork()
performance recently.

I also ran case-anon-cow-rand and case-anon-cow-seq part of
vm-scalability, to assess the scalability and the impact of the
bit-spinlock.  My measurements on a two 2-socket 10-core Intel Xeon Silver
4210R CPU revealed no significant changes.

Similarly, running these benchmarks with 2 MiB THPs enabled on the
AmpereOne A192-32X with 192 cores, I got < 1% difference with < 1% stdev,
which is nice.

So far, I did not get my hands on a similarly large system with multiple
sockets.

I found no other fitting scalability benchmarks that seem to really hammer
on concurrent mapping/unmapping of large folio pages like
case-anon-cow-seq does.


5 Concerns
==========

5.1 Bit spinlock
----------------

I'm not quite happy about the bit-spinlock, but so far it does not seem to
affect scalability in my measurements.

If it ever becomes a problem we could either investigate improving the
locking, or simply stopping the MM tracking once there are "too many
mappings" and simply assume that the folio is "mapped shared" until it was
freed.

This would be similar (but slightly different) to the "0,1,2,stopped"
counting idea Willy had at some point.  Adding that logic to "stop
tracking" adds more code to the hot path, so I avoided that for now.


5.2 folio_maybe_mapped_shared()
-------------------------------

I documented the change from folio_likely_mapped_shared() to
folio_maybe_mapped_shared() quite extensively.  If we run into surprises,
I have some ideas on how to resolve them.  For now, I think we should be
fine.


5.3 Added code to map/unmap hot path
------------------------------------

So far, it looks like the added code on the rmap hot path does not really
seem to matter much in the bigger picture.  I'd like to further reduce it
(and possibly improve fork() performance further), but I don't easily see
how right now.  Well, and I am out of puff 🙂

Having that said, alternatives I considered (e.g., per-MM per-folio
mapcount) would add a lot more overhead to these hot paths.


6 Future Work
=============

6.1 Large mapcount
------------------

It would be very handy if the large mapcount would count how often folio
pages are actually mapped into page tables: a PMD on x86-64 would count
512 times.  Calculating the average per-page mapcount will be easy, and
remapping (PMD->PTE) folios would get even faster.

That would also remove the need for the entire mapcount (except for
PMD-sized folios for memory statistics reasons ...), and allow for mapping
folios larger than PMDs (e.g., 4 MiB) easily.

We likely would also have to take the same number of folio references to
make our folio_mapcount() == folio_ref_count() work, and we'd want to be
able to avoid mapcount+refcount overflows: this could already become an
issue with pte-mapped PUD-sized folios (fsdax).

One approach we discussed in the THP cabal meeting is (1) extending the
mapcount for large folios to 64bit (at least on 64bit systems) and (2)
keeping the refcount at 32bit, but (3) having exactly one reference if the
the mapcount != 0.

It should be doable, but there are some corner cases to consider on the
unmap path; it is something that I will be looking into next.


6.2 hugetlb
-----------

I'd love to make use of the same tracking also for hugetlb.

The real problem is PMD table sharing: getting a page mapped by MM X and
unmapped by MM Y will not work.  With mshare, that problem should not
exist (all mapping/unmapping will be routed through the mshare MM).

[1] https://lwn.net/Articles/974223/
[2] https://lore.kernel.org/linux-mm/a9922f58-8129-4f15-b160-e0ace581bcbe@redhat.com/T/
[3] https://lkml.kernel.org/r/20240829165627.2256514-1-david@redhat.com
[4] https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/pte-mapped-folio-benchmarks.c


This patch (of 20):

Let's factor it out into a simple helper function.  This helper will also
come in handy when working with code where we know that our folio is
large.

Maybe in the future we'll have the order readily available for small and
large folios; in that case, folio_large_order() would simply translate to
folio_order().

Link: https://lkml.kernel.org/r/20250303163014.1128035-1-david@redhat.com
Link: https://lkml.kernel.org/r/20250303163014.1128035-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lance Yang <ioworker0@gmail.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index de008efd96aa..e5cdbb94ef81 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1194,6 +1194,11 @@ struct inode;
 
 extern void prep_compound_page(struct page *page, unsigned int order);
 
+static inline unsigned int folio_large_order(const struct folio *folio)
+{
+	return folio->_flags_1 & 0xff;
+}
+
 /*
  * compound_order() can be called without holding a reference, which means
  * that niceties like page_folio() don't work.  These callers should be
@@ -1207,7 +1212,7 @@ static inline unsigned int compound_order(struct page *page)
 
 	if (!test_bit(PG_head, &folio->flags))
 		return 0;
-	return folio->_flags_1 & 0xff;
+	return folio_large_order(folio);
 }
 
 /**
@@ -1223,7 +1228,7 @@ static inline unsigned int folio_order(const struct folio *folio)
 {
 	if (!folio_test_large(folio))
 		return 0;
-	return folio->_flags_1 & 0xff;
+	return folio_large_order(folio);
 }
 
 #include <linux/huge_mm.h>
@@ -2145,7 +2150,7 @@ static inline long folio_nr_pages(const struct folio *folio)
 #ifdef CONFIG_64BIT
 	return folio->_folio_nr_pages;
 #else
-	return 1L << (folio->_flags_1 & 0xff);
+	return 1L << folio_large_order(folio);
 #endif
 }
 
@@ -2170,7 +2175,7 @@ static inline unsigned long compound_nr(struct page *page)
 #ifdef CONFIG_64BIT
 	return folio->_folio_nr_pages;
 #else
-	return 1L << (folio->_flags_1 & 0xff);
+	return 1L << folio_large_order(folio);
 #endif
 }
 
-- 
cgit v1.2.3


From 1ea5212aed0682ae1249cba9df7ad7ef539d0a7d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:29:55 +0100
Subject: mm: factor out large folio handling from folio_nr_pages() into
 folio_large_nr_pages()

Let's factor it out into a simple helper function.  This helper will also
come in handy when working with code where we know that our folio is
large.

While at it, let's consistently return a "long" value from all these
similar functions.  Note that we cannot use "unsigned int" (even though
_folio_nr_pages is of that type), because it would break some callers that
do stuff like "-folio_nr_pages()".  Both "int" or "unsigned long" would
work as well.

Maybe in the future we'll have the nr_pages readily available for all
large folios, maybe even for small folios, or maybe for none.

Link: https://lkml.kernel.org/r/20250303163014.1128035-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e5cdbb94ef81..cf892c08a88c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1199,6 +1199,18 @@ static inline unsigned int folio_large_order(const struct folio *folio)
 	return folio->_flags_1 & 0xff;
 }
 
+#ifdef CONFIG_64BIT
+static inline long folio_large_nr_pages(const struct folio *folio)
+{
+	return folio->_folio_nr_pages;
+}
+#else
+static inline long folio_large_nr_pages(const struct folio *folio)
+{
+	return 1L << folio_large_order(folio);
+}
+#endif
+
 /*
  * compound_order() can be called without holding a reference, which means
  * that niceties like page_folio() don't work.  These callers should be
@@ -2147,11 +2159,7 @@ static inline long folio_nr_pages(const struct folio *folio)
 {
 	if (!folio_test_large(folio))
 		return 1;
-#ifdef CONFIG_64BIT
-	return folio->_folio_nr_pages;
-#else
-	return 1L << folio_large_order(folio);
-#endif
+	return folio_large_nr_pages(folio);
 }
 
 /* Only hugetlbfs can allocate folios larger than MAX_ORDER */
@@ -2166,24 +2174,20 @@ static inline long folio_nr_pages(const struct folio *folio)
  * page.  compound_nr() can be called on a tail page, and is defined to
  * return 1 in that case.
  */
-static inline unsigned long compound_nr(struct page *page)
+static inline long compound_nr(struct page *page)
 {
 	struct folio *folio = (struct folio *)page;
 
 	if (!test_bit(PG_head, &folio->flags))
 		return 1;
-#ifdef CONFIG_64BIT
-	return folio->_folio_nr_pages;
-#else
-	return 1L << folio_large_order(folio);
-#endif
+	return folio_large_nr_pages(folio);
 }
 
 /**
  * thp_nr_pages - The number of regular pages in this huge page.
  * @page: The head page of a huge page.
  */
-static inline int thp_nr_pages(struct page *page)
+static inline long thp_nr_pages(struct page *page)
 {
 	return folio_nr_pages((struct folio *)page);
 }
-- 
cgit v1.2.3


From 4996fc547f5b49f4a43c261dfadb02cf165cdb51 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:29:56 +0100
Subject: mm: let _folio_nr_pages overlay memcg_data in first tail page

Let's free up some more of the "unconditionally available on 64BIT" space
in order-1 folios by letting _folio_nr_pages overlay memcg_data in the
first tail page (second folio page).  Consequently, we have the
optimization now whenever we have CONFIG_MEMCG, independent of 64BIT.

We have to make sure that page->memcg on tail pages does not return
"surprises".  page_memcg_check() already properly refuses PageTail().
Let's do that earlier in print_page_owner_memcg() to avoid printing wrong
"Slab cache page" information.  No other code should touch that field on
tail pages of compound pages.

Reset the "_nr_pages" to 0 when splitting folios, or when freeing them
back to the buddy (to avoid false page->memcg_data "bad page" reports).

Note that in __split_huge_page(), folio_nr_pages() would stop working
already as soon as we start messing with the subpages.

Most kernel configs should have at least CONFIG_MEMCG enabled, even if
disabled at runtime.  64byte "struct memmap" is what we usually have on
64BIT.

While at it, rename "_folio_nr_pages" to "_nr_pages".

Hopefully memdescs / dynamically allocating "strut folio" in the future
will further clean this up, e.g., making _nr_pages available in all
configs and maybe even in small folios.  Doing that should be fairly easy
on top of this change.

[david@redhat.com: make "make htmldoc" happy]
  Link: https://lkml.kernel.org/r/a97f8a91-ec41-4796-81e3-7c9e0e491ba4@redhat.com
Link: https://lkml.kernel.org/r/20250303163014.1128035-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       |  4 ++--
 include/linux/mm_types.h | 32 ++++++++++++++++++++++++--------
 2 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf892c08a88c..c9c2ca345350 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1199,10 +1199,10 @@ static inline unsigned int folio_large_order(const struct folio *folio)
 	return folio->_flags_1 & 0xff;
 }
 
-#ifdef CONFIG_64BIT
+#ifdef NR_PAGES_IN_LARGE_FOLIO
 static inline long folio_large_nr_pages(const struct folio *folio)
 {
-	return folio->_folio_nr_pages;
+	return folio->_nr_pages;
 }
 #else
 static inline long folio_large_nr_pages(const struct folio *folio)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b1827d78ff89..f3b519fbeca1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -287,6 +287,11 @@ typedef struct {
 	unsigned long val;
 } swp_entry_t;
 
+#if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT)
+/* We have some extra room after the refcount in tail pages. */
+#define NR_PAGES_IN_LARGE_FOLIO
+#endif
+
 /**
  * struct folio - Represents a contiguous set of bytes.
  * @flags: Identical to the page flags.
@@ -312,7 +317,7 @@ typedef struct {
  * @_large_mapcount: Do not use directly, call folio_mapcount().
  * @_nr_pages_mapped: Do not use outside of rmap and debug code.
  * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
- * @_folio_nr_pages: Do not use directly, call folio_nr_pages().
+ * @_nr_pages: Do not use directly, call folio_nr_pages().
  * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h.
  * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h.
  * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h.
@@ -376,14 +381,23 @@ struct folio {
 		struct {
 			unsigned long _flags_1;
 			unsigned long _head_1;
+			union {
+				struct {
 	/* public: */
-			atomic_t _large_mapcount;
-			atomic_t _entire_mapcount;
-			atomic_t _nr_pages_mapped;
-			atomic_t _pincount;
-#ifdef CONFIG_64BIT
-			unsigned int _folio_nr_pages;
-#endif
+					atomic_t _large_mapcount;
+					atomic_t _entire_mapcount;
+					atomic_t _nr_pages_mapped;
+					atomic_t _pincount;
+	/* private: the union with struct page is transitional */
+				};
+				unsigned long _usable_1[4];
+			};
+			atomic_t _mapcount_1;
+			atomic_t _refcount_1;
+	/* public: */
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+			unsigned int _nr_pages;
+#endif /* NR_PAGES_IN_LARGE_FOLIO */
 	/* private: the union with struct page is transitional */
 		};
 		struct page __page_1;
@@ -435,6 +449,8 @@ FOLIO_MATCH(_last_cpupid, _last_cpupid);
 			offsetof(struct page, pg) + sizeof(struct page))
 FOLIO_MATCH(flags, _flags_1);
 FOLIO_MATCH(compound_head, _head_1);
+FOLIO_MATCH(_mapcount, _mapcount_1);
+FOLIO_MATCH(_refcount, _refcount_1);
 #undef FOLIO_MATCH
 #define FOLIO_MATCH(pg, fl)						\
 	static_assert(offsetof(struct folio, fl) ==			\
-- 
cgit v1.2.3


From 4eeec8c89a0c4a8c20fb13a4e7093cc8efce383d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:29:57 +0100
Subject: mm: move hugetlb specific things in folio to page[3]

Let's just move the hugetlb specific stuff to a separate page, and stop
letting it overlay other fields for now.

This frees up some space in page[2], which we will use on 32bit to free up
some space in page[1].  While we could move these things to page[3]
instead, it's cleaner to just move the hugetlb specific things out of the
way and pack the core-folio stuff as tight as possible.  ...  and we can
minimize the work required in dump_folio.

We can now avoid re-initializing &folio->_deferred_list in hugetlb code.

Hopefully dynamically allocating "strut folio" in the future will further
clean this up.

Link: https://lkml.kernel.org/r/20250303163014.1128035-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f3b519fbeca1..727322ecbfdd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -407,20 +407,23 @@ struct folio {
 			unsigned long _flags_2;
 			unsigned long _head_2;
 	/* public: */
-			void *_hugetlb_subpool;
-			void *_hugetlb_cgroup;
-			void *_hugetlb_cgroup_rsvd;
-			void *_hugetlb_hwpoison;
+			struct list_head _deferred_list;
 	/* private: the union with struct page is transitional */
 		};
+		struct page __page_2;
+	};
+	union {
 		struct {
-			unsigned long _flags_2a;
-			unsigned long _head_2a;
+			unsigned long _flags_3;
+			unsigned long _head_3;
 	/* public: */
-			struct list_head _deferred_list;
+			void *_hugetlb_subpool;
+			void *_hugetlb_cgroup;
+			void *_hugetlb_cgroup_rsvd;
+			void *_hugetlb_hwpoison;
 	/* private: the union with struct page is transitional */
 		};
-		struct page __page_2;
+		struct page __page_3;
 	};
 };
 
@@ -457,8 +460,12 @@ FOLIO_MATCH(_refcount, _refcount_1);
 			offsetof(struct page, pg) + 2 * sizeof(struct page))
 FOLIO_MATCH(flags, _flags_2);
 FOLIO_MATCH(compound_head, _head_2);
-FOLIO_MATCH(flags, _flags_2a);
-FOLIO_MATCH(compound_head, _head_2a);
+#undef FOLIO_MATCH
+#define FOLIO_MATCH(pg, fl)						\
+	static_assert(offsetof(struct folio, fl) ==			\
+			offsetof(struct page, pg) + 3 * sizeof(struct page))
+FOLIO_MATCH(flags, _flags_3);
+FOLIO_MATCH(compound_head, _head_3);
 #undef FOLIO_MATCH
 
 /**
-- 
cgit v1.2.3


From 31a31da8a6187f1e5448ec73222e01d7d3fed4aa Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:29:58 +0100
Subject: mm: move _pincount in folio to page[2] on 32bit

Let's free up some space on 32bit in page[1] by moving the _pincount to
page[2].

For order-1 folios (never anon folios!) on 32bit, we will now also use the
GUP_PIN_COUNTING_BIAS approach.  A fully-mapped order-1 folio requires 2
references.  With GUP_PIN_COUNTING_BIAS being 1024, we'd detect such
folios as "maybe pinned" with 512 full mappings, instead of 1024 for
order-0.  As anon folios are out of the picture (which are the most
relevant users of checking for pinnings on *mapped* pages) and we are
talking about 32bit, this is not expected to cause any trouble.

In __dump_page(), copy one additional folio page if we detect a folio with
an order > 1, so we can dump the pincount on order > 1 folios reliably.

Note that THPs on 32bit are not particularly common (and we don't care too
much about performance), but we want to keep it working reliably, because
likely we want to use large folios there as well in the future,
independent of PMD leaf support.

Once we dynamically allocate "struct folio", fortunately the 32bit
specifics will likely go away again; even small folios could then have a
pincount and folio_has_pincount() would essentially always return "true".

Link: https://lkml.kernel.org/r/20250303163014.1128035-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       | 11 +++++++++--
 include/linux/mm_types.h |  5 +++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c9c2ca345350..860082ba8978 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2010,6 +2010,13 @@ static inline struct folio *pfn_folio(unsigned long pfn)
 	return page_folio(pfn_to_page(pfn));
 }
 
+static inline bool folio_has_pincount(const struct folio *folio)
+{
+	if (IS_ENABLED(CONFIG_64BIT))
+		return folio_test_large(folio);
+	return folio_order(folio) > 1;
+}
+
 /**
  * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
  * @folio: The folio.
@@ -2026,7 +2033,7 @@ static inline struct folio *pfn_folio(unsigned long pfn)
  * get that many refcounts, and b) all the callers of this routine are
  * expected to be able to deal gracefully with a false positive.
  *
- * For large folios, the result will be exactly correct. That's because
+ * For most large folios, the result will be exactly correct. That's because
  * we have more tracking data available: the _pincount field is used
  * instead of the GUP_PIN_COUNTING_BIAS scheme.
  *
@@ -2037,7 +2044,7 @@ static inline struct folio *pfn_folio(unsigned long pfn)
  */
 static inline bool folio_maybe_dma_pinned(struct folio *folio)
 {
-	if (folio_test_large(folio))
+	if (folio_has_pincount(folio))
 		return atomic_read(&folio->_pincount) > 0;
 
 	/*
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 727322ecbfdd..3ea2019a1aac 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -387,7 +387,9 @@ struct folio {
 					atomic_t _large_mapcount;
 					atomic_t _entire_mapcount;
 					atomic_t _nr_pages_mapped;
+#ifdef CONFIG_64BIT
 					atomic_t _pincount;
+#endif /* CONFIG_64BIT */
 	/* private: the union with struct page is transitional */
 				};
 				unsigned long _usable_1[4];
@@ -408,6 +410,9 @@ struct folio {
 			unsigned long _head_2;
 	/* public: */
 			struct list_head _deferred_list;
+#ifndef CONFIG_64BIT
+			atomic_t _pincount;
+#endif /* !CONFIG_64BIT */
 	/* private: the union with struct page is transitional */
 		};
 		struct page __page_2;
-- 
cgit v1.2.3


From 845d2be6d41f016da670dcc4c8f5357c22172be8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:29:59 +0100
Subject: mm: move _entire_mapcount in folio to page[2] on 32bit

Let's free up some space on 32bit in page[1] by moving the _pincount to
page[2].

Ordinary folios only use the entire mapcount with PMD mappings, so order-1
folios don't apply.  Similarly, hugetlb folios are always larger than
order-1, turning the entire mapcount essentially unused for all order-1
folios.  Moving it to order-1 folios will not change anything.

On 32bit, simply check in folio_entire_mapcount() whether we have an
order-1 folio, and return 0 in that case.

Note that THPs on 32bit are not particularly common (and we don't care too
much about performance), but we want to keep it working reliably, because
likely we want to use large folios there as well in the future,
independent of PMD leaf support.

Once we dynamically allocate "struct folio", the 32bit specifics will go
away again; even small folios could then have a pincount.

Link: https://lkml.kernel.org/r/20250303163014.1128035-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h       | 2 ++
 include/linux/mm_types.h | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 860082ba8978..f366c180f2b6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1333,6 +1333,8 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 static inline int folio_entire_mapcount(const struct folio *folio)
 {
 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+	if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1))
+		return 0;
 	return atomic_read(&folio->_entire_mapcount) + 1;
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3ea2019a1aac..9499eb8e8e66 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -385,9 +385,9 @@ struct folio {
 				struct {
 	/* public: */
 					atomic_t _large_mapcount;
-					atomic_t _entire_mapcount;
 					atomic_t _nr_pages_mapped;
 #ifdef CONFIG_64BIT
+					atomic_t _entire_mapcount;
 					atomic_t _pincount;
 #endif /* CONFIG_64BIT */
 	/* private: the union with struct page is transitional */
@@ -411,6 +411,7 @@ struct folio {
 	/* public: */
 			struct list_head _deferred_list;
 #ifndef CONFIG_64BIT
+			atomic_t _entire_mapcount;
 			atomic_t _pincount;
 #endif /* !CONFIG_64BIT */
 	/* private: the union with struct page is transitional */
-- 
cgit v1.2.3


From 405c4ef769c7c5e83e3556b48a190fe1afe82425 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:30:00 +0100
Subject: mm/rmap: pass dst_vma to folio_dup_file_rmap_pte() and friends

We'll need access to the destination MM when modifying the large mapcount
of a non-hugetlb large folios next.  So pass in the destination VMA.

Link: https://lkml.kernel.org/r/20250303163014.1128035-8-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6abf7960077a..e795610bade8 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -335,7 +335,8 @@ static inline void hugetlb_remove_rmap(struct folio *folio)
 }
 
 static __always_inline void __folio_dup_file_rmap(struct folio *folio,
-		struct page *page, int nr_pages, enum rmap_level level)
+		struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
+		enum rmap_level level)
 {
 	const int orig_nr_pages = nr_pages;
 
@@ -366,45 +367,47 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
  * @folio:	The folio to duplicate the mappings of
  * @page:	The first page to duplicate the mappings of
  * @nr_pages:	The number of pages of which the mapping will be duplicated
+ * @dst_vma:	The destination vm area
  *
  * The page range of the folio is defined by [page, page + nr_pages)
  *
  * The caller needs to hold the page table lock.
  */
 static inline void folio_dup_file_rmap_ptes(struct folio *folio,
-		struct page *page, int nr_pages)
+		struct page *page, int nr_pages, struct vm_area_struct *dst_vma)
 {
-	__folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE);
+	__folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE);
 }
 
 static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
-		struct page *page)
+		struct page *page, struct vm_area_struct *dst_vma)
 {
-	__folio_dup_file_rmap(folio, page, 1, RMAP_LEVEL_PTE);
+	__folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE);
 }
 
 /**
  * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
  * @folio:	The folio to duplicate the mapping of
  * @page:	The first page to duplicate the mapping of
+ * @dst_vma:	The destination vm area
  *
  * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
  *
  * The caller needs to hold the page table lock.
  */
 static inline void folio_dup_file_rmap_pmd(struct folio *folio,
-		struct page *page)
+		struct page *page, struct vm_area_struct *dst_vma)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	__folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE);
+	__folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE);
 #else
 	WARN_ON_ONCE(true);
 #endif
 }
 
 static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
-		struct page *page, int nr_pages, struct vm_area_struct *src_vma,
-		enum rmap_level level)
+		struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
+		struct vm_area_struct *src_vma, enum rmap_level level)
 {
 	const int orig_nr_pages = nr_pages;
 	bool maybe_pinned;
@@ -470,6 +473,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
  * @folio:	The folio to duplicate the mappings of
  * @page:	The first page to duplicate the mappings of
  * @nr_pages:	The number of pages of which the mapping will be duplicated
+ * @dst_vma:	The destination vm area
  * @src_vma:	The vm area from which the mappings are duplicated
  *
  * The page range of the folio is defined by [page, page + nr_pages)
@@ -488,16 +492,18 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
  * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
  */
 static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
-		struct page *page, int nr_pages, struct vm_area_struct *src_vma)
+		struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
+		struct vm_area_struct *src_vma)
 {
-	return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma,
-					 RMAP_LEVEL_PTE);
+	return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma,
+					 src_vma, RMAP_LEVEL_PTE);
 }
 
 static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
-		struct page *page, struct vm_area_struct *src_vma)
+		struct page *page, struct vm_area_struct *dst_vma,
+		struct vm_area_struct *src_vma)
 {
-	return __folio_try_dup_anon_rmap(folio, page, 1, src_vma,
+	return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma,
 					 RMAP_LEVEL_PTE);
 }
 
@@ -506,6 +512,7 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
  *				 of a folio
  * @folio:	The folio to duplicate the mapping of
  * @page:	The first page to duplicate the mapping of
+ * @dst_vma:	The destination vm area
  * @src_vma:	The vm area from which the mapping is duplicated
  *
  * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
@@ -524,11 +531,12 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
  * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
  */
 static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
-		struct page *page, struct vm_area_struct *src_vma)
+		struct page *page, struct vm_area_struct *dst_vma,
+		struct vm_area_struct *src_vma)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma,
-					 RMAP_LEVEL_PMD);
+	return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma,
+					 src_vma, RMAP_LEVEL_PMD);
 #else
 	WARN_ON_ONCE(true);
 	return -EBUSY;
-- 
cgit v1.2.3


From 932961c4b666937e27f7ec5358fb7aabf0f17d41 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:30:02 +0100
Subject: mm/rmap: abstract large mapcount operations for large folios
 (!hugetlb)

Let's abstract the operations so we can extend these operations easily.

Link: https://lkml.kernel.org/r/20250303163014.1128035-10-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index e795610bade8..d1e888cc97a5 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -173,6 +173,30 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
 
 struct anon_vma *folio_get_anon_vma(const struct folio *folio);
 
+static inline void folio_set_large_mapcount(struct folio *folio, int mapcount,
+		struct vm_area_struct *vma)
+{
+	/* Note: mapcounts start at -1. */
+	atomic_set(&folio->_large_mapcount, mapcount - 1);
+}
+
+static inline void folio_add_large_mapcount(struct folio *folio,
+		int diff, struct vm_area_struct *vma)
+{
+	atomic_add(diff, &folio->_large_mapcount);
+}
+
+static inline void folio_sub_large_mapcount(struct folio *folio,
+		int diff, struct vm_area_struct *vma)
+{
+	atomic_sub(diff, &folio->_large_mapcount);
+}
+
+#define folio_inc_large_mapcount(folio, vma) \
+	folio_add_large_mapcount(folio, 1, vma)
+#define folio_dec_large_mapcount(folio, vma) \
+	folio_sub_large_mapcount(folio, 1, vma)
+
 /* RMAP flags, currently only relevant for some anon rmap operations. */
 typedef int __bitwise rmap_t;
 
@@ -352,12 +376,12 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
 		do {
 			atomic_inc(&page->_mapcount);
 		} while (page++, --nr_pages > 0);
-		atomic_add(orig_nr_pages, &folio->_large_mapcount);
+		folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
 		break;
 	case RMAP_LEVEL_PMD:
 	case RMAP_LEVEL_PUD:
 		atomic_inc(&folio->_entire_mapcount);
-		atomic_inc(&folio->_large_mapcount);
+		folio_inc_large_mapcount(folio, dst_vma);
 		break;
 	}
 }
@@ -451,7 +475,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
 				ClearPageAnonExclusive(page);
 			atomic_inc(&page->_mapcount);
 		} while (page++, --nr_pages > 0);
-		atomic_add(orig_nr_pages, &folio->_large_mapcount);
+		folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
 		break;
 	case RMAP_LEVEL_PMD:
 	case RMAP_LEVEL_PUD:
@@ -461,7 +485,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
 			ClearPageAnonExclusive(page);
 		}
 		atomic_inc(&folio->_entire_mapcount);
-		atomic_inc(&folio->_large_mapcount);
+		folio_inc_large_mapcount(folio, dst_vma);
 		break;
 	}
 	return 0;
-- 
cgit v1.2.3


From b85aa9b11b67ed4b086bf5366392adec1b4a6a81 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:30:03 +0100
Subject: bit_spinlock: __always_inline (un)lock functions

The compiler might decide that it is a smart idea to not inline
bit_spin_lock(), primarily when a couple of functions in the same file end
up calling it.  Especially when used in RMAP map/unmap code next, the
compiler sometimes decides to not inline, which is then observable in some
micro-benchmarks.

Let's simply flag all lock/unlock functions as __always_inline;
arch_test_and_set_bit_lock() and friends are already tagged like that (but
not test_and_set_bit_lock() for some reason).

If ever a problem, we could split it into a fast and a slow path, and only
force the fast path to be inlined.  But there is nothing particularly
"big" here.

Link: https://lkml.kernel.org/r/20250303163014.1128035-11-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/bit_spinlock.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h
index bbc4730a6505..c0989b5b0407 100644
--- a/include/linux/bit_spinlock.h
+++ b/include/linux/bit_spinlock.h
@@ -13,7 +13,7 @@
  * Don't use this unless you really need to: spin_lock() and spin_unlock()
  * are significantly faster.
  */
-static inline void bit_spin_lock(int bitnum, unsigned long *addr)
+static __always_inline void bit_spin_lock(int bitnum, unsigned long *addr)
 {
 	/*
 	 * Assuming the lock is uncontended, this never enters
@@ -38,7 +38,7 @@ static inline void bit_spin_lock(int bitnum, unsigned long *addr)
 /*
  * Return true if it was acquired
  */
-static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
+static __always_inline int bit_spin_trylock(int bitnum, unsigned long *addr)
 {
 	preempt_disable();
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
@@ -54,7 +54,7 @@ static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
 /*
  *  bit-based spin_unlock()
  */
-static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
+static __always_inline void bit_spin_unlock(int bitnum, unsigned long *addr)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	BUG_ON(!test_bit(bitnum, addr));
@@ -71,7 +71,7 @@ static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
  *  non-atomic version, which can be used eg. if the bit lock itself is
  *  protecting the rest of the flags in the word.
  */
-static inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
+static __always_inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	BUG_ON(!test_bit(bitnum, addr));
-- 
cgit v1.2.3


From 6af8cb80d3a9a6bbd521d8a7c949b4eafb7dba5d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:30:05 +0100
Subject: mm/rmap: basic MM owner tracking for large folios (!hugetlb)

For small folios, we traditionally use the mapcount to decide whether it
was "certainly mapped exclusively" by a single MM (mapcount == 1) or
whether it "maybe mapped shared" by multiple MMs (mapcount > 1).  For
PMD-sized folios that were PMD-mapped, we were able to use a similar
mechanism (single PMD mapping), but for PTE-mapped folios and in the
future folios that span multiple PMDs, this does not work.

So we need a different mechanism to handle large folios.  Let's add a new
mechanism to detect whether a large folio is "certainly mapped
exclusively", or whether it is "maybe mapped shared".

We'll use this information next to optimize CoW reuse for PTE-mapped
anonymous THP, and to convert folio_likely_mapped_shared() to
folio_maybe_mapped_shared(), independent of per-page mapcounts.

For each large folio, we'll have two slots, whereby a slot stores:
 (1) an MM id: unique id assigned to each MM
 (2) a per-MM mapcount

If a slot is unoccupied, it can be taken by the next MM that maps folio
page.

In addition, we'll remember the current state -- "mapped exclusively" vs.
"maybe mapped shared" -- and use a bit spinlock to sync on updates and to
reduce the total number of atomic accesses on updates.  In the future, it
might be possible to squeeze a proper spinlock into "struct folio".  For
now, keep it simple, as we require the whole thing with THP only, that is
incompatible with RT.

As we have to squeeze this information into the "struct folio" of even
folios of order-1 (2 pages), and we generally want to reduce the required
metadata, we'll assign each MM a unique ID that can fit into an int.  In
total, we can squeeze everything into 4x int (2x long) on 64bit.

32bit support is a bit challenging, because we only have 2x long == 2x int
in order-1 folios.  But we can make it work for now, because we neither
expect many MMs nor very large folios on 32bit.

We will reliably detect folios as "mapped exclusively" vs.  "mapped
shared" as long as only two MMs map pages of a folio at one point in time
-- for example with fork() and short-lived child processes, or with apps
that hand over state from one instance to another.

As soon as three MMs are involved at the same time, we might detect "maybe
mapped shared" although the folio is "mapped exclusively".

Example 1:

(1) App1 faults in a (shmem/file-backed) folio page -> Tracked as MM0
(2) App2 faults in a folio page -> Tracked as MM1
(4) App1 unmaps all folio pages

 -> We will detect "mapped exclusively".

Example 2:

(1) App1 faults in a (shmem/file-backed) folio page -> Tracked as MM0
(2) App2 faults in a folio page -> Tracked as MM1
(3) App3 faults in a folio page -> No slot available, tracked as "unknown"
(4) App1 and App2 unmap all folio pages

 -> We will detect "maybe mapped shared".

Make use of __always_inline to keep possible performance degradation when
(un)mapping large folios to a minimum.

Note: by squeezing the two flags into the "unsigned long" that stores the
MM ids, we can use non-atomic __bit_spin_unlock() and non-atomic
setting/clearing of the "maybe mapped shared" bit, effectively not adding
any new atomics on the hot path when updating the large mapcount + new
metadata, which further helps reduce the runtime overhead in
micro-benchmarks.

Link: https://lkml.kernel.org/r/20250303163014.1128035-13-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h   |  49 ++++++++++++++
 include/linux/page-flags.h |   4 ++
 include/linux/rmap.h       | 165 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 218 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9499eb8e8e66..aac7c87b04e1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -292,6 +292,44 @@ typedef struct {
 #define NR_PAGES_IN_LARGE_FOLIO
 #endif
 
+/*
+ * On 32bit, we can cut the required metadata in half, because:
+ * (a) PID_MAX_LIMIT implicitly limits the number of MMs we could ever have,
+ *     so we can limit MM IDs to 15 bit (32767).
+ * (b) We don't expect folios where even a single complete PTE mapping by
+ *     one MM would exceed 15 bits (order-15).
+ */
+#ifdef CONFIG_64BIT
+typedef int mm_id_mapcount_t;
+#define MM_ID_MAPCOUNT_MAX		INT_MAX
+typedef unsigned int mm_id_t;
+#else /* !CONFIG_64BIT */
+typedef short mm_id_mapcount_t;
+#define MM_ID_MAPCOUNT_MAX		SHRT_MAX
+typedef unsigned short mm_id_t;
+#endif /* CONFIG_64BIT */
+
+/* We implicitly use the dummy ID for init-mm etc. where we never rmap pages. */
+#define MM_ID_DUMMY			0
+#define MM_ID_MIN			(MM_ID_DUMMY + 1)
+
+/*
+ * We leave the highest bit of each MM id unused, so we can store a flag
+ * in the highest bit of each folio->_mm_id[].
+ */
+#define MM_ID_BITS			((sizeof(mm_id_t) * BITS_PER_BYTE) - 1)
+#define MM_ID_MASK			((1U << MM_ID_BITS) - 1)
+#define MM_ID_MAX			MM_ID_MASK
+
+/*
+ * In order to use bit_spin_lock(), which requires an unsigned long, we
+ * operate on folio->_mm_ids when working on flags.
+ */
+#define FOLIO_MM_IDS_LOCK_BITNUM	MM_ID_BITS
+#define FOLIO_MM_IDS_LOCK_BIT		BIT(FOLIO_MM_IDS_LOCK_BITNUM)
+#define FOLIO_MM_IDS_SHARED_BITNUM	(2 * MM_ID_BITS + 1)
+#define FOLIO_MM_IDS_SHARED_BIT		BIT(FOLIO_MM_IDS_SHARED_BITNUM)
+
 /**
  * struct folio - Represents a contiguous set of bytes.
  * @flags: Identical to the page flags.
@@ -318,6 +356,9 @@ typedef struct {
  * @_nr_pages_mapped: Do not use outside of rmap and debug code.
  * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
  * @_nr_pages: Do not use directly, call folio_nr_pages().
+ * @_mm_id: Do not use outside of rmap code.
+ * @_mm_ids: Do not use outside of rmap code.
+ * @_mm_id_mapcount: Do not use outside of rmap code.
  * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h.
  * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h.
  * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h.
@@ -390,6 +431,11 @@ struct folio {
 					atomic_t _entire_mapcount;
 					atomic_t _pincount;
 #endif /* CONFIG_64BIT */
+					mm_id_mapcount_t _mm_id_mapcount[2];
+					union {
+						mm_id_t _mm_id[2];
+						unsigned long _mm_ids;
+					};
 	/* private: the union with struct page is transitional */
 				};
 				unsigned long _usable_1[4];
@@ -1114,6 +1160,9 @@ struct mm_struct {
 #endif
 		} lru_gen;
 #endif /* CONFIG_LRU_GEN_WALKS_MMU */
+#ifdef CONFIG_MM_ID
+		mm_id_t mm_id;
+#endif /* CONFIG_MM_ID */
 	} __randomize_layout;
 
 	/*
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index cab382bd965e..f26ce54c7aa4 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1185,6 +1185,10 @@ static inline int folio_has_private(const struct folio *folio)
 	return !!(folio->flags & PAGE_FLAGS_PRIVATE);
 }
 
+static inline bool folio_test_large_maybe_mapped_shared(const struct folio *folio)
+{
+	return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
+}
 #undef PF_ANY
 #undef PF_HEAD
 #undef PF_NO_TAIL
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d1e888cc97a5..c131b0efff0f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -13,6 +13,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/memremap.h>
+#include <linux/bit_spinlock.h>
 
 /*
  * The anon_vma heads a list of private "related" vmas, to scan if
@@ -173,6 +174,169 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
 
 struct anon_vma *folio_get_anon_vma(const struct folio *folio);
 
+#ifdef CONFIG_MM_ID
+static __always_inline void folio_lock_large_mapcount(struct folio *folio)
+{
+	bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
+}
+
+static __always_inline void folio_unlock_large_mapcount(struct folio *folio)
+{
+	__bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
+}
+
+static inline unsigned int folio_mm_id(const struct folio *folio, int idx)
+{
+	VM_WARN_ON_ONCE(idx != 0 && idx != 1);
+	return folio->_mm_id[idx] & MM_ID_MASK;
+}
+
+static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id)
+{
+	VM_WARN_ON_ONCE(idx != 0 && idx != 1);
+	folio->_mm_id[idx] &= ~MM_ID_MASK;
+	folio->_mm_id[idx] |= id;
+}
+
+static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio,
+		int diff, mm_id_t mm_id)
+{
+	VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio));
+	VM_WARN_ON_ONCE(diff <= 0);
+	VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX);
+
+	/*
+	 * Make sure we can detect at least one complete PTE mapping of the
+	 * folio in a single MM as "exclusively mapped". This is primarily
+	 * a check on 32bit, where we currently reduce the size of the per-MM
+	 * mapcount to a short.
+	 */
+	VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio));
+	VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX);
+
+	VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY &&
+			folio->_mm_id_mapcount[0] != -1);
+	VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY &&
+			folio->_mm_id_mapcount[0] < 0);
+	VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY &&
+			folio->_mm_id_mapcount[1] != -1);
+	VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY &&
+			folio->_mm_id_mapcount[1] < 0);
+	VM_WARN_ON_ONCE(!folio_mapped(folio) &&
+			folio_test_large_maybe_mapped_shared(folio));
+}
+
+static __always_inline void folio_set_large_mapcount(struct folio *folio,
+		int mapcount, struct vm_area_struct *vma)
+{
+	__folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id);
+
+	VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY);
+	VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY);
+
+	/* Note: mapcounts start at -1. */
+	atomic_set(&folio->_large_mapcount, mapcount - 1);
+	folio->_mm_id_mapcount[0] = mapcount - 1;
+	folio_set_mm_id(folio, 0, vma->vm_mm->mm_id);
+}
+
+static __always_inline void folio_add_large_mapcount(struct folio *folio,
+		int diff, struct vm_area_struct *vma)
+{
+	const mm_id_t mm_id = vma->vm_mm->mm_id;
+	int new_mapcount_val;
+
+	folio_lock_large_mapcount(folio);
+	__folio_large_mapcount_sanity_checks(folio, diff, mm_id);
+
+	new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff;
+	atomic_set(&folio->_large_mapcount, new_mapcount_val);
+
+	/*
+	 * If a folio is mapped more than once into an MM on 32bit, we
+	 * can in theory overflow the per-MM mapcount (although only for
+	 * fairly large folios), turning it negative. In that case, just
+	 * free up the slot and mark the folio "mapped shared", otherwise
+	 * we might be in trouble when unmapping pages later.
+	 */
+	if (folio_mm_id(folio, 0) == mm_id) {
+		folio->_mm_id_mapcount[0] += diff;
+		if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) {
+			folio->_mm_id_mapcount[0] = -1;
+			folio_set_mm_id(folio, 0, MM_ID_DUMMY);
+			folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+		}
+	} else if (folio_mm_id(folio, 1) == mm_id) {
+		folio->_mm_id_mapcount[1] += diff;
+		if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) {
+			folio->_mm_id_mapcount[1] = -1;
+			folio_set_mm_id(folio, 1, MM_ID_DUMMY);
+			folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+		}
+	} else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) {
+		folio_set_mm_id(folio, 0, mm_id);
+		folio->_mm_id_mapcount[0] = diff - 1;
+		/* We might have other mappings already. */
+		if (new_mapcount_val != diff - 1)
+			folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+	} else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) {
+		folio_set_mm_id(folio, 1, mm_id);
+		folio->_mm_id_mapcount[1] = diff - 1;
+		/* Slot 0 certainly has mappings as well. */
+		folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+	}
+	folio_unlock_large_mapcount(folio);
+}
+
+static __always_inline void folio_sub_large_mapcount(struct folio *folio,
+		int diff, struct vm_area_struct *vma)
+{
+	const mm_id_t mm_id = vma->vm_mm->mm_id;
+	int new_mapcount_val;
+
+	folio_lock_large_mapcount(folio);
+	__folio_large_mapcount_sanity_checks(folio, diff, mm_id);
+
+	new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff;
+	atomic_set(&folio->_large_mapcount, new_mapcount_val);
+
+	/*
+	 * There are valid corner cases where we might underflow a per-MM
+	 * mapcount (some mappings added when no slot was free, some mappings
+	 * added once a slot was free), so we always set it to -1 once we go
+	 * negative.
+	 */
+	if (folio_mm_id(folio, 0) == mm_id) {
+		folio->_mm_id_mapcount[0] -= diff;
+		if (folio->_mm_id_mapcount[0] >= 0)
+			goto out;
+		folio->_mm_id_mapcount[0] = -1;
+		folio_set_mm_id(folio, 0, MM_ID_DUMMY);
+	} else if (folio_mm_id(folio, 1) == mm_id) {
+		folio->_mm_id_mapcount[1] -= diff;
+		if (folio->_mm_id_mapcount[1] >= 0)
+			goto out;
+		folio->_mm_id_mapcount[1] = -1;
+		folio_set_mm_id(folio, 1, MM_ID_DUMMY);
+	}
+
+	/*
+	 * If one MM slot owns all mappings, the folio is mapped exclusively.
+	 * Note that if the folio is now unmapped (new_mapcount_val == -1), both
+	 * slots must be free (mapcount == -1), and we'll also mark it as
+	 * exclusive.
+	 */
+	if (folio->_mm_id_mapcount[0] == new_mapcount_val ||
+	    folio->_mm_id_mapcount[1] == new_mapcount_val)
+		folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT;
+out:
+	folio_unlock_large_mapcount(folio);
+}
+#else /* !CONFIG_MM_ID */
+/*
+ * See __folio_rmap_sanity_checks(), we might map large folios even without
+ * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now.
+ */
 static inline void folio_set_large_mapcount(struct folio *folio, int mapcount,
 		struct vm_area_struct *vma)
 {
@@ -191,6 +355,7 @@ static inline void folio_sub_large_mapcount(struct folio *folio,
 {
 	atomic_sub(diff, &folio->_large_mapcount);
 }
+#endif /* CONFIG_MM_ID */
 
 #define folio_inc_large_mapcount(folio, vma) \
 	folio_add_large_mapcount(folio, 1, vma)
-- 
cgit v1.2.3


From 003fde4492c88ac3a1fee3d97b3834a679780af3 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:30:07 +0100
Subject: mm: convert folio_likely_mapped_shared() to
 folio_maybe_mapped_shared()

Let's reuse our new MM ownership tracking infrastructure for large folios
to make folio_likely_mapped_shared() never return false negatives -- never
indicating "not mapped shared" although the folio *is* mapped shared.
With that, we can rename it to folio_maybe_mapped_shared() and get rid of
the dependency on the mapcount of the first folio page.

The semantics are now arguably clearer: no mixture of "false negatives"
and "false positives", only the remaining possibility for "false
positives".

Thoroughly document the new semantics.  We might now detect that a large
folio is "maybe mapped shared" although it *no longer* is -- but once was.
Now, if more than two MMs mapped a folio at the same time, and the MM
mapping the folio exclusively at the end is not one tracked in the two
folio MM slots, we will detect the folio as "maybe mapped shared".

For anonymous folios, usually (except weird corner cases) all PTEs that
target a "maybe mapped shared" folio are R/O.  As soon as a child process
would write to them (iow, actively use them), we would CoW and effectively
replace these PTEs.  Most cases (below) are not expected to really matter
with large anonymous folios for this reason.

Most importantly, there will be no change at all for:
* small folios
* hugetlb folios
* PMD-mapped PMD-sized THPs (single mapping)

This change has the potential to affect existing callers of
folio_likely_mapped_shared() -> folio_maybe_mapped_shared():

(1) fs/proc/task_mmu.c: no change (hugetlb)

(2) khugepaged counts PTEs that target shared folios towards
    max_ptes_shared (default: HPAGE_PMD_NR / 2), meaning we could skip a
    collapse where we would have previously collapsed.  This only applies
    to anonymous folios and is not expected to matter in practice.

    Worth noting that this change sorts out case (A) documented in
    commit 1bafe96e89f0 ("mm/khugepaged: replace page_mapcount() check by
    folio_likely_mapped_shared()") by removing the possibility for "false
    negatives".

(3) MADV_COLD / MADV_PAGEOUT / MADV_FREE will not try splitting
    PTE-mapped THPs that are considered shared but not fully covered by
    the requested range, consequently not processing them.

    PMD-mapped PMD-sized THP are not affected, or when all PTEs are
    covered.  These functions are usually only called on anon/file folios
    that are exclusively mapped most of the time (no other file mappings
    or no fork()), so the "false negatives" are not expected to matter in
    practice.

(4) mbind() / migrate_pages() / move_pages() will refuse to migrate
    shared folios unless MPOL_MF_MOVE_ALL is effective (requires
    CAP_SYS_NICE).  We will now reject some folios that could be migrated.

    Similar to (3), especially with MPOL_MF_MOVE_ALL, so this is not
    expected to matter in practice.

    Note that cpuset_migrate_mm_workfn() calls do_migrate_pages() with
    MPOL_MF_MOVE_ALL.

(5) NUMA hinting

    mm/migrate.c:migrate_misplaced_folio_prepare() will skip file
    folios that are probably shared libraries (-> "mapped shared" and
    executable).  This check would have detected it as a shared library at
    some point (at least 3 MMs mapping it), so detecting it afterwards
    does not sound wrong (still a shared library).  Not expected to
    matter.

    mm/memory.c:numa_migrate_check() will indicate TNF_SHARED in
    MAP_SHARED file mappings when encountering a shared folio.  Similar
    reasoning, not expected to matter.

    mm/mprotect.c:change_pte_range() will skip folios detected as
    shared in CoW mappings.  Similarly, this is not expected to matter in
    practice, but if it would ever be a problem we could relax that check
    a bit (e.g., basing it on the average page-mapcount in a folio),
    because it was only an optimization when many (e.g., 288) processes
    were mapping the same folios -- see commit 859d4adc3415 ("mm: numa: do
    not trap faults on shared data section pages.")

(6) mm/rmap.c:folio_referenced_one() will skip exclusive swapbacked
    folios in dying processes.  Applies to anonymous folios only.  Without
    "false negatives", we'll now skip all actually shared ones.  Skipping
    ones that are actually exclusive won't really matter, it's a pure
    optimization, and is not expected to matter in practice.

In theory, one can detect the problematic scenario: folio_mapcount() > 0
and no folio MM slot is occupied ("state unknown").  One could reset the
MM slots while doing an rmap walk, which migration / folio split already
do when setting everything up.  Further, when batching PTEs we might
naturally learn about a owner (e.g., folio_mapcount() == nr_ptes) and
could update the owner.  However, we'll defer that until the scenarios
where it would really matter are clear.

Link: https://lkml.kernel.org/r/20250303163014.1128035-15-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f366c180f2b6..82776b409391 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2251,23 +2251,18 @@ static inline size_t folio_size(const struct folio *folio)
 }
 
 /**
- * folio_likely_mapped_shared - Estimate if the folio is mapped into the page
- *				tables of more than one MM
+ * folio_maybe_mapped_shared - Whether the folio is mapped into the page
+ *			       tables of more than one MM
  * @folio: The folio.
  *
- * This function checks if the folio is currently mapped into more than one
- * MM ("mapped shared"), or if the folio is only mapped into a single MM
- * ("mapped exclusively").
+ * This function checks if the folio maybe currently mapped into more than one
+ * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single
+ * MM ("mapped exclusively").
  *
  * For KSM folios, this function also returns "mapped shared" when a folio is
  * mapped multiple times into the same MM, because the individual page mappings
  * are independent.
  *
- * As precise information is not easily available for all folios, this function
- * estimates the number of MMs ("sharers") that are currently mapping a folio
- * using the number of times the first page of the folio is currently mapped
- * into page tables.
- *
  * For small anonymous folios and anonymous hugetlb folios, the return
  * value will be exactly correct: non-KSM folios can only be mapped at most once
  * into an MM, and they cannot be partially mapped. KSM folios are
@@ -2275,8 +2270,8 @@ static inline size_t folio_size(const struct folio *folio)
  *
  * For other folios, the result can be fuzzy:
  *    #. For partially-mappable large folios (THP), the return value can wrongly
- *       indicate "mapped exclusively" (false negative) when the folio is
- *       only partially mapped into at least one MM.
+ *       indicate "mapped shared" (false positive) if a folio was mapped by
+ *       more than two MMs at one point in time.
  *    #. For pagecache folios (including hugetlb), the return value can wrongly
  *       indicate "mapped shared" (false positive) when two VMAs in the same MM
  *       cover the same file range.
@@ -2293,7 +2288,7 @@ static inline size_t folio_size(const struct folio *folio)
  *
  * Return: Whether the folio is estimated to be mapped into more than one MM.
  */
-static inline bool folio_likely_mapped_shared(struct folio *folio)
+static inline bool folio_maybe_mapped_shared(struct folio *folio)
 {
 	int mapcount = folio_mapcount(folio);
 
@@ -2301,16 +2296,22 @@ static inline bool folio_likely_mapped_shared(struct folio *folio)
 	if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
 		return mapcount > 1;
 
-	/* A single mapping implies "mapped exclusively". */
-	if (mapcount <= 1)
-		return false;
-
-	/* If any page is mapped more than once we treat it "mapped shared". */
-	if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio))
+	/*
+	 * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ...
+	 * simply assume "mapped shared", nobody should really care
+	 * about this for arbitrary kernel allocations.
+	 */
+	if (!IS_ENABLED(CONFIG_MM_ID))
 		return true;
 
-	/* Let's guess based on the first subpage. */
-	return atomic_read(&folio->_mapcount) > 0;
+	/*
+	 * A single mapping implies "mapped exclusively", even if the
+	 * folio flag says something different: it's easier to handle this
+	 * case here instead of on the RMAP hot path.
+	 */
+	if (mapcount <= 1)
+		return false;
+	return folio_test_large_maybe_mapped_shared(folio);
 }
 
 #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
-- 
cgit v1.2.3


From 749492229e3bd6222dda7267b8244135229d1fd8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:30:13 +0100
Subject: mm: stop maintaining the per-page mapcount of large folios
 (CONFIG_NO_PAGE_MAPCOUNT)

Everything is in place to stop using the per-page mapcounts in large
folios: the mapcount of tail pages will always be logically 0 (-1 value),
just like it currently is for hugetlb folios already, and the page
mapcount of the head page is either 0 (-1 value) or contains a page type
(e.g., hugetlb).

Maintaining _nr_pages_mapped without per-page mapcounts is impossible, so
that one also has to go with CONFIG_NO_PAGE_MAPCOUNT.

There are two remaining implications:

(1) Per-node, per-cgroup and per-lruvec stats of "NR_ANON_MAPPED"
    ("mapped anonymous memory") and "NR_FILE_MAPPED"
    ("mapped file memory"):

    As soon as any page of the folio is mapped -- folio_mapped() -- we
    now account the complete folio as mapped. Once the last page is
    unmapped -- !folio_mapped() -- we account the complete folio as
    unmapped.

    This implies that ...

    * "AnonPages" and "Mapped" in /proc/meminfo and
      /sys/devices/system/node/*/meminfo
    * cgroup v2: "anon" and "file_mapped" in "memory.stat" and
      "memory.numa_stat"
    * cgroup v1: "rss" and "mapped_file" in "memory.stat" and
      "memory.numa_stat

    ... can now appear higher than before. But note that these folios do
    consume that memory, simply not all pages are actually currently
    mapped.

    It's worth nothing that other accounting in the kernel (esp. cgroup
    charging on allocation) is not affected by this change.

    [why oh why is "anon" called "rss" in cgroup v1]

 (2) Detecting partial mappings

     Detecting whether anon THPs are partially mapped gets a bit more
     unreliable. As long as a single MM maps such a large folio
     ("exclusively mapped"), we can reliably detect it. Especially before
     fork() / after a short-lived child process quit, we will detect
     partial mappings reliably, which is the common case.

     In essence, if the average per-page mapcount in an anon THP is < 1,
     we know for sure that we have a partial mapping.

     However, as soon as multiple MMs are involved, we might miss detecting
     partial mappings: this might be relevant with long-lived child
     processes. If we have a fully-mapped anon folio before fork(), once
     our child processes and our parent all unmap (zap/COW) the same pages
     (but not the complete folio), we might not detect the partial mapping.
     However, once the child processes quit we would detect the partial
     mapping.

     How relevant this case is in practice remains to be seen.
     Swapout/migration will likely mitigate this.

     In the future, RMAP walkers could check for that for that case
     (e.g., when collecting access bits during reclaim) and simply flag
     them for deferred-splitting.

Link: https://lkml.kernel.org/r/20250303163014.1128035-21-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c131b0efff0f..6b82b618846e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -240,7 +240,7 @@ static __always_inline void folio_set_large_mapcount(struct folio *folio,
 	folio_set_mm_id(folio, 0, vma->vm_mm->mm_id);
 }
 
-static __always_inline void folio_add_large_mapcount(struct folio *folio,
+static __always_inline int folio_add_return_large_mapcount(struct folio *folio,
 		int diff, struct vm_area_struct *vma)
 {
 	const mm_id_t mm_id = vma->vm_mm->mm_id;
@@ -286,9 +286,11 @@ static __always_inline void folio_add_large_mapcount(struct folio *folio,
 		folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
 	}
 	folio_unlock_large_mapcount(folio);
+	return new_mapcount_val + 1;
 }
+#define folio_add_large_mapcount folio_add_return_large_mapcount
 
-static __always_inline void folio_sub_large_mapcount(struct folio *folio,
+static __always_inline int folio_sub_return_large_mapcount(struct folio *folio,
 		int diff, struct vm_area_struct *vma)
 {
 	const mm_id_t mm_id = vma->vm_mm->mm_id;
@@ -331,7 +333,9 @@ static __always_inline void folio_sub_large_mapcount(struct folio *folio,
 		folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT;
 out:
 	folio_unlock_large_mapcount(folio);
+	return new_mapcount_val + 1;
 }
+#define folio_sub_large_mapcount folio_sub_return_large_mapcount
 #else /* !CONFIG_MM_ID */
 /*
  * See __folio_rmap_sanity_checks(), we might map large folios even without
@@ -350,17 +354,33 @@ static inline void folio_add_large_mapcount(struct folio *folio,
 	atomic_add(diff, &folio->_large_mapcount);
 }
 
+static inline int folio_add_return_large_mapcount(struct folio *folio,
+		int diff, struct vm_area_struct *vma)
+{
+	BUILD_BUG();
+}
+
 static inline void folio_sub_large_mapcount(struct folio *folio,
 		int diff, struct vm_area_struct *vma)
 {
 	atomic_sub(diff, &folio->_large_mapcount);
 }
+
+static inline int folio_sub_return_large_mapcount(struct folio *folio,
+		int diff, struct vm_area_struct *vma)
+{
+	BUILD_BUG();
+}
 #endif /* CONFIG_MM_ID */
 
 #define folio_inc_large_mapcount(folio, vma) \
 	folio_add_large_mapcount(folio, 1, vma)
+#define folio_inc_return_large_mapcount(folio, vma) \
+	folio_add_return_large_mapcount(folio, 1, vma)
 #define folio_dec_large_mapcount(folio, vma) \
 	folio_sub_large_mapcount(folio, 1, vma)
+#define folio_dec_return_large_mapcount(folio, vma) \
+	folio_sub_return_large_mapcount(folio, 1, vma)
 
 /* RMAP flags, currently only relevant for some anon rmap operations. */
 typedef int __bitwise rmap_t;
@@ -538,9 +558,11 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
 			break;
 		}
 
-		do {
-			atomic_inc(&page->_mapcount);
-		} while (page++, --nr_pages > 0);
+		if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
+			do {
+				atomic_inc(&page->_mapcount);
+			} while (page++, --nr_pages > 0);
+		}
 		folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
 		break;
 	case RMAP_LEVEL_PMD:
@@ -638,7 +660,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
 		do {
 			if (PageAnonExclusive(page))
 				ClearPageAnonExclusive(page);
-			atomic_inc(&page->_mapcount);
+			if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+				atomic_inc(&page->_mapcount);
 		} while (page++, --nr_pages > 0);
 		folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
 		break;
-- 
cgit v1.2.3


From f7f0d88b7d6da29d2b073740aa130685f0e18609 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 5 Mar 2025 14:27:29 -0800
Subject: mm/damon/core: expose damos_filter_for_ops() to DAMON kernel API
 callers

damos_filter_for_ops() can be useful to avoid putting wrong type of
filters in wrong place.  Make it be exposed to DAMON kernel API callers.

Link: https://lkml.kernel.org/r/20250305222733.59089-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 52559475dbe7..eed008b64a23 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -894,6 +894,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed,
 struct damos_filter *damos_new_filter(enum damos_filter_type type,
 		bool matching, bool allow);
 void damos_add_filter(struct damos *s, struct damos_filter *f);
+bool damos_filter_for_ops(enum damos_filter_type type);
 void damos_destroy_filter(struct damos_filter *f);
 
 struct damos_quota_goal *damos_new_quota_goal(
-- 
cgit v1.2.3


From ff22f9299d7b2c7874b560993c21543708b7e1b6 Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Thu, 6 Mar 2025 12:50:10 -0800
Subject: page_io: zswap: do not crash the kernel on decompression failure

Currently, we crash the kernel when a decompression failure occurs in
zswap (either because of memory corruption, or a bug in the compression
algorithm).  This is overkill.  We should only SIGBUS the unfortunate
process asking for the zswap entry on zswap load, and skip the corrupted
entry in zswap writeback.

See [1] for a recent upstream discussion about this.

The zswap writeback case is relatively straightforward to fix.  For the
zswap_load() case, we change the return behavior:

* Return 0 on success.
* Return -ENOENT (with the folio locked) if zswap does not own the
  swapped out content.
* Return -EIO if zswap owns the swapped out content, but encounters a
  decompression failure for some reasons. The folio will be unlocked,
  but not be marked up-to-date, which will eventually cause the process
  requesting the page to SIGBUS (see the handling of not-up-to-date
  folio in do_swap_page() in mm/memory.c), without crashing the kernel.
* Return -EINVAL if we encounter a large folio, as large folio should
  not be swapped in while zswap is being used. Similar to the -EIO case,
  we also unlock the folio but do not mark it as up-to-date to SIGBUS
  the faulting process.

As a side effect, we require one extra zswap tree traversal in the load
and writeback paths.  Quick benchmarking on a kernel build test shows no
performance difference:

With the new scheme:
real: mean: 125.1s, stdev: 0.12s
user: mean: 3265.23s, stdev: 9.62s
sys: mean: 2156.41s, stdev: 13.98s

The old scheme:
real: mean: 125.78s, stdev: 0.45s
user: mean: 3287.18s, stdev: 5.95s
sys: mean: 2177.08s, stdev: 26.52s

[nphamcs@gmail.com: fix documentation of zswap_load()]
  Link: https://lkml.kernel.org/r/20250306222453.1269456-1-nphamcs@gmail.com
Link: https://lore.kernel.org/all/ZsiLElTykamcYZ6J@casper.infradead.org/ [1]
Link: https://lkml.kernel.org/r/20250306205011.784787-1-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Suggested-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zswap.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index d961ead91bf1..30c193a1207e 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -26,7 +26,7 @@ struct zswap_lruvec_state {
 
 unsigned long zswap_total_pages(void);
 bool zswap_store(struct folio *folio);
-bool zswap_load(struct folio *folio);
+int zswap_load(struct folio *folio);
 void zswap_invalidate(swp_entry_t swp);
 int zswap_swapon(int type, unsigned long nr_pages);
 void zswap_swapoff(int type);
@@ -44,9 +44,9 @@ static inline bool zswap_store(struct folio *folio)
 	return false;
 }
 
-static inline bool zswap_load(struct folio *folio)
+static inline int zswap_load(struct folio *folio)
 {
-	return false;
+	return -ENOENT;
 }
 
 static inline void zswap_invalidate(swp_entry_t swp) {}
-- 
cgit v1.2.3


From 8268af309d07d1c6279080b4e6fd16ec75cc977c Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Thu, 13 Mar 2025 15:49:59 +0200
Subject: arch, mm: set max_mapnr when allocating memory map for FLATMEM

max_mapnr is essentially the size of the memory map for systems that use
FLATMEM. There is no reason to calculate it in each and every architecture
when it's anyway calculated in alloc_node_mem_map().

Drop setting of max_mapnr from architecture code and set it once in
alloc_node_mem_map().

While on it, move definition of mem_map and max_mapnr to mm/mm_init.c so
there won't be two copies for MMU and !MMU variants.

Link: https://lkml.kernel.org/r/20250313135003.836600-10-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>	[x86]
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Guo Ren (csky) <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russel King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/memory_model.h |  5 +++--
 include/linux/mm.h                 | 11 -----------
 2 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index 6d1fb6162ac1..a3b5029aebbd 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -19,11 +19,12 @@
 #define __page_to_pfn(page)	((unsigned long)((page) - mem_map) + \
 				 ARCH_PFN_OFFSET)
 
+/* avoid <linux/mm.h> include hell */
+extern unsigned long max_mapnr;
+
 #ifndef pfn_valid
 static inline int pfn_valid(unsigned long pfn)
 {
-	/* avoid <linux/mm.h> include hell */
-	extern unsigned long max_mapnr;
 	unsigned long pfn_offset = ARCH_PFN_OFFSET;
 
 	return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 82776b409391..8c4cb8c28507 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -46,17 +46,6 @@ extern int sysctl_page_lock_unfairness;
 void mm_core_init(void);
 void init_mm_internals(void);
 
-#ifndef CONFIG_NUMA		/* Don't use mapnrs, do it properly */
-extern unsigned long max_mapnr;
-
-static inline void set_max_mapnr(unsigned long limit)
-{
-	max_mapnr = limit;
-}
-#else
-static inline void set_max_mapnr(unsigned long limit) { }
-#endif
-
 extern atomic_long_t _totalram_pages;
 static inline unsigned long totalram_pages(void)
 {
-- 
cgit v1.2.3


From 6faea3422e3b4e8de44a55aa3e6e843320da66d2 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Thu, 13 Mar 2025 15:50:01 +0200
Subject: arch, mm: streamline HIGHMEM freeing

All architectures that support HIGHMEM have their code that frees high
memory pages to the buddy allocator while __free_memory_core() is limited
to freeing only low memory.

There is no actual reason for that.  The memory map is completely ready by
the time memblock_free_all() is called and high pages can be released to
the buddy allocator along with low memory.

Remove low memory limit from __free_memory_core() and drop per-architecture
code that frees high memory pages.

Link: https://lkml.kernel.org/r/20250313135003.836600-12-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>	[x86]
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Guo Ren (csky) <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russel King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8c4cb8c28507..6c519a5098d4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3275,7 +3275,6 @@ extern void reserve_bootmem_region(phys_addr_t start,
 
 /* Free the reserved page into the buddy system, so it gets managed. */
 void free_reserved_page(struct page *page);
-#define free_highmem_page(page) free_reserved_page(page)
 
 static inline void mark_page_reserved(struct page *page)
 {
-- 
cgit v1.2.3


From 0d98484ee3330c35d1dc0da0be0871b3596cbe0d Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Thu, 13 Mar 2025 15:50:02 +0200
Subject: arch, mm: introduce arch_mm_preinit

Currently, implementation of mem_init() in every architecture consists of
one or more of the following:

* initializations that must run before page allocator is active, for
  instance swiotlb_init()
* a call to memblock_free_all() to release all the memory to the buddy
  allocator
* initializations that must run after page allocator is ready and there is
  no arch-specific hook other than mem_init() for that, like for example
  register_page_bootmem_info() in x86 and sparc64 or simple setting of
  mem_init_done = 1 in several architectures
* a bunch of semi-related stuff that apparently had no better place to
  live, for example a ton of BUILD_BUG_ON()s in parisc.

Introduce arch_mm_preinit() that will be the first thing called from
mm_core_init(). On architectures that have initializations that must happen
before the page allocator is ready, move those into arch_mm_preinit() along
with the code that does not depend on ordering with page allocator setup.

On several architectures this results in reduction of mem_init() to a
single call to memblock_free_all() that allows its consolidation next.

Link: https://lkml.kernel.org/r/20250313135003.836600-13-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>	[x86]
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Guo Ren (csky) <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russel King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6c519a5098d4..c417e5634a58 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -43,6 +43,7 @@ struct folio_batch;
 
 extern int sysctl_page_lock_unfairness;
 
+void arch_mm_preinit(void);
 void mm_core_init(void);
 void init_mm_internals(void);
 
-- 
cgit v1.2.3


From 8afa901c147a41f92e83943cddf154bbb7995ee6 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Thu, 13 Mar 2025 15:50:03 +0200
Subject: arch, mm: make releasing of memory to page allocator more explicit

The point where the memory is released from memblock to the buddy
allocator is hidden inside arch-specific mem_init()s and the call to
memblock_free_all() is needlessly duplicated in every artiste cure and
after introduction of arch_mm_preinit() hook, mem_init() implementation on
many architecture only contains the call to memblock_free_all().

Pull memblock_free_all() call into mm_core_init() and drop mem_init() on
relevant architectures to make it more explicit where the free memory is
released from memblock to the buddy allocator and to reduce code
duplication in architecture specific code.

Link: https://lkml.kernel.org/r/20250313135003.836600-14-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>	[x86]
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>	[m68k]
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Guo Ren (csky) <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russel King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memblock.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index e79eb6ac516f..ef5a1ecc6e59 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -133,7 +133,6 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size);
 
-void memblock_free_all(void);
 void memblock_free(void *ptr, size_t size);
 void reset_all_zones_managed_pages(void);
 
-- 
cgit v1.2.3


From 53058c762afff714ceaec14b87cf8fdce3b6d33e Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 6 Mar 2025 09:59:04 -0800
Subject: mm/damon: remove damon_callback->private

The field was added to let users keep their personal data to use inside of
the callbacks.  However, no one is actively using that now.  Remove it.

Link: https://lkml.kernel.org/r/20250306175908.66300-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index eed008b64a23..dab4bb0fe39d 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -609,12 +609,10 @@ struct damon_operations {
  * @after_aggregation:	Called after each aggregation.
  * @before_damos_apply:	Called before applying DAMOS action.
  * @before_terminate:	Called before terminating the monitoring.
- * @private:		User private data.
  *
  * The monitoring thread (&damon_ctx.kdamond) calls @before_start and
  * @before_terminate just before starting and finishing the monitoring,
- * respectively.  Therefore, those are good places for installing and cleaning
- * @private.
+ * respectively.
  *
  * The monitoring thread calls @after_wmarks_check after each DAMON-based
  * operation schemes' watermarks check.  If users need to make changes to the
@@ -630,8 +628,6 @@ struct damon_operations {
  * If any callback returns non-zero, monitoring stops.
  */
 struct damon_callback {
-	void *private;
-
 	int (*before_start)(struct damon_ctx *context);
 	int (*after_wmarks_check)(struct damon_ctx *context);
 	int (*after_sampling)(struct damon_ctx *context);
-- 
cgit v1.2.3


From 07da21855b270c17b2a2d20e644c1419fcaafdd1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 6 Mar 2025 09:59:05 -0800
Subject: mm/damon: remove ->before_start of damon_callback

The function pointer field was added to be used as a place to do some
initialization works just before DAMON starts working.  However, nobody is
using it now.  Remove it.

Link: https://lkml.kernel.org/r/20250306175908.66300-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index dab4bb0fe39d..043de2408c65 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -603,16 +603,14 @@ struct damon_operations {
 /**
  * struct damon_callback - Monitoring events notification callbacks.
  *
- * @before_start:	Called before starting the monitoring.
  * @after_wmarks_check:	Called after each schemes' watermarks check.
  * @after_sampling:	Called after each sampling.
  * @after_aggregation:	Called after each aggregation.
  * @before_damos_apply:	Called before applying DAMOS action.
  * @before_terminate:	Called before terminating the monitoring.
  *
- * The monitoring thread (&damon_ctx.kdamond) calls @before_start and
- * @before_terminate just before starting and finishing the monitoring,
- * respectively.
+ * The monitoring thread (&damon_ctx.kdamond) calls @before_terminate just
+ * before finishing the monitoring.
  *
  * The monitoring thread calls @after_wmarks_check after each DAMON-based
  * operation schemes' watermarks check.  If users need to make changes to the
@@ -628,7 +626,6 @@ struct damon_operations {
  * If any callback returns non-zero, monitoring stops.
  */
 struct damon_callback {
-	int (*before_start)(struct damon_ctx *context);
 	int (*after_wmarks_check)(struct damon_ctx *context);
 	int (*after_sampling)(struct damon_ctx *context);
 	int (*after_aggregation)(struct damon_ctx *context);
-- 
cgit v1.2.3


From cedee98f68875605dad644a95a63eae04de250b2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 6 Mar 2025 09:59:06 -0800
Subject: mm/damon: remove damon_callback->after_sampling

The callback was used by DAMON sysfs interface for reading DAMON internal
data.  But it is no more being used, and damon_call() can do similar works
in a better way.  Remove it.

Link: https://lkml.kernel.org/r/20250306175908.66300-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 043de2408c65..5aa277f4c948 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -604,7 +604,6 @@ struct damon_operations {
  * struct damon_callback - Monitoring events notification callbacks.
  *
  * @after_wmarks_check:	Called after each schemes' watermarks check.
- * @after_sampling:	Called after each sampling.
  * @after_aggregation:	Called after each aggregation.
  * @before_damos_apply:	Called before applying DAMOS action.
  * @before_terminate:	Called before terminating the monitoring.
@@ -617,17 +616,15 @@ struct damon_operations {
  * attributes of the monitoring context while it's deactivated due to the
  * watermarks, this is the good place to do.
  *
- * The monitoring thread calls @after_sampling and @after_aggregation for each
- * of the sampling intervals and aggregation intervals, respectively.
- * Therefore, users can safely access the monitoring results without additional
- * protection.  For the reason, users are recommended to use these callback for
- * the accesses to the results.
+ * The monitoring thread calls @after_aggregation for each of the aggregation
+ * intervals.  Therefore, users can safely access the monitoring results
+ * without additional protection.  For the reason, users are recommended to use
+ * these callback for the accesses to the results.
  *
  * If any callback returns non-zero, monitoring stops.
  */
 struct damon_callback {
 	int (*after_wmarks_check)(struct damon_ctx *context);
-	int (*after_sampling)(struct damon_ctx *context);
 	int (*after_aggregation)(struct damon_ctx *context);
 	int (*before_damos_apply)(struct damon_ctx *context,
 			struct damon_target *target,
-- 
cgit v1.2.3


From 99ce7c9c6d855716356f2f839c53905592fd780b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 6 Mar 2025 09:59:07 -0800
Subject: mm/damon: remove damon_callback->before_damos_apply

The hook was introduced to let DAMON kernel API users access DAMOS
schemes-eligible regions in a safe way.  Now it is no more used by anyone,
and the functionality is provided in a better way by damos_walk().  Remove
it.

Link: https://lkml.kernel.org/r/20250306175908.66300-13-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 5aa277f4c948..be7b281fb922 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -605,7 +605,6 @@ struct damon_operations {
  *
  * @after_wmarks_check:	Called after each schemes' watermarks check.
  * @after_aggregation:	Called after each aggregation.
- * @before_damos_apply:	Called before applying DAMOS action.
  * @before_terminate:	Called before terminating the monitoring.
  *
  * The monitoring thread (&damon_ctx.kdamond) calls @before_terminate just
@@ -626,10 +625,6 @@ struct damon_operations {
 struct damon_callback {
 	int (*after_wmarks_check)(struct damon_ctx *context);
 	int (*after_aggregation)(struct damon_ctx *context);
-	int (*before_damos_apply)(struct damon_ctx *context,
-			struct damon_target *target,
-			struct damon_region *region,
-			struct damos *scheme);
 	void (*before_terminate)(struct damon_ctx *context);
 };
 
-- 
cgit v1.2.3


From 105f830fa35c49ada7db785a7f9b70386f193529 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 6 Mar 2025 09:59:08 -0800
Subject: mm/damon: remove damon_operations->reset_aggregated

The operations layer hook was introduced to let operations set do any
aggregation data reset if needed.  But it is not really be used now.
Remove it.

Link: https://lkml.kernel.org/r/20250306175908.66300-14-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index be7b281fb922..3db4f77261f5 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -542,7 +542,6 @@ enum damon_ops_id {
  * @update:			Update operations-related data structures.
  * @prepare_access_checks:	Prepare next access check of target regions.
  * @check_accesses:		Check the accesses to target regions.
- * @reset_aggregated:		Reset aggregated accesses monitoring results.
  * @get_scheme_score:		Get the score of a region for a scheme.
  * @apply_scheme:		Apply a DAMON-based operation scheme.
  * @target_valid:		Determine if the target is valid.
@@ -554,8 +553,7 @@ enum damon_ops_id {
  * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting
  * the monitoring, @update after each &damon_attrs.ops_update_interval, and
  * @check_accesses, @target_valid and @prepare_access_checks after each
- * &damon_attrs.sample_interval.  Finally, @reset_aggregated is called after
- * each &damon_attrs.aggr_interval.
+ * &damon_attrs.sample_interval.
  *
  * Each &struct damon_operations instance having valid @id can be registered
  * via damon_register_ops() and selected by damon_select_ops() later.
@@ -570,8 +568,6 @@ enum damon_ops_id {
  * last preparation and update the number of observed accesses of each region.
  * It should also return max number of observed accesses that made as a result
  * of its update.  The value will be used for regions adjustment threshold.
- * @reset_aggregated should reset the access monitoring results that aggregated
- * by @check_accesses.
  * @get_scheme_score should return the priority score of a region for a scheme
  * as an integer in [0, &DAMOS_MAX_SCORE].
  * @apply_scheme is called from @kdamond when a region for user provided
@@ -589,7 +585,6 @@ struct damon_operations {
 	void (*update)(struct damon_ctx *context);
 	void (*prepare_access_checks)(struct damon_ctx *context);
 	unsigned int (*check_accesses)(struct damon_ctx *context);
-	void (*reset_aggregated)(struct damon_ctx *context);
 	int (*get_scheme_score)(struct damon_ctx *context,
 			struct damon_target *t, struct damon_region *r,
 			struct damos *scheme);
-- 
cgit v1.2.3


From 9039b9096ea27a20f0349d1537537663c935c8ed Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@redhat.com>
Date: Thu, 6 Mar 2025 17:44:50 -0500
Subject: mm: page_ext: add an iteration API for page extensions

Patch series "mm: page_ext: Introduce new iteration API", v3.

Introduction
============

  [ Thanks to David Hildenbrand for identifying the root cause of this
    issue and proving guidance on how to fix it. The new API idea, bugs
    and misconceptions are all mine though ]

Currently, trying to reserve 1G pages with page_owner=on and sparsemem
causes a crash. The reproducer is very simple:

 1. Build the kernel with CONFIG_SPARSEMEM=y and the table extensions
 2. Pass 'default_hugepagesz=1 page_owner=on' in the kernel command-line
 3. Reserve one 1G page at run-time, this should crash (see patch 1 for
    backtrace)

 [ A crash with page_table_check is also possible, but harder to trigger ]

Apparently, starting with commit cf54f310d0d3 ("mm/hugetlb: use __GFP_COMP
for gigantic folios") we now pass the full allocation order to page
extension clients and the page extension implementation assumes that all
PFNs of an allocation range will be stored in the same memory section (which
is not true for 1G pages).

To fix this, this series introduces a new iteration API for page extension
objects. The API checks if the next page extension object can be retrieved
from the current section or if it needs to look up for it in another
section.

Please, find all details in patch 1.

I tested this series on arm64 and x86 by reserving 1G pages at run-time
and doing kernel builds (always with page_owner=on and page_table_check=on).


This patch (of 3):

The page extension implementation assumes that all page extensions of a
given page order are stored in the same memory section.  The function
page_ext_next() relies on this assumption by adding an offset to the
current object to return the next adjacent page extension.

This behavior works as expected for flatmem but fails for sparsemem when
using 1G pages.  The commit cf54f310d0d3 ("mm/hugetlb: use __GFP_COMP for
gigantic folios") exposes this issue, making it possible for a crash when
using page_owner or page_table_check page extensions.

The problem is that for 1G pages, the page extensions may span memory
section boundaries and be stored in different memory sections.  This issue
was not visible before commit cf54f310d0d3 ("mm/hugetlb: use __GFP_COMP
for gigantic folios") because alloc_contig_pages() never passed more than
MAX_PAGE_ORDER to post_alloc_hook().  However, the series introducing
mentioned commit changed this behavior allowing the full 1G page order to
be passed.

Reproducer:

 1. Build the kernel with CONFIG_SPARSEMEM=y and table extensions
    support
 2. Pass 'default_hugepagesz=1 page_owner=on' in the kernel command-line
 3. Reserve one 1G page at run-time, this should crash (backtrace below)

To address this issue, this commit introduces a new API for iterating
through page extensions.  The main iteration macro is for_each_page_ext()
and it must be called with the RCU read lock taken.  Here's an usage
example:

"""
struct page_ext_iter iter;
struct page_ext *page_ext;

...

rcu_read_lock();
for_each_page_ext(page, 1 << order, page_ext, iter) {
	struct my_page_ext *obj = get_my_page_ext_obj(page_ext);
	...
}
rcu_read_unlock();
"""

The loop construct uses page_ext_iter_next() which checks to see if we
have crossed sections in the iteration.  In this case,
page_ext_iter_next() retrieves the next page_ext object from another
section.

Thanks to David Hildenbrand for helping identify the root cause and
providing suggestions on how to fix and optmize the solution (final
implementation and bugs are all mine through).

Lastly, here's the backtrace, without kasan you can get random crashes:

[   76.052526] BUG: KASAN: slab-out-of-bounds in __update_page_owner_handle+0x238/0x298
[   76.060283] Write of size 4 at addr ffff07ff96240038 by task tee/3598
[   76.066714]
[   76.068203] CPU: 88 UID: 0 PID: 3598 Comm: tee Kdump: loaded Not tainted 6.13.0-rep1 #3
[   76.076202] Hardware name: WIWYNN Mt.Jade Server System B81.030Z1.0007/Mt.Jade Motherboard, BIOS 2.10.20220810 (SCP: 2.10.20220810) 2022/08/10
[   76.088972] Call trace:
[   76.091411]  show_stack+0x20/0x38 (C)
[   76.095073]  dump_stack_lvl+0x80/0xf8
[   76.098733]  print_address_description.constprop.0+0x88/0x398
[   76.104476]  print_report+0xa8/0x278
[   76.108041]  kasan_report+0xa8/0xf8
[   76.111520]  __asan_report_store4_noabort+0x20/0x30
[   76.116391]  __update_page_owner_handle+0x238/0x298
[   76.121259]  __set_page_owner+0xdc/0x140
[   76.125173]  post_alloc_hook+0x190/0x1d8
[   76.129090]  alloc_contig_range_noprof+0x54c/0x890
[   76.133874]  alloc_contig_pages_noprof+0x35c/0x4a8
[   76.138656]  alloc_gigantic_folio.isra.0+0x2c0/0x368
[   76.143616]  only_alloc_fresh_hugetlb_folio.isra.0+0x24/0x150
[   76.149353]  alloc_pool_huge_folio+0x11c/0x1f8
[   76.153787]  set_max_huge_pages+0x364/0xca8
[   76.157961]  __nr_hugepages_store_common+0xb0/0x1a0
[   76.162829]  nr_hugepages_store+0x108/0x118
[   76.167003]  kobj_attr_store+0x3c/0x70
[   76.170745]  sysfs_kf_write+0xfc/0x188
[   76.174492]  kernfs_fop_write_iter+0x274/0x3e0
[   76.178927]  vfs_write+0x64c/0x8e0
[   76.182323]  ksys_write+0xf8/0x1f0
[   76.185716]  __arm64_sys_write+0x74/0xb0
[   76.189630]  invoke_syscall.constprop.0+0xd8/0x1e0
[   76.194412]  do_el0_svc+0x164/0x1e0
[   76.197891]  el0_svc+0x40/0xe0
[   76.200939]  el0t_64_sync_handler+0x144/0x168
[   76.205287]  el0t_64_sync+0x1ac/0x1b0

Link: https://lkml.kernel.org/r/cover.1741301089.git.luizcap@redhat.com
Link: https://lkml.kernel.org/r/a45893880b7e1601082d39d2c5c8b50bcc096305.1741301089.git.luizcap@redhat.com
Fixes: cf54f310d0d3 ("mm/hugetlb: use __GFP_COMP for gigantic folios")
Signed-off-by: Luiz Capitulino <luizcap@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Luiz Capitulino <luizcap@redhat.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_ext.h | 93 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

(limited to 'include')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index e4b48a0dda24..76c817162d2f 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -3,6 +3,7 @@
 #define __LINUX_PAGE_EXT_H
 
 #include <linux/types.h>
+#include <linux/mmzone.h>
 #include <linux/stacktrace.h>
 
 struct pglist_data;
@@ -69,16 +70,31 @@ extern void page_ext_init(void);
 static inline void page_ext_init_flatmem_late(void)
 {
 }
+
+static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn)
+{
+	/*
+	 * page_ext is allocated per memory section. Once we cross a
+	 * memory section, we have to fetch the new pointer.
+	 */
+	return next_pfn % PAGES_PER_SECTION;
+}
 #else
 extern void page_ext_init_flatmem(void);
 extern void page_ext_init_flatmem_late(void);
 static inline void page_ext_init(void)
 {
 }
+
+static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn)
+{
+	return true;
+}
 #endif
 
 extern struct page_ext *page_ext_get(const struct page *page);
 extern void page_ext_put(struct page_ext *page_ext);
+extern struct page_ext *page_ext_lookup(unsigned long pfn);
 
 static inline void *page_ext_data(struct page_ext *page_ext,
 				  struct page_ext_operations *ops)
@@ -93,6 +109,83 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr)
 	return next;
 }
 
+struct page_ext_iter {
+	unsigned long index;
+	unsigned long start_pfn;
+	struct page_ext *page_ext;
+};
+
+/**
+ * page_ext_iter_begin() - Prepare for iterating through page extensions.
+ * @iter: page extension iterator.
+ * @pfn: PFN of the page we're interested in.
+ *
+ * Must be called with RCU read lock taken.
+ *
+ * Return: NULL if no page_ext exists for this page.
+ */
+static inline struct page_ext *page_ext_iter_begin(struct page_ext_iter *iter,
+						unsigned long pfn)
+{
+	iter->index = 0;
+	iter->start_pfn = pfn;
+	iter->page_ext = page_ext_lookup(pfn);
+
+	return iter->page_ext;
+}
+
+/**
+ * page_ext_iter_next() - Get next page extension
+ * @iter: page extension iterator.
+ *
+ * Must be called with RCU read lock taken.
+ *
+ * Return: NULL if no next page_ext exists.
+ */
+static inline struct page_ext *page_ext_iter_next(struct page_ext_iter *iter)
+{
+	unsigned long pfn;
+
+	if (WARN_ON_ONCE(!iter->page_ext))
+		return NULL;
+
+	iter->index++;
+	pfn = iter->start_pfn + iter->index;
+
+	if (page_ext_iter_next_fast_possible(pfn))
+		iter->page_ext = page_ext_next(iter->page_ext);
+	else
+		iter->page_ext = page_ext_lookup(pfn);
+
+	return iter->page_ext;
+}
+
+/**
+ * page_ext_iter_get() - Get current page extension
+ * @iter: page extension iterator.
+ *
+ * Return: NULL if no page_ext exists for this iterator.
+ */
+static inline struct page_ext *page_ext_iter_get(const struct page_ext_iter *iter)
+{
+	return iter->page_ext;
+}
+
+/**
+ * for_each_page_ext(): iterate through page_ext objects.
+ * @__page: the page we're interested in
+ * @__pgcount: how many pages to iterate through
+ * @__page_ext: struct page_ext pointer where the current page_ext
+ *              object is returned
+ * @__iter: struct page_ext_iter object (defined in the stack)
+ *
+ * IMPORTANT: must be called with RCU read lock taken.
+ */
+#define for_each_page_ext(__page, __pgcount, __page_ext, __iter) \
+	for (__page_ext = page_ext_iter_begin(&__iter, page_to_pfn(__page));\
+		__page_ext && __iter.index < __pgcount;          \
+		__page_ext = page_ext_iter_next(&__iter))
+
 #else /* !CONFIG_PAGE_EXTENSION */
 struct page_ext;
 
-- 
cgit v1.2.3


From 3fec86f8aa8c7ae84567a0d1396e84ada96141d8 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Fri, 7 Mar 2025 12:39:54 -0500
Subject: xarray: add xas_try_split() to split a multi-index entry

Patch series "Buddy allocator like (or non-uniform) folio split", v10.

This patchset adds a new buddy allocator like (or non-uniform) large folio
split from a order-n folio to order-m with m < n.  It reduces

1. the total number of after-split folios from 2^(n-m) to n-m+1;

2. the amount of memory needed for multi-index xarray split from 2^(n/6-m/6) to
   n/6-m/6, assuming XA_CHUNK_SHIFT=6;

3. keep more large folios after a split from all order-m folios to
   order-(n-1) to order-m folios.

For example, to split an order-9 to order-0, folio split generates 10 (or
11 for anonymous memory) folios instead of 512, allocates 1 xa_node
instead of 8, and leaves 1 order-8, 1 order-7, ..., 1 order-1 and 2
order-0 folios (or 4 order-0 for anonymous memory) instead of 512 order-0
folios.

Instead of duplicating existing split_huge_page*() code, __folio_split()
is introduced as the shared backend code for both
split_huge_page_to_list_to_order() and folio_split().  __folio_split() can
support both uniform split and buddy allocator like (or non-uniform)
split.  All existing split_huge_page*() users can be gradually converted
to use folio_split() if possible.  In this patchset, I converted
truncate_inode_partial_folio() to use folio_split().

xfstests quick group passed for both tmpfs and xfs.  I also
semi-replicated Hugh's test[12] and ran it without any issue for almost 24
hours.


This patch (of 8):

A preparation patch for non-uniform folio split, which always split a
folio into half iteratively, and minimal xarray entry split.

Currently, xas_split_alloc() and xas_split() always split all slots from a
multi-index entry.  They cost the same number of xa_node as the
to-be-split slots.  For example, to split an order-9 entry, which takes
2^(9-6)=8 slots, assuming XA_CHUNK_SHIFT is 6 (!CONFIG_BASE_SMALL), 8
xa_node are needed.  Instead xas_try_split() is intended to be used
iteratively to split the order-9 entry into 2 order-8 entries, then split
one order-8 entry, based on the given index, to 2 order-7 entries, ...,
and split one order-1 entry to 2 order-0 entries.  When splitting the
order-6 entry and a new xa_node is needed, xas_try_split() will try to
allocate one if possible.  As a result, xas_try_split() would only need 1
xa_node instead of 8.

When a new xa_node is needed during the split, xas_try_split() can try to
allocate one but no more.  -ENOMEM will be return if a node cannot be
allocated.  -EINVAL will be return if a sibling node is split or cascade
split happens, where two or more new nodes are needed, and these are not
supported by xas_try_split().

xas_split_alloc() and xas_split() split an order-9 to order-0:

         ---------------------------------
         |   |   |   |   |   |   |   |   |
         | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
         |   |   |   |   |   |   |   |   |
         ---------------------------------
           |   |                   |   |
     -------   ---               ---   -------
     |           |     ...       |           |
     V           V               V           V
----------- -----------     ----------- -----------
| xa_node | | xa_node | ... | xa_node | | xa_node |
----------- -----------     ----------- -----------

xas_try_split() splits an order-9 to order-0:
   ---------------------------------
   |   |   |   |   |   |   |   |   |
   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
   |   |   |   |   |   |   |   |   |
   ---------------------------------
     |
     |
     V
-----------
| xa_node |
-----------

Link: https://lkml.kernel.org/r/20250307174001.242794-1-ziy@nvidia.com
Link: https://lkml.kernel.org/r/20250307174001.242794-2-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Kirill A. Shuemov <kirill.shutemov@linux.intel.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <yang@os.amperecomputing.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Kairui Song <kasong@tencent.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/xarray.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 0b618ec04115..4010195201c9 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1555,6 +1555,7 @@ int xa_get_order(struct xarray *, unsigned long index);
 int xas_get_order(struct xa_state *xas);
 void xas_split(struct xa_state *, void *entry, unsigned int order);
 void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
+void xas_try_split(struct xa_state *xas, void *entry, unsigned int order);
 #else
 static inline int xa_get_order(struct xarray *xa, unsigned long index)
 {
@@ -1576,6 +1577,11 @@ static inline void xas_split_alloc(struct xa_state *xas, void *entry,
 		unsigned int order, gfp_t gfp)
 {
 }
+
+static inline void xas_try_split(struct xa_state *xas, void *entry,
+		unsigned int order)
+{
+}
 #endif
 
 /**
-- 
cgit v1.2.3


From 7460b470a131f985a70302a322617121efdd7caa Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Fri, 7 Mar 2025 12:40:00 -0500
Subject: mm/truncate: use folio_split() in truncate operation

Instead of splitting the large folio uniformly during truncation, try to
use buddy allocator like folio_split() at the start and the end of a
truncation range to minimize the number of resulting folios if it is
supported.  try_folio_split() is introduced to use folio_split() if
supported and it falls back to uniform split otherwise.

For example, to truncate a order-4 folio
[0, 1, 2, 3, 4, 5, ..., 15]
between [3, 10] (inclusive), folio_split() splits the folio at 3 to
[0,1], [2], [3], [4..7], [8..15] and [3], [4..7] can be dropped and
[8..15] is kept with zeros in [8..10], then another folio_split() is
done at 10, so [8..10] can be dropped.

One possible optimization is to make folio_split() to split a folio based
on a given range, like [3..10] above.  But that complicates folio_split(),
so it will be investigated when necessary.

Link: https://lkml.kernel.org/r/20250226210032.2044041-8-ziy@nvidia.com
Link: https://lkml.kernel.org/r/20250307174001.242794-8-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Kirill A. Shuemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <yang@os.amperecomputing.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Kairui Song <kasong@tencent.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e57e811cfd3c..e893d546a49f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -345,6 +345,36 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		unsigned int new_order);
 int min_order_for_split(struct folio *folio);
 int split_folio_to_list(struct folio *folio, struct list_head *list);
+bool uniform_split_supported(struct folio *folio, unsigned int new_order,
+		bool warns);
+bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
+		bool warns);
+int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
+		struct list_head *list);
+/*
+ * try_folio_split - try to split a @folio at @page using non uniform split.
+ * @folio: folio to be split
+ * @page: split to order-0 at the given page
+ * @list: store the after-split folios
+ *
+ * Try to split a @folio at @page using non uniform split to order-0, if
+ * non uniform split is not supported, fall back to uniform split.
+ *
+ * Return: 0: split is successful, otherwise split failed.
+ */
+static inline int try_folio_split(struct folio *folio, struct page *page,
+		struct list_head *list)
+{
+	int ret = min_order_for_split(folio);
+
+	if (ret < 0)
+		return ret;
+
+	if (!non_uniform_split_supported(folio, 0, false))
+		return split_huge_page_to_list_to_order(&folio->page, list,
+				ret);
+	return folio_split(folio, ret, page, list);
+}
 static inline int split_huge_page(struct page *page)
 {
 	struct folio *folio = page_folio(page);
@@ -537,6 +567,12 @@ static inline int split_folio_to_list(struct folio *folio, struct list_head *lis
 	return 0;
 }
 
+static inline int try_folio_split(struct folio *folio, struct page *page,
+		struct list_head *list)
+{
+	return 0;
+}
+
 static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
 #define split_huge_pmd(__vma, __pmd, __address)	\
 	do { } while (0)
-- 
cgit v1.2.3


From 200a89c159a7a416115e6e309183c82183bf98aa Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Fri, 14 Mar 2025 18:21:12 -0400
Subject: mm/filemap: use xas_try_split() in __filemap_add_folio()

Patch series "Minimize xa_node allocation during xarry split", v3.

When splitting a multi-index entry in XArray from order-n to order-m,
existing xas_split_alloc()+xas_split() approach requires 2^(n %
XA_CHUNK_SHIFT) xa_node allocations.  But its callers,
__filemap_add_folio() and shmem_split_large_entry(), use at most 1
xa_node.  To minimize xa_node allocation and remove the limitation of no
split from order-12 (or above) to order-0 (or anything between 0 and
5)[1], xas_try_split() was added[2], which allocates (n / XA_CHUNK_SHIFT -
m / XA_CHUNK_SHIFT) xa_node.  It is used for non-uniform folio split, but
can be used by __filemap_add_folio() and shmem_split_large_entry().

xas_split_alloc() and xas_split() split an order-9 to order-0:

         ---------------------------------
         |   |   |   |   |   |   |   |   |
         | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
         |   |   |   |   |   |   |   |   |
         ---------------------------------
           |   |                   |   |
     -------   ---               ---   -------
     |           |     ...       |           |
     V           V               V           V
----------- -----------     ----------- -----------
| xa_node | | xa_node | ... | xa_node | | xa_node |
----------- -----------     ----------- -----------

xas_try_split() splits an order-9 to order-0:
   ---------------------------------
   |   |   |   |   |   |   |   |   |
   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
   |   |   |   |   |   |   |   |   |
   ---------------------------------
     |
     |
     V
-----------
| xa_node |
-----------

xas_try_split() is designed to be called iteratively with n = m + 1.
xas_try_split_mini_order() is added to minmize the number of calls to
xas_try_split() by telling the caller the next minimal order to split to
instead of n - 1.  Splitting order-n to order-m when m= l * XA_CHUNK_SHIFT
does not require xa_node allocation and requires 1 xa_node when n=l *
XA_CHUNK_SHIFT and m = n - 1, so it is OK to use xas_try_split() with n >
m + 1 when no new xa_node is needed.

xfstests quick group test passed on xfs and tmpfs.

[1] https://lore.kernel.org/linux-mm/Z6YX3RznGLUD07Ao@casper.infradead.org/
[2] https://lore.kernel.org/linux-mm/20250226210032.2044041-1-ziy@nvidia.com/


This patch (of 2):

During __filemap_add_folio(), a shadow entry is covering n slots and a
folio covers m slots with m < n is to be added.  Instead of splitting all
n slots, only the m slots covered by the folio need to be split and the
remaining n-m shadow entries can be retained with orders ranging from m to
n-1.  This method only requires

	(n/XA_CHUNK_SHIFT) - (m/XA_CHUNK_SHIFT)

new xa_nodes instead of

	(n % XA_CHUNK_SHIFT) * ((n/XA_CHUNK_SHIFT) - (m/XA_CHUNK_SHIFT))

new xa_nodes, compared to the original xas_split_alloc() + xas_split()
one.  For example, to insert an order-0 folio when an order-9 shadow entry
is present (assuming XA_CHUNK_SHIFT is 6), 1 xa_node is needed instead of
8.

xas_try_split_min_order() is introduced to reduce the number of calls to
xas_try_split() during split.

Link: https://lkml.kernel.org/r/20250314222113.711703-1-ziy@nvidia.com
Link: https://lkml.kernel.org/r/20250314222113.711703-2-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mattew Wilcox <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Kirill A. Shuemov <kirill.shutemov@linux.intel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <yang@os.amperecomputing.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/xarray.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 4010195201c9..78eede109b1a 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1556,6 +1556,7 @@ int xas_get_order(struct xa_state *xas);
 void xas_split(struct xa_state *, void *entry, unsigned int order);
 void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
 void xas_try_split(struct xa_state *xas, void *entry, unsigned int order);
+unsigned int xas_try_split_min_order(unsigned int order);
 #else
 static inline int xa_get_order(struct xarray *xa, unsigned long index)
 {
@@ -1582,6 +1583,12 @@ static inline void xas_try_split(struct xa_state *xas, void *entry,
 		unsigned int order)
 {
 }
+
+static inline unsigned int xas_try_split_min_order(unsigned int order)
+{
+	return 0;
+}
+
 #endif
 
 /**
-- 
cgit v1.2.3


From 61659efdb35ce6c6ac7639342098f3c4548b794b Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Wed, 12 Mar 2025 09:30:43 +1000
Subject: drivers/base/memory: improve add_boot_memory_block()

Patch series "drivers/base/memory: Two cleanups", v3.

Two cleanups to drivers/base/memory.


This patch (of 2)L

It's unnecessary to count the present sections for the specified block
since the block will be added if any section in the block is present.
Besides, for_each_present_section_nr() can be reused as Andrew Morton
suggested.

Improve by using for_each_present_section_nr() and dropping the
unnecessary @section_count.

No functional changes intended.

Link: https://lkml.kernel.org/r/20250311233045.148943-1-gshan@redhat.com
Link: https://lkml.kernel.org/r/20250311233045.148943-2-gshan@redhat.com
Signed-off-by: Gavin Shan <gshan@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Oscar Salvador <osalvador@suse.de>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 550dbba92521..dbb0ad69e17f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -2140,6 +2140,11 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr)
 	return -1;
 }
 
+#define for_each_present_section_nr(start, section_nr)		\
+	for (section_nr = next_present_section_nr(start - 1);	\
+	     section_nr != -1;					\
+	     section_nr = next_present_section_nr(section_nr))
+
 /*
  * These are _only_ used during initialisation, therefore they
  * can use __initdata ...  They could have names to indicate
-- 
cgit v1.2.3


From 1a24776fca39ed79d7f5e0b0592b5addd784981e Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Wed, 12 Mar 2025 09:30:44 +1000
Subject: drivers/base/memory: correct the field name in the header

Replace @blocks with @memory_blocks to match with the definition of struct
memory_group.

Link: https://lkml.kernel.org/r/20250311233045.148943-3-gshan@redhat.com
Signed-off-by: Gavin Shan <gshan@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Oscar Salvador <osalvador@suse.de>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index c0afee5d126e..12daa6ec7d09 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -25,7 +25,7 @@
 /**
  * struct memory_group - a logical group of memory blocks
  * @nid: The node id for all memory blocks inside the memory group.
- * @blocks: List of all memory blocks belonging to this memory group.
+ * @memory_blocks: List of all memory blocks belonging to this memory group.
  * @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this
  *			  memory group.
  * @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this
-- 
cgit v1.2.3


From 8c02048d1c6126527f15752a5e0849dc49cefeeb Mon Sep 17 00:00:00 2001
From: Martin Liu <liumartin@google.com>
Date: Sat, 8 Mar 2025 03:46:00 +0000
Subject: mm/page_alloc: add trace event for per-zone watermark setup

Patch series "Add tracepoints for lowmem reserves, watermarks and
totalreserve_pages", v2.

This patchset introduces tracepoints to track changes in the lowmem
reserves, watermarks and totalreserve_pages. This helps to track
the exact timing of such changes and understand their relation to
reclaim activities.

The tracepoints added are:

mm_setup_per_zone_lowmem_reserve
mm_setup_per_zone_wmarks
mm_calculate_totalreserve_pagesi


This patch (of 3):

This commit introduces the `mm_setup_per_zone_wmarks` trace event,
which provides detailed insights into the kernel's per-zone watermark
configuration, offering precise timing and the ability to correlate
watermark changes with specific kernel events.

While `/proc/zoneinfo` provides some information about zone watermarks,
this trace event offers:

1. The ability to link watermark changes to specific kernel events and
   logic.

2. The ability to capture rapid or short-lived changes in watermarks
   that may be missed by user-space polling

3. Diagnosing unexpected kswapd activity or excessive direct reclaim
   triggered by rapidly changing watermarks.

Link: https://lkml.kernel.org/r/20250308034606.2036033-1-liumartin@google.com
Link: https://lkml.kernel.org/r/20250308034606.2036033-2-liumartin@google.com
Signed-off-by: Martin Liu <liumartin@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Martin Liu <liumartin@google.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/kmem.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index b37eb0a7060f..5fd392dae503 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -342,6 +342,39 @@ TRACE_EVENT(mm_alloc_contig_migrate_range_info,
 		  __entry->nr_mapped)
 );
 
+TRACE_EVENT(mm_setup_per_zone_wmarks,
+
+	TP_PROTO(struct zone *zone),
+
+	TP_ARGS(zone),
+
+	TP_STRUCT__entry(
+		__field(int, node_id)
+		__string(name, zone->name)
+		__field(unsigned long, watermark_min)
+		__field(unsigned long, watermark_low)
+		__field(unsigned long, watermark_high)
+		__field(unsigned long, watermark_promo)
+	),
+
+	TP_fast_assign(
+		__entry->node_id = zone->zone_pgdat->node_id;
+		__assign_str(name);
+		__entry->watermark_min = zone->_watermark[WMARK_MIN];
+		__entry->watermark_low = zone->_watermark[WMARK_LOW];
+		__entry->watermark_high = zone->_watermark[WMARK_HIGH];
+		__entry->watermark_promo = zone->_watermark[WMARK_PROMO];
+	),
+
+	TP_printk("node_id=%d zone name=%s watermark min=%lu low=%lu high=%lu promo=%lu",
+		  __entry->node_id,
+		  __get_str(name),
+		  __entry->watermark_min,
+		  __entry->watermark_low,
+		  __entry->watermark_high,
+		  __entry->watermark_promo)
+);
+
 /*
  * Required for uniquely and securely identifying mm in rss_stat tracepoint.
  */
-- 
cgit v1.2.3


From a293aba4a584709889f77a0ad0c45746aecf1b9f Mon Sep 17 00:00:00 2001
From: Martin Liu <liumartin@google.com>
Date: Sat, 8 Mar 2025 03:46:01 +0000
Subject: mm/page_alloc: add trace event for per-zone lowmem reserve setup

This commit introduces the `mm_setup_per_zone_lowmem_reserve` trace
event,which provides detailed insights into the kernel's per-zone lowmem
reserve configuration.

The trace event provides precise timestamps, allowing developers to

1. Correlate lowmem reserve changes with specific kernel events and
   able to diagnose unexpected kswapd or direct reclaim behavior triggered
   by dynamic changes in lowmem reserve.

2. Know memory allocation failures that occur due to insufficient
   lowmem reserve, by precisely correlating allocation attempts with
   reserve adjustments.

Link: https://lkml.kernel.org/r/20250308034606.2036033-3-liumartin@google.com
Signed-off-by: Martin Liu <liumartin@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/kmem.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 5fd392dae503..9623e68d4d26 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -375,6 +375,33 @@ TRACE_EVENT(mm_setup_per_zone_wmarks,
 		  __entry->watermark_promo)
 );
 
+TRACE_EVENT(mm_setup_per_zone_lowmem_reserve,
+
+	TP_PROTO(struct zone *zone, struct zone *upper_zone, long lowmem_reserve),
+
+	TP_ARGS(zone, upper_zone, lowmem_reserve),
+
+	TP_STRUCT__entry(
+		__field(int, node_id)
+		__string(name, zone->name)
+		__string(upper_name, upper_zone->name)
+		__field(long, lowmem_reserve)
+	),
+
+	TP_fast_assign(
+		__entry->node_id = zone->zone_pgdat->node_id;
+		__assign_str(name);
+		__assign_str(upper_name);
+		__entry->lowmem_reserve = lowmem_reserve;
+	),
+
+	TP_printk("node_id=%d zone name=%s upper_zone name=%s lowmem_reserve_pages=%ld",
+		  __entry->node_id,
+		  __get_str(name),
+		  __get_str(upper_name),
+		  __entry->lowmem_reserve)
+);
+
 /*
  * Required for uniquely and securely identifying mm in rss_stat tracepoint.
  */
-- 
cgit v1.2.3


From 15766485e4a51bec2dcce304c089a95550720033 Mon Sep 17 00:00:00 2001
From: Martin Liu <liumartin@google.com>
Date: Sat, 8 Mar 2025 03:46:02 +0000
Subject: mm/page_alloc: add trace event for totalreserve_pages calculation

This commit introduces a new trace event,
`mm_calculate_totalreserve_pages`, which reports the new reserve value at
the exact time when it takes effect.

The `totalreserve_pages` value represents the total amount of memory
reserved across all zones and nodes in the system.  This reserved memory
is crucial for ensuring that critical kernel operations have access to
sufficient memory, even under memory pressure.

By tracing the `totalreserve_pages` value, developers can gain insights
that how the total reserved memory changes over time.

Link: https://lkml.kernel.org/r/20250308034606.2036033-4-liumartin@google.com
Signed-off-by: Martin Liu <liumartin@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/kmem.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 9623e68d4d26..f74925a6cf69 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -402,6 +402,24 @@ TRACE_EVENT(mm_setup_per_zone_lowmem_reserve,
 		  __entry->lowmem_reserve)
 );
 
+TRACE_EVENT(mm_calculate_totalreserve_pages,
+
+	TP_PROTO(unsigned long totalreserve_pages),
+
+	TP_ARGS(totalreserve_pages),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, totalreserve_pages)
+	),
+
+	TP_fast_assign(
+		__entry->totalreserve_pages = totalreserve_pages;
+	),
+
+	TP_printk("totalreserve_pages=%lu", __entry->totalreserve_pages)
+);
+
+
 /*
  * Required for uniquely and securely identifying mm in rss_stat tracepoint.
  */
-- 
cgit v1.2.3


From 67914ac08604345f620566ccf5bac87b40d5881d Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 13 Mar 2025 17:05:32 -0400
Subject: mm: compaction: push watermark into compaction_suitable() callers

Patch series "mm: reliable huge page allocator".

This series makes changes to the allocator and reclaim/compaction code to
try harder to avoid fragmentation.  As a result, this makes huge page
allocations cheaper, more reliable and more sustainable.

It's a subset of the huge page allocator RFC initially proposed here:

  https://lore.kernel.org/lkml/20230418191313.268131-1-hannes@cmpxchg.org/

The following results are from a kernel build test, with additional
concurrent bursts of THP allocations on a memory-constrained system.
Comparing before and after the changes over 15 runs:

                                                     before                   after
    Hugealloc Time mean               52739.45 (    +0.00%)   28904.00 (   -45.19%)
    Hugealloc Time stddev             56541.26 (    +0.00%)   33464.37 (   -40.81%)
    Kbuild Real time                    197.47 (    +0.00%)     196.59 (    -0.44%)
    Kbuild User time                   1240.49 (    +0.00%)    1231.67 (    -0.71%)
    Kbuild System time                   70.08 (    +0.00%)      59.10 (   -15.45%)
    THP fault alloc                   46727.07 (    +0.00%)   63223.67 (   +35.30%)
    THP fault fallback                21910.60 (    +0.00%)    5412.47 (   -75.29%)
    Direct compact fail                 195.80 (    +0.00%)      59.07 (   -69.48%)
    Direct compact success                7.93 (    +0.00%)       2.80 (   -57.46%)
    Direct compact success rate %         3.51 (    +0.00%)       3.99 (   +10.49%)
    Compact daemon scanned migrate  3369601.27 (    +0.00%) 2267500.33 (   -32.71%)
    Compact daemon scanned free     5075474.47 (    +0.00%) 2339773.00 (   -53.90%)
    Compact direct scanned migrate   161787.27 (    +0.00%)   47659.93 (   -70.54%)
    Compact direct scanned free      163467.53 (    +0.00%)   40729.67 (   -75.08%)
    Compact total migrate scanned   3531388.53 (    +0.00%) 2315160.27 (   -34.44%)
    Compact total free scanned      5238942.00 (    +0.00%) 2380502.67 (   -54.56%)
    Alloc stall                        2371.07 (    +0.00%)     638.87 (   -73.02%)
    Pages kswapd scanned            2160926.73 (    +0.00%) 4002186.33 (   +85.21%)
    Pages kswapd reclaimed           533191.07 (    +0.00%)  718577.80 (   +34.77%)
    Pages direct scanned             400450.33 (    +0.00%)  355172.73 (   -11.31%)
    Pages direct reclaimed            94441.73 (    +0.00%)   31162.80 (   -67.00%)
    Pages total scanned             2561377.07 (    +0.00%) 4357359.07 (   +70.12%)
    Pages total reclaimed            627632.80 (    +0.00%)  749740.60 (   +19.46%)
    Swap out                          47959.53 (    +0.00%)  110084.33 (  +129.53%)
    Swap in                            7276.00 (    +0.00%)   24457.00 (  +236.10%)
    File refaults                    138043.00 (    +0.00%)  188226.93 (   +36.35%)

THP latencies are cut in half, and failure rates are cut by 75%.  These
metrics also hold up over time, while the vanilla kernel sees a steady
downward trend in success rates with each subsequent run, owed to the
cumulative effects of fragmentation.

A more detailed discussion of results is in the patch changelogs.

The patches first introduce a vm.defrag_mode sysctl, which enforces the
existing ALLOC_NOFRAGMENT alloc flag until after reclaim and compaction
have run.  They then change kswapd and kcompactd to target pageblocks,
which boosts success in the ALLOC_NOFRAGMENT hotpaths.

Patches #1 and #2 are somewhat unrelated cleanups, but touch the same code
and so are included here to avoid conflicts from re-ordering.


This patch (of 5):

compaction_suitable() hardcodes the min watermark, with a boost to the low
watermark for costly orders.  However, compaction_ready() requires order-0
at the high watermark.  It currently checks the marks twice.

Make the watermark a parameter to compaction_suitable() and have the
callers pass in what they require:

- compaction_zonelist_suitable() is used by the direct reclaim path,
  so use the min watermark.

- compact_suit_allocation_order() has a watermark in context derived
  from cc->alloc_flags.

  The only quirk is that kcompactd doesn't initialize cc->alloc_flags
  explicitly. There is a direct check in kcompactd_do_work() that
  passes ALLOC_WMARK_MIN, but there is another check downstack in
  compact_zone() that ends up passing the unset alloc_flags. Since
  they default to 0, and that coincides with ALLOC_WMARK_MIN, it is
  correct. But it's subtle. Set cc->alloc_flags explicitly.

- should_continue_reclaim() is direct reclaim, use the min watermark.

- Finally, consolidate the two checks in compaction_ready() to a
  single compaction_suitable() call passing the high watermark.

  There is a tiny change in behavior: before, compaction_suitable()
  would check order-0 against min or low, depending on costly
  order. Then there'd be another high watermark check.

  Now, the high watermark is passed to compaction_suitable(), and the
  costly order-boost (low - min) is added on top. This means
  compaction_ready() sets a marginally higher target for free pages.

  In a kernelbuild + THP pressure test, though, this didn't show any
  measurable negative effects on memory pressure or reclaim rates. As
  the comment above the check says, reclaim is usually stopped short
  on should_continue_reclaim(), and this just defines the worst-case
  reclaim cutoff in case compaction is not making any headway.

[hughd@google.com: stop oops on out-of-range highest_zoneidx]
  Link: https://lkml.kernel.org/r/005ace8b-07fa-01d4-b54b-394a3e029c07@google.com
Link: https://lkml.kernel.org/r/20250313210647.1314586-1-hannes@cmpxchg.org
Link: https://lkml.kernel.org/r/20250313210647.1314586-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compaction.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 7bf0c521db63..173d9c07a895 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -95,7 +95,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 		struct page **page);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern bool compaction_suitable(struct zone *zone, int order,
-					       int highest_zoneidx);
+				unsigned long watermark, int highest_zoneidx);
 
 extern void compaction_defer_reset(struct zone *zone, int order,
 				bool alloc_success);
@@ -113,7 +113,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
 }
 
 static inline bool compaction_suitable(struct zone *zone, int order,
-						      int highest_zoneidx)
+				       unsigned long watermark,
+				       int highest_zoneidx)
 {
 	return false;
 }
-- 
cgit v1.2.3


From a211c6550efcc87aa2459ca347bda10721c7a46a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 13 Mar 2025 17:05:36 -0400
Subject: mm: page_alloc: defrag_mode kswapd/kcompactd watermarks

The previous patch added pageblock_order reclaim to kswapd/kcompactd,
which helps, but produces only one block at a time.  Allocation stalls and
THP failure rates are still higher than they could be.

To adequately reflect ALLOC_NOFRAGMENT demand for pageblocks, change the
watermarking for kswapd & kcompactd: instead of targeting the high
watermark in order-0 pages and checking for one suitable block, simply
require that the high watermark is entirely met in pageblocks.

To this end, track the number of free pages within contiguous pageblocks,
then change pgdat_balanced() and compact_finished() to check watermarks
against this new value.

This further reduces THP latencies and allocation stalls, and improves THP
success rates against the previous patch:

                                       DEFRAGMODE-ASYNC DEFRAGMODE-ASYNC-WMARKS
Hugealloc Time mean               34300.36 (    +0.00%)   28904.00 (   -15.73%)
Hugealloc Time stddev             36390.42 (    +0.00%)   33464.37 (    -8.04%)
Kbuild Real time                    196.13 (    +0.00%)     196.59 (    +0.23%)
Kbuild User time                   1234.74 (    +0.00%)    1231.67 (    -0.25%)
Kbuild System time                   62.62 (    +0.00%)      59.10 (    -5.54%)
THP fault alloc                   57054.53 (    +0.00%)   63223.67 (   +10.81%)
THP fault fallback                11581.40 (    +0.00%)    5412.47 (   -53.26%)
Direct compact fail                 107.80 (    +0.00%)      59.07 (   -44.79%)
Direct compact success                4.53 (    +0.00%)       2.80 (   -31.33%)
Direct compact success rate %         3.20 (    +0.00%)       3.99 (   +18.66%)
Compact daemon scanned migrate  5461033.93 (    +0.00%) 2267500.33 (   -58.48%)
Compact daemon scanned free     5824897.93 (    +0.00%) 2339773.00 (   -59.83%)
Compact direct scanned migrate    58336.93 (    +0.00%)   47659.93 (   -18.30%)
Compact direct scanned free       32791.87 (    +0.00%)   40729.67 (   +24.21%)
Compact total migrate scanned   5519370.87 (    +0.00%) 2315160.27 (   -58.05%)
Compact total free scanned      5857689.80 (    +0.00%) 2380502.67 (   -59.36%)
Alloc stall                        2424.60 (    +0.00%)     638.87 (   -73.62%)
Pages kswapd scanned            2657018.33 (    +0.00%) 4002186.33 (   +50.63%)
Pages kswapd reclaimed           559583.07 (    +0.00%)  718577.80 (   +28.41%)
Pages direct scanned             722094.07 (    +0.00%)  355172.73 (   -50.81%)
Pages direct reclaimed           107257.80 (    +0.00%)   31162.80 (   -70.95%)
Pages total scanned             3379112.40 (    +0.00%) 4357359.07 (   +28.95%)
Pages total reclaimed            666840.87 (    +0.00%)  749740.60 (   +12.43%)
Swap out                          77238.20 (    +0.00%)  110084.33 (   +42.53%)
Swap in                           11712.80 (    +0.00%)   24457.00 (  +108.80%)
File refaults                    143438.80 (    +0.00%)  188226.93 (   +31.22%)

Also of note is that compaction work overall is reduced.  The reason for
this is that when free pageblocks are more readily available, allocations
are also much more likely to get physically placed in LRU order, instead
of being forced to scavenge free space here and there.  This means that
reclaim by itself has better chances of freeing up whole blocks, and the
system relies less on compaction.

Comparing all changes to the vanilla kernel:

                                                VANILLA DEFRAGMODE-ASYNC-WMARKS
Hugealloc Time mean               52739.45 (    +0.00%)   28904.00 (   -45.19%)
Hugealloc Time stddev             56541.26 (    +0.00%)   33464.37 (   -40.81%)
Kbuild Real time                    197.47 (    +0.00%)     196.59 (    -0.44%)
Kbuild User time                   1240.49 (    +0.00%)    1231.67 (    -0.71%)
Kbuild System time                   70.08 (    +0.00%)      59.10 (   -15.45%)
THP fault alloc                   46727.07 (    +0.00%)   63223.67 (   +35.30%)
THP fault fallback                21910.60 (    +0.00%)    5412.47 (   -75.29%)
Direct compact fail                 195.80 (    +0.00%)      59.07 (   -69.48%)
Direct compact success                7.93 (    +0.00%)       2.80 (   -57.46%)
Direct compact success rate %         3.51 (    +0.00%)       3.99 (   +10.49%)
Compact daemon scanned migrate  3369601.27 (    +0.00%) 2267500.33 (   -32.71%)
Compact daemon scanned free     5075474.47 (    +0.00%) 2339773.00 (   -53.90%)
Compact direct scanned migrate   161787.27 (    +0.00%)   47659.93 (   -70.54%)
Compact direct scanned free      163467.53 (    +0.00%)   40729.67 (   -75.08%)
Compact total migrate scanned   3531388.53 (    +0.00%) 2315160.27 (   -34.44%)
Compact total free scanned      5238942.00 (    +0.00%) 2380502.67 (   -54.56%)
Alloc stall                        2371.07 (    +0.00%)     638.87 (   -73.02%)
Pages kswapd scanned            2160926.73 (    +0.00%) 4002186.33 (   +85.21%)
Pages kswapd reclaimed           533191.07 (    +0.00%)  718577.80 (   +34.77%)
Pages direct scanned             400450.33 (    +0.00%)  355172.73 (   -11.31%)
Pages direct reclaimed            94441.73 (    +0.00%)   31162.80 (   -67.00%)
Pages total scanned             2561377.07 (    +0.00%) 4357359.07 (   +70.12%)
Pages total reclaimed            627632.80 (    +0.00%)  749740.60 (   +19.46%)
Swap out                          47959.53 (    +0.00%)  110084.33 (  +129.53%)
Swap in                            7276.00 (    +0.00%)   24457.00 (  +236.10%)
File refaults                    138043.00 (    +0.00%)  188226.93 (   +36.35%)

THP allocation latencies and %sys time are down dramatically.

THP allocation failures are down from nearly 50% to 8.5%.  And to recall
previous data points, the success rates are steady and reliable without
the cumulative deterioration of fragmentation events.

Compaction work is down overall.  Direct compaction work especially is
drastically reduced.  As an aside, its success rate of 4% indicates there
is room for improvement.  For now it's good to rely on it less.

Reclaim work is up overall, however direct reclaim work is down.  Part of
the increase can be attributed to a higher use of THPs, which due to
internal fragmentation increase the memory footprint.  This is not
necessarily an unexpected side-effect for users of THP.

However, taken both points together, there may well be some opportunities
for fine tuning in the reclaim/compaction coordination.

[hannes@cmpxchg.org: fix squawks from rebasing]
  Link: https://lkml.kernel.org/r/20250314210558.GD1316033@cmpxchg.org
Link: https://lkml.kernel.org/r/20250313210647.1314586-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index dbb0ad69e17f..37c29f3fbca8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -138,6 +138,7 @@ enum numa_stat_item {
 enum zone_stat_item {
 	/* First 128 byte cacheline (assuming 64 bit words) */
 	NR_FREE_PAGES,
+	NR_FREE_PAGES_BLOCKS,
 	NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
 	NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
 	NR_ZONE_ACTIVE_ANON,
-- 
cgit v1.2.3


From e36ba5ab808ef6237c3148d469c8238674230e2b Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 11 Mar 2025 12:44:23 -0700
Subject: iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC

Introduce a new IOMMUFD_OBJ_VEVENTQ object for vIOMMU Event Queue that
provides user space (VMM) another FD to read the vIOMMU Events.

Allow a vIOMMU object to allocate vEVENTQs, with a condition that each
vIOMMU can only have one single vEVENTQ per type.

Add iommufd_veventq_alloc() with iommufd_veventq_ops for the new ioctl.

Link: https://patch.msgid.link/r/21acf0751dd5c93846935ee06f93b9c65eff5e04.1741719725.git.nicolinc@nvidia.com
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommufd.h      |  3 ++
 include/uapi/linux/iommufd.h | 82 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

(limited to 'include')

diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 11110c749200..8948b1836940 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -34,6 +34,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_FAULT,
 	IOMMUFD_OBJ_VIOMMU,
 	IOMMUFD_OBJ_VDEVICE,
+	IOMMUFD_OBJ_VEVENTQ,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
@@ -93,6 +94,8 @@ struct iommufd_viommu {
 	const struct iommufd_viommu_ops *ops;
 
 	struct xarray vdevs;
+	struct list_head veventqs;
+	struct rw_semaphore veventqs_rwsem;
 
 	unsigned int type;
 };
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 78747b24bd0f..dbb8787d9c63 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -55,6 +55,7 @@ enum {
 	IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
 	IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
 	IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
+	IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93,
 };
 
 /**
@@ -1014,4 +1015,85 @@ struct iommu_ioas_change_process {
 #define IOMMU_IOAS_CHANGE_PROCESS \
 	_IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
 
+/**
+ * enum iommu_veventq_flag - flag for struct iommufd_vevent_header
+ * @IOMMU_VEVENTQ_FLAG_LOST_EVENTS: vEVENTQ has lost vEVENTs
+ */
+enum iommu_veventq_flag {
+	IOMMU_VEVENTQ_FLAG_LOST_EVENTS = (1U << 0),
+};
+
+/**
+ * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status
+ * @flags: Combination of enum iommu_veventq_flag
+ * @sequence: The sequence index of a vEVENT in the vEVENTQ, with a range of
+ *            [0, INT_MAX] where the following index of INT_MAX is 0
+ *
+ * Each iommufd_vevent_header reports a sequence index of the following vEVENT:
+ *  -------------------------------------------------------------------------
+ * | header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...| dataN |
+ *  -------------------------------------------------------------------------
+ * And this sequence index is expected to be monotonic to the sequence index of
+ * the previous vEVENT. If two adjacent sequence indexes has a delta larger than
+ * 1, it means that delta - 1 number of vEVENTs has lost, e.g. two lost vEVENTs:
+ *  -------------------------------------------------------------------------
+ * | ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ... |
+ *  -------------------------------------------------------------------------
+ * If a vEVENT lost at the tail of the vEVENTQ and there is no following vEVENT
+ * providing the next sequence index, an IOMMU_VEVENTQ_FLAG_LOST_EVENTS header
+ * would be added to the tail, and no data would follow this header:
+ *  ---------------------------------------------------------------------------
+ * |..| header3 {sequence=3} | data3 | header4 {flags=LOST_EVENTS, sequence=4} |
+ *  ---------------------------------------------------------------------------
+ */
+struct iommufd_vevent_header {
+	__u32 flags;
+	__u32 sequence;
+};
+
+/**
+ * enum iommu_veventq_type - Virtual Event Queue Type
+ * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use
+ */
+enum iommu_veventq_type {
+	IOMMU_VEVENTQ_TYPE_DEFAULT = 0,
+};
+
+/**
+ * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC)
+ * @size: sizeof(struct iommu_veventq_alloc)
+ * @flags: Must be 0
+ * @viommu_id: virtual IOMMU ID to associate the vEVENTQ with
+ * @type: Type of the vEVENTQ. Must be defined in enum iommu_veventq_type
+ * @veventq_depth: Maximum number of events in the vEVENTQ
+ * @out_veventq_id: The ID of the new vEVENTQ
+ * @out_veventq_fd: The fd of the new vEVENTQ. User space must close the
+ *                  successfully returned fd after using it
+ * @__reserved: Must be 0
+ *
+ * Explicitly allocate a virtual event queue interface for a vIOMMU. A vIOMMU
+ * can have multiple FDs for different types, but is confined to one per @type.
+ * User space should open the @out_veventq_fd to read vEVENTs out of a vEVENTQ,
+ * if there are vEVENTs available. A vEVENTQ will lose events due to overflow,
+ * if the number of the vEVENTs hits @veventq_depth.
+ *
+ * Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by
+ * a type-specific data structure, in a normal case:
+ *  -------------------------------------------------------------
+ * || header0 | data0 | header1 | data1 | ... | headerN | dataN ||
+ *  -------------------------------------------------------------
+ * unless a tailing IOMMU_VEVENTQ_FLAG_LOST_EVENTS header is logged (refer to
+ * struct iommufd_vevent_header).
+ */
+struct iommu_veventq_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 viommu_id;
+	__u32 type;
+	__u32 veventq_depth;
+	__u32 out_veventq_id;
+	__u32 out_veventq_fd;
+	__u32 __reserved;
+};
+#define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC)
 #endif
-- 
cgit v1.2.3


From ea94b211c5483080b749c142090f4c4de4926e51 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 11 Mar 2025 12:44:24 -0700
Subject: iommufd/viommu: Add iommufd_viommu_get_vdev_id helper

This is a reverse search v.s. iommufd_viommu_find_dev, as drivers may want
to convert a struct device pointer (physical) to its virtual device ID for
an event injection to the user space VM.

Again, this avoids exposing more core structures to the drivers, than the
iommufd_viommu alone.

Link: https://patch.msgid.link/r/18b8e8bc1b8104d43b205d21602c036fd0804e56.1741719725.git.nicolinc@nvidia.com
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommufd.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 8948b1836940..05cb393aff0a 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -190,6 +190,8 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
 					     enum iommufd_object_type type);
 struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 				       unsigned long vdev_id);
+int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu,
+			       struct device *dev, unsigned long *vdev_id);
 #else /* !CONFIG_IOMMUFD_DRIVER_CORE */
 static inline struct iommufd_object *
 _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size,
@@ -203,6 +205,13 @@ iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id)
 {
 	return NULL;
 }
+
+static inline int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu,
+					     struct device *dev,
+					     unsigned long *vdev_id)
+{
+	return -ENOENT;
+}
 #endif /* CONFIG_IOMMUFD_DRIVER_CORE */
 
 /*
-- 
cgit v1.2.3


From e8e1ef9b77a7a09b7809890a52229f24d3c8b532 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 11 Mar 2025 12:44:25 -0700
Subject: iommufd/viommu: Add iommufd_viommu_report_event helper

Similar to iommu_report_device_fault, this allows IOMMU drivers to report
vIOMMU events from threaded IRQ handlers to user space hypervisors.

Link: https://patch.msgid.link/r/44be825042c8255e75d0151b338ffd8ba0e4920b.1741719725.git.nicolinc@nvidia.com
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommufd.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 05cb393aff0a..60eff9272551 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -11,6 +11,7 @@
 #include <linux/refcount.h>
 #include <linux/types.h>
 #include <linux/xarray.h>
+#include <uapi/linux/iommufd.h>
 
 struct device;
 struct file;
@@ -192,6 +193,9 @@ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 				       unsigned long vdev_id);
 int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu,
 			       struct device *dev, unsigned long *vdev_id);
+int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
+				enum iommu_veventq_type type, void *event_data,
+				size_t data_len);
 #else /* !CONFIG_IOMMUFD_DRIVER_CORE */
 static inline struct iommufd_object *
 _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size,
@@ -212,6 +216,13 @@ static inline int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu,
 {
 	return -ENOENT;
 }
+
+static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
+					      enum iommu_veventq_type type,
+					      void *event_data, size_t data_len)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_IOMMUFD_DRIVER_CORE */
 
 /*
-- 
cgit v1.2.3


From e7d3fa3d29d5b2ed12d247cf57a0a34fffe89eb8 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 11 Mar 2025 12:44:31 -0700
Subject: iommu/arm-smmu-v3: Report events that belong to devices attached to
 vIOMMU

Aside from the IOPF framework, iommufd provides an additional pathway to
report hardware events, via the vEVENTQ of vIOMMU infrastructure.

Define an iommu_vevent_arm_smmuv3 uAPI structure, and report stage-1 events
in the threaded IRQ handler. Also, add another four event record types that
can be forwarded to a VM.

Link: https://patch.msgid.link/r/5cf6719682fdfdabffdb08374cdf31ad2466d75a.1741719725.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/uapi/linux/iommufd.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index dbb8787d9c63..8719d4f5d618 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -1054,9 +1054,32 @@ struct iommufd_vevent_header {
 /**
  * enum iommu_veventq_type - Virtual Event Queue Type
  * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use
+ * @IOMMU_VEVENTQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event Queue
  */
 enum iommu_veventq_type {
 	IOMMU_VEVENTQ_TYPE_DEFAULT = 0,
+	IOMMU_VEVENTQ_TYPE_ARM_SMMUV3 = 1,
+};
+
+/**
+ * struct iommu_vevent_arm_smmuv3 - ARM SMMUv3 Virtual Event
+ *                                  (IOMMU_VEVENTQ_TYPE_ARM_SMMUV3)
+ * @evt: 256-bit ARM SMMUv3 Event record, little-endian.
+ *       Reported event records: (Refer to "7.3 Event records" in SMMUv3 HW Spec)
+ *       - 0x04 C_BAD_STE
+ *       - 0x06 F_STREAM_DISABLED
+ *       - 0x08 C_BAD_SUBSTREAMID
+ *       - 0x0a C_BAD_CD
+ *       - 0x10 F_TRANSLATION
+ *       - 0x11 F_ADDR_SIZE
+ *       - 0x12 F_ACCESS
+ *       - 0x13 F_PERMISSION
+ *
+ * StreamID field reports a virtual device ID. To receive a virtual event for a
+ * device, a vDEVICE must be allocated via IOMMU_VDEVICE_ALLOC.
+ */
+struct iommu_vevent_arm_smmuv3 {
+	__aligned_le64 evt[4];
 };
 
 /**
-- 
cgit v1.2.3


From e794dc30be8b69e44646a162db09b43f719525b7 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 13 Feb 2025 16:07:15 +0200
Subject: i2c: Introduce i2c_10bit_addr_*_from_msg() helpers

There are already a lot of drivers that have been using
i2c_8bit_addr_from_msg() for 7-bit addresses, now it's time
to have the similar for 10-bit addresses.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20250213141045.2716943-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 include/linux/i2c.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 2b2af24d2a43..1e35b1e23165 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -952,6 +952,21 @@ static inline u8 i2c_8bit_addr_from_msg(const struct i2c_msg *msg)
 	return (msg->addr << 1) | (msg->flags & I2C_M_RD);
 }
 
+/*
+ * 10-bit address
+ *   addr_1: 5'b11110 | addr[9:8] | (R/nW)
+ *   addr_2: addr[7:0]
+ */
+static inline u8 i2c_10bit_addr_hi_from_msg(const struct i2c_msg *msg)
+{
+	return 0xf0 | ((msg->addr & GENMASK(9, 8)) >> 7) | (msg->flags & I2C_M_RD);
+}
+
+static inline u8 i2c_10bit_addr_lo_from_msg(const struct i2c_msg *msg)
+{
+	return msg->addr & GENMASK(7, 0);
+}
+
 u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold);
 void i2c_put_dma_safe_msg_buf(u8 *buf, struct i2c_msg *msg, bool xferred);
 
-- 
cgit v1.2.3


From a72f418703b55072d837b72ac87091e18049260c Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Mon, 17 Mar 2025 08:00:16 +0100
Subject: tty: convert "TTY Struct Flags" to an enum

Convert TTY_* macros (flags) to an enum. This allows for easier
kernel-doc (the comment needed fine tuning), grouping of these nicely,
and proper checking.

Note that these are bit positions. So they are used such as
test_bit(TTY_THROTTLED, ...). Given these are not the user API (only
in-kernel API/ABI), the bit positions are NOT preserved in this patch.
All are renumbered naturally using the enum-auto-numbering.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20250317070046.24386-2-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 52 +++++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 2372f9357240..6bb4fb3845f0 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -251,7 +251,7 @@ struct tty_file_private {
 };
 
 /**
- * DOC: TTY Struct Flags
+ * enum tty_struct_flags - TTY Struct Flags
  *
  * These bits are used in the :c:member:`tty_struct.flags` field.
  *
@@ -260,62 +260,64 @@ struct tty_file_private {
  * tty->write.  Thus, you must use the inline functions set_bit() and
  * clear_bit() to make things atomic.
  *
- * TTY_THROTTLED
+ * @TTY_THROTTLED:
  *	Driver input is throttled. The ldisc should call
  *	:c:member:`tty_driver.unthrottle()` in order to resume reception when
  *	it is ready to process more data (at threshold min).
  *
- * TTY_IO_ERROR
+ * @TTY_IO_ERROR:
  *	If set, causes all subsequent userspace read/write calls on the tty to
  *	fail, returning -%EIO. (May be no ldisc too.)
  *
- * TTY_OTHER_CLOSED
+ * @TTY_OTHER_CLOSED:
  *	Device is a pty and the other side has closed.
  *
- * TTY_EXCLUSIVE
+ * @TTY_EXCLUSIVE:
  *	Exclusive open mode (a single opener).
  *
- * TTY_DO_WRITE_WAKEUP
+ * @TTY_DO_WRITE_WAKEUP:
  *	If set, causes the driver to call the
  *	:c:member:`tty_ldisc_ops.write_wakeup()` method in order to resume
  *	transmission when it can accept more data to transmit.
  *
- * TTY_LDISC_OPEN
+ * @TTY_LDISC_OPEN:
  *	Indicates that a line discipline is open. For debugging purposes only.
  *
- * TTY_PTY_LOCK
+ * @TTY_PTY_LOCK:
  *	A flag private to pty code to implement %TIOCSPTLCK/%TIOCGPTLCK logic.
  *
- * TTY_NO_WRITE_SPLIT
+ * @TTY_NO_WRITE_SPLIT:
  *	Prevent driver from splitting up writes into smaller chunks (preserve
  *	write boundaries to driver).
  *
- * TTY_HUPPED
+ * @TTY_HUPPED:
  *	The TTY was hung up. This is set post :c:member:`tty_driver.hangup()`.
  *
- * TTY_HUPPING
+ * @TTY_HUPPING:
  *	The TTY is in the process of hanging up to abort potential readers.
  *
- * TTY_LDISC_CHANGING
+ * @TTY_LDISC_CHANGING:
  *	Line discipline for this TTY is being changed. I/O should not block
  *	when this is set. Use tty_io_nonblock() to check.
  *
- * TTY_LDISC_HALTED
+ * @TTY_LDISC_HALTED:
  *	Line discipline for this TTY was stopped. No work should be queued to
  *	this ldisc.
  */
-#define TTY_THROTTLED		0
-#define TTY_IO_ERROR		1
-#define TTY_OTHER_CLOSED	2
-#define TTY_EXCLUSIVE		3
-#define TTY_DO_WRITE_WAKEUP	5
-#define TTY_LDISC_OPEN		11
-#define TTY_PTY_LOCK		16
-#define TTY_NO_WRITE_SPLIT	17
-#define TTY_HUPPED		18
-#define TTY_HUPPING		19
-#define TTY_LDISC_CHANGING	20
-#define TTY_LDISC_HALTED	22
+enum tty_struct_flags {
+	TTY_THROTTLED,
+	TTY_IO_ERROR,
+	TTY_OTHER_CLOSED,
+	TTY_EXCLUSIVE,
+	TTY_DO_WRITE_WAKEUP,
+	TTY_LDISC_OPEN,
+	TTY_PTY_LOCK,
+	TTY_NO_WRITE_SPLIT,
+	TTY_HUPPED,
+	TTY_HUPPING,
+	TTY_LDISC_CHANGING,
+	TTY_LDISC_HALTED,
+};
 
 static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file)
 {
-- 
cgit v1.2.3


From d2e38e713bada24539b2b049f9be8d40dea79f41 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Mon, 17 Mar 2025 08:00:19 +0100
Subject: tty: move N_TTY_BUF_SIZE to n_tty

"N_TTY_BUF_SIZE" is private to n_tty and shall not be exposed to the
world. Definitely not in tty.h somewhere in the middle of "struct
tty_struct".

This is a remnant of moving "read_flags" to "struct n_tty_data" in
commit 3fe780b379fa ("TTY: move ldisc data from tty_struct: bitmaps").
But some cleanup was needed first (in previous patches).

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20250317070046.24386-5-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 6bb4fb3845f0..0a46e4054dec 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -239,7 +239,6 @@ struct tty_struct {
 
 	struct list_head tty_files;
 
-#define N_TTY_BUF_SIZE 4096
 	struct work_struct SAK_work;
 } __randomize_layout;
 
-- 
cgit v1.2.3


From 67acbcc324272ed06cd1c8f360afdeceb919af89 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Mon, 17 Mar 2025 08:00:27 +0100
Subject: tty: tty_driver: move TTY macros to the top

So that they can be referenced in structs once converted to enums (in
the next patches).

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20250317070046.24386-13-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 156 ++++++++++++++++++++++-----------------------
 1 file changed, 78 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index d4cdc089f6c3..f3be6d56e9e5 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -16,6 +16,84 @@ struct tty_driver;
 struct serial_icounter_struct;
 struct serial_struct;
 
+/**
+ * DOC: TTY Driver Flags
+ *
+ * TTY_DRIVER_RESET_TERMIOS
+ *	Requests the tty layer to reset the termios setting when the last
+ *	process has closed the device. Used for PTYs, in particular.
+ *
+ * TTY_DRIVER_REAL_RAW
+ *	Indicates that the driver will guarantee not to set any special
+ *	character handling flags if this is set for the tty:
+ *
+ *	``(IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || !INPCK)``
+ *
+ *	That is, if there is no reason for the driver to
+ *	send notifications of parity and break characters up to the line
+ *	driver, it won't do so.  This allows the line driver to optimize for
+ *	this case if this flag is set.  (Note that there is also a promise, if
+ *	the above case is true, not to signal overruns, either.)
+ *
+ * TTY_DRIVER_DYNAMIC_DEV
+ *	The individual tty devices need to be registered with a call to
+ *	tty_register_device() when the device is found in the system and
+ *	unregistered with a call to tty_unregister_device() so the devices will
+ *	be show up properly in sysfs.  If not set, all &tty_driver.num entries
+ *	will be created by the tty core in sysfs when tty_register_driver() is
+ *	called.  This is to be used by drivers that have tty devices that can
+ *	appear and disappear while the main tty driver is registered with the
+ *	tty core.
+ *
+ * TTY_DRIVER_DEVPTS_MEM
+ *	Don't use the standard arrays (&tty_driver.ttys and
+ *	&tty_driver.termios), instead use dynamic memory keyed through the
+ *	devpts filesystem. This is only applicable to the PTY driver.
+ *
+ * TTY_DRIVER_HARDWARE_BREAK
+ *	Hardware handles break signals. Pass the requested timeout to the
+ *	&tty_operations.break_ctl instead of using a simple on/off interface.
+ *
+ * TTY_DRIVER_DYNAMIC_ALLOC
+ *	Do not allocate structures which are needed per line for this driver
+ *	(&tty_driver.ports) as it would waste memory. The driver will take
+ *	care. This is only applicable to the PTY driver.
+ *
+ * TTY_DRIVER_UNNUMBERED_NODE
+ *	Do not create numbered ``/dev`` nodes. For example, create
+ *	``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a
+ *	driver for a single tty device is being allocated.
+ */
+#define TTY_DRIVER_INSTALLED		0x0001
+#define TTY_DRIVER_RESET_TERMIOS	0x0002
+#define TTY_DRIVER_REAL_RAW		0x0004
+#define TTY_DRIVER_DYNAMIC_DEV		0x0008
+#define TTY_DRIVER_DEVPTS_MEM		0x0010
+#define TTY_DRIVER_HARDWARE_BREAK	0x0020
+#define TTY_DRIVER_DYNAMIC_ALLOC	0x0040
+#define TTY_DRIVER_UNNUMBERED_NODE	0x0080
+
+/* tty driver types */
+#define TTY_DRIVER_TYPE_SYSTEM		0x0001
+#define TTY_DRIVER_TYPE_CONSOLE		0x0002
+#define TTY_DRIVER_TYPE_SERIAL		0x0003
+#define TTY_DRIVER_TYPE_PTY		0x0004
+#define TTY_DRIVER_TYPE_SCC		0x0005	/* scc driver */
+#define TTY_DRIVER_TYPE_SYSCONS		0x0006
+
+/* system subtypes (magic, used by tty_io.c) */
+#define SYSTEM_TYPE_TTY			0x0001
+#define SYSTEM_TYPE_CONSOLE		0x0002
+#define SYSTEM_TYPE_SYSCONS		0x0003
+#define SYSTEM_TYPE_SYSPTMX		0x0004
+
+/* pty subtypes (magic, used by tty_io.c) */
+#define PTY_TYPE_MASTER			0x0001
+#define PTY_TYPE_SLAVE			0x0002
+
+/* serial subtype definitions */
+#define SERIAL_TYPE_NORMAL	1
+
 /**
  * struct tty_operations -- interface between driver and tty
  *
@@ -494,84 +572,6 @@ static inline void tty_set_operations(struct tty_driver *driver,
 	driver->ops = op;
 }
 
-/**
- * DOC: TTY Driver Flags
- *
- * TTY_DRIVER_RESET_TERMIOS
- *	Requests the tty layer to reset the termios setting when the last
- *	process has closed the device. Used for PTYs, in particular.
- *
- * TTY_DRIVER_REAL_RAW
- *	Indicates that the driver will guarantee not to set any special
- *	character handling flags if this is set for the tty:
- *
- *	``(IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || !INPCK)``
- *
- *	That is, if there is no reason for the driver to
- *	send notifications of parity and break characters up to the line
- *	driver, it won't do so.  This allows the line driver to optimize for
- *	this case if this flag is set.  (Note that there is also a promise, if
- *	the above case is true, not to signal overruns, either.)
- *
- * TTY_DRIVER_DYNAMIC_DEV
- *	The individual tty devices need to be registered with a call to
- *	tty_register_device() when the device is found in the system and
- *	unregistered with a call to tty_unregister_device() so the devices will
- *	be show up properly in sysfs.  If not set, all &tty_driver.num entries
- *	will be created by the tty core in sysfs when tty_register_driver() is
- *	called.  This is to be used by drivers that have tty devices that can
- *	appear and disappear while the main tty driver is registered with the
- *	tty core.
- *
- * TTY_DRIVER_DEVPTS_MEM
- *	Don't use the standard arrays (&tty_driver.ttys and
- *	&tty_driver.termios), instead use dynamic memory keyed through the
- *	devpts filesystem. This is only applicable to the PTY driver.
- *
- * TTY_DRIVER_HARDWARE_BREAK
- *	Hardware handles break signals. Pass the requested timeout to the
- *	&tty_operations.break_ctl instead of using a simple on/off interface.
- *
- * TTY_DRIVER_DYNAMIC_ALLOC
- *	Do not allocate structures which are needed per line for this driver
- *	(&tty_driver.ports) as it would waste memory. The driver will take
- *	care. This is only applicable to the PTY driver.
- *
- * TTY_DRIVER_UNNUMBERED_NODE
- *	Do not create numbered ``/dev`` nodes. For example, create
- *	``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a
- *	driver for a single tty device is being allocated.
- */
-#define TTY_DRIVER_INSTALLED		0x0001
-#define TTY_DRIVER_RESET_TERMIOS	0x0002
-#define TTY_DRIVER_REAL_RAW		0x0004
-#define TTY_DRIVER_DYNAMIC_DEV		0x0008
-#define TTY_DRIVER_DEVPTS_MEM		0x0010
-#define TTY_DRIVER_HARDWARE_BREAK	0x0020
-#define TTY_DRIVER_DYNAMIC_ALLOC	0x0040
-#define TTY_DRIVER_UNNUMBERED_NODE	0x0080
-
-/* tty driver types */
-#define TTY_DRIVER_TYPE_SYSTEM		0x0001
-#define TTY_DRIVER_TYPE_CONSOLE		0x0002
-#define TTY_DRIVER_TYPE_SERIAL		0x0003
-#define TTY_DRIVER_TYPE_PTY		0x0004
-#define TTY_DRIVER_TYPE_SCC		0x0005	/* scc driver */
-#define TTY_DRIVER_TYPE_SYSCONS		0x0006
-
-/* system subtypes (magic, used by tty_io.c) */
-#define SYSTEM_TYPE_TTY			0x0001
-#define SYSTEM_TYPE_CONSOLE		0x0002
-#define SYSTEM_TYPE_SYSCONS		0x0003
-#define SYSTEM_TYPE_SYSPTMX		0x0004
-
-/* pty subtypes (magic, used by tty_io.c) */
-#define PTY_TYPE_MASTER			0x0001
-#define PTY_TYPE_SLAVE			0x0002
-
-/* serial subtype definitions */
-#define SERIAL_TYPE_NORMAL	1
-
 int tty_register_driver(struct tty_driver *driver);
 void tty_unregister_driver(struct tty_driver *driver);
 struct device *tty_register_device(struct tty_driver *driver, unsigned index,
-- 
cgit v1.2.3


From 109e06ae1dcf41d470705c54a67b123792424279 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Mon, 17 Mar 2025 08:00:28 +0100
Subject: tty: tty_driver: convert "TTY Driver Flags" to an enum

Convert TTY_DRIVER_* macros (flags) to an enum. This allows for easier
kernel-doc (the comment needed fine tuning), grouping of these nicely,
and proper checking.

Given these are flags, define them using modern BIT() instead of hex
constants.

It turns out (thanks, kernel-doc checker) that internal
TTY_DRIVER_INSTALLED was undocumented. Fix that too.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20250317070046.24386-14-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index f3be6d56e9e5..d0d940236580 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -17,13 +17,19 @@ struct serial_icounter_struct;
 struct serial_struct;
 
 /**
- * DOC: TTY Driver Flags
+ * enum tty_driver_flag -- TTY Driver Flags
  *
- * TTY_DRIVER_RESET_TERMIOS
+ * These are flags passed to tty_alloc_driver().
+ *
+ * @TTY_DRIVER_INSTALLED:
+ *	Whether this driver was succesfully installed. This is a tty internal
+ *	flag. Do not touch.
+ *
+ * @TTY_DRIVER_RESET_TERMIOS:
  *	Requests the tty layer to reset the termios setting when the last
  *	process has closed the device. Used for PTYs, in particular.
  *
- * TTY_DRIVER_REAL_RAW
+ * @TTY_DRIVER_REAL_RAW:
  *	Indicates that the driver will guarantee not to set any special
  *	character handling flags if this is set for the tty:
  *
@@ -35,7 +41,7 @@ struct serial_struct;
  *	this case if this flag is set.  (Note that there is also a promise, if
  *	the above case is true, not to signal overruns, either.)
  *
- * TTY_DRIVER_DYNAMIC_DEV
+ * @TTY_DRIVER_DYNAMIC_DEV:
  *	The individual tty devices need to be registered with a call to
  *	tty_register_device() when the device is found in the system and
  *	unregistered with a call to tty_unregister_device() so the devices will
@@ -45,33 +51,35 @@ struct serial_struct;
  *	appear and disappear while the main tty driver is registered with the
  *	tty core.
  *
- * TTY_DRIVER_DEVPTS_MEM
+ * @TTY_DRIVER_DEVPTS_MEM:
  *	Don't use the standard arrays (&tty_driver.ttys and
  *	&tty_driver.termios), instead use dynamic memory keyed through the
  *	devpts filesystem. This is only applicable to the PTY driver.
  *
- * TTY_DRIVER_HARDWARE_BREAK
+ * @TTY_DRIVER_HARDWARE_BREAK:
  *	Hardware handles break signals. Pass the requested timeout to the
  *	&tty_operations.break_ctl instead of using a simple on/off interface.
  *
- * TTY_DRIVER_DYNAMIC_ALLOC
+ * @TTY_DRIVER_DYNAMIC_ALLOC:
  *	Do not allocate structures which are needed per line for this driver
  *	(&tty_driver.ports) as it would waste memory. The driver will take
  *	care. This is only applicable to the PTY driver.
  *
- * TTY_DRIVER_UNNUMBERED_NODE
+ * @TTY_DRIVER_UNNUMBERED_NODE:
  *	Do not create numbered ``/dev`` nodes. For example, create
  *	``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a
  *	driver for a single tty device is being allocated.
  */
-#define TTY_DRIVER_INSTALLED		0x0001
-#define TTY_DRIVER_RESET_TERMIOS	0x0002
-#define TTY_DRIVER_REAL_RAW		0x0004
-#define TTY_DRIVER_DYNAMIC_DEV		0x0008
-#define TTY_DRIVER_DEVPTS_MEM		0x0010
-#define TTY_DRIVER_HARDWARE_BREAK	0x0020
-#define TTY_DRIVER_DYNAMIC_ALLOC	0x0040
-#define TTY_DRIVER_UNNUMBERED_NODE	0x0080
+enum tty_driver_flag {
+	TTY_DRIVER_INSTALLED		= BIT(0),
+	TTY_DRIVER_RESET_TERMIOS	= BIT(1),
+	TTY_DRIVER_REAL_RAW		= BIT(2),
+	TTY_DRIVER_DYNAMIC_DEV		= BIT(3),
+	TTY_DRIVER_DEVPTS_MEM		= BIT(4),
+	TTY_DRIVER_HARDWARE_BREAK	= BIT(5),
+	TTY_DRIVER_DYNAMIC_ALLOC	= BIT(6),
+	TTY_DRIVER_UNNUMBERED_NODE	= BIT(7),
+};
 
 /* tty driver types */
 #define TTY_DRIVER_TYPE_SYSTEM		0x0001
-- 
cgit v1.2.3


From 63f3cd5d80d720fc6c9fd321138bbbf322ade392 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Mon, 17 Mar 2025 08:00:29 +0100
Subject: tty: tty_driver: document both {,__}tty_alloc_driver() properly

__tty_alloc_driver()'s kernel-doc needed some care: describe the return
value using the standard "Returns:", and use the new enum tty_driver_flag
for @flags.

Then, the tty_alloc_driver() macro was undocumented, but referenced many
times in the docs. Copy the docs from the above (except the @owner
parameter, obviously).

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20250317070046.24386-15-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index d0d940236580..0fe38befa1b8 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -564,7 +564,13 @@ struct tty_driver *tty_find_polling_driver(char *name, int *line);
 
 void tty_driver_kref_put(struct tty_driver *driver);
 
-/* Use TTY_DRIVER_* flags below */
+/**
+ * tty_alloc_driver - allocate tty driver
+ * @lines: count of lines this driver can handle at most
+ * @flags: some of enum tty_driver_flag, will be set in driver->flags
+ *
+ * Returns: struct tty_driver or a PTR-encoded error (use IS_ERR() and friends).
+ */
 #define tty_alloc_driver(lines, flags) \
 		__tty_alloc_driver(lines, THIS_MODULE, flags)
 
-- 
cgit v1.2.3


From 52443558adcdb11cff76beec34bf75f6779e1a08 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Mon, 17 Mar 2025 08:00:30 +0100
Subject: tty: tty_driver: introduce TTY driver sub/types enums

Convert TTY_DRIVER_TYPE_*, and subtype macros to two enums:
tty_driver_type and tty_driver_subtype. This allows for easier
kernel-doc (later), grouping of these nicely, and proper checking.

The tty_driver's ::type and ::subtype now use these enums instead of
bare "short".

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20250317070046.24386-16-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 0fe38befa1b8..188ee9b768eb 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -81,26 +81,26 @@ enum tty_driver_flag {
 	TTY_DRIVER_UNNUMBERED_NODE	= BIT(7),
 };
 
-/* tty driver types */
-#define TTY_DRIVER_TYPE_SYSTEM		0x0001
-#define TTY_DRIVER_TYPE_CONSOLE		0x0002
-#define TTY_DRIVER_TYPE_SERIAL		0x0003
-#define TTY_DRIVER_TYPE_PTY		0x0004
-#define TTY_DRIVER_TYPE_SCC		0x0005	/* scc driver */
-#define TTY_DRIVER_TYPE_SYSCONS		0x0006
+enum tty_driver_type {
+	TTY_DRIVER_TYPE_SYSTEM,
+	TTY_DRIVER_TYPE_CONSOLE,
+	TTY_DRIVER_TYPE_SERIAL,
+	TTY_DRIVER_TYPE_PTY,
+	TTY_DRIVER_TYPE_SCC,
+	TTY_DRIVER_TYPE_SYSCONS,
+};
 
-/* system subtypes (magic, used by tty_io.c) */
-#define SYSTEM_TYPE_TTY			0x0001
-#define SYSTEM_TYPE_CONSOLE		0x0002
-#define SYSTEM_TYPE_SYSCONS		0x0003
-#define SYSTEM_TYPE_SYSPTMX		0x0004
+enum tty_driver_subtype {
+	SYSTEM_TYPE_TTY = 1,
+	SYSTEM_TYPE_CONSOLE,
+	SYSTEM_TYPE_SYSCONS,
+	SYSTEM_TYPE_SYSPTMX,
 
-/* pty subtypes (magic, used by tty_io.c) */
-#define PTY_TYPE_MASTER			0x0001
-#define PTY_TYPE_SLAVE			0x0002
+	PTY_TYPE_MASTER = 1,
+	PTY_TYPE_SLAVE,
 
-/* serial subtype definitions */
-#define SERIAL_TYPE_NORMAL	1
+	SERIAL_TYPE_NORMAL = 1,
+};
 
 /**
  * struct tty_operations -- interface between driver and tty
@@ -500,8 +500,8 @@ struct tty_operations {
  * @major: major /dev device number (zero for autoassignment)
  * @minor_start: the first minor /dev device number
  * @num: number of devices allocated
- * @type: type of tty driver (%TTY_DRIVER_TYPE_)
- * @subtype: subtype of tty driver (%SYSTEM_TYPE_, %PTY_TYPE_, %SERIAL_TYPE_)
+ * @type: type of tty driver (enum tty_driver_type)
+ * @subtype: subtype of tty driver (enum tty_driver_subtype)
  * @init_termios: termios to set to each tty initially (e.g. %tty_std_termios)
  * @flags: tty driver flags (%TTY_DRIVER_)
  * @proc_entry: proc fs entry, used internally
@@ -533,8 +533,8 @@ struct tty_driver {
 	int	major;
 	int	minor_start;
 	unsigned int	num;
-	short	type;
-	short	subtype;
+	enum tty_driver_type type;
+	enum tty_driver_subtype subtype;
 	struct ktermios init_termios;
 	unsigned long	flags;
 	struct proc_dir_entry *proc_entry;
-- 
cgit v1.2.3


From 5af89030960fd987a6c25e33405bbaaff3c7298c Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Mon, 17 Mar 2025 08:00:31 +0100
Subject: tty: serdev: drop serdev_controller_ops::write_room()

In particular, serdev_device_write_room() is not called, so the whole
serdev's write_room() can go.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20250317070046.24386-17-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serdev.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index ff78efc1f60d..34562eb99931 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -84,7 +84,6 @@ enum serdev_parity {
 struct serdev_controller_ops {
 	ssize_t (*write_buf)(struct serdev_controller *, const u8 *, size_t);
 	void (*write_flush)(struct serdev_controller *);
-	int (*write_room)(struct serdev_controller *);
 	int (*open)(struct serdev_controller *);
 	void (*close)(struct serdev_controller *);
 	void (*set_flow_control)(struct serdev_controller *, bool);
@@ -212,7 +211,6 @@ int serdev_device_break_ctl(struct serdev_device *serdev, int break_state);
 void serdev_device_write_wakeup(struct serdev_device *);
 ssize_t serdev_device_write(struct serdev_device *, const u8 *, size_t, long);
 void serdev_device_write_flush(struct serdev_device *);
-int serdev_device_write_room(struct serdev_device *);
 
 /*
  * serdev device driver functions
@@ -273,10 +271,6 @@ static inline ssize_t serdev_device_write(struct serdev_device *sdev,
 	return -ENODEV;
 }
 static inline void serdev_device_write_flush(struct serdev_device *sdev) {}
-static inline int serdev_device_write_room(struct serdev_device *sdev)
-{
-	return 0;
-}
 
 #define serdev_device_driver_register(x)
 #define serdev_device_driver_unregister(x)
-- 
cgit v1.2.3


From c25951eb7518844fcb7fc9ec58e888731e8c46d0 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Fri, 15 Nov 2024 15:20:55 +0000
Subject: bus: fsl-mc: Remove deadcode

fsl_mc_allocator_driver_exit() was added explicitly by
commit 1e8ac83b6caf ("bus: fsl-mc: add fsl_mc_allocator cleanup function")
but was never used.

Remove it.

fsl_mc_portal_reset() was added in 2015 by
commit 197f4d6a4a00 ("staging: fsl-mc: fsl-mc object allocator driver")
but was never used.

Remove it.

fsl_mc_portal_reset() was the only caller of dpmcp_reset().

Remove it.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Acked-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Link: https://lore.kernel.org/r/20241115152055.279732-1-linux@treblig.org
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 include/linux/fsl/mc.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h
index 99f30c7d6208..897d6211c163 100644
--- a/include/linux/fsl/mc.h
+++ b/include/linux/fsl/mc.h
@@ -417,8 +417,6 @@ int __must_check fsl_mc_portal_allocate(struct fsl_mc_device *mc_dev,
 
 void fsl_mc_portal_free(struct fsl_mc_io *mc_io);
 
-int fsl_mc_portal_reset(struct fsl_mc_io *mc_io);
-
 int __must_check fsl_mc_object_allocate(struct fsl_mc_device *mc_dev,
 					enum fsl_mc_pool_type pool_type,
 					struct fsl_mc_device **new_mc_adev);
-- 
cgit v1.2.3


From cfe1f8776f23fd6dfe4ba9fcb695b129f0761187 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 13 Mar 2025 13:01:22 -0400
Subject: NFS: Extend rdirplus mount option with "force|none"

There are certain users that wish to force the NFS client to choose
READDIRPLUS over READDIR for a particular mount.  Update the "rdirplus" mount
option to optionally accept values.  For "rdirplus=force", the NFS client
will always attempt to use READDDIRPLUS.  The setting of "rdirplus=none" is
aliased to the existing "nordirplus".

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Link: https://lore.kernel.org/r/c4cf0de4c8be0930b91bc74bee310d289781cd3b.1741885071.git.bcodding@redhat.com
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs_sb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 1711eefcf24f..b83d16a42afc 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -167,6 +167,7 @@ struct nfs_server {
 #define NFS_MOUNT_TRUNK_DISCOVERY	0x04000000
 #define NFS_MOUNT_SHUTDOWN			0x08000000
 #define NFS_MOUNT_NO_ALIGNWRITE		0x10000000
+#define NFS_MOUNT_FORCE_RDIRPLUS	0x20000000
 
 	unsigned int		fattr_valid;	/* Valid attributes */
 	unsigned int		caps;		/* server capabilities */
-- 
cgit v1.2.3


From df210d9b0951d714c1668c511ca5c8ff38cf6916 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <anna.schumaker@oracle.com>
Date: Fri, 7 Feb 2025 15:42:24 -0500
Subject: sunrpc: Add a sysfs file for adding a new xprt

Writing to this file will clone the 'main' xprt of an xprt_switch and
add it to be used as an additional connection.

--

Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
v3: Replace call to xprt_iter_get_xprt() with xprt_iter_get_next()
Link: https://lore.kernel.org/r/20250207204225.594002-5-anna@kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtmultipath.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index e411368cdacf..e4db5022fe92 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -56,6 +56,7 @@ extern void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
 		struct rpc_xprt *xprt);
 extern void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps,
 		struct rpc_xprt *xprt, bool offline);
+extern struct rpc_xprt *rpc_xprt_switch_get_main_xprt(struct rpc_xprt_switch *xps);
 
 extern void xprt_iter_init(struct rpc_xprt_iter *xpi,
 		struct rpc_xprt_switch *xps);
-- 
cgit v1.2.3


From 487fae09d7e266a58a13784983cc1ef6a2076526 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 20 Mar 2025 11:45:06 -0400
Subject: NFS: Add a mount option to make ENETUNREACH errors fatal

If the NFS client was initially created in a container, and that
container is torn down, there is usually no possibity to go back and
destroy any NFS clients that are hung because their virtual network
devices have been unlinked.

Add a flag that tells the NFS client that in these circumstances, it
should treat ENETDOWN and ENETUNREACH errors as fatal to the NFS client.

The option defaults to being on when the mount happens from inside a net
namespace that is not "init_net".

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/nfs_fs_sb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b83d16a42afc..a6ce8590eaaf 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -168,6 +168,7 @@ struct nfs_server {
 #define NFS_MOUNT_SHUTDOWN			0x08000000
 #define NFS_MOUNT_NO_ALIGNWRITE		0x10000000
 #define NFS_MOUNT_FORCE_RDIRPLUS	0x20000000
+#define NFS_MOUNT_NETUNREACH_FATAL	0x40000000
 
 	unsigned int		fattr_valid;	/* Valid attributes */
 	unsigned int		caps;		/* server capabilities */
-- 
cgit v1.2.3


From 9827144bfb2b1baf1e78600da73dcc9e92e28415 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 20 Mar 2025 10:50:26 -0400
Subject: NFS: Treat ENETUNREACH errors as fatal in containers

Propagate the NFS_MOUNT_NETUNREACH_FATAL flag to work with the generic
NFS client. If the flag is set, the client will receive ENETDOWN and
ENETUNREACH errors from the RPC layer, and is expected to treat them as
being fatal.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/nfs_fs_sb.h     | 1 +
 include/linux/sunrpc/clnt.h   | 5 ++++-
 include/linux/sunrpc/sched.h  | 1 +
 include/trace/events/sunrpc.h | 1 +
 4 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index a6ce8590eaaf..71319637a84e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -50,6 +50,7 @@ struct nfs_client {
 #define NFS_CS_DS		7		/* - Server is a DS */
 #define NFS_CS_REUSEPORT	8		/* - reuse src port on reconnect */
 #define NFS_CS_PNFS		9		/* - Server used for pnfs */
+#define NFS_CS_NETUNREACH_FATAL	10		/* - ENETUNREACH errors are fatal */
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	size_t			cl_addrlen;
 	char *			cl_hostname;	/* hostname of server */
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index fec976e58174..f8b406b0a1af 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -64,7 +64,9 @@ struct rpc_clnt {
 				cl_noretranstimeo: 1,/* No retransmit timeouts */
 				cl_autobind : 1,/* use getport() */
 				cl_chatty   : 1,/* be verbose */
-				cl_shutdown : 1;/* rpc immediate -EIO */
+				cl_shutdown : 1,/* rpc immediate -EIO */
+				cl_netunreach_fatal : 1;
+						/* Treat ENETUNREACH errors as fatal */
 	struct xprtsec_parms	cl_xprtsec;	/* transport security policy */
 
 	struct rpc_rtt *	cl_rtt;		/* RTO estimator data */
@@ -175,6 +177,7 @@ struct rpc_add_xprt_test {
 #define RPC_CLNT_CREATE_SOFTERR		(1UL << 10)
 #define RPC_CLNT_CREATE_REUSEPORT	(1UL << 11)
 #define RPC_CLNT_CREATE_CONNECTED	(1UL << 12)
+#define RPC_CLNT_CREATE_NETUNREACH_FATAL	(1UL << 13)
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index eac57914dcf3..ccba79ebf893 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -134,6 +134,7 @@ struct rpc_task_setup {
 #define RPC_TASK_MOVEABLE	0x0004		/* nfs4.1+ rpc tasks */
 #define RPC_TASK_NULLCREDS	0x0010		/* Use AUTH_NULL credential */
 #define RPC_CALL_MAJORSEEN	0x0020		/* major timeout seen */
+#define RPC_TASK_NETUNREACH_FATAL 0x0040	/* ENETUNREACH is fatal */
 #define RPC_TASK_DYNAMIC	0x0080		/* task was kmalloc'ed */
 #define	RPC_TASK_NO_ROUND_ROBIN	0x0100		/* send requests on "main" xprt */
 #define RPC_TASK_SOFT		0x0200		/* Use soft timeouts */
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 851841336ee6..5d331383047b 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -343,6 +343,7 @@ TRACE_EVENT(rpc_request,
 		{ RPC_TASK_MOVEABLE, "MOVEABLE" },			\
 		{ RPC_TASK_NULLCREDS, "NULLCREDS" },			\
 		{ RPC_CALL_MAJORSEEN, "MAJORSEEN" },			\
+		{ RPC_TASK_NETUNREACH_FATAL, "NETUNREACH_FATAL"},	\
 		{ RPC_TASK_DYNAMIC, "DYNAMIC" },			\
 		{ RPC_TASK_NO_ROUND_ROBIN, "NO_ROUND_ROBIN" },		\
 		{ RPC_TASK_SOFT, "SOFT" },				\
-- 
cgit v1.2.3


From 8c9f0df7a82a9f3bbdab4c796d302b20a33c1cc6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 20 Mar 2025 08:04:35 -0400
Subject: pNFS/flexfiles: Treat ENETUNREACH errors as fatal in containers

Propagate the NFS_MOUNT_NETUNREACH_FATAL flag to work with the pNFS
flexfiles client. In these circumstances, the client needs to treat the
ENETDOWN and ENETUNREACH errors as fatal, and should abandon the
attempted I/O.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/nfs4.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 5fa60fe441b5..d8cad844870a 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -300,6 +300,7 @@ enum nfsstat4 {
 /* error codes for internal client use */
 #define NFS4ERR_RESET_TO_MDS   12001
 #define NFS4ERR_RESET_TO_PNFS  12002
+#define NFS4ERR_FATAL_IOERROR  12003
 
 static inline bool seqid_mutating_err(u32 err)
 {
-- 
cgit v1.2.3


From fa23a338de93aa03eb0b6146a0440f5762309f85 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 14 Mar 2025 13:36:11 +0000
Subject: mm: separate folio_split_memcg_refs() from split_page_memcg()

Patch series "Minor memcg cleanups & prep for memdescs", v2.

Separate the handling of accounted folios and GFP_ACCOUNT pages for easier
to understand code.  For more detail, see
https://lore.kernel.org/linux-mm/Z9LwTOudOlCGny3f@casper.infradead.org/


This patch (of 5):

Folios always use memcg_data to refer to the mem_cgroup while pages
allocated with GFP_ACCOUNT have a pointer to the obj_cgroup.  Since the
caller already knows what it has, split the function into two and then we
don't need to check.

Move the assignment of split folio memcg_data to the point where we set up
the other parts of the new folio.  That leaves folio_split_memcg_refs()
just handling the memcg accounting.

Link: https://lkml.kernel.org/r/20250314133617.138071-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20250314133617.138071-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Zi Yan <ziy@nvidia.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 57664e2a8fb7..d090089c5497 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1039,6 +1039,8 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 }
 
 void split_page_memcg(struct page *head, int old_order, int new_order);
+void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
+		unsigned new_order);
 
 static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
 {
@@ -1463,6 +1465,11 @@ static inline void split_page_memcg(struct page *head, int old_order, int new_or
 {
 }
 
+static inline void folio_split_memcg_refs(struct folio *folio,
+		unsigned old_order, unsigned new_order)
+{
+}
+
 static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
 {
 	return 0;
-- 
cgit v1.2.3


From 1506c25508acd740ced5e92c539ed3d12f622c5b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 14 Mar 2025 13:36:12 +0000
Subject: mm: simplify split_page_memcg()

The last argument to split_page_memcg() is now always 0, so remove it,
effectively reverting commit b8791381d7ed.

Link: https://lkml.kernel.org/r/20250314133617.138071-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Zi Yan <ziy@nvidia.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d090089c5497..ea28cacfb0d2 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1038,7 +1038,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 	rcu_read_unlock();
 }
 
-void split_page_memcg(struct page *head, int old_order, int new_order);
+void split_page_memcg(struct page *first, unsigned order);
 void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
 		unsigned new_order);
 
@@ -1461,7 +1461,7 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
 
-static inline void split_page_memcg(struct page *head, int old_order, int new_order)
+static inline void split_page_memcg(struct page *first, unsigned order)
 {
 }
 
-- 
cgit v1.2.3


From 8492936abb49eda26e00eab01e58d7e69f2355ba Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 14 Mar 2025 13:36:14 +0000
Subject: mm: simplify folio_memcg_charged()

There's no need to check which kind of pointer is in the memcg_data field,
all we actually care about is whether it's zero or not.  Saves 70 bytes in
workingset_activation() with the Debian config.

Link: https://lkml.kernel.org/r/20250314133617.138071-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ea28cacfb0d2..53364526d877 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -438,9 +438,7 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio)
  */
 static inline bool folio_memcg_charged(struct folio *folio)
 {
-	if (folio_memcg_kmem(folio))
-		return __folio_objcg(folio) != NULL;
-	return __folio_memcg(folio) != NULL;
+	return folio->memcg_data != 0;
 }
 
 /*
-- 
cgit v1.2.3


From 835de37603ef6412949df0a607128f5fffed4576 Mon Sep 17 00:00:00 2001
From: Nico Pache <npache@redhat.com>
Date: Fri, 14 Mar 2025 15:37:54 -0600
Subject: meminfo: add a per node counter for balloon drivers

Patch series "track memory used by balloon drivers", v2.

This series introduces a way to track memory used by balloon drivers.

Add a NR_BALLOON_PAGES counter to track how many pages are reclaimed by
the balloon drivers.  First add the accounting, then updates the balloon
drivers (virtio, Hyper-V, VMware, Pseries-cmm, and Xen) to maintain this
counter.  The virtio, Vmware, and pseries-cmm balloon drivers utilize the
balloon_compaction interface to allocate and free balloon pages.  Other
balloon drivers will have to maintain this counter manually.

This makes the information visible in memory reporting interfaces like
/proc/meminfo, show_mem, and OOM reporting.

This provides admins visibility into their VM balloon sizes without
requiring different virtualization tooling.  Furthermore, this information
is helpful when debugging an OOM inside a VM.


This patch (of 4):

Add NR_BALLOON_PAGES counter to track memory used by balloon drivers and
expose it through /proc/meminfo and other memory reporting interfaces.

[npache@redhat.com: document Balloon Meminfo entry]
  Link: https://lkml.kernel.org/r/a0315ccf-f244-460e-8643-fd7388724fe5@redhat.com
Link: https://lkml.kernel.org/r/20250314213757.244258-1-npache@redhat.com
Link: https://lkml.kernel.org/r/20250314213757.244258-2-npache@redhat.com
Signed-off-by: Nico Pache <npache@redhat.com>
Cc: Alexander Atanasov <alexander.atanasov@virtuozzo.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Juegren Gross <jgross@suse.com>
Cc: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 37c29f3fbca8..a9db0fbd2b94 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -224,6 +224,7 @@ enum node_stat_item {
 #ifdef CONFIG_HUGETLB_PAGE
 	NR_HUGETLB,
 #endif
+	NR_BALLOON_PAGES,
 	NR_VM_NODE_STAT_ITEMS
 };
 
-- 
cgit v1.2.3


From 3b23a44f1f196741596616082e759f7f3a400e78 Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Tue, 18 Mar 2025 11:30:28 -0700
Subject: mm/damon: implement a new DAMOS filter type for active pages

Patch series "mm/damon: introduce DAMOS filter type for active pages".

The memory reclaim algorithm categorizes pages into active and inactive
lists, separately for file and anon pages.  The system's performance
relies heavily on the (relative and absolute) accuracy of this
categorization.

This patch series add a new DAMOS filter for pages' activeness, giving us
visibility into the access frequency of the pages on each list.  This
insight can help us diagnose issues with the active-inactive balancing
dynamics, and make decisions to optimize reclaim efficiency and memory
utilization.

For instance, we might decide to enable DAMON_LRU_SORT, if we find that
there are pages on the active list that are infrequently accessed, or less
frequently accessed than pages on the inactive list.


This patch (of 2):

Implement a DAMOS filter type for active pages on DAMON kernel API, and
add support of it from the physical address space DAMON operations set
(paddr).

Link: https://lkml.kernel.org/r/20250318183029.2062917-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20250318183029.2062917-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 3db4f77261f5..47e36e6ea203 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -334,6 +334,7 @@ struct damos_stat {
 /**
  * enum damos_filter_type - Type of memory for &struct damos_filter
  * @DAMOS_FILTER_TYPE_ANON:	Anonymous pages.
+ * @DAMOS_FILTER_TYPE_ACTIVE:	Active pages.
  * @DAMOS_FILTER_TYPE_MEMCG:	Specific memcg's pages.
  * @DAMOS_FILTER_TYPE_YOUNG:	Recently accessed pages.
  * @DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:	Page is part of a hugepage.
@@ -355,6 +356,7 @@ struct damos_stat {
  */
 enum damos_filter_type {
 	DAMOS_FILTER_TYPE_ANON,
+	DAMOS_FILTER_TYPE_ACTIVE,
 	DAMOS_FILTER_TYPE_MEMCG,
 	DAMOS_FILTER_TYPE_YOUNG,
 	DAMOS_FILTER_TYPE_HUGEPAGE_SIZE,
-- 
cgit v1.2.3


From e452872b40e3f1fb92adf0d573a0a6a7c9f6ce22 Mon Sep 17 00:00:00 2001
From: Hao Jia <jiahao1@lixiang.com>
Date: Tue, 18 Mar 2025 15:58:32 +0800
Subject: mm: vmscan: split proactive reclaim statistics from direct reclaim
 statistics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Adding Proactive Memory Reclaim Statistics".

These two patches are related to proactive memory reclaim.

Patch 1 Split proactive reclaim statistics from direct reclaim counters
and introduces new counters: pgsteal_proactive, pgdemote_proactive,
and pgscan_proactive.

Patch 2 Adds pswpin and pswpout items to the cgroup-v2 documentation.


This patch (of 2):

In proactive memory reclaim scenarios, it is necessary to accurately track
proactive reclaim statistics to dynamically adjust the frequency and
amount of memory being reclaimed proactively.  Currently, proactive
reclaim is included in direct reclaim statistics, which can make these
direct reclaim statistics misleading.

Therefore, separate proactive reclaim memory from the direct reclaim
counters by introducing new counters: pgsteal_proactive,
pgdemote_proactive, and pgscan_proactive, to avoid confusion with direct
reclaim.

Link: https://lkml.kernel.org/r/20250318075833.90615-1-jiahao.kernel@gmail.com
Link: https://lkml.kernel.org/r/20250318075833.90615-2-jiahao.kernel@gmail.com
Signed-off-by: Hao Jia <jiahao1@lixiang.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h        | 1 +
 include/linux/vm_event_item.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a9db0fbd2b94..f7fe1126dc75 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -221,6 +221,7 @@ enum node_stat_item {
 	PGDEMOTE_KSWAPD,
 	PGDEMOTE_DIRECT,
 	PGDEMOTE_KHUGEPAGED,
+	PGDEMOTE_PROACTIVE,
 #ifdef CONFIG_HUGETLB_PAGE
 	NR_HUGETLB,
 #endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index f70d0958095c..f11b6fa9c5b3 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -41,9 +41,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PGSTEAL_KSWAPD,
 		PGSTEAL_DIRECT,
 		PGSTEAL_KHUGEPAGED,
+		PGSTEAL_PROACTIVE,
 		PGSCAN_KSWAPD,
 		PGSCAN_DIRECT,
 		PGSCAN_KHUGEPAGED,
+		PGSCAN_PROACTIVE,
 		PGSCAN_DIRECT_THROTTLE,
 		PGSCAN_ANON,
 		PGSCAN_FILE,
-- 
cgit v1.2.3


From 5f5ee52d4f58605330b09851273d6e56aaadd29e Mon Sep 17 00:00:00 2001
From: Jinjiang Tu <tujinjiang@huawei.com>
Date: Tue, 18 Mar 2025 16:39:38 +0800
Subject: mm/hwpoison: introduce folio_contain_hwpoisoned_page() helper

Patch series "mm/vmscan: don't try to reclaim hwpoison folio".

Fix a bug during memory reclaim if folio is hwpoisoned.


This patch (of 2):

Introduce helper folio_contain_hwpoisoned_page() to check if the entire
folio is hwpoisoned or it contains hwpoisoned pages.

Link: https://lkml.kernel.org/r/20250318083939.987651-1-tujinjiang@huawei.com
Link: https://lkml.kernel.org/r/20250318083939.987651-2-tujinjiang@huawei.com
Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Nanyong Sun <sunnanyong@huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: <stable@vger,kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f26ce54c7aa4..68f4b188fc6b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1098,6 +1098,12 @@ static inline bool is_page_hwpoison(const struct page *page)
 	return folio_test_hugetlb(folio) && PageHWPoison(&folio->page);
 }
 
+static inline bool folio_contain_hwpoisoned_page(struct folio *folio)
+{
+	return folio_test_hwpoison(folio) ||
+	    (folio_test_large(folio) && folio_test_has_hwpoisoned(folio));
+}
+
 bool is_free_buddy_page(const struct page *page);
 
 PAGEFLAG(Isolated, isolated, PF_ANY);
-- 
cgit v1.2.3


From 3cf67d61ff98672120f6ad07528afa165df12588 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Tue, 25 Feb 2025 16:02:34 +0900
Subject: hung_task: show the blocker task if the task is hung on mutex

Patch series "hung_task: Dump the blocking task stacktrace", v4.

The hung_task detector is very useful for detecting the lockup.  However,
since it only dumps the blocked (uninterruptible sleep) processes, it is
not enough to identify the root cause of that lockup.

For example, if a process holds a mutex and sleep an event in
interruptible state long time, the other processes will wait on the mutex
in uninterruptible state.  In this case, the waiter processes are dumped,
but the blocker process is not shown because it is sleep in interruptible
state.

This adds a feature to dump the blocker task which holds a mutex
when detecting a hung task. e.g.

 INFO: task cat:115 blocked for more than 122 seconds.
       Not tainted 6.14.0-rc3-00003-ga8946be3de00 #156
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 task:cat             state:D stack:13432 pid:115   tgid:115   ppid:106    task_flags:0x400100 flags:0x00000002
 Call Trace:
  <TASK>
  __schedule+0x731/0x960
  ? schedule_preempt_disabled+0x54/0xa0
  schedule+0xb7/0x140
  ? __mutex_lock+0x51b/0xa60
  ? __mutex_lock+0x51b/0xa60
  schedule_preempt_disabled+0x54/0xa0
  __mutex_lock+0x51b/0xa60
  read_dummy+0x23/0x70
  full_proxy_read+0x6a/0xc0
  vfs_read+0xc2/0x340
  ? __pfx_direct_file_splice_eof+0x10/0x10
  ? do_sendfile+0x1bd/0x2e0
  ksys_read+0x76/0xe0
  do_syscall_64+0xe3/0x1c0
  ? exc_page_fault+0xa9/0x1d0
  entry_SYSCALL_64_after_hwframe+0x77/0x7f
 RIP: 0033:0x4840cd
 RSP: 002b:00007ffe99071828 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd
 RDX: 0000000000001000 RSI: 00007ffe99071870 RDI: 0000000000000003
 RBP: 00007ffe99071870 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000
 R13: 00000000132fd3a0 R14: 0000000000000001 R15: ffffffffffffffff
  </TASK>
 INFO: task cat:115 is blocked on a mutex likely owned by task cat:114.
 task:cat             state:S stack:13432 pid:114   tgid:114   ppid:106    task_flags:0x400100 flags:0x00000002
 Call Trace:
  <TASK>
  __schedule+0x731/0x960
  ? schedule_timeout+0xa8/0x120
  schedule+0xb7/0x140
  schedule_timeout+0xa8/0x120
  ? __pfx_process_timeout+0x10/0x10
  msleep_interruptible+0x3e/0x60
  read_dummy+0x2d/0x70
  full_proxy_read+0x6a/0xc0
  vfs_read+0xc2/0x340
  ? __pfx_direct_file_splice_eof+0x10/0x10
  ? do_sendfile+0x1bd/0x2e0
  ksys_read+0x76/0xe0
  do_syscall_64+0xe3/0x1c0
  ? exc_page_fault+0xa9/0x1d0
  entry_SYSCALL_64_after_hwframe+0x77/0x7f
 RIP: 0033:0x4840cd
 RSP: 002b:00007ffe3e0147b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd
 RDX: 0000000000001000 RSI: 00007ffe3e014800 RDI: 0000000000000003
 RBP: 00007ffe3e014800 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000
 R13: 000000001a0a93a0 R14: 0000000000000001 R15: ffffffffffffffff
  </TASK>

TBD: We can extend this feature to cover other locks like rwsem and
rt_mutex, but rwsem requires to dump all the tasks which acquire and wait
that rwsem.  We can follow the waiter link but the output will be a bit
different compared with mutex case.


This patch (of 2):

The "hung_task" shows a long-time uninterruptible slept task, but most
often, it's blocked on a mutex acquired by another task.  Without dumping
such a task, investigating the root cause of the hung task problem is very
difficult.

This introduce task_struct::blocker_mutex to point the mutex lock which
this task is waiting for.  Since the mutex has "owner" information, we can
find the owner task and dump it with hung tasks.

Note: the owner can be changed while dumping the owner task, so
this is "likely" the owner of the mutex.

With this change, the hung task shows blocker task's info like below;

 INFO: task cat:115 blocked for more than 122 seconds.
       Not tainted 6.14.0-rc3-00003-ga8946be3de00 #156
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 task:cat             state:D stack:13432 pid:115   tgid:115   ppid:106    task_flags:0x400100 flags:0x00000002
 Call Trace:
  <TASK>
  __schedule+0x731/0x960
  ? schedule_preempt_disabled+0x54/0xa0
  schedule+0xb7/0x140
  ? __mutex_lock+0x51b/0xa60
  ? __mutex_lock+0x51b/0xa60
  schedule_preempt_disabled+0x54/0xa0
  __mutex_lock+0x51b/0xa60
  read_dummy+0x23/0x70
  full_proxy_read+0x6a/0xc0
  vfs_read+0xc2/0x340
  ? __pfx_direct_file_splice_eof+0x10/0x10
  ? do_sendfile+0x1bd/0x2e0
  ksys_read+0x76/0xe0
  do_syscall_64+0xe3/0x1c0
  ? exc_page_fault+0xa9/0x1d0
  entry_SYSCALL_64_after_hwframe+0x77/0x7f
 RIP: 0033:0x4840cd
 RSP: 002b:00007ffe99071828 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd
 RDX: 0000000000001000 RSI: 00007ffe99071870 RDI: 0000000000000003
 RBP: 00007ffe99071870 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000
 R13: 00000000132fd3a0 R14: 0000000000000001 R15: ffffffffffffffff
  </TASK>
 INFO: task cat:115 is blocked on a mutex likely owned by task cat:114.
 task:cat             state:S stack:13432 pid:114   tgid:114   ppid:106    task_flags:0x400100 flags:0x00000002
 Call Trace:
  <TASK>
  __schedule+0x731/0x960
  ? schedule_timeout+0xa8/0x120
  schedule+0xb7/0x140
  schedule_timeout+0xa8/0x120
  ? __pfx_process_timeout+0x10/0x10
  msleep_interruptible+0x3e/0x60
  read_dummy+0x2d/0x70
  full_proxy_read+0x6a/0xc0
  vfs_read+0xc2/0x340
  ? __pfx_direct_file_splice_eof+0x10/0x10
  ? do_sendfile+0x1bd/0x2e0
  ksys_read+0x76/0xe0
  do_syscall_64+0xe3/0x1c0
  ? exc_page_fault+0xa9/0x1d0
  entry_SYSCALL_64_after_hwframe+0x77/0x7f
 RIP: 0033:0x4840cd
 RSP: 002b:00007ffe3e0147b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd
 RDX: 0000000000001000 RSI: 00007ffe3e014800 RDI: 0000000000000003
 RBP: 00007ffe3e014800 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000
 R13: 000000001a0a93a0 R14: 0000000000000001 R15: ffffffffffffffff
  </TASK>

[akpm@linux-foundation.org: implement debug_show_blocker() in C rather than in CPP]
Link: https://lkml.kernel.org/r/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com
Link: https://lkml.kernel.org/r/174046695384.2194069.16796289525958195643.stgit@mhiramat.tok.corp.google.com
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Waiman Long <longman@redhat.com>
Reviewed-by: Lance Yang <ioworker0@gmail.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tomasz Figa <tfiga@chromium.org>
Cc: Will Deacon <will@kernel.org>
Cc: Yongliang Gao <leonylgao@tencent.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mutex.h | 2 ++
 include/linux/sched.h | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 2bf91b57591b..2143d05116be 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -202,4 +202,6 @@ DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T))
 DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T))
 DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0)
 
+extern unsigned long mutex_get_owner(struct mutex *lock);
+
 #endif /* __LINUX_MUTEX_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c15365a30c0..1419d94c8e87 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1217,6 +1217,10 @@ struct task_struct {
 	struct mutex_waiter		*blocked_on;
 #endif
 
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+	struct mutex			*blocker_mutex;
+#endif
+
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 	int				non_block_count;
 #endif
-- 
cgit v1.2.3


From 8ee065a6fdd285387d0eca5e1139ffb182a6e626 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 17 Mar 2025 20:11:10 +0200
Subject: resource: split DEFINE_RES_NAMED_DESC() out of DEFINE_RES_NAMED()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "resource: Split and use DEFINE_RES*() macros", v2.

Replace open coded variants of DEFINE_RES*() macros.

Note, there are many more possibilities over the kernel and even in
reources.c, however the latter contains not so trivial leftovers.  That's
why the examples cover only straightforward conversions.


This patch (of 4):

In some cases it would be useful to supply predefined descriptor of the
resource.  For this, introduce DEFINE_RES_NAMED_DESC() macro.

While at it, provide DEFINE_RES() that takes only start, size,
and flags.

Link: https://lkml.kernel.org/r/20250317181412.1560630-1-andriy.shevchenko@linux.intel.com
Link: https://lkml.kernel.org/r/20250317181412.1560630-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ioport.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 5385349f0b8a..e8b2d6aa4013 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -154,15 +154,20 @@ enum {
 };
 
 /* helpers to define resources */
-#define DEFINE_RES_NAMED(_start, _size, _name, _flags)			\
+#define DEFINE_RES_NAMED_DESC(_start, _size, _name, _flags, _desc)	\
 (struct resource) {							\
 		.start = (_start),					\
 		.end = (_start) + (_size) - 1,				\
 		.name = (_name),					\
 		.flags = (_flags),					\
-		.desc = IORES_DESC_NONE,				\
+		.desc = (_desc),					\
 	}
 
+#define DEFINE_RES_NAMED(_start, _size, _name, _flags)			\
+	DEFINE_RES_NAMED_DESC(_start, _size, _name, _flags, IORES_DESC_NONE)
+#define DEFINE_RES(_start, _size, _flags)				\
+	DEFINE_RES_NAMED(_start, _size, NULL, _flags)
+
 #define DEFINE_RES_IO_NAMED(_start, _size, _name)			\
 	DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_IO)
 #define DEFINE_RES_IO(_start, _size)					\
-- 
cgit v1.2.3


From 24fe172b50b5749c315349e740e4a09e3a0165d5 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Mon, 24 Mar 2025 14:56:01 -0700
Subject: objtool: Fix up some outdated references to ENTRY/ENDPROC

ENTRY and ENDPROC were deprecated years ago and replaced with
SYM_FUNC_{START,END}.  Fix up a few outdated references in the objtool
documentation and comments.  Also fix a few typos.

Suggested-by: Brendan Jackman <jackmanb@google.com>
Suggested-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/5eb7e06e1a0e87aaeda8d583ab060e7638a6ea8e.1742852846.git.jpoimboe@kernel.org
---
 include/linux/linkage.h | 4 ----
 include/linux/objtool.h | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 5c8865bb59d9..b11660b706c5 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -134,10 +134,6 @@
 	.size name, .-name
 #endif
 
-/* If symbol 'name' is treated as a subroutine (gets called, and returns)
- * then please use ENDPROC to mark 'name' as STT_FUNC for the benefit of
- * static analysis tools such as stack depth analyzer.
- */
 #ifndef ENDPROC
 /* deprecated, use SYM_FUNC_END */
 #define ENDPROC(name) \
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 3ca965a2ddc8..366ad004d794 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -69,7 +69,7 @@
  * In asm, there are two kinds of code: normal C-type callable functions and
  * the rest.  The normal callable functions can be called by other code, and
  * don't do anything unusual with the stack.  Such normal callable functions
- * are annotated with the ENTRY/ENDPROC macros.  Most asm code falls in this
+ * are annotated with SYM_FUNC_{START,END}.  Most asm code falls in this
  * category.  In this case, no special debugging annotations are needed because
  * objtool can automatically generate the ORC data for the ORC unwinder to read
  * at runtime.
-- 
cgit v1.2.3


From 6aa63a4ec947f350d1a2f9f6aba8591a2455d192 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 24 Mar 2025 21:05:15 -0700
Subject: iommu: Sort out domain user data

When DMA/MSI cookies were made first-class citizens back in commit
46983fcd67ac ("iommu: Pull IOVA cookie management into the core"), there
was no real need to further expose the two different cookie types.
However, now that IOMMUFD wants to add a third type of MSI-mapping
cookie, we do have a nicely compelling reason to properly dismabiguate
things at the domain level beyond just vaguely guessing from the domain
type.

Meanwhile, we also effectively have another "cookie" in the form of the
anonymous union for other user data, which isn't much better in terms of
being vague and unenforced. The fact is that all these cookie types are
mutually exclusive, in the sense that combining them makes zero sense
and/or would be catastrophic (iommu_set_fault_handler() on an SVA
domain, anyone?) - the only combination which *might* be reasonable is
perhaps a fault handler and an MSI cookie, but nobody's doing that at
the moment, so let's rule it out as well for the sake of being clear and
robust. To that end, we pull DMA and MSI cookies apart a little more,
mostly to clear up the ambiguity at domain teardown, then for clarity
(and to save a little space), move them into the union, whose ownership
we can then properly describe and enforce entirely unambiguously.

[nicolinc: rebase on latest tree; use prefix IOMMU_COOKIE_; merge unions
           in iommu_domain; add IOMMU_COOKIE_IOMMUFD for iommufd_hwpt]

Link: https://patch.msgid.link/r/1ace9076c95204bbe193ee77499d395f15f44b23.1742871535.git.nicolinc@nvidia.com
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e93d2e918599..06cc14e9993d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -41,6 +41,7 @@ struct iommu_dirty_ops;
 struct notifier_block;
 struct iommu_sva;
 struct iommu_dma_cookie;
+struct iommu_dma_msi_cookie;
 struct iommu_fault_param;
 struct iommufd_ctx;
 struct iommufd_viommu;
@@ -165,6 +166,15 @@ struct iommu_domain_geometry {
 	bool force_aperture;       /* DMA only allowed in mappable range? */
 };
 
+enum iommu_domain_cookie_type {
+	IOMMU_COOKIE_NONE,
+	IOMMU_COOKIE_DMA_IOVA,
+	IOMMU_COOKIE_DMA_MSI,
+	IOMMU_COOKIE_FAULT_HANDLER,
+	IOMMU_COOKIE_SVA,
+	IOMMU_COOKIE_IOMMUFD,
+};
+
 /* Domain feature flags */
 #define __IOMMU_DOMAIN_PAGING	(1U << 0)  /* Support for iommu_map/unmap */
 #define __IOMMU_DOMAIN_DMA_API	(1U << 1)  /* Domain for use in DMA-API
@@ -211,12 +221,12 @@ struct iommu_domain_geometry {
 
 struct iommu_domain {
 	unsigned type;
+	enum iommu_domain_cookie_type cookie_type;
 	const struct iommu_domain_ops *ops;
 	const struct iommu_dirty_ops *dirty_ops;
 	const struct iommu_ops *owner; /* Whose domain_alloc we came from */
 	unsigned long pgsize_bitmap;	/* Bitmap of page sizes in use */
 	struct iommu_domain_geometry geometry;
-	struct iommu_dma_cookie *iova_cookie;
 	int (*iopf_handler)(struct iopf_group *group);
 
 #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
@@ -224,10 +234,10 @@ struct iommu_domain {
 		      phys_addr_t msi_addr);
 #endif
 
-	union { /* Pointer usable by owner of the domain */
-		struct iommufd_hw_pagetable *iommufd_hwpt; /* iommufd */
-	};
-	union { /* Fault handler */
+	union { /* cookie */
+		struct iommu_dma_cookie *iova_cookie;
+		struct iommu_dma_msi_cookie *msi_cookie;
+		struct iommufd_hw_pagetable *iommufd_hwpt;
 		struct {
 			iommu_fault_handler_t handler;
 			void *handler_token;
-- 
cgit v1.2.3


From 06d54f00f3f5a29cbf43410ac93ee2dd89e3b711 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Mon, 24 Mar 2025 21:05:17 -0700
Subject: iommu: Drop sw_msi from iommu_domain

There are only two sw_msi implementations in the entire system, thus it's
not very necessary to have an sw_msi pointer.

Instead, check domain->cookie_type to call the two sw_msi implementations
directly from the core code.

Link: https://patch.msgid.link/r/7ded87c871afcbaac665b71354de0a335087bf0f.1742871535.git.nicolinc@nvidia.com
Suggested-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 06cc14e9993d..e01c855ae8a7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -229,11 +229,6 @@ struct iommu_domain {
 	struct iommu_domain_geometry geometry;
 	int (*iopf_handler)(struct iopf_group *group);
 
-#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
-	int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc,
-		      phys_addr_t msi_addr);
-#endif
-
 	union { /* cookie */
 		struct iommu_dma_cookie *iova_cookie;
 		struct iommu_dma_msi_cookie *msi_cookie;
@@ -254,16 +249,6 @@ struct iommu_domain {
 	};
 };
 
-static inline void iommu_domain_set_sw_msi(
-	struct iommu_domain *domain,
-	int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc,
-		      phys_addr_t msi_addr))
-{
-#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
-	domain->sw_msi = sw_msi;
-#endif
-}
-
 static inline bool iommu_is_dma_domain(struct iommu_domain *domain)
 {
 	return domain->type & __IOMMU_DOMAIN_DMA_API;
-- 
cgit v1.2.3


From 2fb69c602d57f77483b8dcdd12d17408a09f76fe Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Fri, 21 Mar 2025 10:19:33 -0700
Subject: iommufd: Support pasid attach/replace

This extends the below APIs to support PASID. Device drivers to manage pasid
attach/replace/detach.

    int iommufd_device_attach(struct iommufd_device *idev,
			      ioasid_t pasid, u32 *pt_id);
    int iommufd_device_replace(struct iommufd_device *idev,
			       ioasid_t pasid, u32 *pt_id);
    void iommufd_device_detach(struct iommufd_device *idev,
			       ioasid_t pasid);

The pasid operations share underlying attach/replace/detach infrastructure
with the device operations, but still have some different implications:

 - no reserved region per pasid otherwise SVA architecture is already
   broken (CPU address space doesn't count device reserved regions);

 - accordingly no sw_msi trick;

Cache coherency enforcement is still applied to pasid operations since
it is about memory accesses post page table walking (no matter the walk
is per RID or per PASID).

Link: https://patch.msgid.link/r/20250321171940.7213-12-yi.l.liu@intel.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommufd.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 60eff9272551..34b6e6ca4bfa 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -8,6 +8,7 @@
 
 #include <linux/err.h>
 #include <linux/errno.h>
+#include <linux/iommu.h>
 #include <linux/refcount.h>
 #include <linux/types.h>
 #include <linux/xarray.h>
@@ -54,9 +55,11 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 					   struct device *dev, u32 *id);
 void iommufd_device_unbind(struct iommufd_device *idev);
 
-int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id);
-int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id);
-void iommufd_device_detach(struct iommufd_device *idev);
+int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid,
+			  u32 *pt_id);
+int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid,
+			   u32 *pt_id);
+void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid);
 
 struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev);
 u32 iommufd_device_to_id(struct iommufd_device *idev);
-- 
cgit v1.2.3


From dbc5f37b4f8ad833132f77c1f67e68bb11ca9b9e Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Fri, 21 Mar 2025 10:19:36 -0700
Subject: iommufd: Allow allocating PASID-compatible domain

The underlying infrastructure has supported the PASID attach and related
enforcement per the requirement of the IOMMU_HWPT_ALLOC_PASID flag. This
extends iommufd to support PASID compatible domain requested by userspace.

Link: https://patch.msgid.link/r/20250321171940.7213-15-yi.l.liu@intel.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/uapi/linux/iommufd.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 8719d4f5d618..6901804ec736 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -393,6 +393,9 @@ struct iommu_vfio_ioas {
  *                          Any domain attached to the non-PASID part of the
  *                          device must also be flagged, otherwise attaching a
  *                          PASID will blocked.
+ *                          For the user that wants to attach PASID, ioas is
+ *                          not recommended for both the non-PASID part
+ *                          and PASID part of the device.
  *                          If IOMMU does not support PASID it will return
  *                          error (-EOPNOTSUPP).
  */
-- 
cgit v1.2.3


From 7fe6b987166b901efc5c6fce5fe853c9ebb835be Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Fri, 21 Mar 2025 11:01:39 -0700
Subject: ida: Add ida_find_first_range()

There is no helpers for user to check if a given ID is allocated or not,
neither a helper to loop all the allocated IDs in an IDA and do something
for cleanup. With the two needs, a helper to get the lowest allocated ID
of a range and two variants based on it.

Caller can check if a given ID is allocated or not by:

	bool ida_exists(struct ida *ida, unsigned int id)

Caller can iterate all allocated IDs by:

	int id;
	while ((id = ida_find_first(&pasid_ida)) >= 0) {
		//anything to do with the allocated ID
		ida_free(pasid_ida, pasid);
	}

Link: https://patch.msgid.link/r/20250321180143.8468-2-yi.l.liu@intel.com
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/idr.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index da5f5fa4a3a6..718f9b1b91af 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -257,6 +257,7 @@ struct ida {
 int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t);
 void ida_free(struct ida *, unsigned int id);
 void ida_destroy(struct ida *ida);
+int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max);
 
 /**
  * ida_alloc() - Allocate an unused ID.
@@ -328,4 +329,14 @@ static inline bool ida_is_empty(const struct ida *ida)
 {
 	return xa_empty(&ida->xa);
 }
+
+static inline bool ida_exists(struct ida *ida, unsigned int id)
+{
+	return ida_find_first_range(ida, id, id) == id;
+}
+
+static inline int ida_find_first(struct ida *ida)
+{
+	return ida_find_first_range(ida, 0, ~0);
+}
 #endif /* __IDR_H__ */
-- 
cgit v1.2.3


From 290641346d0d1eaf400c4f968d5b2cd91f483733 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Fri, 21 Mar 2025 11:01:40 -0700
Subject: vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices

This adds pasid_at|de]tach_ioas ops for attaching hwpt to pasid of a
device and the helpers for it. For now, only vfio-pci supports pasid
attach/detach.

Link: https://patch.msgid.link/r/20250321180143.8468-3-yi.l.liu@intel.com
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/vfio.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 000a6cab2d31..707b00772ce1 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -67,6 +67,7 @@ struct vfio_device {
 	struct inode *inode;
 #if IS_ENABLED(CONFIG_IOMMUFD)
 	struct iommufd_device *iommufd_device;
+	struct ida pasids;
 	u8 iommufd_attached:1;
 #endif
 	u8 cdev_opened:1;
@@ -91,6 +92,8 @@ struct vfio_device {
  *		 bound iommufd. Undo in unbind_iommufd if @detach_ioas is not
  *		 called.
  * @detach_ioas: Opposite of attach_ioas
+ * @pasid_attach_ioas: The pasid variation of attach_ioas
+ * @pasid_detach_ioas: Opposite of pasid_attach_ioas
  * @open_device: Called when the first file descriptor is opened for this device
  * @close_device: Opposite of open_device
  * @read: Perform read(2) on device file descriptor
@@ -115,6 +118,9 @@ struct vfio_device_ops {
 	void	(*unbind_iommufd)(struct vfio_device *vdev);
 	int	(*attach_ioas)(struct vfio_device *vdev, u32 *pt_id);
 	void	(*detach_ioas)(struct vfio_device *vdev);
+	int	(*pasid_attach_ioas)(struct vfio_device *vdev, u32 pasid,
+				     u32 *pt_id);
+	void	(*pasid_detach_ioas)(struct vfio_device *vdev, u32 pasid);
 	int	(*open_device)(struct vfio_device *vdev);
 	void	(*close_device)(struct vfio_device *vdev);
 	ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
@@ -139,6 +145,10 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev,
 void vfio_iommufd_physical_unbind(struct vfio_device *vdev);
 int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
 void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev);
+int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev,
+					    u32 pasid, u32 *pt_id);
+void vfio_iommufd_physical_pasid_detach_ioas(struct vfio_device *vdev,
+					     u32 pasid);
 int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
 			       struct iommufd_ctx *ictx, u32 *out_device_id);
 void vfio_iommufd_emulated_unbind(struct vfio_device *vdev);
@@ -166,6 +176,10 @@ vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx)
 	((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL)
 #define vfio_iommufd_physical_detach_ioas \
 	((void (*)(struct vfio_device *vdev)) NULL)
+#define vfio_iommufd_physical_pasid_attach_ioas \
+	((int (*)(struct vfio_device *vdev, u32 pasid, u32 *pt_id)) NULL)
+#define vfio_iommufd_physical_pasid_detach_ioas \
+	((void (*)(struct vfio_device *vdev, u32 pasid)) NULL)
 #define vfio_iommufd_emulated_bind                                      \
 	((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx,   \
 		  u32 *out_device_id)) NULL)
-- 
cgit v1.2.3


From ad744ed5dd8b70e9256fc1ff18aaaffeedf5f21e Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Fri, 21 Mar 2025 11:01:41 -0700
Subject: vfio: VFIO_DEVICE_[AT|DE]TACH_IOMMUFD_PT support pasid

This extends the VFIO_DEVICE_[AT|DE]TACH_IOMMUFD_PT ioctls to attach/detach
a given pasid of a vfio device to/from an IOAS/HWPT.

Link: https://patch.msgid.link/r/20250321180143.8468-4-yi.l.liu@intel.com
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/uapi/linux/vfio.h | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index c8dbf8219c4f..6899da70b929 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -931,29 +931,34 @@ struct vfio_device_bind_iommufd {
  * VFIO_DEVICE_ATTACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 19,
  *					struct vfio_device_attach_iommufd_pt)
  * @argsz:	User filled size of this data.
- * @flags:	Must be 0.
+ * @flags:	Flags for attach.
  * @pt_id:	Input the target id which can represent an ioas or a hwpt
  *		allocated via iommufd subsystem.
  *		Output the input ioas id or the attached hwpt id which could
  *		be the specified hwpt itself or a hwpt automatically created
  *		for the specified ioas by kernel during the attachment.
+ * @pasid:	The pasid to be attached, only meaningful when
+ *		VFIO_DEVICE_ATTACH_PASID is set in @flags
  *
  * Associate the device with an address space within the bound iommufd.
  * Undo by VFIO_DEVICE_DETACH_IOMMUFD_PT or device fd close.  This is only
  * allowed on cdev fds.
  *
- * If a vfio device is currently attached to a valid hw_pagetable, without doing
- * a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl
- * passing in another hw_pagetable (hwpt) id is allowed. This action, also known
- * as a hw_pagetable replacement, will replace the device's currently attached
- * hw_pagetable with a new hw_pagetable corresponding to the given pt_id.
+ * If a vfio device or a pasid of this device is currently attached to a valid
+ * hw_pagetable (hwpt), without doing a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second
+ * VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl passing in another hwpt id is allowed.
+ * This action, also known as a hw_pagetable replacement, will replace the
+ * currently attached hwpt of the device or the pasid of this device with a new
+ * hwpt corresponding to the given pt_id.
  *
  * Return: 0 on success, -errno on failure.
  */
 struct vfio_device_attach_iommufd_pt {
 	__u32	argsz;
 	__u32	flags;
+#define VFIO_DEVICE_ATTACH_PASID	(1 << 0)
 	__u32	pt_id;
+	__u32	pasid;
 };
 
 #define VFIO_DEVICE_ATTACH_IOMMUFD_PT		_IO(VFIO_TYPE, VFIO_BASE + 19)
@@ -962,17 +967,21 @@ struct vfio_device_attach_iommufd_pt {
  * VFIO_DEVICE_DETACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 20,
  *					struct vfio_device_detach_iommufd_pt)
  * @argsz:	User filled size of this data.
- * @flags:	Must be 0.
+ * @flags:	Flags for detach.
+ * @pasid:	The pasid to be detached, only meaningful when
+ *		VFIO_DEVICE_DETACH_PASID is set in @flags
  *
- * Remove the association of the device and its current associated address
- * space.  After it, the device should be in a blocking DMA state.  This is only
- * allowed on cdev fds.
+ * Remove the association of the device or a pasid of the device and its current
+ * associated address space.  After it, the device or the pasid should be in a
+ * blocking DMA state.  This is only allowed on cdev fds.
  *
  * Return: 0 on success, -errno on failure.
  */
 struct vfio_device_detach_iommufd_pt {
 	__u32	argsz;
 	__u32	flags;
+#define VFIO_DEVICE_DETACH_PASID	(1 << 0)
+	__u32	pasid;
 };
 
 #define VFIO_DEVICE_DETACH_IOMMUFD_PT		_IO(VFIO_TYPE, VFIO_BASE + 20)
-- 
cgit v1.2.3


From 9e6ec8cf64e2973f0ec74f09023988cabd218426 Mon Sep 17 00:00:00 2001
From: xueqin Luo <luoxueqin@kylinos.cn>
Date: Thu, 6 Feb 2025 16:14:36 +0800
Subject: thermal: core: Remove duplicate struct declaration

The struct thermal_zone_device is already declared on line 32, so the
duplicate declaration has been removed.

Fixes: b1ae92dcfa8e ("thermal: core: Make struct thermal_zone_device definition internal")
Signed-off-by: xueqin Luo <luoxueqin@kylinos.cn>
Link: https://lore.kernel.org/r/20250206081436.51785-1-luoxueqin@kylinos.cn
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/thermal.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 69f9bedd0ee8..0b5ed6821080 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -86,8 +86,6 @@ struct thermal_trip {
 #define THERMAL_TRIP_PRIV_TO_INT(_val_)	(uintptr_t)(_val_)
 #define THERMAL_INT_TO_TRIP_PRIV(_val_)	(void *)(uintptr_t)(_val_)
 
-struct thermal_zone_device;
-
 struct cooling_spec {
 	unsigned long upper;	/* Highest cooling state  */
 	unsigned long lower;	/* Lowest cooling state  */
-- 
cgit v1.2.3


From d073a571e68f42414f8f06f01b59f52224538a83 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel.holland@sifive.com>
Date: Sat, 26 Oct 2024 10:13:56 -0700
Subject: asm-generic: Always define Elf_Rel and Elf_Rela

These definitions are useful for relocating the kernel image as well,
regardless of the type of relocations used for modules.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Link: https://lore.kernel.org/r/20241026171441.3047904-5-samuel.holland@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 include/asm-generic/module.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/module.h b/include/asm-generic/module.h
index 98e1541b72b7..a8622501b975 100644
--- a/include/asm-generic/module.h
+++ b/include/asm-generic/module.h
@@ -19,12 +19,8 @@ struct mod_arch_specific
 #define Elf_Dyn		Elf64_Dyn
 #define Elf_Ehdr	Elf64_Ehdr
 #define Elf_Addr	Elf64_Addr
-#ifdef CONFIG_MODULES_USE_ELF_REL
 #define Elf_Rel		Elf64_Rel
-#endif
-#ifdef CONFIG_MODULES_USE_ELF_RELA
 #define Elf_Rela	Elf64_Rela
-#endif
 #define ELF_R_TYPE(X)	ELF64_R_TYPE(X)
 #define ELF_R_SYM(X)	ELF64_R_SYM(X)
 
@@ -36,12 +32,8 @@ struct mod_arch_specific
 #define Elf_Dyn		Elf32_Dyn
 #define Elf_Ehdr	Elf32_Ehdr
 #define Elf_Addr	Elf32_Addr
-#ifdef CONFIG_MODULES_USE_ELF_REL
 #define Elf_Rel		Elf32_Rel
-#endif
-#ifdef CONFIG_MODULES_USE_ELF_RELA
 #define Elf_Rela	Elf32_Rela
-#endif
 #define ELF_R_TYPE(X)	ELF32_R_TYPE(X)
 #define ELF_R_SYM(X)	ELF32_R_SYM(X)
 #endif
-- 
cgit v1.2.3


From 923936efeb74b3f42e5ad283a0b9110bda102601 Mon Sep 17 00:00:00 2001
From: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Date: Fri, 28 Mar 2025 01:01:19 +0800
Subject: iomap: Fix conflicting values of iomap flags

IOMAP_F_ATOMIC_BIO mistakenly took the same value as of IOMAP_F_SIZE_CHANGED
in patch '370a6de7651b ("iomap: rework IOMAP atomic flags")'.
Let's fix this and let's also create some more space for filesystem reported
flags to avoid this in future. This patch makes the core iomap flags to start
from bit 15, moving downwards. Note that "flags" member within struct iomap
is of type u16.

Fixes: 370a6de7651b ("iomap: rework IOMAP atomic flags")
Signed-off-by: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Link: https://lore.kernel.org/r/20250327170119.61045-1-ritesh.list@gmail.com
Reviewed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/iomap.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 02fe001feebb..68416b135151 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -78,6 +78,11 @@ struct vm_fault;
 #define IOMAP_F_ANON_WRITE	(1U << 7)
 #define IOMAP_F_ATOMIC_BIO	(1U << 8)
 
+/*
+ * Flag reserved for file system specific usage
+ */
+#define IOMAP_F_PRIVATE		(1U << 12)
+
 /*
  * Flags set by the core iomap code during operations:
  *
@@ -88,14 +93,8 @@ struct vm_fault;
  * range it covers needs to be remapped by the high level before the operation
  * can proceed.
  */
-#define IOMAP_F_SIZE_CHANGED	(1U << 8)
-#define IOMAP_F_STALE		(1U << 9)
-
-/*
- * Flags from 0x1000 up are for file system specific usage:
- */
-#define IOMAP_F_PRIVATE		(1U << 12)
-
+#define IOMAP_F_SIZE_CHANGED	(1U << 14)
+#define IOMAP_F_STALE		(1U << 15)
 
 /*
  * Magic value for addr:
-- 
cgit v1.2.3


From 079a206f51f0c183be96a2f78f183dee42df1d4a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 21 Mar 2025 16:40:47 +0200
Subject: seq_buf: Mark binary printing functions with __printf() attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Binary printing functions are using printf() type of format, and compiler
is not happy about them as is:

lib/seq_buf.c:162:17: error: function ‘seq_buf_bprintf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format]

Fix the compilation errors by adding __printf() attribute.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20250321144822.324050-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/seq_buf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index fe41da005970..52791e070506 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -167,8 +167,8 @@ extern int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str,
 			    const void *buf, size_t len, bool ascii);
 
 #ifdef CONFIG_BINARY_PRINTF
-extern int
-seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary);
+__printf(2, 0)
+int seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary);
 #endif
 
 void seq_buf_do_printk(struct seq_buf *s, const char *lvl);
-- 
cgit v1.2.3


From 6b2c1e30ad6846624d935a7ea98dae60458126b8 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 21 Mar 2025 16:40:48 +0200
Subject: seq_file: Mark binary printing functions with __printf() attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Binary printing functions are using printf() type of format, and compiler
is not happy about them as is:

fs/seq_file.c:418:35: error: function ‘seq_bprintf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format]

Fix the compilation errors by adding __printf() attribute.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20250321144822.324050-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/seq_file.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 2fb266ea69fa..d6ebf0596510 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -181,6 +181,7 @@ int seq_open_private(struct file *, const struct seq_operations *, int);
 int seq_release_private(struct inode *, struct file *);
 
 #ifdef CONFIG_BINARY_PRINTF
+__printf(2, 0)
 void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary);
 #endif
 
-- 
cgit v1.2.3


From 196a062641fe68d9bfe0ad36b6cd7628c99ad22c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 21 Mar 2025 16:40:49 +0200
Subject: tracing: Mark binary printing functions with __printf() attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Binary printing functions are using printf() type of format, and compiler
is not happy about them as is:

kernel/trace/trace.c:3292:9: error: function ‘trace_vbprintk’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format]
kernel/trace/trace_seq.c:182:9: error: function ‘trace_seq_bprintf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format]

Fix the compilation errors by adding __printf() attribute.

While at it, move existing __printf() attributes from the implementations
to the declarations. IT also fixes incorrect attribute parameters that are
used for trace_array_printk().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20250321144822.324050-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/trace.h     | 4 ++--
 include/linux/trace_seq.h | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/trace.h b/include/linux/trace.h
index fdcd76b7be83..7eaad857dee0 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -72,8 +72,8 @@ static inline int unregister_ftrace_export(struct trace_export *export)
 static inline void trace_printk_init_buffers(void)
 {
 }
-static inline int trace_array_printk(struct trace_array *tr, unsigned long ip,
-				     const char *fmt, ...)
+static inline __printf(3, 4)
+int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...)
 {
 	return 0;
 }
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 1ef95c0287f0..a93ed5ac3226 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -88,8 +88,8 @@ extern __printf(2, 3)
 void trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
 extern __printf(2, 0)
 void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args);
-extern void
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
+extern __printf(2, 0)
+void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
 extern int trace_print_seq(struct seq_file *m, struct trace_seq *s);
 extern int trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 			     int cnt);
@@ -113,8 +113,8 @@ static inline __printf(2, 3)
 void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 {
 }
-static inline void
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+static inline __printf(2, 0)
+void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
 }
 
-- 
cgit v1.2.3


From 7bf819aa992faee980610e9021aec0c38aac53be Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 21 Mar 2025 16:40:50 +0200
Subject: vsnprintf: Mark binary printing functions with __printf() attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Binary printf() functions are using printf() type of format, and compiler
is not happy about them as is:

lib/vsprintf.c:3130:47: error: function ‘vbin_printf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format]
lib/vsprintf.c:3298:33: error: function ‘bstr_printf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format]

Fix the compilation errors by adding __printf() attribute.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20250321144822.324050-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/string.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index 0403a4ca4c11..01621ad0f598 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -336,8 +336,8 @@ int __sysfs_match_string(const char * const *array, size_t n, const char *s);
 #define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s)
 
 #ifdef CONFIG_BINARY_PRINTF
-int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
-int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf);
+__printf(3, 0) int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
+__printf(3, 0) int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf);
 #endif
 
 extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
-- 
cgit v1.2.3


From 803f97298e7de9242eb677a1351dcafbbcc9117e Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Fri, 21 Mar 2025 11:01:42 -0700
Subject: iommufd: Extend IOMMU_GET_HW_INFO to report PASID capability

PASID usage requires PASID support in both device and IOMMU. Since the
iommu drivers always enable the PASID capability for the device if it
is supported, this extends the IOMMU_GET_HW_INFO to report the PASID
capability to userspace. Also, enhances the selftest accordingly.

Link: https://patch.msgid.link/r/20250321180143.8468-5-yi.l.liu@intel.com
Cc: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org> #aarch64 platform
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/pci-ats.h      |  3 +++
 include/uapi/linux/iommufd.h | 14 +++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index 0e8b74e63767..75c6c86cf09d 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -42,6 +42,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features);
 void pci_disable_pasid(struct pci_dev *pdev);
 int pci_pasid_features(struct pci_dev *pdev);
 int pci_max_pasids(struct pci_dev *pdev);
+int pci_pasid_status(struct pci_dev *pdev);
 #else /* CONFIG_PCI_PASID */
 static inline int pci_enable_pasid(struct pci_dev *pdev, int features)
 { return -EINVAL; }
@@ -50,6 +51,8 @@ static inline int pci_pasid_features(struct pci_dev *pdev)
 { return -EINVAL; }
 static inline int pci_max_pasids(struct pci_dev *pdev)
 { return -EINVAL; }
+static inline int pci_pasid_status(struct pci_dev *pdev)
+{ return -EINVAL; }
 #endif /* CONFIG_PCI_PASID */
 
 #endif /* LINUX_PCI_ATS_H */
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 6901804ec736..e2c04e58a997 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -612,9 +612,17 @@ enum iommu_hw_info_type {
  *                                   IOMMU_HWPT_GET_DIRTY_BITMAP
  *                                   IOMMU_HWPT_SET_DIRTY_TRACKING
  *
+ * @IOMMU_HW_CAP_PCI_PASID_EXEC: Execute Permission Supported, user ignores it
+ *                               when the struct
+ *                               iommu_hw_info::out_max_pasid_log2 is zero.
+ * @IOMMU_HW_CAP_PCI_PASID_PRIV: Privileged Mode Supported, user ignores it
+ *                               when the struct
+ *                               iommu_hw_info::out_max_pasid_log2 is zero.
  */
 enum iommufd_hw_capabilities {
 	IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0,
+	IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1,
+	IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2,
 };
 
 /**
@@ -630,6 +638,9 @@ enum iommufd_hw_capabilities {
  *                 iommu_hw_info_type.
  * @out_capabilities: Output the generic iommu capability info type as defined
  *                    in the enum iommu_hw_capabilities.
+ * @out_max_pasid_log2: Output the width of PASIDs. 0 means no PASID support.
+ *                      PCI devices turn to out_capabilities to check if the
+ *                      specific capabilities is supported or not.
  * @__reserved: Must be 0
  *
  * Query an iommu type specific hardware information data from an iommu behind
@@ -653,7 +664,8 @@ struct iommu_hw_info {
 	__u32 data_len;
 	__aligned_u64 data_uptr;
 	__u32 out_data_type;
-	__u32 __reserved;
+	__u8 out_max_pasid_log2;
+	__u8 __reserved[3];
 	__aligned_u64 out_capabilities;
 };
 #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO)
-- 
cgit v1.2.3


From 858c9c10c123b7b04bba12c689db675c18d48bda Mon Sep 17 00:00:00 2001
From: Bagas Sanjaya <bagasdotme@gmail.com>
Date: Fri, 28 Mar 2025 18:46:54 +0700
Subject: iommufd: Fix iommu_vevent_header tables markup

Stephen Rothwell reports htmldocs warnings on iommufd_vevent_header
tables:

Documentation/userspace-api/iommufd:323: ./include/uapi/linux/iommufd.h:1048: CRITICAL: Unexpected section title or transition.

------------------------------------------------------------------------- [docutils]
WARNING: kernel-doc './scripts/kernel-doc -rst -enable-lineno -sphinx-version 8.1.3 ./include/uapi/linux/iommufd.h' processing failed with: Documentation/userspace-api/iommufd:323: ./include/uapi/linux/iommufd.h:1048: (SEVERE/4) Unexpected section title or transition.

-------------------------------------------------------------------------

These are because Sphinx confuses the tables for section headings. Fix
the table markup to squash away above warnings.

Fixes: e36ba5ab808e ("iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC")
Link: https://patch.msgid.link/r/20250328114654.55840-1-bagasdotme@gmail.com
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/linux-next/20250318213359.5dc56fd1@canb.auug.org.au/
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/uapi/linux/iommufd.h | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index e2c04e58a997..f29b6c44655e 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -1045,21 +1045,26 @@ enum iommu_veventq_flag {
  *            [0, INT_MAX] where the following index of INT_MAX is 0
  *
  * Each iommufd_vevent_header reports a sequence index of the following vEVENT:
- *  -------------------------------------------------------------------------
+ *
+ * +----------------------+-------+----------------------+-------+---+-------+
  * | header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...| dataN |
- *  -------------------------------------------------------------------------
+ * +----------------------+-------+----------------------+-------+---+-------+
+ *
  * And this sequence index is expected to be monotonic to the sequence index of
  * the previous vEVENT. If two adjacent sequence indexes has a delta larger than
  * 1, it means that delta - 1 number of vEVENTs has lost, e.g. two lost vEVENTs:
- *  -------------------------------------------------------------------------
+ *
+ * +-----+----------------------+-------+----------------------+-------+-----+
  * | ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ... |
- *  -------------------------------------------------------------------------
+ * +-----+----------------------+-------+----------------------+-------+-----+
+ *
  * If a vEVENT lost at the tail of the vEVENTQ and there is no following vEVENT
  * providing the next sequence index, an IOMMU_VEVENTQ_FLAG_LOST_EVENTS header
  * would be added to the tail, and no data would follow this header:
- *  ---------------------------------------------------------------------------
+ *
+ * +--+----------------------+-------+-----------------------------------------+
  * |..| header3 {sequence=3} | data3 | header4 {flags=LOST_EVENTS, sequence=4} |
- *  ---------------------------------------------------------------------------
+ * +--+----------------------+-------+-----------------------------------------+
  */
 struct iommufd_vevent_header {
 	__u32 flags;
@@ -1117,9 +1122,11 @@ struct iommu_vevent_arm_smmuv3 {
  *
  * Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by
  * a type-specific data structure, in a normal case:
- *  -------------------------------------------------------------
- * || header0 | data0 | header1 | data1 | ... | headerN | dataN ||
- *  -------------------------------------------------------------
+ *
+ * +-+---------+-------+---------+-------+-----+---------+-------+-+
+ * | | header0 | data0 | header1 | data1 | ... | headerN | dataN | |
+ * +-+---------+-------+---------+-------+-----+---------+-------+-+
+ *
  * unless a tailing IOMMU_VEVENTQ_FLAG_LOST_EVENTS header is logged (refer to
  * struct iommufd_vevent_header).
  */
-- 
cgit v1.2.3


From a30d4ff8193ef768dbb524824c7aa07c5486a63a Mon Sep 17 00:00:00 2001
From: Nir Lichtman <nir@lichtman.org>
Date: Tue, 4 Feb 2025 05:47:41 +0000
Subject: kdb: remove usage of static environment buffer

Problem: The set environment variable logic uses a static "heap" like
buffer to store the values of the variables, and they are never freed,
on top of that this is redundant since the kernel supplies allocation
facilities which are even used also in this file.

Solution: Remove the weird static buffer logic and use kmalloc instead,
call kfree when overriding an existing variable.

Signed-off-by: Nir Lichtman <nir@lichtman.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20250204054741.GB1219827@lichtman.org
Signed-off-by: Daniel Thompson <daniel@riscstar.com>
---
 include/linux/kdb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index 905a2e2f45f6..ecbf819deeca 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -104,7 +104,7 @@ extern int kdb_initial_cpu;
 #define KDB_NOENVVALUE	(-6)
 #define KDB_NOTIMP	(-7)
 #define KDB_ENVFULL	(-8)
-#define KDB_ENVBUFFULL	(-9)
+#define KDB_KMALLOCFAILED	(-9)
 #define KDB_TOOMANYBPT	(-10)
 #define KDB_TOOMANYDBREGS (-11)
 #define KDB_DUPBPT	(-12)
-- 
cgit v1.2.3


From ebf695f129367ed4b26df6baec2ea7fc50c9e6f0 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 27 Mar 2025 17:51:15 +0800
Subject: ublk: add segment parameter

IO split is usually bad in io_uring world, since -EAGAIN is caused and
IO handling may have to fallback to io-wq, this way does hurt performance.

ublk starts to support zero copy recently, for avoiding unnecessary IO
split, ublk driver's segment limit should be aligned with backend
device's segment limit.

Another reason is that io_buffer_register_bvec() needs to allocate bvecs,
which number is aligned with ublk request segment number, so that big
memory allocation can be avoided by setting reasonable max_segments limit.

So add segment parameter for providing ublk server chance to align
segment limit with backend, and keep it reasonable from implementation
viewpoint.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250327095123.179113-7-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/ublk_cmd.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 7255b36b5cf6..583b86681c93 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -410,6 +410,29 @@ struct ublk_param_dma_align {
 	__u8	pad[4];
 };
 
+#define UBLK_MIN_SEGMENT_SIZE   4096
+/*
+ * If any one of the three segment parameter is set as 0, the behavior is
+ * undefined.
+ */
+struct ublk_param_segment {
+	/*
+	 * seg_boundary_mask + 1 needs to be power_of_2(), and the sum has
+	 * to be >= UBLK_MIN_SEGMENT_SIZE(4096)
+	 */
+	__u64 	seg_boundary_mask;
+
+	/*
+	 * max_segment_size could be override by virt_boundary_mask, so be
+	 * careful when setting both.
+	 *
+	 * max_segment_size has to be >= UBLK_MIN_SEGMENT_SIZE(4096)
+	 */
+	__u32 	max_segment_size;
+	__u16 	max_segments;
+	__u8	pad[2];
+};
+
 struct ublk_params {
 	/*
 	 * Total length of parameters, userspace has to set 'len' for both
@@ -423,6 +446,7 @@ struct ublk_params {
 #define UBLK_PARAM_TYPE_DEVT            (1 << 2)
 #define UBLK_PARAM_TYPE_ZONED           (1 << 3)
 #define UBLK_PARAM_TYPE_DMA_ALIGN       (1 << 4)
+#define UBLK_PARAM_TYPE_SEGMENT         (1 << 5)
 	__u32	types;			/* types of parameter included */
 
 	struct ublk_param_basic		basic;
@@ -430,6 +454,7 @@ struct ublk_params {
 	struct ublk_param_devt		devt;
 	struct ublk_param_zoned	zoned;
 	struct ublk_param_dma_align	dma;
+	struct ublk_param_segment	seg;
 };
 
 #endif
-- 
cgit v1.2.3


From 0f6439f61a6e2ddc92b98362c6d1afc210f56a90 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 22 Jan 2025 13:55:27 -0800
Subject: fuse: add kernel-enforced timeout option for requests

There are situations where fuse servers can become unresponsive or
stuck, for example if the server is deadlocked. Currently, there's no
good way to detect if a server is stuck and needs to be killed manually.

This commit adds an option for enforcing a timeout (in seconds) for
requests where if the timeout elapses without the server responding to
the request, the connection will be automatically aborted.

Please note that these timeouts are not 100% precise. For example, the
request may take roughly an extra FUSE_TIMEOUT_TIMER_FREQ seconds beyond
the requested timeout due to internal implementation, in order to
mitigate overhead.

[SzM: Bump the API version number]

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/uapi/linux/fuse.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 5e0eb41d967e..5ec43ecbceb7 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -229,6 +229,9 @@
  *    - FUSE_URING_IN_OUT_HEADER_SZ
  *    - FUSE_URING_OP_IN_OUT_SZ
  *    - enum fuse_uring_cmd
+ *
+ *  7.43
+ *  - add FUSE_REQUEST_TIMEOUT
  */
 
 #ifndef _LINUX_FUSE_H
@@ -264,7 +267,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 42
+#define FUSE_KERNEL_MINOR_VERSION 43
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -435,6 +438,8 @@ struct fuse_file_lock {
  *		    of the request ID indicates resend requests
  * FUSE_ALLOW_IDMAP: allow creation of idmapped mounts
  * FUSE_OVER_IO_URING: Indicate that client supports io-uring
+ * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests.
+ *			 init_out.request_timeout contains the timeout (in secs)
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -477,11 +482,11 @@ struct fuse_file_lock {
 #define FUSE_PASSTHROUGH	(1ULL << 37)
 #define FUSE_NO_EXPORT_SUPPORT	(1ULL << 38)
 #define FUSE_HAS_RESEND		(1ULL << 39)
-
 /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */
 #define FUSE_DIRECT_IO_RELAX	FUSE_DIRECT_IO_ALLOW_MMAP
 #define FUSE_ALLOW_IDMAP	(1ULL << 40)
 #define FUSE_OVER_IO_URING	(1ULL << 41)
+#define FUSE_REQUEST_TIMEOUT	(1ULL << 42)
 
 /**
  * CUSE INIT request/reply flags
@@ -909,7 +914,8 @@ struct fuse_init_out {
 	uint16_t	map_alignment;
 	uint32_t	flags2;
 	uint32_t	max_stack_depth;
-	uint32_t	unused[6];
+	uint16_t	request_timeout;
+	uint16_t	unused[11];
 };
 
 #define CUSE_INIT_INFO_MAX 4096
-- 
cgit v1.2.3


From 296e16961817e8e5f574661febc608ac0c0c0108 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 31 Mar 2025 08:55:47 +0100
Subject: io_uring: hide caches sqes from drivers

There is now an io_uring private part of cmd async_data, move saved sqe
into it. Drivers are accessing it via struct io_uring_cmd::cmd.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ecbe078dd57acefdbc4366d083327086c0879378.1743357121.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring/cmd.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index e6723fa95160..0634a3de1782 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -21,7 +21,6 @@ struct io_uring_cmd {
 
 struct io_uring_cmd_data {
 	void			*op_data;
-	struct io_uring_sqe	sqes[2];
 };
 
 static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
-- 
cgit v1.2.3


From f278b6d5bb465c7fd66f3d103812947e55b376ed Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 31 Mar 2025 07:59:46 +0000
Subject: Revert "tcp: avoid atomic operations on sk->sk_rmem_alloc"

This reverts commit 0de2a5c4b824da2205658ebebb99a55c43cdf60f.

I forgot that a TCP socket could receive messages in its error queue.

sock_queue_err_skb() can be called without socket lock being held,
and changes sk->sk_rmem_alloc.

The fact that skbs in error queue are limited by sk->sk_rcvbuf
means that error messages can be dropped if socket receive
queues are full, which is an orthogonal issue.

In future kernels, we could use a separate sk->sk_error_mem_alloc
counter specifically for the error queue.

Fixes: 0de2a5c4b824 ("tcp: avoid atomic operations on sk->sk_rmem_alloc")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250331075946.31960-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index df04dc09c519..4450c384ef17 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -779,7 +779,6 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
 
 /* tcp.c */
 void tcp_get_info(struct sock *, struct tcp_info *);
-void tcp_sock_rfree(struct sk_buff *skb);
 
 /* Read 'sendfile()'-style from a TCP socket */
 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
@@ -2899,18 +2898,4 @@ enum skb_drop_reason tcp_inbound_hash(struct sock *sk,
 		const void *saddr, const void *daddr,
 		int family, int dif, int sdif);
 
-/* version of skb_set_owner_r() avoiding one atomic_add() */
-static inline void tcp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
-{
-	skb_orphan(skb);
-	skb->sk = sk;
-	skb->destructor = tcp_sock_rfree;
-
-	sock_owned_by_me(sk);
-	atomic_set(&sk->sk_rmem_alloc,
-		   atomic_read(&sk->sk_rmem_alloc) + skb->truesize);
-
-	sk_forward_alloc_add(sk, -skb->truesize);
-}
-
 #endif	/* _TCP_H */
-- 
cgit v1.2.3


From 09f37f2d7b21ff35b8b533f9ab8cfad2fe8f72f6 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Mon, 31 Mar 2025 21:26:44 -0700
Subject: sched/smt: Always inline sched_smt_active()

sched_smt_active() can be called from noinstr code, so it should always
be inlined.  The CONFIG_SCHED_SMT version already has __always_inline.
Do the same for its !CONFIG_SCHED_SMT counterpart.

Fixes the following warning:

  vmlinux.o: error: objtool: intel_idle_ibrs+0x13: call to sched_smt_active() leaves .noinstr.text section

Fixes: 321a874a7ef8 ("sched/smt: Expose sched_smt_present static key")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/1d03907b0a247cf7fb5c1d518de378864f603060.1743481539.git.jpoimboe@kernel.org
Closes: https://lore.kernel.org/r/202503311434.lyw2Tveh-lkp@intel.com/
---
 include/linux/sched/smt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h
index fb1e295e7e63..166b19af956f 100644
--- a/include/linux/sched/smt.h
+++ b/include/linux/sched/smt.h
@@ -12,7 +12,7 @@ static __always_inline bool sched_smt_active(void)
 	return static_branch_likely(&sched_smt_present);
 }
 #else
-static inline bool sched_smt_active(void) { return false; }
+static __always_inline bool sched_smt_active(void) { return false; }
 #endif
 
 void arch_smt_update(void);
-- 
cgit v1.2.3


From 9ac50f7311dc8b39e355582f14c1e82da47a8196 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Mon, 31 Mar 2025 21:26:45 -0700
Subject: context_tracking: Always inline ct_{nmi,irq}_{enter,exit}()

Thanks to CONFIG_DEBUG_SECTION_MISMATCH, empty functions can be
generated out of line.  These can be called from noinstr code, so make
sure they're always inlined.

Fixes the following warnings:

  vmlinux.o: warning: objtool: irqentry_nmi_enter+0xa2: call to ct_nmi_enter() leaves .noinstr.text section
  vmlinux.o: warning: objtool: irqentry_nmi_exit+0x16: call to ct_nmi_exit() leaves .noinstr.text section
  vmlinux.o: warning: objtool: irqentry_exit+0x78: call to ct_irq_exit() leaves .noinstr.text section

Fixes: 6f0e6c1598b1 ("context_tracking: Take IRQ eqs entrypoints over RCU")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/8509bce3f536bcd4ae7af3a2cf6930d48c5e631a.1743481539.git.jpoimboe@kernel.org
Closes: https://lore.kernel.org/d1eca076-fdde-484a-b33e-70e0d167c36d@infradead.org
---
 include/linux/context_tracking_irq.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/context_tracking_irq.h b/include/linux/context_tracking_irq.h
index c50b5670c4a5..197916ee91a4 100644
--- a/include/linux/context_tracking_irq.h
+++ b/include/linux/context_tracking_irq.h
@@ -10,12 +10,12 @@ void ct_irq_exit_irqson(void);
 void ct_nmi_enter(void);
 void ct_nmi_exit(void);
 #else
-static inline void ct_irq_enter(void) { }
-static inline void ct_irq_exit(void) { }
+static __always_inline void ct_irq_enter(void) { }
+static __always_inline void ct_irq_exit(void) { }
 static inline void ct_irq_enter_irqson(void) { }
 static inline void ct_irq_exit_irqson(void) { }
-static inline void ct_nmi_enter(void) { }
-static inline void ct_nmi_exit(void) { }
+static __always_inline void ct_nmi_enter(void) { }
+static __always_inline void ct_nmi_exit(void) { }
 #endif
 
 #endif
-- 
cgit v1.2.3


From 6309a5c43b0dc629851f25b2e5ef8beff61d08e5 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Mon, 31 Mar 2025 21:26:46 -0700
Subject: rcu-tasks: Always inline rcu_irq_work_resched()

Thanks to CONFIG_DEBUG_SECTION_MISMATCH, empty functions can be
generated out of line.  rcu_irq_work_resched() can be called from
noinstr code, so make sure it's always inlined.

Fixes: 564506495ca9 ("rcu/context-tracking: Move deferred nocb resched to context tracking")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/e84f15f013c07e4c410d972e75620c53b62c1b3e.1743481539.git.jpoimboe@kernel.org
Closes: https://lore.kernel.org/d1eca076-fdde-484a-b33e-70e0d167c36d@infradead.org
---
 include/linux/rcupdate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f8159f8a7d73..120536f4c6eb 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -132,7 +132,7 @@ static inline void rcu_sysrq_end(void) { }
 #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
 void rcu_irq_work_resched(void);
 #else
-static inline void rcu_irq_work_resched(void) { }
+static __always_inline void rcu_irq_work_resched(void) { }
 #endif
 
 #ifdef CONFIG_RCU_NOCB_CPU
-- 
cgit v1.2.3


From c2004b6efb1c891b80bef186771a3668fc25f6b3 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 3 Mar 2025 23:36:37 +0100
Subject: rtc: mt6397: drop unused defines

RTC_NUM_YEARS has never been used, the other defines are not used since
commit 34bbdc12d04e ("rtc: mt6359: Add RTC hardware range and add support
for start-year")

Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20250303223637.1135362-1-alexandre.belloni@bootlin.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/mfd/mt6397/rtc.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mfd/mt6397/rtc.h b/include/linux/mfd/mt6397/rtc.h
index 068ae1c0f0e8..27883af44f87 100644
--- a/include/linux/mfd/mt6397/rtc.h
+++ b/include/linux/mfd/mt6397/rtc.h
@@ -60,11 +60,6 @@
 #define RTC_PDN2               0x002e
 #define RTC_PDN2_PWRON_ALARM   BIT(4)
 
-#define RTC_MIN_YEAR           1968
-#define RTC_BASE_YEAR          1900
-#define RTC_NUM_YEARS          128
-#define RTC_MIN_YEAR_OFFSET    (RTC_MIN_YEAR - RTC_BASE_YEAR)
-
 #define MTK_RTC_POLL_DELAY_US  10
 #define MTK_RTC_POLL_TIMEOUT   (jiffies_to_usecs(HZ))
 
-- 
cgit v1.2.3


From b98072af60a7ce19214c80f10a7daab924c69182 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 8 Jan 2025 00:48:21 -0700
Subject: mm/hugetlb_vmemmap: fix memory loads ordering

Using x86_64 as an example, for a 32KB struct page[] area describing a 2MB
hugeTLB, HVO reduces the area to 4KB by the following steps:

1. Split the (r/w vmemmap) PMD mapping the area into 512 (r/w) PTEs;
2. For the 8 PTEs mapping the area, remap PTE 1-7 to the page mapped
   by PTE 0, and at the same time change the permission from r/w to
   r/o;
3. Free the pages PTE 1-7 used to map, hence the reduction from 32KB
   to 4KB.

However, the following race can happen due to improperly memory loads
ordering:
  CPU 1 (HVO)                     CPU 2 (speculative PFN walker)

  page_ref_freeze()
  synchronize_rcu()
                                  rcu_read_lock()
                                  page_is_fake_head() is false
  vmemmap_remap_pte()
  XXX: struct page[] becomes r/o

  page_ref_unfreeze()
                                  page_ref_count() is not zero

                                  atomic_add_unless(&page->_refcount)
                                  XXX: try to modify r/o struct page[]

Specifically, page_is_fake_head() must be ordered after page_ref_count()
on CPU 2 so that it can only return true for this case, to avoid the later
attempt to modify r/o struct page[].

This patch adds the missing memory barrier and makes the tests on
page_is_fake_head() and page_ref_count() done in the proper order.

Link: https://lkml.kernel.org/r/20250108074822.722696-1-yuzhao@google.com
Fixes: bd225530a4c7 ("mm/hugetlb_vmemmap: fix race with speculative PFN walkers")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: Will Deacon <will@kernel.org>
Closes: https://lore.kernel.org/20241128142028.GA3506@willie-the-truck/
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: Will Deacon <will@kernel.org>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/page_ref.h   |  2 +-
 2 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5bd9492a66ee..e6a21b62dcce 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -226,11 +226,48 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page
 	}
 	return page;
 }
+
+static __always_inline bool page_count_writable(const struct page *page, int u)
+{
+	if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
+		return true;
+
+	/*
+	 * The refcount check is ordered before the fake-head check to prevent
+	 * the following race:
+	 *   CPU 1 (HVO)                     CPU 2 (speculative PFN walker)
+	 *
+	 *   page_ref_freeze()
+	 *   synchronize_rcu()
+	 *                                   rcu_read_lock()
+	 *                                   page_is_fake_head() is false
+	 *   vmemmap_remap_pte()
+	 *   XXX: struct page[] becomes r/o
+	 *
+	 *   page_ref_unfreeze()
+	 *                                   page_ref_count() is not zero
+	 *
+	 *                                   atomic_add_unless(&page->_refcount)
+	 *                                   XXX: try to modify r/o struct page[]
+	 *
+	 * The refcount check also prevents modification attempts to other (r/o)
+	 * tail pages that are not fake heads.
+	 */
+	if (atomic_read_acquire(&page->_refcount) == u)
+		return false;
+
+	return page_fixed_fake_head(page) == page;
+}
 #else
 static inline const struct page *page_fixed_fake_head(const struct page *page)
 {
 	return page;
 }
+
+static inline bool page_count_writable(const struct page *page, int u)
+{
+	return true;
+}
 #endif
 
 static __always_inline int page_is_fake_head(const struct page *page)
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 8c236c651d1d..544150d1d5fd 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -234,7 +234,7 @@ static inline bool page_ref_add_unless(struct page *page, int nr, int u)
 
 	rcu_read_lock();
 	/* avoid writing to the vmemmap area being remapped */
-	if (!page_is_fake_head(page) && page_ref_count(page) != u)
+	if (page_count_writable(page, u))
 		ret = atomic_add_unless(&page->_refcount, nr, u);
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From f21bb37afbba0878c8d417cd861e43d014119845 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Tue, 25 Feb 2025 11:45:51 +0800
Subject: mm: pgtable: make generic tlb_remove_table() use struct ptdesc

Patch series "remove tlb_remove_page_ptdesc()", v2.

As suggested by Peter Zijlstra below [1], this series aims to remove
tlb_remove_page_ptdesc().

: Fundamentally tlb_remove_page() is about removing *pages* as from a PTE,
: there should not be a page-table anywhere near here *ever*.
:
: Yes, some architectures use tlb_remove_page() for page-tables too, but
: that is more or less an implementation detail that can be fixed.

After this series, all architectures use tlb_remove_table() or
tlb_remove_ptdesc() to remove the page table pages.  In the future, once
all architectures using tlb_remove_table() have also converted to using
struct ptdesc (eg.  powerpc), it may be possible to use only
tlb_remove_ptdesc().

[1] https://lore.kernel.org/linux-mm/20250103111457.GC22934@noisy.programming.kicks-ass.net/


This patch (of 6):

Now only arm will call tlb_remove_ptdesc()/tlb_remove_table() when
CONFIG_MMU_GATHER_TABLE_FREE is disabled.  In this case, the type of the
table parameter is actually struct ptdesc * instead of struct page *.

Since struct ptdesc still overlaps with struct page and has not been
separated from it, forcing the table parameter to struct page * will not
cause any problems at this time.  But this is definitely incorrect and
needs to be fixed.  So just like the generic __tlb_remove_table(), let
generic tlb_remove_table() use struct ptdesc by default when
CONFIG_MMU_GATHER_TABLE_FREE is disabled.

Link: https://lkml.kernel.org/r/cover.1740454179.git.zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/5be8c3ab7bd68510bf0db4cf84010f4dfe372917.1740454179.git.zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickens <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: "Mike Rapoport (IBM)" <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/tlb.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index d1adfba8387e..e27d24dd8d5e 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -227,10 +227,10 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page);
  */
 static inline void tlb_remove_table(struct mmu_gather *tlb, void *table)
 {
-	struct page *page = (struct page *)table;
+	struct ptdesc *ptdesc = (struct ptdesc *)table;
 
-	pagetable_dtor(page_ptdesc(page));
-	tlb_remove_page(tlb, page);
+	pagetable_dtor(ptdesc);
+	tlb_remove_page(tlb, ptdesc_page(ptdesc));
 }
 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */
 
-- 
cgit v1.2.3


From 1a03c275a3ad4e47d479a63037b72f2b305f0c13 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Tue, 25 Feb 2025 11:45:52 +0800
Subject: mm: pgtable: change pt parameter of tlb_remove_ptdesc() to struct
 ptdesc*

All callers of tlb_remove_ptdesc() pass it a pointer of struct ptdesc, so
let's change the pt parameter from void * to struct ptdesc * to perform a
type safety check.

Link: https://lkml.kernel.org/r/60bb44299cf2d731df6592e446e7f694054d0dbe.1740454179.git.zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Originally-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickens <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: "Mike Rapoport (IBM)" <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/tlb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index e27d24dd8d5e..bf845019af36 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -493,7 +493,7 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 	return tlb_remove_page_size(tlb, page, PAGE_SIZE);
 }
 
-static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt)
+static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt)
 {
 	tlb_remove_table(tlb, pt);
 }
-- 
cgit v1.2.3


From 02d9e1a2048e47d39733f0ced71ce8e8fee3e56d Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Tue, 25 Feb 2025 11:45:56 +0800
Subject: mm: pgtable: remove tlb_remove_page_ptdesc()

The tlb_remove_ptdesc()/tlb_remove_table() is specially designed for page
table pages, and now all architectures have been converted to use it to
remove page table pages.  So let's remove tlb_remove_page_ptdesc(), it
currently has no users and should not be used for page table pages.

Link: https://lkml.kernel.org/r/3df04c8494339073b71be4acb2d92e108ecd1b60.1740454179.git.zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickens <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: "Mike Rapoport (IBM)" <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/tlb.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index bf845019af36..88a42973fa47 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -498,12 +498,6 @@ static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt)
 	tlb_remove_table(tlb, pt);
 }
 
-/* Like tlb_remove_ptdesc, but for page-like page directories. */
-static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt)
-{
-	tlb_remove_page(tlb, ptdesc_page(pt));
-}
-
 static inline void tlb_change_page_size(struct mmu_gather *tlb,
 						     unsigned int page_size)
 {
-- 
cgit v1.2.3


From 5796d3967c0956734bd1249f76989ca80da0225b Mon Sep 17 00:00:00 2001
From: Jeff Xu <jeffxu@chromium.org>
Date: Wed, 5 Mar 2025 02:17:05 +0000
Subject: mseal sysmap: kernel config and header change
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mseal system mappings", v9.

As discussed during mseal() upstream process [1], mseal() protects the
VMAs of a given virtual memory range against modifications, such as the
read/write (RW) and no-execute (NX) bits.  For complete descriptions of
memory sealing, please see mseal.rst [2].

The mseal() is useful to mitigate memory corruption issues where a
corrupted pointer is passed to a memory management system.  For example,
such an attacker primitive can break control-flow integrity guarantees
since read-only memory that is supposed to be trusted can become writable
or .text pages can get remapped.

The system mappings are readonly only, memory sealing can protect them
from ever changing to writable or unmmap/remapped as different attributes.

System mappings such as vdso, vvar, vvar_vclock, vectors (arm
compat-mode), sigpage (arm compat-mode), are created by the kernel during
program initialization, and could be sealed after creation.

Unlike the aforementioned mappings, the uprobe mapping is not established
during program startup.  However, its lifetime is the same as the
process's lifetime [3].  It could be sealed from creation.

The vsyscall on x86-64 uses a special address (0xffffffffff600000), which
is outside the mm managed range.  This means mprotect, munmap, and mremap
won't work on the vsyscall.  Since sealing doesn't enhance the vsyscall's
security, it is skipped in this patch.  If we ever seal the vsyscall, it
is probably only for decorative purpose, i.e.  showing the 'sl' flag in
the /proc/pid/smaps.  For this patch, it is ignored.

It is important to note that the CHECKPOINT_RESTORE feature (CRIU) may
alter the system mappings during restore operations.  UML(User Mode Linux)
and gVisor, rr are also known to change the vdso/vvar mappings.
Consequently, this feature cannot be universally enabled across all
systems.  As such, CONFIG_MSEAL_SYSTEM_MAPPINGS is disabled by default.

To support mseal of system mappings, architectures must define
CONFIG_ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS and update their special
mappings calls to pass mseal flag.  Additionally, architectures must
confirm they do not unmap/remap system mappings during the process
lifetime.  The existence of this flag for an architecture implies that it
does not require the remapping of thest system mappings during process
lifetime, so sealing these mappings is safe from a kernel perspective.

This version covers x86-64 and arm64 archiecture as minimum viable feature.

While no specific CPU hardware features are required for enable this
feature on an archiecture, memory sealing requires a 64-bit kernel.  Other
architectures can choose whether or not to adopt this feature.  Currently,
I'm not aware of any instances in the kernel code that actively
munmap/mremap a system mapping without a request from userspace.  The PPC
does call munmap when _install_special_mapping fails for vdso; however,
it's uncertain if this will ever fail for PPC - this needs to be
investigated by PPC in the future [4].  The UML kernel can add this
support when KUnit tests require it [5].

In this version, we've improved the handling of system mapping sealing
from previous versions, instead of modifying the _install_special_mapping
function itself, which would affect all architectures, we now call
_install_special_mapping with a sealing flag only within the specific
architecture that requires it.  This targeted approach offers two key
advantages: 1) It limits the code change's impact to the necessary
architectures, and 2) It aligns with the software architecture by keeping
the core memory management within the mm layer, while delegating the
decision of sealing system mappings to the individual architecture, which
is particularly relevant since 32-bit architectures never require sealing.

Prior to this patch series, we explored sealing special mappings from
userspace using glibc's dynamic linker.  This approach revealed several
issues:

- The PT_LOAD header may report an incorrect length for vdso, (smaller
  than its actual size).  The dynamic linker, which relies on PT_LOAD
  information to determine mapping size, would then split and partially
  seal the vdso mapping.  Since each architecture has its own vdso/vvar
  code, fixing this in the kernel would require going through each
  archiecture.  Our initial goal was to enable sealing readonly mappings,
  e.g.  .text, across all architectures, sealing vdso from kernel since
  creation appears to be simpler than sealing vdso at glibc.

- The [vvar] mapping header only contains address information, not
  length information.  Similar issues might exist for other special
  mappings.

- Mappings like uprobe are not covered by the dynamic linker, and there
  is no effective solution for them.

This feature's security enhancements will benefit ChromeOS, Android, and
other high security systems.

Testing:
This feature was tested on ChromeOS and Android for both x86-64 and ARM64.
- Enable sealing and verify vdso/vvar, sigpage, vector are sealed properly,
  i.e. "sl" shown in the smaps for those mappings, and mremap is blocked.
- Passing various automation tests (e.g. pre-checkin) on ChromeOS and
  Android to ensure the sealing doesn't affect the functionality of
  Chromebook and Android phone.

I also tested the feature on Ubuntu on x86-64:
- With config disabled, vdso/vvar is not sealed,
- with config enabled, vdso/vvar is sealed, and booting up Ubuntu is OK,
  normal operations such as browsing the web, open/edit doc are OK.

Link: https://lore.kernel.org/all/20240415163527.626541-1-jeffxu@chromium.org/ [1]
Link: Documentation/userspace-api/mseal.rst [2]
Link: https://lore.kernel.org/all/CABi2SkU9BRUnqf70-nksuMCQ+yyiWjo3fM4XkRkL-NrCZxYAyg@mail.gmail.com/ [3]
Link: https://lore.kernel.org/all/CABi2SkV6JJwJeviDLsq9N4ONvQ=EFANsiWkgiEOjyT9TQSt+HA@mail.gmail.com/ [4]
Link: https://lore.kernel.org/all/202502251035.239B85A93@keescook/ [5]


This patch (of 7):

Provide infrastructure to mseal system mappings.  Establish two kernel
configs (CONFIG_MSEAL_SYSTEM_MAPPINGS,
ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS) and VM_SEALED_SYSMAP macro for future
patches.

Link: https://lkml.kernel.org/r/20250305021711.3867874-1-jeffxu@google.com
Link: https://lkml.kernel.org/r/20250305021711.3867874-2-jeffxu@google.com
Signed-off-by: Jeff Xu <jeffxu@chromium.org>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Cc: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Anna-Maria Behnsen <anna-maria@linutronix.de>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Benjamin Berg <benjamin@sipsolutions.net>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Elliot Hughes <enh@google.com>
Cc: Florian Faineli <f.fainelli@gmail.com>
Cc: Greg Ungerer <gerg@kernel.org>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jason A. Donenfeld <jason@zx2c4.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Jorge Lucangeli Obes <jorgelo@chromium.org>
Cc: Linus Waleij <linus.walleij@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <mike.rapoport@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Stephen Röttger <sroettger@google.com>
Cc: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 32ba0e33422b..778f5de6a12e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4236,4 +4236,14 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st
 int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
 int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
 
+
+/*
+ * mseal of userspace process's system mappings.
+ */
+#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
+#define VM_SEALED_SYSMAP	VM_SEALED
+#else
+#define VM_SEALED_SYSMAP	VM_NONE
+#endif
+
 #endif /* _LINUX_MM_H */
-- 
cgit v1.2.3


From e2a33a2a3258794891cdd6ca4b1318da6594d157 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 26 Mar 2025 11:26:06 -0400
Subject: lib/sort.c: add _nonatomic() variants with cond_resched()

bcachefs calls sort() during recovery to sort all keys it found in the
journal, and this may be very large - gigabytes on large machines.

This has been causing "task blocked" warnings, so needs a
cond_resched().

[kent.overstreet@linux.dev: fix kerneldoc]
  Link: https://lkml.kernel.org/r/cgsr5a447pxqomc4gvznsp5yroqmif4omd7o5lsr2swifjhoic@yzjjrx2bvrq7
Link: https://lkml.kernel.org/r/20250326152606.2594920-1-kent.overstreet@linux.dev
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sort.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/sort.h b/include/linux/sort.h
index e163287ac6c1..8e5603b10941 100644
--- a/include/linux/sort.h
+++ b/include/linux/sort.h
@@ -13,4 +13,15 @@ void sort(void *base, size_t num, size_t size,
 	  cmp_func_t cmp_func,
 	  swap_func_t swap_func);
 
+/* Versions that periodically call cond_resched(): */
+
+void sort_r_nonatomic(void *base, size_t num, size_t size,
+		      cmp_r_func_t cmp_func,
+		      swap_r_func_t swap_func,
+		      const void *priv);
+
+void sort_nonatomic(void *base, size_t num, size_t size,
+		    cmp_func_t cmp_func,
+		    swap_func_t swap_func);
+
 #endif
-- 
cgit v1.2.3


From 149974fdb8e186a1c72b87cee806373624a8a375 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 25 Mar 2025 21:51:51 +0800
Subject: block: add for_each_mp_bvec()

Add helper of for_each_mp_bvec() for io_uring to import fixed kernel
buffer.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250325135155.935398-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bvec.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index ba8f52d48b94..204b22a99c4b 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -184,6 +184,12 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv,
 		((bvl = bvec_iter_bvec((bio_vec), (iter))), 1);	\
 	     bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len))
 
+#define for_each_mp_bvec(bvl, bio_vec, iter, start)			\
+	for (iter = (start);						\
+	     (iter).bi_size &&						\
+		((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1);	\
+	     bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len))
+
 /* for iterating one bio from start to end */
 #define BVEC_ITER_ALL_INIT (struct bvec_iter)				\
 {									\
-- 
cgit v1.2.3


From 4c975fd700022c90e61a46326e3444e08317876e Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Tue, 1 Apr 2025 09:34:43 -0700
Subject: net: hold instance lock during NETDEV_REGISTER/UP

Callers of inetdev_init can come from several places with inconsistent
expectation about netdev instance lock. Grab instance lock during
REGISTER (plus UP). Also solve the inconsistency with UNREGISTER
where it was locked only during move netns path.

WARNING: CPU: 10 PID: 1479 at ./include/net/netdev_lock.h:54
__netdev_update_features+0x65f/0xca0
__warn+0x81/0x180
__netdev_update_features+0x65f/0xca0
report_bug+0x156/0x180
handle_bug+0x4f/0x90
exc_invalid_op+0x13/0x60
asm_exc_invalid_op+0x16/0x20
__netdev_update_features+0x65f/0xca0
netif_disable_lro+0x30/0x1d0
inetdev_init+0x12f/0x1f0
inetdev_event+0x48b/0x870
notifier_call_chain+0x38/0xf0
register_netdevice+0x741/0x8b0
register_netdev+0x1f/0x40
mlx5e_probe+0x4e3/0x8e0 [mlx5_core]
auxiliary_bus_probe+0x3f/0x90
really_probe+0xc3/0x3a0
__driver_probe_device+0x80/0x150
driver_probe_device+0x1f/0x90
__device_attach_driver+0x7d/0x100
bus_for_each_drv+0x80/0xd0
__device_attach+0xb4/0x1c0
bus_probe_device+0x91/0xa0
device_add+0x657/0x870

Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reported-by: Cosmin Ratiu <cratiu@nvidia.com>
Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations")
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250401163452.622454-3-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa79145518d1..cf3b6445817b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4192,7 +4192,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
 int netif_set_alias(struct net_device *dev, const char *alias, size_t len);
 int dev_set_alias(struct net_device *, const char *, size_t);
 int dev_get_alias(const struct net_device *, char *, size_t);
-int netif_change_net_namespace(struct net_device *dev, struct net *net,
+int __dev_change_net_namespace(struct net_device *dev, struct net *net,
 			       const char *pat, int new_ifindex,
 			       struct netlink_ext_ack *extack);
 int dev_change_net_namespace(struct net_device *dev, struct net *net,
-- 
cgit v1.2.3


From 8965c160b8f7333df895321c8aa6bad4a7175f2b Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Tue, 1 Apr 2025 09:34:44 -0700
Subject: net: use netif_disable_lro in ipv6_add_dev

ipv6_add_dev might call dev_disable_lro which unconditionally grabs
instance lock, so it will deadlock during NETDEV_REGISTER. Switch
to netif_disable_lro.

Make sure all callers hold the instance lock as well.

Cc: Cosmin Ratiu <cratiu@nvidia.com>
Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations")
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250401163452.622454-4-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 8a48ade24620..47ed6d23853d 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -667,14 +667,6 @@ static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast,
 		memcpy(buf, &naddr, sizeof(naddr));
 }
 
-#if IS_MODULE(CONFIG_IPV6)
-#define EXPORT_IPV6_MOD(X) EXPORT_SYMBOL(X)
-#define EXPORT_IPV6_MOD_GPL(X) EXPORT_SYMBOL_GPL(X)
-#else
-#define EXPORT_IPV6_MOD(X)
-#define EXPORT_IPV6_MOD_GPL(X)
-#endif
-
 #if IS_ENABLED(CONFIG_IPV6)
 #include <linux/ipv6.h>
 #endif
@@ -694,6 +686,14 @@ static __inline__ void inet_reset_saddr(struct sock *sk)
 
 #endif
 
+#if IS_MODULE(CONFIG_IPV6)
+#define EXPORT_IPV6_MOD(X) EXPORT_SYMBOL(X)
+#define EXPORT_IPV6_MOD_GPL(X) EXPORT_SYMBOL_GPL(X)
+#else
+#define EXPORT_IPV6_MOD(X)
+#define EXPORT_IPV6_MOD_GPL(X)
+#endif
+
 static inline unsigned int ipv4_addr_hash(__be32 ip)
 {
 	return (__force unsigned int) ip;
-- 
cgit v1.2.3


From 1901066aab7654f4a225ac29354a564d891d0c1a Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Tue, 1 Apr 2025 09:34:46 -0700
Subject: netdevsim: add dummy device notifiers

In order to exercise and verify notifiers' locking assumptions,
register dummy notifiers (via register_netdevice_notifier_dev_net).
Share notifier event handler that enforces the assumptions with
lock_debug.c (rename and export rtnl_net_debug_event as
netdev_debug_event). Add ops lock asserts to netdev_debug_event.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250401163452.622454-6-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netdev_lock.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h
index 1c0c9a94cc22..c316b551df8d 100644
--- a/include/net/netdev_lock.h
+++ b/include/net/netdev_lock.h
@@ -98,4 +98,7 @@ static inline int netdev_lock_cmp_fn(const struct lockdep_map *a,
 				  &qdisc_xmit_lock_key);	\
 }
 
+int netdev_debug_event(struct notifier_block *nb, unsigned long event,
+		       void *ptr);
+
 #endif
-- 
cgit v1.2.3


From ec304b70d46bd2ed66541c5b57b63276529e05b1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 2 Apr 2025 18:34:04 -0700
Subject: net: move mp dev config validation to __net_mp_open_rxq()

devmem code performs a number of safety checks to avoid having
to reimplement all of them in the drivers. Move those to
__net_mp_open_rxq() and reuse that function for binding to make
sure that io_uring ZC also benefits from them.

While at it rename the queue ID variable to rxq_idx in
__net_mp_open_rxq(), we touch most of the relevant lines.

The XArray insertion is reordered after the netdev_rx_queue_restart()
call, otherwise we'd need to duplicate the queue index check
or risk inserting an invalid pointer. The XArray allocation
failures should be extremely rare.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Fixes: 6e18ed929d3b ("net: add helpers for setting a memory provider on an rx queue")
Link: https://patch.msgid.link/20250403013405.2827250-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool/memory_provider.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h
index b3e665897767..ada4f968960a 100644
--- a/include/net/page_pool/memory_provider.h
+++ b/include/net/page_pool/memory_provider.h
@@ -6,6 +6,7 @@
 #include <net/page_pool/types.h>
 
 struct netdev_rx_queue;
+struct netlink_ext_ack;
 struct sk_buff;
 
 struct memory_provider_ops {
@@ -24,8 +25,13 @@ void net_mp_niov_clear_page_pool(struct net_iov *niov);
 
 int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx,
 		    struct pp_memory_provider_params *p);
+int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+		      const struct pp_memory_provider_params *p,
+		      struct netlink_ext_ack *extack);
 void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
 		      struct pp_memory_provider_params *old_p);
+void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
+			const struct pp_memory_provider_params *old_p);
 
 /**
   * net_mp_netmem_place_in_cache() - give a netmem to a page pool
-- 
cgit v1.2.3


From 825dfab23bca520629a9e5a21ba5b03aaccc75f2 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 19 Mar 2025 10:28:55 +0100
Subject: irqdomain: Rename irq_set_default_host() to irq_set_default_domain()

Naming interrupt domains host is confusing at best and the irqdomain code
uses both domain and host inconsistently.

Therefore rename irq_set_default_host() to irq_set_default_domain().

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250319092951.37667-3-jirislaby@kernel.org
---
 include/linux/irqdomain.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 33ff41eef8f7..4b5c495b5710 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -352,7 +352,7 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,
 					    void *host_data);
 struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
 					    enum irq_domain_bus_token bus_token);
-void irq_set_default_host(struct irq_domain *host);
+void irq_set_default_domain(struct irq_domain *domain);
 struct irq_domain *irq_get_default_host(void);
 int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
 			   irq_hw_number_t hwirq, int node,
-- 
cgit v1.2.3


From 0a27ea384c82e70d16e40adbaebeb3725f7e6342 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 19 Mar 2025 10:28:56 +0100
Subject: irqdomain: Rename irq_get_default_host() to irq_get_default_domain()

Naming interrupt domains host is confusing at best and the irqdomain code
uses both domain and host inconsistently.

Therefore rename irq_get_default_host() to irq_get_default_domain().

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250319092951.37667-4-jirislaby@kernel.org
---
 include/linux/irqdomain.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 4b5c495b5710..e9ab95fbc5a9 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -353,7 +353,7 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,
 struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
 					    enum irq_domain_bus_token bus_token);
 void irq_set_default_domain(struct irq_domain *domain);
-struct irq_domain *irq_get_default_host(void);
+struct irq_domain *irq_get_default_domain(void);
 int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
 			   irq_hw_number_t hwirq, int node,
 			   const struct irq_affinity_desc *affinity);
-- 
cgit v1.2.3


From d2705d33885e3a19f727dff2521fb7d5b1fc5cda Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 19 Mar 2025 10:28:57 +0100
Subject: irqdomain: Stop using 'host' for domain

It is confusing to see 'host' and 'domain' to be used as 'domain'. Given
this header is all about domains, switch the remaining 'host' uses to
'domain'.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250319092951.37667-5-jirislaby@kernel.org
---
 include/linux/irqdomain.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index e9ab95fbc5a9..bb7111105296 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -72,7 +72,7 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
 
 /**
  * struct irq_domain_ops - Methods for irq_domain objects
- * @match: Match an interrupt controller device node to a host, returns
+ * @match: Match an interrupt controller device node to a domain, returns
  *         1 on a match
  * @select: Match an interrupt controller fw specification. It is more generic
  *	    than @match as it receives a complete struct irq_fwspec. Therefore,
@@ -454,7 +454,7 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod
 	return IS_ERR(d) ? NULL : d;
 }
 
-unsigned int irq_create_direct_mapping(struct irq_domain *host);
+unsigned int irq_create_direct_mapping(struct irq_domain *domain);
 #endif
 
 static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
@@ -507,7 +507,7 @@ static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fw
 	return IS_ERR(d) ? NULL : d;
 }
 
-void irq_domain_remove(struct irq_domain *host);
+void irq_domain_remove(struct irq_domain *domain);
 
 int irq_domain_associate(struct irq_domain *domain, unsigned int irq,
 			 irq_hw_number_t hwirq);
@@ -515,16 +515,16 @@ void irq_domain_associate_many(struct irq_domain *domain,
 			       unsigned int irq_base,
 			       irq_hw_number_t hwirq_base, int count);
 
-unsigned int irq_create_mapping_affinity(struct irq_domain *host,
+unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
 					 irq_hw_number_t hwirq,
 					 const struct irq_affinity_desc *affinity);
 unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec);
 void irq_dispose_mapping(unsigned int virq);
 
-static inline unsigned int irq_create_mapping(struct irq_domain *host,
+static inline unsigned int irq_create_mapping(struct irq_domain *domain,
 					      irq_hw_number_t hwirq)
 {
-	return irq_create_mapping_affinity(host, hwirq, NULL);
+	return irq_create_mapping_affinity(domain, hwirq, NULL);
 }
 
 struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain,
-- 
cgit v1.2.3


From 8fa7292fee5c5240402371ea89ab285ec856c916 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 5 Apr 2025 10:17:26 +0200
Subject: treewide: Switch/rename to timer_delete[_sync]()

timer_delete[_sync]() replaces del_timer[_sync](). Convert the whole tree
over and remove the historical wrapper inlines.

Conversion was done with coccinelle plus manual fixups where necessary.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/timer.h   | 36 +-----------------------------------
 include/net/sctp/sctp.h |  2 +-
 2 files changed, 2 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index e67ecd1cbc97..10596d7c3a34 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -30,7 +30,7 @@
  *
  * @TIMER_IRQSAFE: An irqsafe timer is executed with IRQ disabled and
  * it's safe to wait for the completion of the running instance from
- * IRQ handlers, for example, by calling del_timer_sync().
+ * IRQ handlers, for example, by calling timer_delete_sync().
  *
  * Note: The irq disabled callback execution is a special case for
  * workqueue locking issues. It's not meant for executing random crap
@@ -168,40 +168,6 @@ extern int timer_delete(struct timer_list *timer);
 extern int timer_shutdown_sync(struct timer_list *timer);
 extern int timer_shutdown(struct timer_list *timer);
 
-/**
- * del_timer_sync - Delete a pending timer and wait for a running callback
- * @timer:	The timer to be deleted
- *
- * See timer_delete_sync() for detailed explanation.
- *
- * Do not use in new code. Use timer_delete_sync() instead.
- *
- * Returns:
- * * %0	- The timer was not pending
- * * %1	- The timer was pending and deactivated
- */
-static inline int del_timer_sync(struct timer_list *timer)
-{
-	return timer_delete_sync(timer);
-}
-
-/**
- * del_timer - Delete a pending timer
- * @timer:	The timer to be deleted
- *
- * See timer_delete() for detailed explanation.
- *
- * Do not use in new code. Use timer_delete() instead.
- *
- * Returns:
- * * %0	- The timer was not pending
- * * %1	- The timer was pending and deactivated
- */
-static inline int del_timer(struct timer_list *timer)
-{
-	return timer_delete(timer);
-}
-
 extern void init_timers(void);
 struct hrtimer;
 extern enum hrtimer_restart it_real_fn(struct hrtimer *);
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 84e6b9fd5610..d8da764cf6de 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -636,7 +636,7 @@ static inline void sctp_transport_pl_reset(struct sctp_transport *t)
 		}
 	} else {
 		if (t->pl.state != SCTP_PL_DISABLED) {
-			if (del_timer(&t->probe_timer))
+			if (timer_delete(&t->probe_timer))
 				sctp_transport_put(t);
 			t->pl.state = SCTP_PL_DISABLED;
 		}
-- 
cgit v1.2.3


From 9779489a31d77a7b9cb6f20d2d2caced4e29dbe6 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Wed, 5 Feb 2025 11:55:10 +0100
Subject: hrtimers: Delete hrtimer_init()

hrtimer_init() is now unused. Delete it.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/all/003722f60c7a2a4f8d4ed24fb741aa313b7e5136.1738746927.git.namcao@linutronix.de
---
 include/linux/hrtimer.h       | 2 --
 include/linux/hrtimer_types.h | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 88e078871158..1adcba3ddd76 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -231,8 +231,6 @@ static inline enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused)
 /* Exported timer functions: */
 
 /* Initialize timers: */
-extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
-			 enum hrtimer_mode mode);
 extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
 			  clockid_t clock_id, enum hrtimer_mode mode);
 extern void hrtimer_setup_on_stack(struct hrtimer *timer,
diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h
index ad66a3081735..7c5b27daa89d 100644
--- a/include/linux/hrtimer_types.h
+++ b/include/linux/hrtimer_types.h
@@ -34,7 +34,7 @@ enum hrtimer_restart {
  * @is_hard:	Set if hrtimer will be expired in hard interrupt context
  *		even on RT.
  *
- * The hrtimer structure must be initialized by hrtimer_init()
+ * The hrtimer structure must be initialized by hrtimer_setup()
  */
 struct hrtimer {
 	struct timerqueue_node		node;
-- 
cgit v1.2.3


From 04257da0c99c9d4ff7c5bb93046482e1f7d34938 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Wed, 5 Feb 2025 11:55:16 +0100
Subject: hrtimers: Make callback function pointer private

Make the struct hrtimer::function field private, to prevent users from
changing this field in an unsafe way. hrtimer_update_function() should be
used if the callback function needs to be changed.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/all/7d0e6e0c5c59a64a9bea940051aac05d750bc0c2.1738746927.git.namcao@linutronix.de
---
 include/linux/hrtimer_types.h | 2 +-
 include/trace/events/timer.h  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h
index 7c5b27daa89d..8fbbb6bdf7a1 100644
--- a/include/linux/hrtimer_types.h
+++ b/include/linux/hrtimer_types.h
@@ -39,7 +39,7 @@ enum hrtimer_restart {
 struct hrtimer {
 	struct timerqueue_node		node;
 	ktime_t				_softexpires;
-	enum hrtimer_restart		(*function)(struct hrtimer *);
+	enum hrtimer_restart		(*__private function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	u8				state;
 	u8				is_rel;
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 1ef58a04fc57..f8c906be4cd0 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -235,7 +235,7 @@ TRACE_EVENT(hrtimer_start,
 
 	TP_fast_assign(
 		__entry->hrtimer	= hrtimer;
-		__entry->function	= hrtimer->function;
+		__entry->function	= ACCESS_PRIVATE(hrtimer, function);
 		__entry->expires	= hrtimer_get_expires(hrtimer);
 		__entry->softexpires	= hrtimer_get_softexpires(hrtimer);
 		__entry->mode		= mode;
@@ -271,7 +271,7 @@ TRACE_EVENT(hrtimer_expire_entry,
 	TP_fast_assign(
 		__entry->hrtimer	= hrtimer;
 		__entry->now		= *now;
-		__entry->function	= hrtimer->function;
+		__entry->function	= ACCESS_PRIVATE(hrtimer, function);
 	),
 
 	TP_printk("hrtimer=%p function=%ps now=%llu",
-- 
cgit v1.2.3


From 244132c4e5777fe0a4544ef23afba0d9a50e5ec5 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Wed, 5 Feb 2025 11:55:21 +0100
Subject: tracing/timers: Rename the hrtimer_init event to hrtimer_setup

The function hrtimer_init() doesn't exist anymore. It was replaced by
hrtimer_setup().

Thus, rename the hrtimer_init trace event to hrtimer_setup to keep it
consistent.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/all/cba84c3d853c5258aa3a262363a6eac08e2c7afc.1738746927.git.namcao@linutronix.de
---
 include/trace/events/timer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index f8c906be4cd0..1641ae3e6ca0 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -185,12 +185,12 @@ TRACE_EVENT(timer_base_idle,
 		{ HRTIMER_MODE_REL_PINNED_HARD,	"REL|PINNED|HARD" })
 
 /**
- * hrtimer_init - called when the hrtimer is initialized
+ * hrtimer_setup - called when the hrtimer is initialized
  * @hrtimer:	pointer to struct hrtimer
  * @clockid:	the hrtimers clock
  * @mode:	the hrtimers mode
  */
-TRACE_EVENT(hrtimer_init,
+TRACE_EVENT(hrtimer_setup,
 
 	TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid,
 		 enum hrtimer_mode mode),
-- 
cgit v1.2.3