From 51333bfbf18f730a78bbb85895f9c9e4c994d883 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 18 Jan 2025 22:10:17 +0100 Subject: usb: musb: Constify struct musb_fifo_cfg 'struct musb_fifo_cfg' are not modified in these drivers. Constifying these structures moves some data to a read-only section, so increase overall security. On a x86_64, with allmodconfig, as an example: Before: ====== text data bss dec hex filename 64381 5537 202 70120 111e8 drivers/usb/musb/musb_core.o After: ===== text data bss dec hex filename 64957 4929 202 70088 111c8 drivers/usb/musb/musb_core.o Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/26e6f3e25dc8e785d0034dd7e59918e455563e60.1737234596.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/musb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/usb/musb.h b/include/linux/usb/musb.h index 3963e55e88a3..fbdef950f06c 100644 --- a/include/linux/usb/musb.h +++ b/include/linux/usb/musb.h @@ -61,7 +61,7 @@ struct musb_hdrc_eps_bits { }; struct musb_hdrc_config { - struct musb_fifo_cfg *fifo_cfg; /* board fifo configuration */ + const struct musb_fifo_cfg *fifo_cfg; /* board fifo configuration */ unsigned fifo_cfg_size; /* size of the fifo configuration */ /* MUSB configuration-specific details */ -- cgit v1.2.3 From 470cb490d1b75cf25f3139dcf0226967bcc6e217 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 13 Jan 2025 15:43:18 -0600 Subject: iio: adc: ad7173: move fwnode_irq_get_byname() call site Move the call to fwnode_irq_get_byname() from the driver-specific ad7173_fw_parse_device_config() to the shared ad_sd_init() function. The main reason for this is that we want struct ad_sigma_delta_info to be static const data that describes the actual ADC chip, not the application-specific configuration or any runtime state. Previously, this struct was being used to pass the IRQ number to the shared ad_sd_init() function. Now, this is replaced by a boolean flag that is set at compile time and the ad_sd_init() function handles looking up the IRQ number instead. This also has the added benefit that if any other drivers need to make use of this in the future, they just have to set the flag and the shared code will take care of the rest rather than duplicating the code in each driver. Signed-off-by: David Lechner Link: https://patch.msgid.link/20250113-iio-adc-ad7313-fix-non-const-info-struct-v4-1-b63be3ecac4a@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 417073c52380..f242b285081b 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -46,6 +46,7 @@ struct iio_dev; * modify or drop the sample data, it, may be NULL. * @has_registers: true if the device has writable and readable registers, false * if there is just one read-only sample data shift register. + * @has_named_irqs: Set to true if there is more than one IRQ line. * @addr_shift: Shift of the register address in the communications register. * @read_mask: Mask for the communications register having the read bit set. * @status_ch_mask: Mask for the channel number stored in status register. @@ -53,7 +54,6 @@ struct iio_dev; * be used. * @irq_flags: flags for the interrupt used by the triggered buffer * @num_slots: Number of sequencer slots - * @irq_line: IRQ for reading conversions. If 0, spi->irq will be used * @num_resetclks: Number of SPI clk cycles with MOSI=1 to reset the chip. */ struct ad_sigma_delta_info { @@ -64,13 +64,13 @@ struct ad_sigma_delta_info { int (*disable_one)(struct ad_sigma_delta *, unsigned int chan); int (*postprocess_sample)(struct ad_sigma_delta *, unsigned int raw_sample); bool has_registers; + bool has_named_irqs; unsigned int addr_shift; unsigned int read_mask; unsigned int status_ch_mask; unsigned int data_reg; unsigned long irq_flags; unsigned int num_slots; - int irq_line; unsigned int num_resetclks; }; -- cgit v1.2.3 From 42e128c9d5c5b6632c3b182afb7fab0957b6c337 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 22 Jan 2025 01:25:59 +0000 Subject: tty/ldsem: Remove unused ldsem_down_write_trylock ldsem_down_write_trylock() was added in 2013 by commit 4898e640caf0 ("tty: Add timed, writer-prioritized rw semaphore") but hasn't been used. Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/r/20250122012559.441006-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_ldisc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index af01e89074b2..c5cccc3fc1e8 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -39,7 +39,6 @@ do { \ int ldsem_down_read(struct ld_semaphore *sem, long timeout); int ldsem_down_read_trylock(struct ld_semaphore *sem); int ldsem_down_write(struct ld_semaphore *sem, long timeout); -int ldsem_down_write_trylock(struct ld_semaphore *sem); void ldsem_up_read(struct ld_semaphore *sem); void ldsem_up_write(struct ld_semaphore *sem); -- cgit v1.2.3 From a1cd339599a8cff197805c9c71c9cab83cec59c2 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 10 Jan 2025 18:22:05 -0600 Subject: counter: add direction change event Add COUNTER_EVENT_DIRECTION_CHANGE to be used by drivers to emit events when a counter detects a change in direction. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20250110-counter-ti-eqep-add-direction-support-v2-2-c6b6f96d2db9@baylibre.com Signed-off-by: William Breathitt Gray --- include/uapi/linux/counter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/counter.h b/include/uapi/linux/counter.h index 008a691c254b..350b45d616bb 100644 --- a/include/uapi/linux/counter.h +++ b/include/uapi/linux/counter.h @@ -65,6 +65,8 @@ enum counter_event_type { COUNTER_EVENT_CHANGE_OF_STATE, /* Count value captured */ COUNTER_EVENT_CAPTURE, + /* Direction change detected */ + COUNTER_EVENT_DIRECTION_CHANGE, }; /** -- cgit v1.2.3 From 958c3a6706863f9f77b21c90d2428473441cd8a1 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Thu, 23 Jan 2025 08:44:17 +0000 Subject: efi/cper, cxl: Make definitions and structures global In preparation to add tracepoint support, move protocol error UUID definition to a common location, Also, make struct CXL RAS capability, cxl_cper_sec_prot_err and CPER validation flags global for use across different modules. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Fan Ni Reviewed-by: Gregory Price Reviewed-by: Dan Williams Link: https://patch.msgid.link/20250123084421.127697-3-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- include/cxl/event.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/cper.h | 4 +++ 2 files changed, 84 insertions(+) (limited to 'include') diff --git a/include/cxl/event.h b/include/cxl/event.h index 04edd44bd26f..7a6c14c05215 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -164,6 +164,86 @@ struct cxl_cper_work_data { struct cxl_cper_event_rec rec; }; +#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0) +#define PROT_ERR_VALID_AGENT_ADDRESS BIT_ULL(1) +#define PROT_ERR_VALID_DEVICE_ID BIT_ULL(2) +#define PROT_ERR_VALID_SERIAL_NUMBER BIT_ULL(3) +#define PROT_ERR_VALID_CAPABILITY BIT_ULL(4) +#define PROT_ERR_VALID_DVSEC BIT_ULL(5) +#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6) + +/* + * The layout of the enumeration and the values matches CXL Agent Type + * field in the UEFI 2.10 Section N.2.13, + */ +enum { + RCD, /* Restricted CXL Device */ + RCH_DP, /* Restricted CXL Host Downstream Port */ + DEVICE, /* CXL Device */ + LD, /* CXL Logical Device */ + FMLD, /* CXL Fabric Manager managed Logical Device */ + RP, /* CXL Root Port */ + DSP, /* CXL Downstream Switch Port */ + USP, /* CXL Upstream Switch Port */ +}; + +#pragma pack(1) + +/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */ +struct cxl_cper_sec_prot_err { + u64 valid_bits; + u8 agent_type; + u8 reserved[7]; + + /* + * Except for RCH Downstream Port, all the remaining CXL Agent + * types are uniquely identified by the PCIe compatible SBDF number. + */ + union { + u64 rcrb_base_addr; + struct { + u8 function; + u8 device; + u8 bus; + u16 segment; + u8 reserved_1[3]; + }; + } agent_addr; + + struct { + u16 vendor_id; + u16 device_id; + u16 subsystem_vendor_id; + u16 subsystem_id; + u8 class_code[2]; + u16 slot; + u8 reserved_1[4]; + } device_id; + + struct { + u32 lower_dw; + u32 upper_dw; + } dev_serial_num; + + u8 capability[60]; + u16 dvsec_len; + u16 err_len; + u8 reserved_2[4]; +}; + +#pragma pack() + +/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */ +struct cxl_ras_capability_regs { + u32 uncor_status; + u32 uncor_mask; + u32 uncor_severity; + u32 cor_status; + u32 cor_mask; + u32 cap_control; + u32 header_log[16]; +}; + #ifdef CONFIG_ACPI_APEI_GHES int cxl_cper_register_work(struct work_struct *work); int cxl_cper_unregister_work(struct work_struct *work); diff --git a/include/linux/cper.h b/include/linux/cper.h index 265b0f8fc0b3..5c6d4d5b9975 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -89,6 +89,10 @@ enum { #define CPER_NOTIFY_DMAR \ GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \ 0x72, 0x2D, 0xEB, 0x41) +/* CXL Protocol Error Section */ +#define CPER_SEC_CXL_PROT_ERR \ + GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \ + 0x4B, 0x77, 0x10, 0x48) /* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ /* -- cgit v1.2.3 From 61eac5f7f6439e8fe99b5fb29406acb0fd7b83c6 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Thu, 23 Jan 2025 08:44:18 +0000 Subject: efi/cper, cxl: Remove cper_cxl.h Move the declaration of cxl_cper_print_prot_err() to include/linux/cper.h to avoid maintaining a separate header file just for this function declaration. Remove drivers/firmware/efi/cper_cxl.h as its contents have been reorganized. No functional changes. Signed-off-by: Smita Koralahalli Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Fan Ni Reviewed-by: Gregory Price Reviewed-by: Dan Williams Link: https://patch.msgid.link/20250123084421.127697-4-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- include/linux/cper.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/cper.h b/include/linux/cper.h index 5c6d4d5b9975..0ed60a91eca9 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -605,4 +605,8 @@ void cper_estatus_print(const char *pfx, int cper_estatus_check_header(const struct acpi_hest_generic_status *estatus); int cper_estatus_check(const struct acpi_hest_generic_status *estatus); +struct cxl_cper_sec_prot_err; +void cxl_cper_print_prot_err(const char *pfx, + const struct cxl_cper_sec_prot_err *prot_err); + #endif -- cgit v1.2.3 From 315c2f0b53ba2645062627443a12cea73f3dad9c Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Thu, 23 Jan 2025 08:44:19 +0000 Subject: acpi/ghes, cper: Recognize and cache CXL Protocol errors Add support in GHES to detect and process CXL CPER Protocol errors, as defined in UEFI v2.10, section N.2.13. Define struct cxl_cper_prot_err_work_data to cache CXL protocol error information, including RAS capabilities and severity, for further handling. These cached CXL CPER records will later be processed by workqueues within the CXL subsystem. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Ira Weiny Reviewed-by: Tony Luck Reviewed-by: Gregory Price Reviewed-by: Dan Williams Link: https://patch.msgid.link/20250123084421.127697-5-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- include/cxl/event.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/cxl/event.h b/include/cxl/event.h index 7a6c14c05215..8381a07052d0 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -244,6 +244,12 @@ struct cxl_ras_capability_regs { u32 header_log[16]; }; +struct cxl_cper_prot_err_work_data { + struct cxl_cper_sec_prot_err prot_err; + struct cxl_ras_capability_regs ras_cap; + int severity; +}; + #ifdef CONFIG_ACPI_APEI_GHES int cxl_cper_register_work(struct work_struct *work); int cxl_cper_unregister_work(struct work_struct *work); -- cgit v1.2.3 From c8006fbd0f4fd15fa990786ff9a097b45eef616c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 27 Jan 2025 21:58:59 +0000 Subject: bus: mhi: host: Remove unused functions mhi_device_get() and mhi_queue_dma() haven't been used since 2020's commit 189ff97cca53 ("bus: mhi: core: Add support for data transfer") added them. Remove them. Note that mhi_queue_dma_sync() is used and has been left. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20250127215859.261105-1-linux@treblig.org Signed-off-by: Manivannan Sadhasivam --- include/linux/mhi.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 059dc94d20bb..dd372b0123a6 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -720,12 +720,6 @@ enum mhi_state mhi_get_mhi_state(struct mhi_controller *mhi_cntrl); */ void mhi_soc_reset(struct mhi_controller *mhi_cntrl); -/** - * mhi_device_get - Disable device low power mode - * @mhi_dev: Device associated with the channel - */ -void mhi_device_get(struct mhi_device *mhi_dev); - /** * mhi_device_get_sync - Disable device low power mode. Synchronously * take the controller out of suspended state @@ -776,18 +770,6 @@ int mhi_prepare_for_transfer_autoqueue(struct mhi_device *mhi_dev); */ void mhi_unprepare_from_transfer(struct mhi_device *mhi_dev); -/** - * mhi_queue_dma - Send or receive DMA mapped buffers from client device - * over MHI channel - * @mhi_dev: Device associated with the channels - * @dir: DMA direction for the channel - * @mhi_buf: Buffer for holding the DMA mapped data - * @len: Buffer length - * @mflags: MHI transfer flags used for the transfer - */ -int mhi_queue_dma(struct mhi_device *mhi_dev, enum dma_data_direction dir, - struct mhi_buf *mhi_buf, size_t len, enum mhi_flags mflags); - /** * mhi_queue_buf - Send or receive raw buffers from client device over MHI * channel -- cgit v1.2.3 From dbd2e08ff09fd6bd51215b44474899cc1b7b7a16 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 27 Jan 2025 20:30:22 +0100 Subject: iio: gts-helper: export iio_gts_get_total_gain() Export this function in preparation for the fix in veml6030.c, where the total gain can be used to ease the calculation of the processed value of the IIO_LIGHT channel compared to acquiring the scale in NANO. Suggested-by: Matti Vaittinen Signed-off-by: Javier Carrasco Reviewed-by: Matti Vaittinen Link: https://patch.msgid.link/20250127-veml6030-scale-v3-1-4f32ba03df94@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio-gts-helper.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/iio/iio-gts-helper.h b/include/linux/iio/iio-gts-helper.h index e5de7a124bad..66f830ab9b49 100644 --- a/include/linux/iio/iio-gts-helper.h +++ b/include/linux/iio/iio-gts-helper.h @@ -208,5 +208,6 @@ int iio_gts_all_avail_scales(struct iio_gts *gts, const int **vals, int *type, int *length); int iio_gts_avail_scales_for_time(struct iio_gts *gts, int time, const int **vals, int *type, int *length); +int iio_gts_get_total_gain(struct iio_gts *gts, int gain, int time); #endif -- cgit v1.2.3 From 34934d79965518548d33c0513a9572ae936d8c29 Mon Sep 17 00:00:00 2001 From: Guillaume Ranquet Date: Mon, 27 Jan 2025 14:59:32 +0100 Subject: iio: introduce the FAULT event type Add a new event type to describe an hardware failure. Reviewed-by: Nuno Sa Signed-off-by: Guillaume Ranquet Reviewed-by: David Lechner Link: https://patch.msgid.link/20250127-ad4111_openwire-v5-1-ef2db05c384f@baylibre.com Signed-off-by: Jonathan Cameron --- include/uapi/linux/iio/types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index 12886d4465e4..3eb0821af7a4 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -119,6 +119,7 @@ enum iio_event_type { IIO_EV_TYPE_CHANGE, IIO_EV_TYPE_MAG_REFERENCED, IIO_EV_TYPE_GESTURE, + IIO_EV_TYPE_FAULT, }; enum iio_event_direction { @@ -128,6 +129,7 @@ enum iio_event_direction { IIO_EV_DIR_NONE, IIO_EV_DIR_SINGLETAP, IIO_EV_DIR_DOUBLETAP, + IIO_EV_DIR_FAULT_OPENWIRE, }; #endif /* _UAPI_IIO_TYPES_H_ */ -- cgit v1.2.3 From 31d43141d13aa63587f140884b1f667800ce4e1d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Feb 2025 16:57:09 +0200 Subject: dmaengine: Replace dma_request_slave_channel() by dma_request_chan() Replace dma_request_slave_channel() by dma_request_chan() as suggested since the former is deprecated. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250205145757.889247-2-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 346251bf1026..83cbf7197a76 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -1639,8 +1639,8 @@ static inline struct dma_chan { struct dma_chan *chan; - chan = dma_request_slave_channel(dev, name); - if (chan) + chan = dma_request_chan(dev, name); + if (!IS_ERR(chan)) return chan; if (!fn || !fn_param) -- cgit v1.2.3 From 1c83d3dfa0905590408595560629627cba4f9261 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Feb 2025 16:57:10 +0200 Subject: dmaengine: Use dma_request_channel() instead of __dma_request_channel() Reduce use of internal __dma_request_channel() function in public dmaengine.h. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250205145757.889247-3-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 83cbf7197a76..a360e0330436 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -1646,7 +1646,7 @@ static inline struct dma_chan if (!fn || !fn_param) return NULL; - return __dma_request_channel(&mask, fn, fn_param, NULL); + return dma_request_channel(mask, fn, fn_param); } static inline char * -- cgit v1.2.3 From 4fe7fd17fe66a5e243c1de7ff32c29fac7039b8d Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 7 Feb 2025 14:09:05 -0600 Subject: iio: buffer-dmaengine: split requesting DMA channel from allocating buffer Refactor the IIO dmaengine buffer code to split requesting the DMA channel from allocating the buffer. We want to be able to add a new function where the IIO device driver manages the DMA channel, so these two actions need to be separate. To do this, calling dma_request_chan() is moved from iio_dmaengine_buffer_alloc() to iio_dmaengine_buffer_setup_ext(). A new __iio_dmaengine_buffer_setup_ext() helper function is added to simplify error unwinding and will also be used by a new function in a later patch. iio_dmaengine_buffer_free() now only frees the buffer and does not release the DMA channel. A new iio_dmaengine_buffer_teardown() function is added to unwind everything done in iio_dmaengine_buffer_setup_ext(). This keeps things more symmetrical with obvious pairs alloc/free and setup/teardown. Calling dma_get_slave_caps() in iio_dmaengine_buffer_alloc() is moved so that we can avoid any gotos for error unwinding. Reviewed-by: Nuno Sa Signed-off-by: David Lechner Link: https://patch.msgid.link/20250207-dlech-mainline-spi-engine-offload-2-v8-8-e48a489be48c@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer-dmaengine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iio/buffer-dmaengine.h b/include/linux/iio/buffer-dmaengine.h index 81d9a19aeb91..72a2e3fd8a5b 100644 --- a/include/linux/iio/buffer-dmaengine.h +++ b/include/linux/iio/buffer-dmaengine.h @@ -12,7 +12,7 @@ struct iio_dev; struct device; -void iio_dmaengine_buffer_free(struct iio_buffer *buffer); +void iio_dmaengine_buffer_teardown(struct iio_buffer *buffer); struct iio_buffer *iio_dmaengine_buffer_setup_ext(struct device *dev, struct iio_dev *indio_dev, const char *channel, -- cgit v1.2.3 From 79f24971b4ffbdba9733365e298982c45338e6b1 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 7 Feb 2025 14:09:06 -0600 Subject: iio: buffer-dmaengine: add devm_iio_dmaengine_buffer_setup_with_handle() Add a new devm_iio_dmaengine_buffer_setup_with_handle() function to handle cases where the DMA channel is managed by the caller rather than being requested and released by the iio_dmaengine module. Reviewed-by: Nuno Sa Signed-off-by: David Lechner Link: https://patch.msgid.link/20250207-dlech-mainline-spi-engine-offload-2-v8-9-e48a489be48c@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer-dmaengine.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/iio/buffer-dmaengine.h b/include/linux/iio/buffer-dmaengine.h index 72a2e3fd8a5b..37f27545f69f 100644 --- a/include/linux/iio/buffer-dmaengine.h +++ b/include/linux/iio/buffer-dmaengine.h @@ -11,6 +11,7 @@ struct iio_dev; struct device; +struct dma_chan; void iio_dmaengine_buffer_teardown(struct iio_buffer *buffer); struct iio_buffer *iio_dmaengine_buffer_setup_ext(struct device *dev, @@ -26,6 +27,10 @@ int devm_iio_dmaengine_buffer_setup_ext(struct device *dev, struct iio_dev *indio_dev, const char *channel, enum iio_buffer_direction dir); +int devm_iio_dmaengine_buffer_setup_with_handle(struct device *dev, + struct iio_dev *indio_dev, + struct dma_chan *chan, + enum iio_buffer_direction dir); #define devm_iio_dmaengine_buffer_setup(dev, indio_dev, channel) \ devm_iio_dmaengine_buffer_setup_ext(dev, indio_dev, channel, \ -- cgit v1.2.3 From b7c1e069f54668461856f03696812ab7176c400d Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 7 Feb 2025 14:09:10 -0600 Subject: dt-bindings: iio: adc: adi,ad4695: add SPI offload properties Add a pwms property to the adi,ad4695 binding to specify an optional PWM output connected to the CNV pin on the ADC. Also add #trigger-source-cells property to allow the BUSY output to be used as a SPI offload trigger source to indicate when a sample is ready to be read. Macros are added to adi,ad4695.h for the cell values to help with readability since they are arbitrary values. Reviewed-by: Rob Herring (Arm) Signed-off-by: David Lechner Link: https://patch.msgid.link/20250207-dlech-mainline-spi-engine-offload-2-v8-13-e48a489be48c@baylibre.com Signed-off-by: Jonathan Cameron --- include/dt-bindings/iio/adc/adi,ad4695.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/iio/adc/adi,ad4695.h b/include/dt-bindings/iio/adc/adi,ad4695.h index 9fbef542bf67..fea4525d2710 100644 --- a/include/dt-bindings/iio/adc/adi,ad4695.h +++ b/include/dt-bindings/iio/adc/adi,ad4695.h @@ -6,4 +6,11 @@ #define AD4695_COMMON_MODE_REFGND 0xFF #define AD4695_COMMON_MODE_COM 0xFE +#define AD4695_TRIGGER_EVENT_BUSY 0 +#define AD4695_TRIGGER_EVENT_ALERT 1 + +#define AD4695_TRIGGER_PIN_GP0 0 +#define AD4695_TRIGGER_PIN_GP2 2 +#define AD4695_TRIGGER_PIN_GP3 3 + #endif /* _DT_BINDINGS_ADI_AD4695_H */ -- cgit v1.2.3 From fbc54ae4f8d7cef3b11f70945bf8df8f952814b6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 13 Feb 2025 13:45:05 +0200 Subject: i2c: Unexport i2c_of_match_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit i2c_of_match_device() is not used anymore outside of I²C framework, unexport it. Signed-off-by: Andy Shevchenko Reviewed-by: Vladimir Zapolskiy Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 2b2af24d2a43..997e80649889 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -1029,10 +1029,6 @@ static inline struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node return i2c_get_adapter_by_fwnode(of_fwnode_handle(node)); } -const struct of_device_id -*i2c_of_match_device(const struct of_device_id *matches, - struct i2c_client *client); - int of_i2c_get_board_info(struct device *dev, struct device_node *node, struct i2c_board_info *info); @@ -1053,13 +1049,6 @@ static inline struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node return NULL; } -static inline const struct of_device_id -*i2c_of_match_device(const struct of_device_id *matches, - struct i2c_client *client) -{ - return NULL; -} - static inline int of_i2c_get_board_info(struct device *dev, struct device_node *node, struct i2c_board_info *info) -- cgit v1.2.3 From 2c0ae8ef1e5edfd0e42727fba4617694f3aac2eb Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Fri, 7 Feb 2025 12:28:38 +0530 Subject: soundwire: amd: add support for ACP7.0 & ACP7.1 platforms Add SoundWire support for ACP7.0 and ACP7.1 platforms. Signed-off-by: Vijendar Mukunda Link: https://lore.kernel.org/r/20250207065841.4718-4-Vijendar.Mukunda@amd.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_amd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index 799f8578137b..6b839987f14c 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -28,6 +28,8 @@ #define ACP_SDW1 1 #define AMD_SDW_MAX_MANAGER_COUNT 2 #define ACP63_PCI_REV_ID 0x63 +#define ACP70_PCI_REV_ID 0x70 +#define ACP71_PCI_REV_ID 0x71 struct acp_sdw_pdata { u16 instance; -- cgit v1.2.3 From dcba69711fff8af4370cb4c00460db0bf47c7093 Mon Sep 17 00:00:00 2001 From: Jameson Thies Date: Tue, 4 Feb 2025 02:45:58 +0000 Subject: platform/chrome: add PD_EVENT_INIT bit definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update cros_ec_commands.h to include a definition for PD_EVENT_INIT. On platforms supporting UCSI, this host event type is sent when the PPM initializes. Signed-off-by: Jameson Thies Reviewed-by: Benson Leung Acked-by: Tzung-Bi Shih Reviewed-by: Łukasz Bartosik Link: https://lore.kernel.org/r/20250204024600.4138776-2-jthies@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_data/cros_ec_commands.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index ecf290a0c98f..1f4e4f2b89bb 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -5046,6 +5046,7 @@ struct ec_response_pd_status { #define PD_EVENT_DATA_SWAP BIT(3) #define PD_EVENT_TYPEC BIT(4) #define PD_EVENT_PPM BIT(5) +#define PD_EVENT_INIT BIT(6) struct ec_response_host_event_status { uint32_t status; /* PD MCU host event status */ -- cgit v1.2.3 From c08b0f2c317223fa702616dfa77f10a92b7b34b2 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 29 Jan 2025 08:25:50 -0800 Subject: Revert "tty/serial: Add kgdb_nmi driver" This reverts commit 0c57dfcc6c1d037243c2f8fbf62eab3633326ec0. The functionality was supoosed to be used by a later patch in the series that never landed [1]. Drop it. NOTE: part of functionality was already reverted by commit 39d0be87438a ("serial: kgdb_nmi: Remove unused knock code"). Also note that this revert is not a clean revert given code changes that have happened in the meantime. It's obvious that nobody is using this code since the two exposed functions (kgdb_register_nmi_console() and kgdb_unregister_nmi_console()) are both no-ops if "arch_kgdb_ops.enable_nmi" is not defined. No architectures define it. [1] https://lore.kernel.org/lkml/1348522080-32629-9-git-send-email-anton.vorontsov@linaro.org/ Signed-off-by: Douglas Anderson Acked-by: Andy Shevchenko Link: URL [1] Link: https://lore.kernel.org/r/20250129082535.1.Ia095eac1ae357f87d23e7af2206741f5d40788f1@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/kgdb.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 51ef131e66b7..14739952698b 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -306,14 +306,6 @@ extern const struct kgdb_arch arch_kgdb_ops; extern unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs); -#ifdef CONFIG_SERIAL_KGDB_NMI -extern int kgdb_register_nmi_console(void); -extern int kgdb_unregister_nmi_console(void); -#else -static inline int kgdb_register_nmi_console(void) { return 0; } -static inline int kgdb_unregister_nmi_console(void) { return 0; } -#endif - extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops); extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops); extern struct kgdb_io *dbg_io_ops; -- cgit v1.2.3 From a029a219385cfdf9c43fb1ef9eb49d173773388f Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 29 Jan 2025 08:25:52 -0800 Subject: Revert "kernel/debug: Mask KGDB NMI upon entry" This reverts commit 5a14fead07bcf4e0acc877a8d9e1d1f40a441153. No architectures ever implemented `enable_nmi` since the later patches in the series adding it never landed. It's been a long time. Drop it. NOTE: this is not a clean revert due to changes in the file in the meantime. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20250129082535.3.I2254953cd852f31f354456689d68b2d910de3fbe@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/kgdb.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 14739952698b..5eebbe7a3545 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -257,7 +257,6 @@ extern void kgdb_arch_late(void); * hardware breakpoints. * @correct_hw_break: Allow an architecture to specify how to correct the * hardware debug registers. - * @enable_nmi: Manage NMI-triggered entry to KGDB */ struct kgdb_arch { unsigned char gdb_bpt_instr[BREAK_INSTR_SIZE]; @@ -270,8 +269,6 @@ struct kgdb_arch { void (*disable_hw_break)(struct pt_regs *regs); void (*remove_all_hw_break)(void); void (*correct_hw_break)(void); - - void (*enable_nmi)(bool on); }; /** -- cgit v1.2.3 From 633488947ef66b194377411322dc9e12aab79b65 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 13 Feb 2025 15:50:22 +0100 Subject: kernfs: Use RCU to access kernfs_node::parent. kernfs_rename_lock is used to obtain stable kernfs_node::{name|parent} pointer. This is a preparation to access kernfs_node::parent under RCU and ensure that the pointer remains stable under the RCU lifetime guarantees. For a complete path, as it is done in kernfs_path_from_node(), the kernfs_rename_lock is still required in order to obtain a stable parent relationship while computing the relevant node depth. This must not change while the nodes are inspected in order to build the path. If the kernfs user never moves the nodes (changes the parent) then the kernfs_rename_lock is not required and the RCU guarantees are sufficient. This "restriction" can be set with KERNFS_ROOT_INVARIANT_PARENT. Otherwise the lock is required. Rename kernfs_node::parent to kernfs_node::__parent to denote the RCU access and use RCU accessor while accessing the node. Make cgroup use KERNFS_ROOT_INVARIANT_PARENT since the parent here can not change. Acked-by: Tejun Heo Cc: Yonghong Song Signed-off-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20250213145023.2820193-6-bigeasy@linutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 87c79d076d6d..5dda9a268e44 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -147,6 +147,11 @@ enum kernfs_root_flag { * Support user xattrs to be written to nodes rooted at this root. */ KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008, + + /* + * Renames must not change the parent node. + */ + KERNFS_ROOT_INVARIANT_PARENT = 0x0010, }; /* type-specific structures for kernfs_node union members */ @@ -199,8 +204,8 @@ struct kernfs_node { * never moved to a different parent, it is safe to access the * parent directly. */ - struct kernfs_node *parent; const char *name; + struct kernfs_node __rcu *__parent; struct rb_node rb; @@ -416,6 +421,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv); void kernfs_destroy_root(struct kernfs_root *root); +unsigned int kernfs_root_flags(struct kernfs_node *kn); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, @@ -514,6 +520,8 @@ kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, { return ERR_PTR(-ENOSYS); } static inline void kernfs_destroy_root(struct kernfs_root *root) { } +static inline unsigned int kernfs_root_flags(struct kernfs_node *kn) +{ return 0; } static inline struct kernfs_node * kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, -- cgit v1.2.3 From 741c10b096bc4dd79cd9f215b6ef173bb953e75c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 13 Feb 2025 15:50:23 +0100 Subject: kernfs: Use RCU to access kernfs_node::name. Using RCU lifetime rules to access kernfs_node::name can avoid the trouble with kernfs_rename_lock in kernfs_name() and kernfs_path_from_node() if the fs was created with KERNFS_ROOT_INVARIANT_PARENT. This is usefull as it allows to implement kernfs_path_from_node() only with RCU protection and avoiding kernfs_rename_lock. The lock is only required if the __parent node can be changed and the function requires an unchanged hierarchy while it iterates from the node to its parent. The change is needed to allow the lookup of the node's path (kernfs_path_from_node()) from context which runs always with disabled preemption and or interrutps even on PREEMPT_RT. The problem is that kernfs_rename_lock becomes a sleeping lock on PREEMPT_RT. I went through all ::name users and added the required access for the lookup with a few extensions: - rdtgroup_pseudo_lock_create() drops all locks and then uses the name later on. resctrl supports rename with different parents. Here I made a temporal copy of the name while it is used outside of the lock. - kernfs_rename_ns() accepts NULL as new_parent. This simplifies sysfs_move_dir_ns() where it can set NULL in order to reuse the current name. - kernfs_rename_ns() is only using kernfs_rename_lock if the parents are different. All users use either kernfs_rwsem (for stable path view) or just RCU for the lookup. The ::name uses always RCU free. Use RCU lifetime guarantees to access kernfs_node::name. Suggested-by: Tejun Heo Acked-by: Tejun Heo Reported-by: syzbot+6ea37e2e6ffccf41a7e6@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/67251dc6.050a0220.529b6.015e.GAE@google.com/ Reported-by: Hillf Danton Closes: https://lore.kernel.org/20241102001224.2789-1-hdanton@sina.com Signed-off-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20250213145023.2820193-7-bigeasy@linutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5dda9a268e44..b5a5f32fdfd1 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -204,8 +204,8 @@ struct kernfs_node { * never moved to a different parent, it is safe to access the * parent directly. */ - const char *name; struct kernfs_node __rcu *__parent; + const char __rcu *name; struct rb_node rb; @@ -400,7 +400,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); -int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, +int kernfs_path_from_node(struct kernfs_node *kn_to, struct kernfs_node *kn_from, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); void pr_cont_kernfs_path(struct kernfs_node *kn); -- cgit v1.2.3 From 4018ab42636cbea1bcf4fd69afc31d51cd905ba0 Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Fri, 14 Feb 2025 15:19:47 +0200 Subject: iio: backend: add API for interface get Add backend support for obtaining the interface type used. Reviewed-by: Nuno Sa Reviewed-by: David Lechner Signed-off-by: Antoniu Miclaus Link: https://patch.msgid.link/20250214131955.31973-2-antoniu.miclaus@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 10be00f3b120..a0ea6c29d7ba 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -70,6 +70,12 @@ enum iio_backend_sample_trigger { IIO_BACKEND_SAMPLE_TRIGGER_MAX }; +enum iio_backend_interface_type { + IIO_BACKEND_INTERFACE_SERIAL_LVDS, + IIO_BACKEND_INTERFACE_SERIAL_CMOS, + IIO_BACKEND_INTERFACE_MAX +}; + /** * struct iio_backend_ops - operations structure for an iio_backend * @enable: Enable backend. @@ -88,6 +94,7 @@ enum iio_backend_sample_trigger { * @extend_chan_spec: Extend an IIO channel. * @ext_info_set: Extended info setter. * @ext_info_get: Extended info getter. + * @interface_type_get: Interface type. * @read_raw: Read a channel attribute from a backend device * @debugfs_print_chan_status: Print channel status into a buffer. * @debugfs_reg_access: Read or write register value of backend. @@ -128,6 +135,8 @@ struct iio_backend_ops { const char *buf, size_t len); int (*ext_info_get)(struct iio_backend *back, uintptr_t private, const struct iio_chan_spec *chan, char *buf); + int (*interface_type_get)(struct iio_backend *back, + enum iio_backend_interface_type *type); int (*read_raw)(struct iio_backend *back, struct iio_chan_spec const *chan, int *val, int *val2, long mask); @@ -186,6 +195,8 @@ ssize_t iio_backend_ext_info_set(struct iio_dev *indio_dev, uintptr_t private, const char *buf, size_t len); ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private, const struct iio_chan_spec *chan, char *buf); +int iio_backend_interface_type_get(struct iio_backend *back, + enum iio_backend_interface_type *type); int iio_backend_read_raw(struct iio_backend *back, struct iio_chan_spec const *chan, int *val, int *val2, long mask); -- cgit v1.2.3 From fc3fdb835eebb2ba88c6fdbf2c8b9eae1ceab106 Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Fri, 14 Feb 2025 15:19:48 +0200 Subject: iio: backend: add support for data size set Add backend support for setting the data size used. This setting can be adjusted within the IP cores interfacing devices. Reviewed-by: Nuno Sa Reviewed-by: David Lechner Signed-off-by: Antoniu Miclaus Link: https://patch.msgid.link/20250214131955.31973-3-antoniu.miclaus@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index a0ea6c29d7ba..9ae861a21472 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -95,6 +95,7 @@ enum iio_backend_interface_type { * @ext_info_set: Extended info setter. * @ext_info_get: Extended info getter. * @interface_type_get: Interface type. + * @data_size_set: Data size. * @read_raw: Read a channel attribute from a backend device * @debugfs_print_chan_status: Print channel status into a buffer. * @debugfs_reg_access: Read or write register value of backend. @@ -137,6 +138,7 @@ struct iio_backend_ops { const struct iio_chan_spec *chan, char *buf); int (*interface_type_get)(struct iio_backend *back, enum iio_backend_interface_type *type); + int (*data_size_set)(struct iio_backend *back, unsigned int size); int (*read_raw)(struct iio_backend *back, struct iio_chan_spec const *chan, int *val, int *val2, long mask); @@ -197,6 +199,7 @@ ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private, const struct iio_chan_spec *chan, char *buf); int iio_backend_interface_type_get(struct iio_backend *back, enum iio_backend_interface_type *type); +int iio_backend_data_size_set(struct iio_backend *back, unsigned int size); int iio_backend_read_raw(struct iio_backend *back, struct iio_chan_spec const *chan, int *val, int *val2, long mask); -- cgit v1.2.3 From 22894e0be908791e3df011cdfac02589c2f340bd Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Fri, 14 Feb 2025 15:19:49 +0200 Subject: iio: backend: add API for oversampling Add backend support for setting oversampling ratio. Reviewed-by: David Lechner Signed-off-by: Antoniu Miclaus Link: https://patch.msgid.link/20250214131955.31973-4-antoniu.miclaus@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 9ae861a21472..e45b7dfbec35 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -96,6 +96,7 @@ enum iio_backend_interface_type { * @ext_info_get: Extended info getter. * @interface_type_get: Interface type. * @data_size_set: Data size. + * @oversampling_ratio_set: Set Oversampling ratio. * @read_raw: Read a channel attribute from a backend device * @debugfs_print_chan_status: Print channel status into a buffer. * @debugfs_reg_access: Read or write register value of backend. @@ -139,6 +140,8 @@ struct iio_backend_ops { int (*interface_type_get)(struct iio_backend *back, enum iio_backend_interface_type *type); int (*data_size_set)(struct iio_backend *back, unsigned int size); + int (*oversampling_ratio_set)(struct iio_backend *back, + unsigned int ratio); int (*read_raw)(struct iio_backend *back, struct iio_chan_spec const *chan, int *val, int *val2, long mask); @@ -200,6 +203,8 @@ ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private, int iio_backend_interface_type_get(struct iio_backend *back, enum iio_backend_interface_type *type); int iio_backend_data_size_set(struct iio_backend *back, unsigned int size); +int iio_backend_oversampling_ratio_set(struct iio_backend *back, + unsigned int ratio); int iio_backend_read_raw(struct iio_backend *back, struct iio_chan_spec const *chan, int *val, int *val2, long mask); -- cgit v1.2.3 From d795e38df4b7ebac1072bbf7d8a5500c1ea83332 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 9 Feb 2025 18:05:58 +0000 Subject: iio: core: Rework claim and release of direct mode to work with sparse. Initial thought was to do something similar to __cond_lock() do_iio_device_claim_direct_mode(iio_dev) ? : ({ __acquire(iio_dev); 0; }) + Appropriate static inline iio_device_release_direct_mode() However with that, sparse generates false positives. E.g. drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c:1811:17: warning: context imbalance in 'st_lsm6dsx_read_raw' - unexpected unlock So instead, this patch rethinks the return type and makes it more 'conditional lock like' (which is part of what is going on under the hood anyway) and return a boolean - true for successfully acquired, false for did not acquire. To allow a migration path given the rework is now non trivial, take a leaf out of the naming of the conditional guard we currently have for IIO device direct mode and drop the _mode postfix from the new functions giving iio_device_claim_direct() and iio_device_release_direct() Whilst the kernel supports __cond_acquires() upstream sparse does not yet do so. Hence rely on sparse expanding a static inline wrapper to explicitly see whether __acquire() is called. Note that even with the solution here, sparse sometimes gives false positives. However in the few cases seen they were complex code structures that benefited from simplification anyway. Reviewed-by: David Lechner Reviewed-by: Nuno Sa Link: https://patch.msgid.link/20250209180624.701140-2-jic23@kernel.org Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 56161e02f002..5ed03e36178f 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include /* IIO TODO LIST */ @@ -662,6 +663,31 @@ int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp); int iio_device_claim_direct_mode(struct iio_dev *indio_dev); void iio_device_release_direct_mode(struct iio_dev *indio_dev); +/* + * Helper functions that allow claim and release of direct mode + * in a fashion that doesn't generate many false positives from sparse. + * Note this must remain static inline in the header so that sparse + * can see the __acquire() marking. Revisit when sparse supports + * __cond_acquires() + */ +static inline bool iio_device_claim_direct(struct iio_dev *indio_dev) +{ + int ret = iio_device_claim_direct_mode(indio_dev); + + if (ret) + return false; + + __acquire(iio_dev); + + return true; +} + +static inline void iio_device_release_direct(struct iio_dev *indio_dev) +{ + iio_device_release_direct_mode(indio_dev); + __release(indio_dev); +} + /* * This autocleanup logic is normally used via * iio_device_claim_direct_scoped(). -- cgit v1.2.3 From 4c571885898c5c98934d086f2ab11b5e27e4f41f Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 9 Feb 2025 18:06:24 +0000 Subject: iio: Drop iio_device_claim_direct_scoped() and related infrastructure Scoped conditional automated cleanup turned out to be harder to work with than expected. Despite several attempts to find a better solution non have surfaced. As such rip it out of the IIO code. Reviewed-by: David Lechner Reviewed-by: Nuno Sa Link: https://patch.msgid.link/20250209180624.701140-28-jic23@kernel.org Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'include') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 5ed03e36178f..07a0e8132e88 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -9,7 +9,6 @@ #include #include -#include #include #include #include @@ -688,32 +687,6 @@ static inline void iio_device_release_direct(struct iio_dev *indio_dev) __release(indio_dev); } -/* - * This autocleanup logic is normally used via - * iio_device_claim_direct_scoped(). - */ -DEFINE_GUARD(iio_claim_direct, struct iio_dev *, iio_device_claim_direct_mode(_T), - iio_device_release_direct_mode(_T)) - -DEFINE_GUARD_COND(iio_claim_direct, _try, ({ - struct iio_dev *dev; - int d = iio_device_claim_direct_mode(_T); - - if (d < 0) - dev = NULL; - else - dev = _T; - dev; - })) - -/** - * iio_device_claim_direct_scoped() - Scoped call to iio_device_claim_direct. - * @fail: What to do on failure to claim device. - * @iio_dev: Pointer to the IIO devices structure - */ -#define iio_device_claim_direct_scoped(fail, iio_dev) \ - scoped_cond_guard(iio_claim_direct_try, fail, iio_dev) - int iio_device_claim_buffer_mode(struct iio_dev *indio_dev); void iio_device_release_buffer_mode(struct iio_dev *indio_dev); -- cgit v1.2.3 From 8fd74a31eaf3def0d57264ada57fc981902aeadf Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sat, 8 Feb 2025 22:15:26 +0800 Subject: driver core: class: Remove needless return in void API class_remove_file() Remove return since both class_remove_file() and class_remove_file_ns() are void functions. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250208-cls_rmv_return-v1-1-091b37945aac@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device/class.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 45ee3a634999..65880e60c720 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -193,7 +193,7 @@ static inline int __must_check class_create_file(const struct class *class, static inline void class_remove_file(const struct class *class, const struct class_attribute *attr) { - return class_remove_file_ns(class, attr, NULL); + class_remove_file_ns(class, attr, NULL); } /* Simple class attribute that is just a static string */ -- cgit v1.2.3 From a44073c28bc6d4118891d61e31c9fa9dc4333dc0 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sat, 8 Feb 2025 23:18:39 +0800 Subject: driver core: Remove needless return in void API device_remove_group() Remove return since both device_remove_group() and device_remove_groups() are void functions. Fixes: e323b2dddc1c ("driver core: add device_{add|remove}_group() helpers") Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250208-fix_device_remove_group-v1-1-8a5b0ac0ce5c@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index 80a5b3268986..605b60254f6d 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1268,7 +1268,7 @@ static inline void device_remove_group(struct device *dev, { const struct attribute_group *groups[] = { grp, NULL }; - return device_remove_groups(dev, groups); + device_remove_groups(dev, groups); } int __must_check devm_device_add_group(struct device *dev, -- cgit v1.2.3 From 10d43ecbb0121fee8ddbc1e9755edc0402e458ca Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 30 Jan 2025 01:26:54 +0000 Subject: mei: Remove unused functions The following functions have been in the mei code for a long time but have never been used. mei_txe_setup_satt2() was added in 2014 by commit 32e2b59fca2c ("mei: txe: add hw-txe.c") mei_me_cl_rm_by_uuid_id() was added in 2015 by commit 79563db9ddd3 ("mei: add reference counting for me clients") mei_cldev_uuid() was added in 2015 by commit baeacd037697 ("mei: bus: export uuid and protocol version to mei_cl bus drivers") mei_cldev_recv_nonblock() was added in 2016 by commit 076802d00615 ("mei: bus: enable non-blocking RX") it is the only user of mei_cldev_recv_nonblock_vtag(). Remove them. Signed-off-by: Dr. David Alan Gilbert Acked-by: Arnd Bergmann Reviewed-by: Alexander Usyskin Link: https://lore.kernel.org/r/20250130012654.255119-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mei_cl_bus.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h index b38a56a13f39..725fd7727422 100644 --- a/include/linux/mei_cl_bus.h +++ b/include/linux/mei_cl_bus.h @@ -97,8 +97,6 @@ ssize_t mei_cldev_send(struct mei_cl_device *cldev, const u8 *buf, ssize_t mei_cldev_send_timeout(struct mei_cl_device *cldev, const u8 *buf, size_t length, unsigned long timeout); ssize_t mei_cldev_recv(struct mei_cl_device *cldev, u8 *buf, size_t length); -ssize_t mei_cldev_recv_nonblock(struct mei_cl_device *cldev, u8 *buf, - size_t length); ssize_t mei_cldev_recv_timeout(struct mei_cl_device *cldev, u8 *buf, size_t length, unsigned long timeout); ssize_t mei_cldev_send_vtag(struct mei_cl_device *cldev, const u8 *buf, @@ -107,8 +105,6 @@ ssize_t mei_cldev_send_vtag_timeout(struct mei_cl_device *cldev, const u8 *buf, size_t length, u8 vtag, unsigned long timeout); ssize_t mei_cldev_recv_vtag(struct mei_cl_device *cldev, u8 *buf, size_t length, u8 *vtag); -ssize_t mei_cldev_recv_nonblock_vtag(struct mei_cl_device *cldev, u8 *buf, - size_t length, u8 *vtag); ssize_t mei_cldev_recv_vtag_timeout(struct mei_cl_device *cldev, u8 *buf, size_t length, u8 *vtag, unsigned long timeout); @@ -116,7 +112,6 @@ int mei_cldev_register_rx_cb(struct mei_cl_device *cldev, mei_cldev_cb_t rx_cb); int mei_cldev_register_notif_cb(struct mei_cl_device *cldev, mei_cldev_cb_t notif_cb); -const uuid_le *mei_cldev_uuid(const struct mei_cl_device *cldev); u8 mei_cldev_ver(const struct mei_cl_device *cldev); void *mei_cldev_get_drvdata(const struct mei_cl_device *cldev); -- cgit v1.2.3 From 6866c91f8c2327546d850d5a85025fb81e2089bf Mon Sep 17 00:00:00 2001 From: Billy Tsai Date: Tue, 4 Feb 2025 17:17:01 +0800 Subject: i3c: Remove the const qualifier from i2c_msg pointer in i2c_xfers API The change is necessary to enable the use of the `i2c_get_dma_safe_msg_buf()` API, which requires a non-const `struct i2c_msg *` to operate. The `i2c_get_dma_safe_msg_buf()` function ensures safe handling of I2C messages when using DMA, making it essential for scenarios where DMA transfers are involved. By removing the `const` qualifier, this patch allows drivers to prepare and manage DMA-safe buffers directly. Signed-off-by: Billy Tsai Reviewed-by: Frank Li Reviewed-by: Wolfram Sang Acked-by: Mukesh Kumar Savaliya Link: https://lore.kernel.org/r/20250204091702.4014466-1-billy_tsai@aspeedtech.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 12d532b012c5..c67922ece617 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -475,7 +475,7 @@ struct i3c_master_controller_ops { int (*attach_i2c_dev)(struct i2c_dev_desc *dev); void (*detach_i2c_dev)(struct i2c_dev_desc *dev); int (*i2c_xfers)(struct i2c_dev_desc *dev, - const struct i2c_msg *xfers, int nxfers); + struct i2c_msg *xfers, int nxfers); int (*request_ibi)(struct i3c_dev_desc *dev, const struct i3c_ibi_setup *req); void (*free_ibi)(struct i3c_dev_desc *dev); -- cgit v1.2.3 From c749f058b4371430a8338e1ca72b9ae38fef613b Mon Sep 17 00:00:00 2001 From: Kannappan R Date: Thu, 20 Feb 2025 16:13:39 +0200 Subject: USB: core: Add eUSB2 descriptor and parsing in USB core Add support for the 'eUSB2 Isochronous Endpoint Companion Descriptor' introduced in the recent USB 2.0 specification 'USB 2.0 Double Isochronous IN Bandwidth' ECN. It allows embedded USB2 (eUSB2) devices to report and use higher bandwidths for isochronous IN transfers in order to support higher camera resolutions on the lid of laptops and tablets with minimal change to the USB2 protocol. The motivation for expanding USB 2.0 is further clarified in an additional Embedded USB2 version 2.0 (eUSB2v2) supplement to the USB 2.0 specification. It points out this is optimized for performance, power and cost by using the USB 2.0 low-voltage, power efficient PHY and half-duplex link for the asymmetric camera bandwidth needs, avoiding the costly and complex full-duplex USB 3.x symmetric link and gigabit receivers. eUSB2 devices that support the higher isochronous IN bandwidth and the new descriptor can be identified by their device descriptor bcdUSB value of 0x0220 Co-developed-by: Amardeep Rai Signed-off-by: Amardeep Rai Signed-off-by: Kannappan R Co-developed-by: Mathias Nyman Signed-off-by: Mathias Nyman Acked-by: Alan Stern Link: https://lore.kernel.org/r/20250220141339.1939448-1-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 8 +++++--- include/uapi/linux/usb/ch9.h | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/usb.h b/include/linux/usb.h index cfa8005e24f9..b46738701f8d 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -51,6 +51,7 @@ struct ep_device; * @desc: descriptor for this endpoint, wMaxPacketSize in native byteorder * @ss_ep_comp: SuperSpeed companion descriptor for this endpoint * @ssp_isoc_ep_comp: SuperSpeedPlus isoc companion descriptor for this endpoint + * @eusb2_isoc_ep_comp: eUSB2 isoc companion descriptor for this endpoint * @urb_list: urbs queued to this endpoint; maintained by usbcore * @hcpriv: for use by HCD; typically holds hardware dma queue head (QH) * with one or more transfer descriptors (TDs) per urb @@ -64,9 +65,10 @@ struct ep_device; * descriptor within an active interface in a given USB configuration. */ struct usb_host_endpoint { - struct usb_endpoint_descriptor desc; - struct usb_ss_ep_comp_descriptor ss_ep_comp; - struct usb_ssp_isoc_ep_comp_descriptor ssp_isoc_ep_comp; + struct usb_endpoint_descriptor desc; + struct usb_ss_ep_comp_descriptor ss_ep_comp; + struct usb_ssp_isoc_ep_comp_descriptor ssp_isoc_ep_comp; + struct usb_eusb2_isoc_ep_comp_descriptor eusb2_isoc_ep_comp; struct list_head urb_list; void *hcpriv; struct ep_device *ep_dev; /* For sysfs info */ diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index 91f0f7e214a5..475af9358173 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -253,6 +253,9 @@ struct usb_ctrlrequest { #define USB_DT_BOS 0x0f #define USB_DT_DEVICE_CAPABILITY 0x10 #define USB_DT_WIRELESS_ENDPOINT_COMP 0x11 +/* From the eUSB2 spec */ +#define USB_DT_EUSB2_ISOC_ENDPOINT_COMP 0x12 +/* From Wireless USB spec */ #define USB_DT_WIRE_ADAPTER 0x21 /* From USB Device Firmware Upgrade Specification, Revision 1.1 */ #define USB_DT_DFU_FUNCTIONAL 0x21 @@ -675,6 +678,18 @@ static inline int usb_endpoint_interrupt_type( /*-------------------------------------------------------------------------*/ +/* USB_DT_EUSB2_ISOC_ENDPOINT_COMP: eUSB2 Isoch Endpoint Companion descriptor */ +struct usb_eusb2_isoc_ep_comp_descriptor { + __u8 bLength; + __u8 bDescriptorType; + __le16 wMaxPacketSize; + __le32 dwBytesPerInterval; +} __attribute__ ((packed)); + +#define USB_DT_EUSB2_ISOC_EP_COMP_SIZE 8 + +/*-------------------------------------------------------------------------*/ + /* USB_DT_SSP_ISOC_ENDPOINT_COMP: SuperSpeedPlus Isochronous Endpoint Companion * descriptor */ -- cgit v1.2.3 From ac9c5170a18162d45c6edd1f0fa2d2b2504bc2cb Mon Sep 17 00:00:00 2001 From: Subramanian Mohan Date: Wed, 19 Feb 2025 09:36:15 +0530 Subject: pps: generators: replace copy of pps-gen info struct with const pointer Some PPS generator drivers may need to retrieve a pointer to their internal data while executing the PPS generator enable() method. During the driver registration the pps_gen_device pointer is returned from the framework, and for that reason, there is difficulty in getting generator driver data back in the enable function. We won't be able to use container_of macro as it results in static assert, and we might end up in using static pointer. To solve the issue and to get back the generator driver data back, we should not copy the struct pps_gen_source_info within the struct pps_gen_device during the registration stage, but simply save the pointer of the driver one. In this manner, driver may get a pointer to their internal data as shown below: struct pps_gen_foo_data_s { ... struct pps_gen_source_info gen_info; struct pps_gen_device *pps_gen; ... }; static int __init pps_gen_foo_init(void) { struct pps_gen_foo_data_s *foo; ... foo->pps_gen = pps_gen_register_source(&foo->gen_info); ... } Then, in the enable() method, we can retrieve the pointer to the main struct by using the code below: static int pps_gen_foo_enable(struct pps_gen_device *pps_gen, bool enable) { struct pps_gen_foo_data_s *foo = container_of(pps_gen->info, struct pps_gen_foo_data_s, gen_info); ... } Signed-off-by: Rodolfo Giometti Tested-by: Subramanian Mohan Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Signed-off-by: Subramanian Mohan Link: https://lore.kernel.org/r/20250219040618.70962-2-subramanian.mohan@intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/pps_gen_kernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pps_gen_kernel.h b/include/linux/pps_gen_kernel.h index 022ea0ac4440..6214c8aa2e02 100644 --- a/include/linux/pps_gen_kernel.h +++ b/include/linux/pps_gen_kernel.h @@ -43,7 +43,7 @@ struct pps_gen_source_info { /* The main struct */ struct pps_gen_device { - struct pps_gen_source_info info; /* PSS generator info */ + const struct pps_gen_source_info *info; /* PSS generator info */ bool enabled; /* PSS generator status */ unsigned int event; @@ -70,7 +70,7 @@ extern const struct attribute_group *pps_gen_groups[]; */ extern struct pps_gen_device *pps_gen_register_source( - struct pps_gen_source_info *info); + const struct pps_gen_source_info *info); extern void pps_gen_unregister_source(struct pps_gen_device *pps_gen); extern void pps_gen_event(struct pps_gen_device *pps_gen, unsigned int event, void *data); -- cgit v1.2.3 From 46006ceb5d029f92df405db15c9a31f0ee41628c Mon Sep 17 00:00:00 2001 From: Linu Cherian Date: Wed, 12 Feb 2025 17:19:13 +0530 Subject: coresight: core: Add provision for panic callbacks Panic callback handlers allows coresight device drivers to sync relevant trace data and trace metadata to reserved memory regions so that they can be retrieved later in the subsequent boot or in the crashdump kernel. Signed-off-by: Linu Cherian Reviewed-by: James Clark Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250212114918.548431-4-lcherian@marvell.com --- include/linux/coresight.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 17276965ff1d..3eef4e91df3f 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -340,6 +340,7 @@ enum cs_mode { #define link_ops(csdev) csdev->ops->link_ops #define helper_ops(csdev) csdev->ops->helper_ops #define ect_ops(csdev) csdev->ops->ect_ops +#define panic_ops(csdev) csdev->ops->panic_ops /** * struct coresight_ops_sink - basic operations for a sink @@ -409,11 +410,22 @@ struct coresight_ops_helper { int (*disable)(struct coresight_device *csdev, void *data); }; + +/** + * struct coresight_ops_panic - Generic device ops for panic handing + * + * @sync : Sync the device register state/trace data + */ +struct coresight_ops_panic { + int (*sync)(struct coresight_device *csdev); +}; + struct coresight_ops { const struct coresight_ops_sink *sink_ops; const struct coresight_ops_link *link_ops; const struct coresight_ops_source *source_ops; const struct coresight_ops_helper *helper_ops; + const struct coresight_ops_panic *panic_ops; }; static inline u32 csdev_access_relaxed_read32(struct csdev_access *csa, -- cgit v1.2.3 From 3b29bcee8f6f703a5952b85fc2ffcbcfb0862db4 Mon Sep 17 00:00:00 2001 From: Robert Budai Date: Mon, 17 Feb 2025 12:57:45 +0200 Subject: iio: imu: adis: Add custom ops struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces a custom ops struct letting users define custom read and write functions. Some adis devices might define a completely different spi protocol from the one used in the default implementation. Co-developed-by: Ramona Gradinariu Signed-off-by: Ramona Gradinariu Co-developed-by: Antoniu Miclaus Signed-off-by: Antoniu Miclaus Co-developed-by: Nuno Sá Signed-off-by: Nuno Sá Signed-off-by: Robert Budai Link: https://patch.msgid.link/20250217105753.605465-2-robert.budai@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 4bb98d9731de..89cfa75ae9ea 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -94,6 +94,18 @@ struct adis_data { unsigned int burst_max_speed_hz; }; +/** + * struct adis_ops: Custom ops for adis devices. + * @write: Custom spi write implementation. + * @read: Custom spi read implementation. + */ +struct adis_ops { + int (*write)(struct adis *adis, unsigned int reg, unsigned int value, + unsigned int size); + int (*read)(struct adis *adis, unsigned int reg, unsigned int *value, + unsigned int size); +}; + /** * struct adis - ADIS device instance data * @spi: Reference to SPI device which owns this ADIS IIO device @@ -101,6 +113,7 @@ struct adis_data { * @data: ADIS chip variant specific data * @burst_extra_len: Burst extra length. Should only be used by devices that can * dynamically change their burst mode length. + * @ops: ops struct for custom read and write functions * @state_lock: Lock used by the device to protect state * @msg: SPI message object * @xfer: SPI transfer objects to be used for a @msg @@ -116,6 +129,7 @@ struct adis { const struct adis_data *data; unsigned int burst_extra_len; + const struct adis_ops *ops; /** * The state_lock is meant to be used during operations that require * a sequence of SPI R/W in order to protect the SPI transfer @@ -168,7 +182,7 @@ int __adis_read_reg(struct adis *adis, unsigned int reg, static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg, u8 val) { - return __adis_write_reg(adis, reg, val, 1); + return adis->ops->write(adis, reg, val, 1); } /** @@ -180,7 +194,7 @@ static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg, static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg, u16 val) { - return __adis_write_reg(adis, reg, val, 2); + return adis->ops->write(adis, reg, val, 2); } /** @@ -192,7 +206,7 @@ static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg, static inline int __adis_write_reg_32(struct adis *adis, unsigned int reg, u32 val) { - return __adis_write_reg(adis, reg, val, 4); + return adis->ops->write(adis, reg, val, 4); } /** @@ -207,7 +221,7 @@ static inline int __adis_read_reg_16(struct adis *adis, unsigned int reg, unsigned int tmp; int ret; - ret = __adis_read_reg(adis, reg, &tmp, 2); + ret = adis->ops->read(adis, reg, &tmp, 2); if (ret == 0) *val = tmp; @@ -226,7 +240,7 @@ static inline int __adis_read_reg_32(struct adis *adis, unsigned int reg, unsigned int tmp; int ret; - ret = __adis_read_reg(adis, reg, &tmp, 4); + ret = adis->ops->read(adis, reg, &tmp, 4); if (ret == 0) *val = tmp; @@ -244,7 +258,7 @@ static inline int adis_write_reg(struct adis *adis, unsigned int reg, unsigned int val, unsigned int size) { guard(mutex)(&adis->state_lock); - return __adis_write_reg(adis, reg, val, size); + return adis->ops->write(adis, reg, val, size); } /** @@ -258,7 +272,7 @@ static int adis_read_reg(struct adis *adis, unsigned int reg, unsigned int *val, unsigned int size) { guard(mutex)(&adis->state_lock); - return __adis_read_reg(adis, reg, val, size); + return adis->ops->read(adis, reg, val, size); } /** -- cgit v1.2.3 From 7f15d7a7d12dbcb4a846ca19d9565e0c11ea6694 Mon Sep 17 00:00:00 2001 From: Robert Budai Date: Mon, 17 Feb 2025 12:57:46 +0200 Subject: iio: imu: adis: Add reset to custom ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch allows the custom definition of reset functionality for adis object. It is useful in cases where the driver does not need to sleep after the reset since it is handled by the library. Co-developed-by: Ramona Gradinariu Signed-off-by: Ramona Gradinariu Co-developed-by: Antoniu Miclaus Signed-off-by: Antoniu Miclaus Co-developed-by: Nuno Sá Signed-off-by: Nuno Sá Signed-off-by: Robert Budai Link: https://patch.msgid.link/20250217105753.605465-3-robert.budai@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 89cfa75ae9ea..52652f51db2e 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -98,12 +98,15 @@ struct adis_data { * struct adis_ops: Custom ops for adis devices. * @write: Custom spi write implementation. * @read: Custom spi read implementation. + * @reset: Custom sw reset implementation. The custom implementation does not + * need to sleep after the reset. It's done by the library already. */ struct adis_ops { int (*write)(struct adis *adis, unsigned int reg, unsigned int value, unsigned int size); int (*read)(struct adis *adis, unsigned int reg, unsigned int *value, unsigned int size); + int (*reset)(struct adis *adis); }; /** -- cgit v1.2.3 From 9fa98d94131806cae0441d05213d36da480d7389 Mon Sep 17 00:00:00 2001 From: Robert Budai Date: Mon, 17 Feb 2025 12:57:47 +0200 Subject: iio: imu: adis: Add DIAG_STAT register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some devices may have more than 16 bits of status. This patch allows the user to specify the size of the DIAG_STAT register. It defaults to 2 if not specified. This is mainly for backward compatibility. Co-developed-by: Ramona Gradinariu Signed-off-by: Ramona Gradinariu Co-developed-by: Antoniu Miclaus Signed-off-by: Antoniu Miclaus Co-developed-by: Nuno Sá Signed-off-by: Nuno Sá Signed-off-by: Robert Budai Link: https://patch.msgid.link/20250217105753.605465-4-robert.budai@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 52652f51db2e..aa160511e265 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -44,6 +44,8 @@ struct adis_timeout { * @glob_cmd_reg: Register address of the GLOB_CMD register * @msc_ctrl_reg: Register address of the MSC_CTRL register * @diag_stat_reg: Register address of the DIAG_STAT register + * @diag_stat_size: Length (in bytes) of the DIAG_STAT register. If 0 the + * default length is 2 bytes long. * @prod_id_reg: Register address of the PROD_ID register * @prod_id: Product ID code that should be expected when reading @prod_id_reg * @self_test_mask: Bitmask of supported self-test operations @@ -70,6 +72,7 @@ struct adis_data { unsigned int glob_cmd_reg; unsigned int msc_ctrl_reg; unsigned int diag_stat_reg; + unsigned int diag_stat_size; unsigned int prod_id_reg; unsigned int prod_id; -- cgit v1.2.3 From 4ff6039ffb79a4a8a44b63810a8a2f2b43264856 Mon Sep 17 00:00:00 2001 From: Yuanfang Zhang Date: Thu, 16 Jan 2025 17:04:20 +0800 Subject: coresight-etm4x: add isb() before reading the TRCSTATR As recommended by section 4.3.7 ("Synchronization when using system instructions to progrom the trace unit") of ARM IHI 0064H.b, the self-hosted trace analyzer must perform a Context synchronization event between writing to the TRCPRGCTLR and reading the TRCSTATR. Additionally, add an ISB between the each read of TRCSTATR on coresight_timeout() when using system instructions to program the trace unit. Fixes: 1ab3bb9df5e3 ("coresight: etm4x: Add necessary synchronization for sysreg access") Signed-off-by: Yuanfang Zhang Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250116-etm_sync-v4-1-39f2b05e9514@quicinc.com --- include/linux/coresight.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 3eef4e91df3f..c7614ccb3232 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -661,6 +661,10 @@ extern int coresight_enable_sysfs(struct coresight_device *csdev); extern void coresight_disable_sysfs(struct coresight_device *csdev); extern int coresight_timeout(struct csdev_access *csa, u32 offset, int position, int value); +typedef void (*coresight_timeout_cb_t) (struct csdev_access *, u32, int, int); +extern int coresight_timeout_action(struct csdev_access *csa, u32 offset, + int position, int value, + coresight_timeout_cb_t cb); extern int coresight_claim_device(struct coresight_device *csdev); extern int coresight_claim_device_unlocked(struct coresight_device *csdev); -- cgit v1.2.3 From 84b25926fa7abb5b634d4af9544f47ab0aabc399 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 26 Feb 2025 09:21:18 -0700 Subject: acpi: numa: Add support to enumerate and store extended linear address mode Store the address mode as part of the cache attriutes. Export the mode attribute to sysfs as all other cache attributes. Link: https://lore.kernel.org/linux-cxl/668333b17e4b2_5639294fd@dwillia2-xfh.jf.intel.com.notmuch/ Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20250226162224.3633792-2-dave.jiang@intel.com Signed-off-by: Dave Jiang --- include/linux/node.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/node.h b/include/linux/node.h index 9a881c2208b3..2b7517892230 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -57,6 +57,11 @@ enum cache_write_policy { NODE_CACHE_WRITE_OTHER, }; +enum cache_mode { + NODE_CACHE_ADDR_MODE_RESERVED, + NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR, +}; + /** * struct node_cache_attrs - system memory caching attributes * @@ -65,6 +70,7 @@ enum cache_write_policy { * @size: Total size of cache in bytes * @line_size: Number of bytes fetched on a cache miss * @level: The cache hierarchy level + * @address_mode: The address mode */ struct node_cache_attrs { enum cache_indexing indexing; @@ -72,6 +78,7 @@ struct node_cache_attrs { u64 size; u16 line_size; u8 level; + u16 address_mode; }; #ifdef CONFIG_HMEM_REPORTING -- cgit v1.2.3 From 0ec9849b63338da7883440ed3f52757cd8c847b1 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 26 Feb 2025 09:21:19 -0700 Subject: acpi/hmat / cxl: Add extended linear cache support for CXL The current cxl region size only indicates the size of the CXL memory region without accounting for the extended linear cache size. Retrieve the cache size from HMAT and append that to the cxl region size for the cxl region range that matches the SRAT range that has extended linear cache enabled. The SRAT defines the whole memory range that includes the extended linear cache and the CXL memory region. The new HMAT ECN/ECR to the Memory Side Cache Information Structure defines the size of the extended linear cache size and matches to the SRAT Memory Affinity Structure by the memory proxmity domain. Add a helper to match the cxl range to the SRAT memory range in order to retrieve the cache size. There are several places that checks the cxl region range against the decoder range. Use new helper to check between the two ranges and address the new cache size. Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20250226162224.3633792-3-dave.jiang@intel.com Signed-off-by: Dave Jiang --- include/linux/acpi.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4e495b29c640..cbd933504dbf 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1095,6 +1095,17 @@ static inline acpi_handle acpi_get_processor_handle(int cpu) #endif /* !CONFIG_ACPI */ +#ifdef CONFIG_ACPI_HMAT +int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid, + resource_size_t *size); +#else +static inline int hmat_get_extended_linear_cache_size(struct resource *backing_res, + int nid, resource_size_t *size) +{ + return -EOPNOTSUPP; +} +#endif + extern void arch_post_acpi_subsys_init(void); #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC -- cgit v1.2.3 From 0da30874729baeb01889b0eca16cfda122687503 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 24 Feb 2025 16:04:17 +0200 Subject: dmaengine: ti: k3-udma-glue: Drop skip_fdq argument from k3_udma_glue_reset_rx_chn The user of k3_udma_glue_reset_rx_chn() e.g. ti_am65_cpsw_nuss can run on multiple platforms having different DMA architectures. On some platforms there can be one FDQ for all flows in the RX channel while for others there is a separate FDQ for each flow in the RX channel. So far we have been relying on the skip_fdq argument of k3_udma_glue_reset_rx_chn(). Instead of relying on the user to provide this information, infer it based on DMA architecture during k3_udma_glue_request_rx_chn() and save it in an internal flag 'single_fdq'. Use that flag at k3_udma_glue_reset_rx_chn() to deicide if the FDQ needs to be cleared for every flow or just for flow 0. Fixes the below issue on ti_am65_cpsw_nuss driver on AM62-SK. > ip link set eth1 down > ip link set eth0 down > ethtool -L eth0 rx 8 > ip link set eth0 up > modprobe -r ti_am65_cpsw_nuss [ 103.045726] ------------[ cut here ]------------ [ 103.050505] k3_knav_desc_pool size 512000 != avail 64000 [ 103.050703] WARNING: CPU: 1 PID: 450 at drivers/net/ethernet/ti/k3-cppi-desc-pool.c:33 k3_cppi_desc_pool_destroy+0xa0/0xa8 [k3_cppi_desc_pool] [ 103.068810] Modules linked in: ti_am65_cpsw_nuss(-) k3_cppi_desc_pool snd_soc_hdmi_codec crct10dif_ce snd_soc_simple_card snd_soc_simple_card_utils display_connector rtc_ti_k3 k3_j72xx_bandgap tidss drm_client_lib snd_soc_davinci_mcas p drm_dma_helper tps6598x phylink snd_soc_ti_udma rti_wdt drm_display_helper snd_soc_tlv320aic3x_i2c typec at24 phy_gmii_sel snd_soc_ti_edma snd_soc_tlv320aic3x sii902x snd_soc_ti_sdma sa2ul omap_mailbox drm_kms_helper authenc cfg80211 r fkill fuse drm drm_panel_orientation_quirks backlight ip_tables x_tables ipv6 [last unloaded: k3_cppi_desc_pool] [ 103.119950] CPU: 1 UID: 0 PID: 450 Comm: modprobe Not tainted 6.13.0-rc7-00001-g9c5e3435fa66 #1011 [ 103.119968] Hardware name: Texas Instruments AM625 SK (DT) [ 103.119974] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 103.119983] pc : k3_cppi_desc_pool_destroy+0xa0/0xa8 [k3_cppi_desc_pool] [ 103.148007] lr : k3_cppi_desc_pool_destroy+0xa0/0xa8 [k3_cppi_desc_pool] [ 103.154709] sp : ffff8000826ebbc0 [ 103.158015] x29: ffff8000826ebbc0 x28: ffff0000090b6300 x27: 0000000000000000 [ 103.165145] x26: 0000000000000000 x25: 0000000000000000 x24: ffff0000019df6b0 [ 103.172271] x23: ffff0000019df6b8 x22: ffff0000019df410 x21: ffff8000826ebc88 [ 103.179397] x20: 000000000007d000 x19: ffff00000a3b3000 x18: 0000000000000000 [ 103.186522] x17: 0000000000000000 x16: 0000000000000000 x15: 000001e8c35e1cde [ 103.193647] x14: 0000000000000396 x13: 000000000000035c x12: 0000000000000000 [ 103.200772] x11: 000000000000003a x10: 00000000000009c0 x9 : ffff8000826eba20 [ 103.207897] x8 : ffff0000090b6d20 x7 : ffff00007728c180 x6 : ffff00007728c100 [ 103.215022] x5 : 0000000000000001 x4 : ffff000000508a50 x3 : ffff7ffff6146000 [ 103.222147] x2 : 0000000000000000 x1 : e300b4173ee6b200 x0 : 0000000000000000 [ 103.229274] Call trace: [ 103.231714] k3_cppi_desc_pool_destroy+0xa0/0xa8 [k3_cppi_desc_pool] (P) [ 103.238408] am65_cpsw_nuss_free_rx_chns+0x28/0x4c [ti_am65_cpsw_nuss] [ 103.244942] devm_action_release+0x14/0x20 [ 103.249040] release_nodes+0x3c/0x68 [ 103.252610] devres_release_all+0x8c/0xdc [ 103.256614] device_unbind_cleanup+0x18/0x60 [ 103.260876] device_release_driver_internal+0xf8/0x178 [ 103.266004] driver_detach+0x50/0x9c [ 103.269571] bus_remove_driver+0x6c/0xbc [ 103.273485] driver_unregister+0x30/0x60 [ 103.277401] platform_driver_unregister+0x14/0x20 [ 103.282096] am65_cpsw_nuss_driver_exit+0x18/0xff4 [ti_am65_cpsw_nuss] [ 103.288620] __arm64_sys_delete_module+0x17c/0x25c [ 103.293404] invoke_syscall+0x44/0x100 [ 103.297149] el0_svc_common.constprop.0+0xc0/0xe0 [ 103.301845] do_el0_svc+0x1c/0x28 [ 103.305155] el0_svc+0x28/0x98 [ 103.308207] el0t_64_sync_handler+0xc8/0xcc [ 103.312384] el0t_64_sync+0x198/0x19c [ 103.316040] ---[ end trace 0000000000000000 ]--- Signed-off-by: Roger Quadros Acked-by: Jakub Kicinski Acked-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20250224-k3-udma-glue-single-fdq-v2-1-cbe7621f2507@kernel.org Signed-off-by: Vinod Koul --- include/linux/dma/k3-udma-glue.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h index 2dea217629d0..5d43881e6fb7 100644 --- a/include/linux/dma/k3-udma-glue.h +++ b/include/linux/dma/k3-udma-glue.h @@ -138,8 +138,7 @@ int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn, u32 flow_num); void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn, u32 flow_num, void *data, - void (*cleanup)(void *data, dma_addr_t desc_dma), - bool skip_fdq); + void (*cleanup)(void *data, dma_addr_t desc_dma)); int k3_udma_glue_rx_flow_enable(struct k3_udma_glue_rx_channel *rx_chn, u32 flow_idx); int k3_udma_glue_rx_flow_disable(struct k3_udma_glue_rx_channel *rx_chn, -- cgit v1.2.3 From 62fb8adc43afad5fa1c9cadc6f3a8e9fb72af194 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 18 Feb 2025 15:22:05 -0700 Subject: mm: Provide address mask in struct follow_pfnmap_args follow_pfnmap_start() walks the page table for a given address and fills out the struct follow_pfnmap_args in pfnmap_args_setup(). The address mask of the page table level is already provided to this latter function for calculating the pfn. This address mask can also be useful for the caller to determine the extent of the contiguous mapping. For example, vfio-pci now supports huge_fault for pfnmaps and is able to insert pud and pmd mappings. When we DMA map these pfnmaps, ex. PCI MMIO BARs, we iterate follow_pfnmap_start() to get each pfn to test for a contiguous pfn range. Providing the mapping address mask allows us to skip the extent of the mapping level. Assuming a 1GB pud level and 4KB page size, iterations are reduced by a factor of 256K. In wall clock time, mapping a 32GB PCI BAR is reduced from ~1s to <1ms. Cc: Andrew Morton Cc: David Hildenbrand Cc: linux-mm@kvack.org Reviewed-by: Peter Xu Reviewed-by: Mitchell Augustin Tested-by: Mitchell Augustin Reviewed-by: Jason Gunthorpe Acked-by: David Hildenbrand Link: https://lore.kernel.org/r/20250218222209.1382449-6-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..92b30dba7e38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2417,11 +2417,13 @@ struct follow_pfnmap_args { * Outputs: * * @pfn: the PFN of the address + * @addr_mask: address mask covering pfn * @pgprot: the pgprot_t of the mapping * @writable: whether the mapping is writable * @special: whether the mapping is a special mapping (real PFN maps) */ unsigned long pfn; + unsigned long addr_mask; pgprot_t pgprot; bool writable; bool special; -- cgit v1.2.3 From 9fab91e0eae3a9ec1861119877a61d571f21536a Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 23 Feb 2025 16:06:02 +0000 Subject: usb: ulpi: Remove unused otg_ulpi_create otg_ulpi_create() has been unused since 2022's commit 8ca79aaad8be ("ARM: pxa: remove unused pxa3xx-ulpi") Remove it. The devm_ variant is still used. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250223160602.91916-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/ulpi.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/usb/ulpi.h b/include/linux/usb/ulpi.h index 5050f502c1ed..4b651065738a 100644 --- a/include/linux/usb/ulpi.h +++ b/include/linux/usb/ulpi.h @@ -49,19 +49,10 @@ /*-------------------------------------------------------------------------*/ #if IS_ENABLED(CONFIG_USB_ULPI) -struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops, - unsigned int flags); - struct usb_phy *devm_otg_ulpi_create(struct device *dev, struct usb_phy_io_ops *ops, unsigned int flags); #else -static inline struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops, - unsigned int flags) -{ - return NULL; -} - static inline struct usb_phy *devm_otg_ulpi_create(struct device *dev, struct usb_phy_io_ops *ops, unsigned int flags) -- cgit v1.2.3 From dc872c5f527a07124b2ad92d8c70d11829f17ac9 Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Mon, 3 Mar 2025 11:29:22 +0800 Subject: Coresight: Add support for new APB clock name Add support for new APB clock-name. If the function fails to obtain the clock with the name "apb_pclk", it will attempt to acquire the clock with the name "apb". Reviewed-by: James Clark Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250303032931.2500935-2-quic_jiegan@quicinc.com --- include/linux/coresight.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index c7614ccb3232..2e493d5288d8 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -471,8 +471,11 @@ static inline struct clk *coresight_get_enable_apb_pclk(struct device *dev) int ret; pclk = clk_get(dev, "apb_pclk"); - if (IS_ERR(pclk)) - return NULL; + if (IS_ERR(pclk)) { + pclk = clk_get(dev, "apb"); + if (IS_ERR(pclk)) + return NULL; + } ret = clk_prepare_enable(pclk); if (ret) { -- cgit v1.2.3 From c367a89dec267c65b556ace5d6aafd07fb1d66b1 Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Mon, 3 Mar 2025 11:29:23 +0800 Subject: Coresight: Add trace_id function to retrieving the trace ID Add 'trace_id' function pointer in coresight_ops. It's responsible for retrieving the device's trace ID. Co-developed-by: James Clark Signed-off-by: James Clark Reviewed-by: James Clark Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250303032931.2500935-3-quic_jiegan@quicinc.com --- include/linux/coresight.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2e493d5288d8..1396bda77f3f 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -335,6 +335,7 @@ enum cs_mode { CS_MODE_PERF, }; +#define coresight_ops(csdev) csdev->ops #define source_ops(csdev) csdev->ops->source_ops #define sink_ops(csdev) csdev->ops->sink_ops #define link_ops(csdev) csdev->ops->link_ops @@ -421,6 +422,8 @@ struct coresight_ops_panic { }; struct coresight_ops { + int (*trace_id)(struct coresight_device *csdev, enum cs_mode mode, + struct coresight_device *sink); const struct coresight_ops_sink *sink_ops; const struct coresight_ops_link *link_ops; const struct coresight_ops_source *source_ops; @@ -713,4 +716,6 @@ int coresight_init_driver(const char *drv, struct amba_driver *amba_drv, void coresight_remove_driver(struct amba_driver *amba_drv, struct platform_driver *pdev_drv); +int coresight_etm_get_trace_id(struct coresight_device *csdev, enum cs_mode mode, + struct coresight_device *sink); #endif /* _LINUX_COREISGHT_H */ -- cgit v1.2.3 From 3c03c49b2fa59c4d456e2d673fe5847fa6cbcdf2 Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Mon, 3 Mar 2025 11:29:25 +0800 Subject: Coresight: Introduce a new struct coresight_path Introduce a new strcuture, 'struct coresight_path', to store the data that utilized by the devices in the path. The coresight_path will be built/released by coresight_build_path/coresight_release_path functions. Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250303032931.2500935-5-quic_jiegan@quicinc.com --- include/linux/coresight.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 1396bda77f3f..509e1f256412 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -329,6 +329,16 @@ static struct coresight_dev_list (var) = { \ #define to_coresight_device(d) container_of(d, struct coresight_device, dev) +/** + * struct coresight_path - data needed by enable/disable path + * @path_list: path from source to sink. + * @trace_id: trace_id of the whole path. + */ +struct coresight_path { + struct list_head path_list; + u8 trace_id; +}; + enum cs_mode { CS_MODE_DISABLED, CS_MODE_SYSFS, -- cgit v1.2.3 From 7b365f056d8e02fc70c823bdf736e41a7236a54b Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Mon, 3 Mar 2025 11:29:27 +0800 Subject: Coresight: Change to read the trace ID from coresight_path The source device can directly read the trace ID from the coresight_path which result in etm_read_alloc_trace_id and etm4_read_alloc_trace_id being deleted. Co-developed-by: James Clark Signed-off-by: James Clark Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250303032931.2500935-7-quic_jiegan@quicinc.com --- include/linux/coresight.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 509e1f256412..bc0c853ffa6d 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -401,7 +401,7 @@ struct coresight_ops_link { struct coresight_ops_source { int (*cpu_id)(struct coresight_device *csdev); int (*enable)(struct coresight_device *csdev, struct perf_event *event, - enum cs_mode mode, struct coresight_trace_id_map *id_map); + enum cs_mode mode, struct coresight_path *path); void (*disable)(struct coresight_device *csdev, struct perf_event *event); }; -- cgit v1.2.3 From f78d206f3d73446e4c3568946a26f903da2149b4 Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Mon, 3 Mar 2025 11:29:30 +0800 Subject: Coresight: Add Coresight TMC Control Unit driver The Coresight TMC Control Unit hosts miscellaneous configuration registers which control various features related to TMC ETR sink. Based on the trace ID, which is programmed in the related CTCU ATID register of a specific ETR, trace data with that trace ID gets into the ETR buffer, while other trace data gets dropped. Enabling source device sets one bit of the ATID register based on source device's trace ID. Disabling source device resets the bit according to the source device's trace ID. Reviewed-by: James Clark Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250303032931.2500935-10-quic_jiegan@quicinc.com --- include/linux/coresight.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index bc0c853ffa6d..89b781e70fe7 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -71,7 +71,8 @@ enum coresight_dev_subtype_source { enum coresight_dev_subtype_helper { CORESIGHT_DEV_SUBTYPE_HELPER_CATU, - CORESIGHT_DEV_SUBTYPE_HELPER_ECT_CTI + CORESIGHT_DEV_SUBTYPE_HELPER_ECT_CTI, + CORESIGHT_DEV_SUBTYPE_HELPER_CTCU, }; /** -- cgit v1.2.3 From e5d5813968217b99ef2b83f13353967b218e3841 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bence=20Cs=C3=B3k=C3=A1s?= Date: Thu, 6 Mar 2025 14:44:36 +0100 Subject: counter: microchip-tcb-capture: Add IRQ handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add interrupt servicing to allow userspace to wait for the following: * Change-of-state caused by external trigger * Capture of timer value into RA/RB * Compare to RC register * Overflow Signed-off-by: Bence Csókás Link: https://lore.kernel.org/r/20250306134441.582819-2-csokas.bence@prolan.hu Signed-off-by: William Breathitt Gray --- include/uapi/linux/counter/microchip-tcb-capture.h | 34 ++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 include/uapi/linux/counter/microchip-tcb-capture.h (limited to 'include') diff --git a/include/uapi/linux/counter/microchip-tcb-capture.h b/include/uapi/linux/counter/microchip-tcb-capture.h new file mode 100644 index 000000000000..f3ef315fe9f6 --- /dev/null +++ b/include/uapi/linux/counter/microchip-tcb-capture.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Channel numbers used by the microchip-tcb-capture driver + * Copyright (C) 2025 Bence Csókás + */ +#ifndef _UAPI_COUNTER_MCHP_TCB_H_ +#define _UAPI_COUNTER_MCHP_TCB_H_ + +/* + * The driver defines the following components: + * + * Count 0 + * \__ Synapse 0 -- Signal 0 (Channel A, i.e. TIOA) + * \__ Synapse 1 -- Signal 1 (Channel B, i.e. TIOB) + * + * It also supports the following events: + * + * Channel 0: + * - CV register changed + * - CV overflowed + * - RA captured + * Channel 1: + * - RB captured + * Channel 2: + * - RC compare triggered + */ + +/* Event channels */ +#define COUNTER_MCHP_EVCHN_CV 0 +#define COUNTER_MCHP_EVCHN_RA 0 +#define COUNTER_MCHP_EVCHN_RB 1 +#define COUNTER_MCHP_EVCHN_RC 2 + +#endif /* _UAPI_COUNTER_MCHP_TCB_H_ */ -- cgit v1.2.3 From 1adc6240a80278c613f655b71c6c0d447b2d5932 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bence=20Cs=C3=B3k=C3=A1s?= Date: Thu, 6 Mar 2025 14:44:37 +0100 Subject: counter: microchip-tcb-capture: Add capture extensions for registers RA/RB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TCB hardware is capable of capturing the timer value to registers RA and RB. Add these registers as capture extensions. Signed-off-by: Bence Csókás Link: https://lore.kernel.org/r/20250306134441.582819-3-csokas.bence@prolan.hu Signed-off-by: William Breathitt Gray --- include/uapi/linux/counter/microchip-tcb-capture.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/counter/microchip-tcb-capture.h b/include/uapi/linux/counter/microchip-tcb-capture.h index f3ef315fe9f6..136e2faa7730 100644 --- a/include/uapi/linux/counter/microchip-tcb-capture.h +++ b/include/uapi/linux/counter/microchip-tcb-capture.h @@ -12,6 +12,8 @@ * Count 0 * \__ Synapse 0 -- Signal 0 (Channel A, i.e. TIOA) * \__ Synapse 1 -- Signal 1 (Channel B, i.e. TIOB) + * \__ Extension capture0 (RA register) + * \__ Extension capture1 (RB register) * * It also supports the following events: * @@ -25,6 +27,10 @@ * - RC compare triggered */ +/* Capture extensions */ +#define COUNTER_MCHP_EXCAP_RA 0 +#define COUNTER_MCHP_EXCAP_RB 1 + /* Event channels */ #define COUNTER_MCHP_EVCHN_CV 0 #define COUNTER_MCHP_EVCHN_RA 0 -- cgit v1.2.3 From df896e4f7cf5cc3abffb186e2b6815b785500b57 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 27 Feb 2025 22:06:02 +0800 Subject: soundwire: extend sdw_stream_type to BPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPT/BRA need to be special cased, i.e. there's no point in using the bandwidth allocation since the entire frame can be used. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Tested-by: shumingf@realtek.com Link: https://lore.kernel.org/r/20250227140615.8147-4-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 2d6c30317792..857b670eb9a5 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -150,12 +150,14 @@ enum sdw_dpn_pkg_mode { * * @SDW_STREAM_PCM: PCM data stream * @SDW_STREAM_PDM: PDM data stream + * @SDW_STREAM_BPT: BPT data stream * * spec doesn't define this, but is used in implementation */ enum sdw_stream_type { SDW_STREAM_PCM = 0, SDW_STREAM_PDM = 1, + SDW_STREAM_BPT = 2, }; /** @@ -879,7 +881,7 @@ struct sdw_port_config { * @ch_count: Channel count of the stream * @bps: Number of bits per audio sample * @direction: Data direction - * @type: Stream type PCM or PDM + * @type: Stream type PCM, PDM or BPT */ struct sdw_stream_config { unsigned int frame_rate; @@ -929,7 +931,7 @@ struct sdw_stream_params { * @name: SoundWire stream name * @params: Stream parameters * @state: Current state of the stream - * @type: Stream type PCM or PDM + * @type: Stream type PCM, PDM or BPT * @m_rt_count: Count of Master runtime(s) in this stream * @master_list: List of Master runtime(s) in this stream. * master_list can contain only one m_rt per Master instance -- cgit v1.2.3 From dc90bbefa792031d89fe2af9ad4a6febd6be96a9 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 27 Feb 2025 22:06:03 +0800 Subject: soundwire: stream: extend sdw_alloc_stream() to take 'type' parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the existing definition of sdw_stream_runtime, the 'type' member is never set and defaults to PCM. To prepare for the BPT/BRA support, we need to special-case streams and make use of the 'type'. No functional change for now, the implicit PCM type is now explicit. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Tested-by: shumingf@realtek.com Link: https://lore.kernel.org/r/20250227140615.8147-5-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 857b670eb9a5..ba58e2a52322 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -1019,7 +1019,7 @@ struct sdw_bus { unsigned int lane_used_bandwidth[SDW_MAX_LANES]; }; -struct sdw_stream_runtime *sdw_alloc_stream(const char *stream_name); +struct sdw_stream_runtime *sdw_alloc_stream(const char *stream_name, enum sdw_stream_type type); void sdw_release_stream(struct sdw_stream_runtime *stream); int sdw_compute_params(struct sdw_bus *bus, struct sdw_stream_runtime *stream); -- cgit v1.2.3 From 9a756289ac5a8517dc643555d784d830b61576ad Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 27 Feb 2025 22:06:06 +0800 Subject: soundwire: bus: add send_async/wait APIs for BPT protocol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add definitions and helpers for the BPT/BRA protocol. Peripheral drivers (aka ASoC codec drivers) can use this API to send bulk data such as firmware or tables. The design intent is however NOT to directly use this API but to rely on an intermediate regmap layer. The API is only available when no other audio streams have been allocated, and only one BTP/BRA stream is allowed per link. To avoid the addition of yet another lock, the refcount tests are handled in the stream master_runtime alloc/free routines where the bus_lock is already held. Another benefit of this approach is that the same bus_lock is used to handle runtime and port linked lists, which reduces the potential for misaligned configurations. In addition to exclusion with audio streams, BPT transfers have a lot of overhead, specifically registers writes are needed to enable transport in DP0. Most DMAs don't handle too well very small data sets and they may have alignment limitations. The size and alignment requirements are for now not handled by the core but must be checked by platform-specific drivers. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Tested-by: shumingf@realtek.com Link: https://lore.kernel.org/r/20250227140615.8147-8-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index ba58e2a52322..69f3e700796d 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -824,6 +824,15 @@ struct sdw_defer { struct completion complete; }; +/* + * Add a practical limit to BPT transfer sizes. BPT is typically used + * to transfer firmware, and larger firmware transfers will increase + * the cold latency beyond typical OS or user requirements. + */ +#define SDW_BPT_MSG_MAX_BYTES (1024 * 1024) + +struct sdw_bpt_msg; + /** * struct sdw_master_ops - Master driver ops * @read_prop: Read Master properties @@ -839,6 +848,10 @@ struct sdw_defer { * @get_device_num: Callback for vendor-specific device_number allocation * @put_device_num: Callback for vendor-specific device_number release * @new_peripheral_assigned: Callback to handle enumeration of new peripheral. + * @bpt_send_async: reserve resources for BPT stream and send message + * using BTP protocol + * @bpt_wait: wait for message completion using BTP protocol + * and release resources */ struct sdw_master_ops { int (*read_prop)(struct sdw_bus *bus); @@ -855,6 +868,9 @@ struct sdw_master_ops { void (*new_peripheral_assigned)(struct sdw_bus *bus, struct sdw_slave *slave, int dev_num); + int (*bpt_send_async)(struct sdw_bus *bus, struct sdw_slave *slave, + struct sdw_bpt_msg *msg); + int (*bpt_wait)(struct sdw_bus *bus, struct sdw_slave *slave, struct sdw_bpt_msg *msg); }; int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, @@ -961,6 +977,8 @@ struct sdw_stream_runtime { * @defer_msg: Defer message * @params: Current bus parameters * @stream_refcount: number of streams currently using this bus + * @btp_stream_refcount: number of BTP streams currently using this bus (should + * be zero or one, multiple streams per link is not supported). * @ops: Master callback ops * @port_ops: Master port callback ops * @prop: Master properties @@ -998,6 +1016,7 @@ struct sdw_bus { struct sdw_defer defer_msg; struct sdw_bus_params params; int stream_refcount; + int bpt_stream_refcount; const struct sdw_master_ops *ops; const struct sdw_master_port_ops *port_ops; struct sdw_master_prop prop; @@ -1045,6 +1064,10 @@ int sdw_compare_devid(struct sdw_slave *slave, struct sdw_slave_id id); void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id); bool is_clock_scaling_supported_by_slave(struct sdw_slave *slave); +int sdw_bpt_send_async(struct sdw_bus *bus, struct sdw_slave *slave, struct sdw_bpt_msg *msg); +int sdw_bpt_wait(struct sdw_bus *bus, struct sdw_slave *slave, struct sdw_bpt_msg *msg); +int sdw_bpt_send_sync(struct sdw_bus *bus, struct sdw_slave *slave, struct sdw_bpt_msg *msg); + #if IS_ENABLED(CONFIG_SOUNDWIRE) int sdw_stream_add_slave(struct sdw_slave *slave, -- cgit v1.2.3 From 8e4a239b403bd8aed8787798de8e4e42f79246c2 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 27 Feb 2025 22:06:07 +0800 Subject: soundwire: bus: add bpt_stream pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a convenience pointer to the 'sdw_bus' structure. BPT is a dedicated stream which will typically not be handled by DAIs or dailinks. Since there's only one BPT stream per link, storing the pointer at the link level seems rather natural. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Tested-by: shumingf@realtek.com Link: https://lore.kernel.org/r/20250227140615.8147-9-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 69f3e700796d..2362f621d94c 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -979,6 +979,7 @@ struct sdw_stream_runtime { * @stream_refcount: number of streams currently using this bus * @btp_stream_refcount: number of BTP streams currently using this bus (should * be zero or one, multiple streams per link is not supported). + * @bpt_stream: pointer stored to handle BTP streams. * @ops: Master callback ops * @port_ops: Master port callback ops * @prop: Master properties @@ -1017,6 +1018,7 @@ struct sdw_bus { struct sdw_bus_params params; int stream_refcount; int bpt_stream_refcount; + struct sdw_stream_runtime *bpt_stream; const struct sdw_master_ops *ops; const struct sdw_master_port_ops *port_ops; struct sdw_master_prop prop; -- cgit v1.2.3 From 7f17a73a7dd8252aa88c6f5e23310861de3d5423 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 27 Feb 2025 22:06:09 +0800 Subject: soundwire: intel_auxdevice: add indirection for BPT send_async/wait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror abstraction added for master ops. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Tested-by: shumingf@realtek.com Link: https://lore.kernel.org/r/20250227140615.8147-11-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 580086417e4b..493d9de4e472 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -436,6 +436,10 @@ struct sdw_intel_hw_ops { bool (*sync_check_cmdsync_unlocked)(struct sdw_intel *sdw); void (*program_sdi)(struct sdw_intel *sdw, int dev_num); + + int (*bpt_send_async)(struct sdw_intel *sdw, struct sdw_slave *slave, + struct sdw_bpt_msg *msg); + int (*bpt_wait)(struct sdw_intel *sdw, struct sdw_slave *slave, struct sdw_bpt_msg *msg); }; extern const struct sdw_intel_hw_ops sdw_intel_cnl_hw_ops; -- cgit v1.2.3 From 5d5cb86fb46ea1c7efd3b894f63fe364e5847043 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 27 Feb 2025 22:06:10 +0800 Subject: ASoC: SOF: Intel: hda-sdw-bpt: add helpers for SoundWire BPT DMA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SoundWire BPT DMA helpers as a separate module to avoid circular dependencies. For now this assumes no link DMA, only coupled mode. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Acked-by: Mark Brown Tested-by: shumingf@realtek.com Link: https://lore.kernel.org/r/20250227140615.8147-12-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/sound/hda-sdw-bpt.h | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 include/sound/hda-sdw-bpt.h (limited to 'include') diff --git a/include/sound/hda-sdw-bpt.h b/include/sound/hda-sdw-bpt.h new file mode 100644 index 000000000000..f649549b75d5 --- /dev/null +++ b/include/sound/hda-sdw-bpt.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */ +/* + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * Copyright(c) 2025 Intel Corporation. + */ + +#ifndef __HDA_SDW_BPT_H +#define __HDA_SDW_BPT_H + +#include + +struct hdac_ext_stream; +struct snd_dma_buffer; + +#if IS_ENABLED(CONFIG_SND_SOF_SOF_HDA_SDW_BPT) +int hda_sdw_bpt_open(struct device *dev, int link_id, struct hdac_ext_stream **bpt_tx_stream, + struct snd_dma_buffer *dmab_tx_bdl, u32 bpt_tx_num_bytes, + u32 tx_dma_bandwidth, struct hdac_ext_stream **bpt_rx_stream, + struct snd_dma_buffer *dmab_rx_bdl, u32 bpt_rx_num_bytes, + u32 rx_dma_bandwidth); + +int hda_sdw_bpt_send_async(struct device *dev, struct hdac_ext_stream *bpt_tx_stream, + struct hdac_ext_stream *bpt_rx_stream); + +int hda_sdw_bpt_wait(struct device *dev, struct hdac_ext_stream *bpt_tx_stream, + struct hdac_ext_stream *bpt_rx_stream); + +int hda_sdw_bpt_close(struct device *dev, struct hdac_ext_stream *bpt_tx_stream, + struct snd_dma_buffer *dmab_tx_bdl, struct hdac_ext_stream *bpt_rx_stream, + struct snd_dma_buffer *dmab_rx_bdl); +#else +static inline int hda_sdw_bpt_open(struct device *dev, int link_id, + struct hdac_ext_stream **bpt_tx_stream, + struct snd_dma_buffer *dmab_tx_bdl, u32 bpt_tx_num_bytes, + u32 tx_dma_bandwidth, struct hdac_ext_stream **bpt_rx_stream, + struct snd_dma_buffer *dmab_rx_bdl, u32 bpt_rx_num_bytes, + u32 rx_dma_bandwidth) +{ + WARN_ONCE(1, "SoundWire BPT is disabled"); + return -EOPNOTSUPP; +} + +static inline int hda_sdw_bpt_send_async(struct device *dev, struct hdac_ext_stream *bpt_tx_stream, + struct hdac_ext_stream *bpt_rx_stream) +{ + WARN_ONCE(1, "SoundWire BPT is disabled"); + return -EOPNOTSUPP; +} + +static inline int hda_sdw_bpt_wait(struct device *dev, struct hdac_ext_stream *bpt_tx_stream, + struct hdac_ext_stream *bpt_rx_stream) +{ + WARN_ONCE(1, "SoundWire BPT is disabled"); + return -EOPNOTSUPP; +} + +static inline int hda_sdw_bpt_close(struct device *dev, struct hdac_ext_stream *bpt_tx_stream, + struct snd_dma_buffer *dmab_tx_bdl, + struct hdac_ext_stream *bpt_rx_stream, + struct snd_dma_buffer *dmab_rx_bdl) +{ + WARN_ONCE(1, "SoundWire BPT is disabled"); + return -EOPNOTSUPP; +} +#endif + +#endif /* __HDA_SDW_BPT_H */ -- cgit v1.2.3 From b02d41d884f64d22d5d5a2a4b35550f5e80bdb3d Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 6 Mar 2025 01:54:08 +0000 Subject: phy: core: Remove unused phy_pm_runtime_(allow|forbid) phy_pm_runtime_allow() and phy_pm_runtime_forbid() were added in 2013 as part of commit ff764963479a ("drivers: phy: add generic PHY framework") but have remained unused. Remove them. Fix up the (English) docs - I've left the Chinese translation. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250306015408.277729-1-linux@treblig.org Signed-off-by: Vinod Koul --- include/linux/phy/phy.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index 03cd5bae92d3..e63e6e70e860 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -227,8 +227,6 @@ int phy_pm_runtime_get(struct phy *phy); int phy_pm_runtime_get_sync(struct phy *phy); int phy_pm_runtime_put(struct phy *phy); int phy_pm_runtime_put_sync(struct phy *phy); -void phy_pm_runtime_allow(struct phy *phy); -void phy_pm_runtime_forbid(struct phy *phy); int phy_init(struct phy *phy); int phy_exit(struct phy *phy); int phy_power_on(struct phy *phy); @@ -321,16 +319,6 @@ static inline int phy_pm_runtime_put_sync(struct phy *phy) return -ENOSYS; } -static inline void phy_pm_runtime_allow(struct phy *phy) -{ - return; -} - -static inline void phy_pm_runtime_forbid(struct phy *phy) -{ - return; -} - static inline int phy_init(struct phy *phy) { if (!phy) -- cgit v1.2.3 From b5198201932635e19068cc2e83a99adf38f32c43 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Thu, 6 Mar 2025 16:05:43 +0900 Subject: counter: Introduce the compare component Compare registers are used in devices to compare a counter channel against a particular count value (e.g. to check if a threshold has been reached). A macro COUNTER_COMP_COMPARE() is introduced to facilitate the creation of compare components as Count extensions. Link: https://lore.kernel.org/r/20250306-introduce-compare-component-v1-1-93993b3dca9c@kernel.org Signed-off-by: William Breathitt Gray --- include/linux/counter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/counter.h b/include/linux/counter.h index 426b7d58a438..f208e867dd0f 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -580,6 +580,9 @@ struct counter_array { #define COUNTER_COMP_CEILING(_read, _write) \ COUNTER_COMP_COUNT_U64("ceiling", _read, _write) +#define COUNTER_COMP_COMPARE(_read, _write) \ + COUNTER_COMP_COUNT_U64("compare", _read, _write) + #define COUNTER_COMP_COUNT_MODE(_read, _write, _available) \ { \ .type = COUNTER_COMP_COUNT_MODE, \ -- cgit v1.2.3 From 26f060c106f630e34876c096c1c8997a9e68c371 Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Thu, 6 Mar 2025 12:11:02 +0000 Subject: coresight: change coresight_device lock type to raw_spinlock_t coresight_device->cscfg_csdev_lock can be held during __schedule() by perf_event_task_sched_out()/in(). Since coresight->cscfg_csdev_lock type is spinlock_t and perf_event_task_sched_out()/in() is called after acquiring rq_lock, which is raw_spinlock_t (an unsleepable lock), this poses an issue in PREEMPT_RT kernel where spinlock_t is sleepable. To address this, change type of coresight_device->cscfg_csdev_lock from spinlock_t to raw_spinlock_t. Reviewed-by: James Clark Reviewed-by: Mike Leach Signed-off-by: Yeoreum Yun Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250306121110.1647948-2-yeoreum.yun@arm.com --- include/linux/coresight.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 89b781e70fe7..4541bfc1cc6b 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -302,7 +302,7 @@ struct coresight_device { /* system configuration and feature lists */ struct list_head feature_csdev_list; struct list_head config_csdev_list; - spinlock_t cscfg_csdev_lock; + raw_spinlock_t cscfg_csdev_lock; void *active_cscfg_ctxt; }; -- cgit v1.2.3 From 4cf364ca57d851e192ce02e98d314d22fa514895 Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Thu, 6 Mar 2025 12:11:04 +0000 Subject: coresight: change coresight_trace_id_map's lock type to raw_spinlock_t coresight_trace_id_map->lock can be acquired while coresight devices' drvdata_lock. But the drvdata_lock can be raw_spinlock_t (i.e) coresight-etm4x. To address this, change type of coresight_trace_id_map->lock to raw_spinlock_t Signed-off-by: Yeoreum Yun Reviewed-by: James Clark Reviewed-by: Mike Leach Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250306121110.1647948-4-yeoreum.yun@arm.com --- include/linux/coresight.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 4541bfc1cc6b..d79a242b271d 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -239,7 +239,7 @@ struct coresight_trace_id_map { DECLARE_BITMAP(used_ids, CORESIGHT_TRACE_IDS_MAX); atomic_t __percpu *cpu_map; atomic_t perf_cs_etm_session_active; - spinlock_t lock; + raw_spinlock_t lock; }; /** -- cgit v1.2.3 From b9014a10bdb8e967d85819f5ef61e6f80d0b46c9 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 10 Feb 2025 17:29:11 -0600 Subject: dmaengine: Remove device_prep_dma_imm_data from struct dma_device The device_prep_dma_imm_data() method isn't implemented or invoked by any code since commit 80ade22c06ca ("misc: mic: remove the MIC drivers"). Remove it, shrinking struct dma_device by a few bytes. Signed-off-by: Nathan Lynch Link: https://lore.kernel.org/r/20250210-dmaengine-drop-imm-data-v1-1-e017766da2fa@amd.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index a360e0330436..bb146c5ac3e4 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -839,7 +839,6 @@ struct dma_filter { * The function takes a buffer of size buf_len. The callback function will * be called after period_len bytes have been transferred. * @device_prep_interleaved_dma: Transfer expression in a generic way. - * @device_prep_dma_imm_data: DMA's 8 byte immediate data to the dst address * @device_caps: May be used to override the generic DMA slave capabilities * with per-channel specific ones * @device_config: Pushes a new configuration to a channel, return 0 or an error @@ -942,9 +941,6 @@ struct dma_device { struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)( struct dma_chan *chan, struct dma_interleaved_template *xt, unsigned long flags); - struct dma_async_tx_descriptor *(*device_prep_dma_imm_data)( - struct dma_chan *chan, dma_addr_t dst, u64 data, - unsigned long flags); void (*device_caps)(struct dma_chan *chan, struct dma_slave_caps *caps); int (*device_config)(struct dma_chan *chan, struct dma_slave_config *config); -- cgit v1.2.3 From 36f257e3b0ba904f5a4e7fa8dafaa60e88cdd28c Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Mon, 10 Mar 2025 22:38:38 +0000 Subject: acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors When PCIe AER is in FW-First, OS should process CXL Protocol errors from CPER records. Introduce support for handling and logging CXL Protocol errors. The defined trace events cxl_aer_uncorrectable_error and cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them to trace FW-First Protocol errors. Since the CXL code is required to be called from process context and GHES is in interrupt context, use workqueues for processing. Similar to CXL CPER event handling, use kfifo to handle errors as it simplifies queue processing by providing lock free fifo operations. Add the ability for the CXL sub-system to register a workqueue to process CXL CPER protocol errors. [DJ: return cxl_cper_register_prot_err_work() directly in cxl_ras_init()] Signed-off-by: Smita Koralahalli Reviewed-by: Li Ming Reviewed-by: Alison Schofield Reviewed-by: Ira Weiny Reviewed-by: Tony Luck Link: https://patch.msgid.link/20250310223839.31342-2-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- include/cxl/event.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/cxl/event.h b/include/cxl/event.h index 8381a07052d0..f9ae1796da85 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -254,6 +254,9 @@ struct cxl_cper_prot_err_work_data { int cxl_cper_register_work(struct work_struct *work); int cxl_cper_unregister_work(struct work_struct *work); int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd); +int cxl_cper_register_prot_err_work(struct work_struct *work); +int cxl_cper_unregister_prot_err_work(struct work_struct *work); +int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd); #else static inline int cxl_cper_register_work(struct work_struct *work) { @@ -268,6 +271,18 @@ static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd) { return 0; } +static inline int cxl_cper_register_prot_err_work(struct work_struct *work) +{ + return 0; +} +static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work) +{ + return 0; +} +static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd) +{ + return 0; +} #endif #endif /* _LINUX_CXL_EVENT_H */ -- cgit v1.2.3 From 5a0fcb0ef5584caf7da3f31896e08650c532e4c1 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Wed, 19 Feb 2025 18:00:06 +1100 Subject: cxl: Remove driver Remove the cxl driver that provides support for the IBM Coherent Accelerator Processor Interface. Revert or clean up associated code in arch/powerpc that is no longer necessary. cxl has received minimal maintenance for several years, and is not supported on the Power10 processor. We aren't aware of any users who are likely to be using recent kernels. Thanks to Mikey Neuling, Ian Munsie, Daniel Axtens, Frederic Barrat, Christophe Lombard, Philippe Bergheaud, Vaibhav Jain and Alastair D'Silva for their work on this driver over the years. Signed-off-by: Andrew Donnellan Acked-by: Frederic Barrat Acked-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://patch.msgid.link/20250219070007.177725-2-ajd@linux.ibm.com --- include/misc/cxl-base.h | 48 --------- include/misc/cxl.h | 265 ------------------------------------------------ include/misc/cxllib.h | 129 ----------------------- include/uapi/misc/cxl.h | 156 ---------------------------- 4 files changed, 598 deletions(-) delete mode 100644 include/misc/cxl-base.h delete mode 100644 include/misc/cxl.h delete mode 100644 include/misc/cxllib.h delete mode 100644 include/uapi/misc/cxl.h (limited to 'include') diff --git a/include/misc/cxl-base.h b/include/misc/cxl-base.h deleted file mode 100644 index 2251da7f32d9..000000000000 --- a/include/misc/cxl-base.h +++ /dev/null @@ -1,48 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright 2014 IBM Corp. - */ - -#ifndef _MISC_CXL_BASE_H -#define _MISC_CXL_BASE_H - -#ifdef CONFIG_CXL_BASE - -#define CXL_IRQ_RANGES 4 - -struct cxl_irq_ranges { - irq_hw_number_t offset[CXL_IRQ_RANGES]; - irq_hw_number_t range[CXL_IRQ_RANGES]; -}; - -extern atomic_t cxl_use_count; - -static inline bool cxl_ctx_in_use(void) -{ - return (atomic_read(&cxl_use_count) != 0); -} - -static inline void cxl_ctx_get(void) -{ - atomic_inc(&cxl_use_count); -} - -static inline void cxl_ctx_put(void) -{ - atomic_dec(&cxl_use_count); -} - -struct cxl_afu *cxl_afu_get(struct cxl_afu *afu); -void cxl_afu_put(struct cxl_afu *afu); -void cxl_slbia(struct mm_struct *mm); - -#else /* CONFIG_CXL_BASE */ - -static inline bool cxl_ctx_in_use(void) { return false; } -static inline struct cxl_afu *cxl_afu_get(struct cxl_afu *afu) { return NULL; } -static inline void cxl_afu_put(struct cxl_afu *afu) {} -static inline void cxl_slbia(struct mm_struct *mm) {} - -#endif /* CONFIG_CXL_BASE */ - -#endif diff --git a/include/misc/cxl.h b/include/misc/cxl.h deleted file mode 100644 index d8044299d654..000000000000 --- a/include/misc/cxl.h +++ /dev/null @@ -1,265 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright 2015 IBM Corp. - */ - -#ifndef _MISC_CXL_H -#define _MISC_CXL_H - -#include -#include -#include -#include - -/* - * This documents the in kernel API for driver to use CXL. It allows kernel - * drivers to bind to AFUs using an AFU configuration record exposed as a PCI - * configuration record. - * - * This API enables control over AFU and contexts which can't be part of the - * generic PCI API. This API is agnostic to the actual AFU. - */ - -/* Get the AFU associated with a pci_dev */ -struct cxl_afu *cxl_pci_to_afu(struct pci_dev *dev); - -/* Get the AFU conf record number associated with a pci_dev */ -unsigned int cxl_pci_to_cfg_record(struct pci_dev *dev); - - -/* - * Context lifetime overview: - * - * An AFU context may be inited and then started and stopped multiple times - * before it's released. ie. - * - cxl_dev_context_init() - * - cxl_start_context() - * - cxl_stop_context() - * - cxl_start_context() - * - cxl_stop_context() - * ...repeat... - * - cxl_release_context() - * Once released, a context can't be started again. - * - * One context is inited by the cxl driver for every pci_dev. This is to be - * used as a default kernel context. cxl_get_context() will get this - * context. This context will be released by PCI hot unplug, so doesn't need to - * be released explicitly by drivers. - * - * Additional kernel contexts may be inited using cxl_dev_context_init(). - * These must be released using cxl_context_detach(). - * - * Once a context has been inited, IRQs may be configured. Firstly these IRQs - * must be allocated (cxl_allocate_afu_irqs()), then individually mapped to - * specific handlers (cxl_map_afu_irq()). - * - * These IRQs can be unmapped (cxl_unmap_afu_irq()) and finally released - * (cxl_free_afu_irqs()). - * - * The AFU can be reset (cxl_afu_reset()). This will cause the PSL/AFU - * hardware to lose track of all contexts. It's upto the caller of - * cxl_afu_reset() to restart these contexts. - */ - -/* - * On pci_enabled_device(), the cxl driver will init a single cxl context for - * use by the driver. It doesn't start this context (as that will likely - * generate DMA traffic for most AFUs). - * - * This gets the default context associated with this pci_dev. This context - * doesn't need to be released as this will be done by the PCI subsystem on hot - * unplug. - */ -struct cxl_context *cxl_get_context(struct pci_dev *dev); -/* - * Allocate and initalise a context associated with a AFU PCI device. This - * doesn't start the context in the AFU. - */ -struct cxl_context *cxl_dev_context_init(struct pci_dev *dev); -/* - * Release and free a context. Context should be stopped before calling. - */ -int cxl_release_context(struct cxl_context *ctx); - -/* - * Set and get private data associated with a context. Allows drivers to have a - * back pointer to some useful structure. - */ -int cxl_set_priv(struct cxl_context *ctx, void *priv); -void *cxl_get_priv(struct cxl_context *ctx); - -/* - * Allocate AFU interrupts for this context. num=0 will allocate the default - * for this AFU as given in the AFU descriptor. This number doesn't include the - * interrupt 0 (CAIA defines AFU IRQ 0 for page faults). Each interrupt to be - * used must map a handler with cxl_map_afu_irq. - */ -int cxl_allocate_afu_irqs(struct cxl_context *cxl, int num); -/* Free allocated interrupts */ -void cxl_free_afu_irqs(struct cxl_context *cxl); - -/* - * Map a handler for an AFU interrupt associated with a particular context. AFU - * IRQS numbers start from 1 (CAIA defines AFU IRQ 0 for page faults). cookie - * is private data is that will be provided to the interrupt handler. - */ -int cxl_map_afu_irq(struct cxl_context *cxl, int num, - irq_handler_t handler, void *cookie, char *name); -/* unmap mapped IRQ handlers */ -void cxl_unmap_afu_irq(struct cxl_context *cxl, int num, void *cookie); - -/* - * Start work on the AFU. This starts an cxl context and associates it with a - * task. task == NULL will make it a kernel context. - */ -int cxl_start_context(struct cxl_context *ctx, u64 wed, - struct task_struct *task); -/* - * Stop a context and remove it from the PSL - */ -int cxl_stop_context(struct cxl_context *ctx); - -/* Reset the AFU */ -int cxl_afu_reset(struct cxl_context *ctx); - -/* - * Set a context as a master context. - * This sets the default problem space area mapped as the full space, rather - * than just the per context area (for slaves). - */ -void cxl_set_master(struct cxl_context *ctx); - -/* - * Map and unmap the AFU Problem Space area. The amount and location mapped - * depends on if this context is a master or slave. - */ -void __iomem *cxl_psa_map(struct cxl_context *ctx); -void cxl_psa_unmap(void __iomem *addr); - -/* Get the process element for this context */ -int cxl_process_element(struct cxl_context *ctx); - -/* - * These calls allow drivers to create their own file descriptors and make them - * identical to the cxl file descriptor user API. An example use case: - * - * struct file_operations cxl_my_fops = {}; - * ...... - * // Init the context - * ctx = cxl_dev_context_init(dev); - * if (IS_ERR(ctx)) - * return PTR_ERR(ctx); - * // Create and attach a new file descriptor to my file ops - * file = cxl_get_fd(ctx, &cxl_my_fops, &fd); - * // Start context - * rc = cxl_start_work(ctx, &work.work); - * if (rc) { - * fput(file); - * put_unused_fd(fd); - * return -ENODEV; - * } - * // No error paths after installing the fd - * fd_install(fd, file); - * return fd; - * - * This inits a context, and gets a file descriptor and associates some file - * ops to that file descriptor. If the file ops are blank, the cxl driver will - * fill them in with the default ones that mimic the standard user API. Once - * completed, the file descriptor can be installed. Once the file descriptor is - * installed, it's visible to the user so no errors must occur past this point. - * - * If cxl_fd_release() file op call is installed, the context will be stopped - * and released when the fd is released. Hence the driver won't need to manage - * this itself. - */ - -/* - * Take a context and associate it with my file ops. Returns the associated - * file and file descriptor. Any file ops which are blank are filled in by the - * cxl driver with the default ops to mimic the standard API. - */ -struct file *cxl_get_fd(struct cxl_context *ctx, struct file_operations *fops, - int *fd); -/* Get the context associated with this file */ -struct cxl_context *cxl_fops_get_context(struct file *file); -/* - * Start a context associated a struct cxl_ioctl_start_work used by the - * standard cxl user API. - */ -int cxl_start_work(struct cxl_context *ctx, - struct cxl_ioctl_start_work *work); -/* - * Export all the existing fops so drivers can use them - */ -int cxl_fd_open(struct inode *inode, struct file *file); -int cxl_fd_release(struct inode *inode, struct file *file); -long cxl_fd_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int cxl_fd_mmap(struct file *file, struct vm_area_struct *vm); -__poll_t cxl_fd_poll(struct file *file, struct poll_table_struct *poll); -ssize_t cxl_fd_read(struct file *file, char __user *buf, size_t count, - loff_t *off); - -/* - * For EEH, a driver may want to assert a PERST will reload the same image - * from flash into the FPGA. - * - * This is a property of the entire adapter, not a single AFU, so drivers - * should set this property with care! - */ -void cxl_perst_reloads_same_image(struct cxl_afu *afu, - bool perst_reloads_same_image); - -/* - * Read the VPD for the card where the AFU resides - */ -ssize_t cxl_read_adapter_vpd(struct pci_dev *dev, void *buf, size_t count); - -/* - * AFU driver ops allow an AFU driver to create their own events to pass to - * userspace through the file descriptor as a simpler alternative to overriding - * the read() and poll() calls that works with the generic cxl events. These - * events are given priority over the generic cxl events, so they will be - * delivered first if multiple types of events are pending. - * - * The AFU driver must call cxl_context_events_pending() to notify the cxl - * driver that new events are ready to be delivered for a specific context. - * cxl_context_events_pending() will adjust the current count of AFU driver - * events for this context, and wake up anyone waiting on the context wait - * queue. - * - * The cxl driver will then call fetch_event() to get a structure defining - * the size and address of the driver specific event data. The cxl driver - * will build a cxl header with type and process_element fields filled in, - * and header.size set to sizeof(struct cxl_event_header) + data_size. - * The total size of the event is limited to CXL_READ_MIN_SIZE (4K). - * - * fetch_event() is called with a spin lock held, so it must not sleep. - * - * The cxl driver will then deliver the event to userspace, and finally - * call event_delivered() to return the status of the operation, identified - * by cxl context and AFU driver event data pointers. - * 0 Success - * -EFAULT copy_to_user() has failed - * -EINVAL Event data pointer is NULL, or event size is greater than - * CXL_READ_MIN_SIZE. - */ -struct cxl_afu_driver_ops { - struct cxl_event_afu_driver_reserved *(*fetch_event) ( - struct cxl_context *ctx); - void (*event_delivered) (struct cxl_context *ctx, - struct cxl_event_afu_driver_reserved *event, - int rc); -}; - -/* - * Associate the above driver ops with a specific context. - * Reset the current count of AFU driver events. - */ -void cxl_set_driver_ops(struct cxl_context *ctx, - struct cxl_afu_driver_ops *ops); - -/* Notify cxl driver that new events are ready to be delivered for context */ -void cxl_context_events_pending(struct cxl_context *ctx, - unsigned int new_events); - -#endif /* _MISC_CXL_H */ diff --git a/include/misc/cxllib.h b/include/misc/cxllib.h deleted file mode 100644 index eacc417288fc..000000000000 --- a/include/misc/cxllib.h +++ /dev/null @@ -1,129 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright 2017 IBM Corp. - */ - -#ifndef _MISC_CXLLIB_H -#define _MISC_CXLLIB_H - -#include -#include - -/* - * cxl driver exports a in-kernel 'library' API which can be called by - * other drivers to help interacting with an IBM XSL. - */ - -/* - * tells whether capi is supported on the PCIe slot where the - * device is seated - * - * Input: - * dev: device whose slot needs to be checked - * flags: 0 for the time being - */ -bool cxllib_slot_is_supported(struct pci_dev *dev, unsigned long flags); - - -/* - * Returns the configuration parameters to be used by the XSL or device - * - * Input: - * dev: device, used to find PHB - * Output: - * struct cxllib_xsl_config: - * version - * capi BAR address, i.e. 0x2000000000000-0x2FFFFFFFFFFFF - * capi BAR size - * data send control (XSL_DSNCTL) - * dummy read address (XSL_DRA) - */ -#define CXL_XSL_CONFIG_VERSION1 1 -struct cxllib_xsl_config { - u32 version; /* format version for register encoding */ - u32 log_bar_size;/* log size of the capi_window */ - u64 bar_addr; /* address of the start of capi window */ - u64 dsnctl; /* matches definition of XSL_DSNCTL */ - u64 dra; /* real address that can be used for dummy read */ -}; - -int cxllib_get_xsl_config(struct pci_dev *dev, struct cxllib_xsl_config *cfg); - - -/* - * Activate capi for the pci host bridge associated with the device. - * Can be extended to deactivate once we know how to do it. - * Device must be ready to accept messages from the CAPP unit and - * respond accordingly (TLB invalidates, ...) - * - * PHB is switched to capi mode through calls to skiboot. - * CAPP snooping is activated - * - * Input: - * dev: device whose PHB should switch mode - * mode: mode to switch to i.e. CAPI or PCI - * flags: options related to the mode - */ -enum cxllib_mode { - CXL_MODE_CXL, - CXL_MODE_PCI, -}; - -#define CXL_MODE_NO_DMA 0 -#define CXL_MODE_DMA_TVT0 1 -#define CXL_MODE_DMA_TVT1 2 - -int cxllib_switch_phb_mode(struct pci_dev *dev, enum cxllib_mode mode, - unsigned long flags); - - -/* - * Set the device for capi DMA. - * Define its dma_ops and dma offset so that allocations will be using TVT#1 - * - * Input: - * dev: device to set - * flags: options. CXL_MODE_DMA_TVT1 should be used - */ -int cxllib_set_device_dma(struct pci_dev *dev, unsigned long flags); - - -/* - * Get the Process Element structure for the given thread - * - * Input: - * task: task_struct for the context of the translation - * translation_mode: whether addresses should be translated - * Output: - * attr: attributes to fill up the Process Element structure from CAIA - */ -struct cxllib_pe_attributes { - u64 sr; - u32 lpid; - u32 tid; - u32 pid; -}; -#define CXL_TRANSLATED_MODE 0 -#define CXL_REAL_MODE 1 - -int cxllib_get_PE_attributes(struct task_struct *task, - unsigned long translation_mode, struct cxllib_pe_attributes *attr); - - -/* - * Handle memory fault. - * Fault in all the pages of the specified buffer for the permissions - * provided in ‘flags’ - * - * Shouldn't be called from interrupt context - * - * Input: - * mm: struct mm for the thread faulting the pages - * addr: base address of the buffer to page in - * size: size of the buffer to page in - * flags: permission requested (DSISR_ISSTORE...) - */ -int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags); - - -#endif /* _MISC_CXLLIB_H */ diff --git a/include/uapi/misc/cxl.h b/include/uapi/misc/cxl.h deleted file mode 100644 index 56376d3907d8..000000000000 --- a/include/uapi/misc/cxl.h +++ /dev/null @@ -1,156 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ -/* - * Copyright 2014 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _UAPI_MISC_CXL_H -#define _UAPI_MISC_CXL_H - -#include -#include - - -struct cxl_ioctl_start_work { - __u64 flags; - __u64 work_element_descriptor; - __u64 amr; - __s16 num_interrupts; - __u16 tid; - __s32 reserved1; - __u64 reserved2; - __u64 reserved3; - __u64 reserved4; - __u64 reserved5; -}; - -#define CXL_START_WORK_AMR 0x0000000000000001ULL -#define CXL_START_WORK_NUM_IRQS 0x0000000000000002ULL -#define CXL_START_WORK_ERR_FF 0x0000000000000004ULL -#define CXL_START_WORK_TID 0x0000000000000008ULL -#define CXL_START_WORK_ALL (CXL_START_WORK_AMR |\ - CXL_START_WORK_NUM_IRQS |\ - CXL_START_WORK_ERR_FF |\ - CXL_START_WORK_TID) - - -/* Possible modes that an afu can be in */ -#define CXL_MODE_DEDICATED 0x1 -#define CXL_MODE_DIRECTED 0x2 - -/* possible flags for the cxl_afu_id flags field */ -#define CXL_AFUID_FLAG_SLAVE 0x1 /* In directed-mode afu is in slave mode */ - -struct cxl_afu_id { - __u64 flags; /* One of CXL_AFUID_FLAG_X */ - __u32 card_id; - __u32 afu_offset; - __u32 afu_mode; /* one of the CXL_MODE_X */ - __u32 reserved1; - __u64 reserved2; - __u64 reserved3; - __u64 reserved4; - __u64 reserved5; - __u64 reserved6; -}; - -/* base adapter image header is included in the image */ -#define CXL_AI_NEED_HEADER 0x0000000000000001ULL -#define CXL_AI_ALL CXL_AI_NEED_HEADER - -#define CXL_AI_HEADER_SIZE 128 -#define CXL_AI_BUFFER_SIZE 4096 -#define CXL_AI_MAX_ENTRIES 256 -#define CXL_AI_MAX_CHUNK_SIZE (CXL_AI_BUFFER_SIZE * CXL_AI_MAX_ENTRIES) - -struct cxl_adapter_image { - __u64 flags; - __u64 data; - __u64 len_data; - __u64 len_image; - __u64 reserved1; - __u64 reserved2; - __u64 reserved3; - __u64 reserved4; -}; - -/* ioctl numbers */ -#define CXL_MAGIC 0xCA -/* AFU devices */ -#define CXL_IOCTL_START_WORK _IOW(CXL_MAGIC, 0x00, struct cxl_ioctl_start_work) -#define CXL_IOCTL_GET_PROCESS_ELEMENT _IOR(CXL_MAGIC, 0x01, __u32) -#define CXL_IOCTL_GET_AFU_ID _IOR(CXL_MAGIC, 0x02, struct cxl_afu_id) -/* adapter devices */ -#define CXL_IOCTL_DOWNLOAD_IMAGE _IOW(CXL_MAGIC, 0x0A, struct cxl_adapter_image) -#define CXL_IOCTL_VALIDATE_IMAGE _IOW(CXL_MAGIC, 0x0B, struct cxl_adapter_image) - -#define CXL_READ_MIN_SIZE 0x1000 /* 4K */ - -/* Events from read() */ -enum cxl_event_type { - CXL_EVENT_RESERVED = 0, - CXL_EVENT_AFU_INTERRUPT = 1, - CXL_EVENT_DATA_STORAGE = 2, - CXL_EVENT_AFU_ERROR = 3, - CXL_EVENT_AFU_DRIVER = 4, -}; - -struct cxl_event_header { - __u16 type; - __u16 size; - __u16 process_element; - __u16 reserved1; -}; - -struct cxl_event_afu_interrupt { - __u16 flags; - __u16 irq; /* Raised AFU interrupt number */ - __u32 reserved1; -}; - -struct cxl_event_data_storage { - __u16 flags; - __u16 reserved1; - __u32 reserved2; - __u64 addr; - __u64 dsisr; - __u64 reserved3; -}; - -struct cxl_event_afu_error { - __u16 flags; - __u16 reserved1; - __u32 reserved2; - __u64 error; -}; - -struct cxl_event_afu_driver_reserved { - /* - * Defines the buffer passed to the cxl driver by the AFU driver. - * - * This is not ABI since the event header.size passed to the user for - * existing events is set in the read call to sizeof(cxl_event_header) - * + sizeof(whatever event is being dispatched) and the user is already - * required to use a 4K buffer on the read call. - * - * Of course the contents will be ABI, but that's up the AFU driver. - */ - __u32 data_size; - __u8 data[]; -}; - -struct cxl_event { - struct cxl_event_header header; - union { - struct cxl_event_afu_interrupt irq; - struct cxl_event_data_storage fault; - struct cxl_event_afu_error afu_error; - struct cxl_event_afu_driver_reserved afu_driver_event; - }; -}; - -#endif /* _UAPI_MISC_CXL_H */ -- cgit v1.2.3 From ac053946f5c40ce90ca7ccb75ed687612d9eccf9 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 27 Jan 2025 17:05:06 +0100 Subject: compiler.h: introduce TYPEOF_UNQUAL() macro Define TYPEOF_UNQUAL() to use __typeof_unqual__() as typeof operator when available, to return unqualified type of the expression. Current version of sparse doesn't know anything about __typeof_unqual__() operator. Avoid the usage of __typeof_unqual__() when sparse checking is active to prevent sparse errors with unknowing keyword. Link: https://lkml.kernel.org/r/20250127160709.80604-3-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Thomas Gleixner Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Brian Gerst Cc: Denys Vlasenko Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Arnd Bergmann Cc: Boqun Feng Cc: Borislav Petkov Cc: Dave Hansen Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Kent Overstreet Cc: Nadav Amit Cc: Paolo Abeni Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 8 ++++++++ include/linux/compiler-gcc.h | 8 ++++++++ include/linux/compiler.h | 20 ++++++++++++++++++++ 3 files changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 2e7c2c282f3a..4fc8e26914ad 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -128,3 +128,11 @@ */ #define ASM_INPUT_G "ir" #define ASM_INPUT_RM "r" + +/* + * Declare compiler support for __typeof_unqual__() operator. + * + * Bindgen uses LLVM even if our C compiler is GCC, so we cannot + * rely on the auto-detected CONFIG_CC_HAS_TYPEOF_UNQUAL. + */ +#define CC_HAS_TYPEOF_UNQUAL (__clang_major__ >= 19) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index c9b58188ec61..32048052c64a 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -137,3 +137,11 @@ #if GCC_VERSION < 90100 #undef __alloc_size__ #endif + +/* + * Declare compiler support for __typeof_unqual__() operator. + * + * Bindgen uses LLVM even if our C compiler is GCC, so we cannot + * rely on the auto-detected CONFIG_CC_HAS_TYPEOF_UNQUAL. + */ +#define CC_HAS_TYPEOF_UNQUAL (__GNUC__ >= 14) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 155385754824..3a7a537e13a3 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -210,6 +210,26 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __must_be_cstr(p) \ __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") +/* + * Use __typeof_unqual__() when available. + * + * XXX: Remove test for __CHECKER__ once + * sparse learns about __typeof_unqual__(). + */ +#if CC_HAS_TYPEOF_UNQUAL && !defined(__CHECKER__) +# define USE_TYPEOF_UNQUAL 1 +#endif + +/* + * Define TYPEOF_UNQUAL() to use __typeof_unqual__() as typeof + * operator when available, to return an unqualified type of the exp. + */ +#if defined(USE_TYPEOF_UNQUAL) +# define TYPEOF_UNQUAL(exp) __typeof_unqual__(exp) +#else +# define TYPEOF_UNQUAL(exp) __typeof__(exp) +#endif + #endif /* __KERNEL__ */ /** -- cgit v1.2.3 From 8a3c392388c6a6e0c8937a24712b630ec9ac7016 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 27 Jan 2025 17:05:07 +0100 Subject: percpu: use TYPEOF_UNQUAL() in variable declarations Use TYPEOF_UNQUAL() to declare variables as a corresponding type without named address space qualifier to avoid "`__seg_gs' specified for auto variable `var'" errors. Link: https://lkml.kernel.org/r/20250127160709.80604-4-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Nadav Amit Acked-by: Christoph Lameter Cc: Dennis Zhou Cc: Tejun Heo Cc: Andy Lutomirski Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Kent Overstreet Cc: Arnd Bergmann Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Peter Zijlstra Cc: Will Deacon Cc: Waiman Long Cc: Boqun Feng Cc: Linus Torvalds Cc: Brian Gerst Cc: Denys Vlasenko Signed-off-by: Andrew Morton --- include/asm-generic/percpu.h | 26 +++++++++++++------------- include/linux/part_stat.h | 2 +- include/linux/percpu-defs.h | 4 ++-- include/net/snmp.h | 5 ++--- 4 files changed, 18 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 94cbd50cc870..50597b975a49 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -74,7 +74,7 @@ do { \ #define raw_cpu_generic_add_return(pcp, val) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ \ *__p += val; \ *__p; \ @@ -82,8 +82,8 @@ do { \ #define raw_cpu_generic_xchg(pcp, nval) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) __ret; \ __ret = *__p; \ *__p = nval; \ __ret; \ @@ -91,7 +91,7 @@ do { \ #define __cpu_fallback_try_cmpxchg(pcp, ovalp, nval, _cmpxchg) \ ({ \ - typeof(pcp) __val, __old = *(ovalp); \ + TYPEOF_UNQUAL(pcp) __val, __old = *(ovalp); \ __val = _cmpxchg(pcp, __old, nval); \ if (__val != __old) \ *(ovalp) = __val; \ @@ -100,8 +100,8 @@ do { \ #define raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ - typeof(pcp) __val = *__p, ___old = *(ovalp); \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) __val = *__p, ___old = *(ovalp); \ bool __ret; \ if (__val == ___old) { \ *__p = nval; \ @@ -115,14 +115,14 @@ do { \ #define raw_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ - typeof(pcp) __old = (oval); \ + TYPEOF_UNQUAL(pcp) __old = (oval); \ raw_cpu_generic_try_cmpxchg(pcp, &__old, nval); \ __old; \ }) #define __this_cpu_generic_read_nopreempt(pcp) \ ({ \ - typeof(pcp) ___ret; \ + TYPEOF_UNQUAL(pcp) ___ret; \ preempt_disable_notrace(); \ ___ret = READ_ONCE(*raw_cpu_ptr(&(pcp))); \ preempt_enable_notrace(); \ @@ -131,7 +131,7 @@ do { \ #define __this_cpu_generic_read_noirq(pcp) \ ({ \ - typeof(pcp) ___ret; \ + TYPEOF_UNQUAL(pcp) ___ret; \ unsigned long ___flags; \ raw_local_irq_save(___flags); \ ___ret = raw_cpu_generic_read(pcp); \ @@ -141,7 +141,7 @@ do { \ #define this_cpu_generic_read(pcp) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ if (__native_word(pcp)) \ __ret = __this_cpu_generic_read_nopreempt(pcp); \ else \ @@ -160,7 +160,7 @@ do { \ #define this_cpu_generic_add_return(pcp, val) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_add_return(pcp, val); \ @@ -170,7 +170,7 @@ do { \ #define this_cpu_generic_xchg(pcp, nval) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_xchg(pcp, nval); \ @@ -190,7 +190,7 @@ do { \ #define this_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_cmpxchg(pcp, oval, nval); \ diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index ac8c44dd8237..c5e9cac0575e 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -33,7 +33,7 @@ struct disk_stats { #define part_stat_read(part, field) \ ({ \ - typeof((part)->bd_stats->field) res = 0; \ + TYPEOF_UNQUAL((part)->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 5b520fe86b60..79b9402404f1 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -317,7 +317,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { } #define __pcpu_size_call_return(stem, variable) \ ({ \ - typeof(variable) pscr_ret__; \ + TYPEOF_UNQUAL(variable) pscr_ret__; \ __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: pscr_ret__ = stem##1(variable); break; \ @@ -332,7 +332,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { } #define __pcpu_size_call_return2(stem, variable, ...) \ ({ \ - typeof(variable) pscr2_ret__; \ + TYPEOF_UNQUAL(variable) pscr2_ret__; \ __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \ diff --git a/include/net/snmp.h b/include/net/snmp.h index 468a67836e2f..4cb4326dfebe 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -159,7 +159,7 @@ struct linux_tls_mib { #define __SNMP_ADD_STATS64(mib, field, addend) \ do { \ - __typeof__(*mib) *ptr = raw_cpu_ptr(mib); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[field] += addend; \ u64_stats_update_end(&ptr->syncp); \ @@ -176,8 +176,7 @@ struct linux_tls_mib { #define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1) #define __SNMP_UPD_PO_STATS64(mib, basefield, addend) \ do { \ - __typeof__(*mib) *ptr; \ - ptr = raw_cpu_ptr((mib)); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[basefield##PKTS]++; \ ptr->mibs[basefield##OCTETS] += addend; \ -- cgit v1.2.3 From 6a39fe05ecaa3946ed0af9efd3e56689d519e420 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 27 Jan 2025 17:05:08 +0100 Subject: percpu: use TYPEOF_UNQUAL() in *_cpu_ptr() accessors Use TYPEOF_UNQUAL() macro to declare the return type of *_cpu_ptr() accessors in the generic named address space to avoid access to data from pointer to non-enclosed address space type of errors. Link: https://lkml.kernel.org/r/20250127160709.80604-5-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Nadav Amit Acked-by: Christoph Lameter Cc: Dennis Zhou Cc: Tejun Heo Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Brian Gerst Cc: Peter Zijlstra Cc: Arnd Bergmann Cc: Boqun Feng Cc: "David S. Miller" Cc: Denys Vlasenko Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Kent Overstreet Cc: Paolo Abeni Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 79b9402404f1..a7cf954ea99d 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,7 +221,7 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ - (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p)) + (TYPEOF_UNQUAL(*(__p)) __force __kernel *)((__force unsigned long)(__p)) #ifdef CONFIG_SMP -- cgit v1.2.3 From 6cea5ae714ba47ea4807d15903baca9857a450e6 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 27 Jan 2025 17:05:09 +0100 Subject: percpu: repurpose __percpu tag as a named address space qualifier The patch introduces __percpu_qual define and repurposes __percpu tag as a named address space qualifier using the new define. Arches can now conditionally define __percpu_qual as their named address space qualifier for percpu variables. Link: https://lkml.kernel.org/r/20250127160709.80604-6-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Nadav Amit Cc: Arnd Bergmann Cc: Thomas Gleixner Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Brian Gerst Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Boqun Feng Cc: Borislav Petkov Cc: Dave Hansen Cc: "David S. Miller" Cc: Denys Vlasenko Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Kent Overstreet Cc: Paolo Abeni Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton --- include/asm-generic/percpu.h | 13 +++++++++++++ include/linux/compiler_types.h | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 50597b975a49..02aeca21479a 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -6,6 +6,19 @@ #include #include +/* + * __percpu_qual is the qualifier for the percpu named address space. + * + * Most arches use generic named address space for percpu variables but + * some arches define percpu variables in different named address space + * (on the x86 arch, percpu variable may be declared as being relative + * to the %fs or %gs segments using __seg_fs or __seg_gs named address + * space qualifier). + */ +#ifndef __percpu_qual +# define __percpu_qual +#endif + #ifdef CONFIG_SMP /* diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 981cc3d7e3aa..5d6544545658 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -57,7 +57,7 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __user BTF_TYPE_TAG(user) # endif # define __iomem -# define __percpu BTF_TYPE_TAG(percpu) +# define __percpu __percpu_qual BTF_TYPE_TAG(percpu) # define __rcu BTF_TYPE_TAG(rcu) # define __chk_user_ptr(x) (void)0 -- cgit v1.2.3 From 89ce924f0bd447eb52a5f224d879dbf8f09451db Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 24 Jan 2025 00:41:32 -0500 Subject: mm: memcontrol: move memsw charge callbacks to v1 The interweaving of two entirely different swap accounting strategies has been one of the more confusing parts of the memcg code. Split out the v1 code to clarify the implementation and a handful of callsites, and to avoid building the v1 bits when !CONFIG_MEMCG_V1. text data bss dec hex filename 39253 6446 4160 49859 c2c3 mm/memcontrol.o.old 38877 6382 4160 49419 c10b mm/memcontrol.o Link: https://lkml.kernel.org/r/20250124054132.45643-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Balbir Singh Acked-by: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 17 +++++++++++------ include/linux/swap.h | 5 ----- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6e74b8254d9b..57664e2a8fb7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -649,8 +649,6 @@ int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); -void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); - void __mem_cgroup_uncharge(struct folio *folio); /** @@ -1165,10 +1163,6 @@ static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, return 0; } -static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr) -{ -} - static inline void mem_cgroup_uncharge(struct folio *folio) { } @@ -1848,6 +1842,9 @@ static inline void mem_cgroup_exit_user_fault(void) current->in_user_fault = 0; } +void memcg1_swapout(struct folio *folio, swp_entry_t entry); +void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages); + #else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, @@ -1875,6 +1872,14 @@ static inline void mem_cgroup_exit_user_fault(void) { } +static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry) +{ +} + +static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) +{ +} + #endif /* CONFIG_MEMCG_V1 */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index b13b72645db3..91b30701274e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -659,7 +659,6 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) #endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) @@ -680,10 +679,6 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct folio *folio); #else -static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) -{ -} - static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { -- cgit v1.2.3 From 599b684a7854e51a51358fe59bbdfea281f0b461 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 10 Feb 2025 20:37:45 +0100 Subject: mm/rmap: convert make_device_exclusive_range() to make_device_exclusive() The single "real" user in the tree of make_device_exclusive_range() always requests making only a single address exclusive. The current implementation is hard to fix for properly supporting anonymous THP / large folios and for avoiding messing with rmap walks in weird ways. So let's always process a single address/page and return folio + page to minimize page -> folio lookups. This is a preparation for further changes. Reject any non-anonymous or hugetlb folios early, directly after GUP. While at it, extend the documentation of make_device_exclusive() to clarify some things. Link: https://lkml.kernel.org/r/20250210193801.781278-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Simona Vetter Reviewed-by: Alistair Popple Tested-by: Alistair Popple Cc: Alex Shi Cc: Danilo Krummrich Cc: Dave Airlie Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Jonathan Corbet Cc: Karol Herbst Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Lyude Cc: "Masami Hiramatsu (Google)" Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: SeongJae Park Cc: Vlastimil Babka Cc: Yanteng Si Cc: Barry Song Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 2 +- include/linux/rmap.h | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index e2dd57ca368b..d4e714661826 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -46,7 +46,7 @@ struct mmu_interval_notifier; * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no * longer have exclusive access to the page. When sent during creation of an * exclusive range the owner will be initialised to the value provided by the - * caller of make_device_exclusive_range(), otherwise the owner will be NULL. + * caller of make_device_exclusive(), otherwise the owner will be NULL. */ enum mmu_notifier_event { MMU_NOTIFY_UNMAP = 0, diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 683a04088f3f..86425d42c1a9 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -663,9 +663,8 @@ int folio_referenced(struct folio *, int is_locked, void try_to_migrate(struct folio *folio, enum ttu_flags flags); void try_to_unmap(struct folio *, enum ttu_flags flags); -int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct page **pages, - void *arg); +struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, + void *owner, struct folio **foliop); /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) -- cgit v1.2.3 From c25465eb7630ffcadaab29c1010071512f8c9621 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 10 Feb 2025 20:37:48 +0100 Subject: mm: use single SWP_DEVICE_EXCLUSIVE entry type There is no need for the distinction anymore; let's merge the readable and writable device-exclusive entries into a single device-exclusive entry type. Link: https://lkml.kernel.org/r/20250210193801.781278-7-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Simona Vetter Reviewed-by: Alistair Popple Tested-by: Alistair Popple Cc: Alex Shi Cc: Danilo Krummrich Cc: Dave Airlie Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Jonathan Corbet Cc: Karol Herbst Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Lyude Cc: "Masami Hiramatsu (Google)" Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: SeongJae Park Cc: Vlastimil Babka Cc: Yanteng Si Cc: Barry Song Signed-off-by: Andrew Morton --- include/linux/swap.h | 7 +++---- include/linux/swapops.h | 27 ++++----------------------- 2 files changed, 7 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 91b30701274e..9a48e79a0a52 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -74,14 +74,13 @@ static inline int current_is_kswapd(void) * to a special SWP_DEVICE_{READ|WRITE} entry. * * When a page is mapped by the device for exclusive access we set the CPU page - * table entries to special SWP_DEVICE_EXCLUSIVE_* entries. + * table entries to a special SWP_DEVICE_EXCLUSIVE entry. */ #ifdef CONFIG_DEVICE_PRIVATE -#define SWP_DEVICE_NUM 4 +#define SWP_DEVICE_NUM 3 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) -#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) -#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3) +#define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) #else #define SWP_DEVICE_NUM 0 #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 96f26e29fefe..64ea151a7ae3 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -186,26 +186,16 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry) return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); } -static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset) +static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { - return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset); -} - -static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset) -{ - return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset); + return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } static inline bool is_device_exclusive_entry(swp_entry_t entry) { - return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ || - swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE; + return swp_type(entry) == SWP_DEVICE_EXCLUSIVE; } -static inline bool is_writable_device_exclusive_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE); -} #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -227,12 +217,7 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry) return false; } -static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset) -{ - return swp_entry(0, 0); -} - -static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset) +static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } @@ -242,10 +227,6 @@ static inline bool is_device_exclusive_entry(swp_entry_t entry) return false; } -static inline bool is_writable_device_exclusive_entry(swp_entry_t entry) -{ - return false; -} #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION -- cgit v1.2.3 From 6df8bae8e851eacf2acf2237860213e002aba74f Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 29 Jan 2025 18:06:32 +0000 Subject: mm: zbud: remove zbud The zbud compressed pages allocator is rarely used, most users use zsmalloc. zbud consumes much more memory (only stores 1 or 2 compressed pages per physical page). The only advantage of zbud is a marginal performance improvement that by no means justify the memory overhead. Historically, zsmalloc had significantly worse latency than zbud and z3fold but offered better memory savings. This is no longer the case as shown by a simple recent analysis [1]. In a kernel build test on tmpfs in a limited cgroup, zbud 2-3% less time than zsmalloc, but at the cost of using ~32% more memory (1.5G vs 1.13G). The tradeoff does not make sense for zbud in any practical scenario. The only alleged advantage of zbud is not having the dependency on CONFIG_MMU, but CONFIG_SWAP already depends on CONFIG_MMU anyway, and zbud is only used by zswap. Remove zbud after z3fold's removal, leaving zsmalloc as the one and only zpool allocator. Leave the removal of the zpool API (and its associated config options) to a followup cleanup after no more allocators show up. Deprecating zbud for a few cycles before removing it was initially proposed [2], like z3fold was marked as deprecated for 2 cycles [3]. However, Johannes rightfully pointed out that the 2 cycles is too short for most downstream consumers, and z3fold was deprecated first only as a courtesy anyway. [1]https://lore.kernel.org/lkml/CAJD7tkbRF6od-2x_L8-A1QL3=2Ww13sCj4S3i4bNndqF+3+_Vg@mail.gmail.com/ [2]https://lore.kernel.org/lkml/Z5gdnSX5Lv-nfjQL@google.com/ [3]https://lore.kernel.org/lkml/20240904233343.933462-1-yosryahmed@google.com/ Link: https://lkml.kernel.org/r/20250129180633.3501650-3-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Alexander Gordeev Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Dan Streetman Cc: Heiko Carstens Cc: Huacai Chen Cc: Miaohe Lin Cc: Seth Jennings Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vitaly Wool Cc: Vlastimil Babka Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/zpool.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index a67d62b79698..5e6dc46b8cc4 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -4,9 +4,8 @@ * * Copyright (C) 2014 Dan Streetman * - * This is a common frontend for the zbud and zsmalloc memory - * storage pool implementations. Typically, this is used to - * store compressed memory. + * This is a common frontend for the zswap compressed memory storage + * implementations. */ #ifndef _ZPOOL_H_ -- cgit v1.2.3 From c372473a545edff2fdbc002fc67c181e17c7557b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 31 Jan 2025 12:31:53 +0000 Subject: mm: completely abstract unnecessary adj_start calculation The adj_start calculation has been a constant source of confusion in the VMA merge code. There are two cases to consider, one where we adjust the start of the vmg->middle VMA (i.e. the vmg->__adjust_middle_start merge flag is set), in which case adj_start is calculated as: (1) adj_start = vmg->end - vmg->middle->vm_start And the case where we adjust the start of the vmg->next VMA (i.e. the vmg->__adjust_next_start merge flag is set), in which case adj_start is calculated as: (2) adj_start = -(vmg->middle->vm_end - vmg->end) We apply (1) thusly: vmg->middle->vm_start = vmg->middle->vm_start + vmg->end - vmg->middle->vm_start Which simplifies to: vmg->middle->vm_start = vmg->end Similarly, we apply (2) as: vmg->next->vm_start = vmg->next->vm_start + -(vmg->middle->vm_end - vmg->end) Noting that for these VMAs to be mergeable vmg->middle->vm_end == vmg->next->vm_start and so this simplifies to: vmg->next->vm_start = vmg->next->vm_start + -(vmg->next->vm_start - vmg->end) Which simplifies to: vmg->next->vm_start = vmg->end Therefore in each case, we simply need to adjust the start of the VMA to vmg->end (!) and can do away with this adj_start calculation. The only caveat is that we must ensure we update the vm_pgoff field correctly. We therefore abstract this entire calculation to a new function vmg_adjust_set_range() which performs this calculation and sets the adjusted VMA's new range using the general vma_set_range() function. We also must update vma_adjust_trans_huge() which expects the now-abstracted adj_start parameter. It turns out this is wholly unnecessary. In vma_adjust_trans_huge() the relevant code is: if (adjust_next > 0) { struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); unsigned long nstart = next->vm_start; nstart += adjust_next; split_huge_pmd_if_needed(next, nstart); } The only case where this is relevant is when vmg->__adjust_middle_start is specified (in which case adj_next would have been positive), i.e. the one in which the vma specified is vmg->prev and this the sought 'next' VMA would be vmg->middle. We can therefore eliminate the find_vma() invocation altogether and simply provide the vmg->middle VMA in this instance, or NULL otherwise. Again we have an adj_next offset calculation: next->vm_start + vmg->end - vmg->middle->vm_start Where next == vmg->middle this simplifies to vmg->end as previously demonstrated. Therefore nstart is equal to vmg->end, which is already passed to vma_adjust_trans_huge() via the 'end' parameter and so this code (rather delightfully) simplifies to: if (next) split_huge_pmd_if_needed(next, end); With these changes in place, it becomes silly for commit_merge() to return vmg->target, as it is always the same and threaded through vmg, so we finally change commit_merge() to return an error value once again. This patch has no change in functional behaviour. Link: https://lkml.kernel.org/r/7bce2cd4b5afb56211822835d145471280c3dccc.1738326519.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Jann Horn Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 93e509b6c00e..e1bea54820ff 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -404,7 +404,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, - unsigned long end, long adjust_next); + unsigned long end, struct vm_area_struct *next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); @@ -571,7 +571,7 @@ static inline int madvise_collapse(struct vm_area_struct *vma, static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, - long adjust_next) + struct vm_area_struct *next) { } static inline int is_swap_pmd(pmd_t pmd) -- cgit v1.2.3 From 51ff4d7486f0c0b4110a6da4af805b179dd7b11e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Feb 2025 15:18:00 -0800 Subject: mm: avoid extra mem_alloc_profiling_enabled() checks Refactor code to avoid extra mem_alloc_profiling_enabled() checks inside pgalloc_tag_get() function which is often called after that check was already done. Link: https://lkml.kernel.org/r/20250201231803.2661189-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Cc: David Wang <00107082@163.com> Cc: Steven Rostedt Cc: Kent Overstreet Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Vlastimil Babka Cc: Yu Zhao Cc: Zhenhua Huang Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 3469c4b20105..4a82b6b4820e 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -205,28 +205,32 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) } } -static inline struct alloc_tag *pgalloc_tag_get(struct page *page) +/* Should be called only if mem_alloc_profiling_enabled() */ +static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) { struct alloc_tag *tag = NULL; - - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - alloc_tag_sub_check(&ref); - if (ref.ct) - tag = ct_to_alloc_tag(ref.ct); - put_page_tag_ref(handle); - } + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_sub_check(&ref); + if (ref.ct) + tag = ct_to_alloc_tag(ref.ct); + put_page_tag_ref(handle); } return tag; } -static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) +static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) { - if (mem_alloc_profiling_enabled() && tag) + struct alloc_tag *tag; + + if (!mem_alloc_profiling_enabled()) + return; + + tag = __pgalloc_tag_get(page); + if (tag) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } @@ -241,8 +245,7 @@ static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } -static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} +static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {} -- cgit v1.2.3 From 93d5440ece3c0aa341fb02e3a44a1b7ab44304c8 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Feb 2025 15:18:02 -0800 Subject: alloc_tag: uninline code gated by mem_alloc_profiling_key in page allocator When a sizable code section is protected by a disabled static key, that code gets into the instruction cache even though it's not executed and consumes the cache, increasing cache misses. This can be remedied by moving such code into a separate uninlined function. On a Pixel6 phone, page allocation profiling overhead measured with CONFIG_MEM_ALLOC_PROFILING=y and profiling disabled is: baseline modified Big core 4.93% 1.53% Medium core 4.39% 1.41% Little core 1.02% 0.36% This improvement comes at the expense of the configuration when profiling gets enabled, since there is now an additional function call. The overhead from this additional call on Pixel6 is: Big core 0.24% Middle core 0.63% Little core 1.1% However this is negligible when compared with the overall overhead of the memory allocation profiling when it is enabled. On x86 this patch does not make noticeable difference because the overhead with mem_alloc_profiling_key disabled is much lower (under 1%) to start with, so any improvement is less visible and hard to distinguish from the noise. The overhead from additional call when profiling is enabled is also within noise levels. Link: https://lkml.kernel.org/r/20250201231803.2661189-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Steven Rostedt Cc: Vlastimil Babka Cc: Yu Zhao Cc: Zhenhua Huang Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 60 ++++----------------------------------------- 1 file changed, 5 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 4a82b6b4820e..c74077977830 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -162,47 +162,13 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code } } -static inline void clear_page_tag_ref(struct page *page) -{ - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - set_codetag_empty(&ref); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } -} - -static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, - unsigned int nr) -{ - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } -} +/* Should be called only if mem_alloc_profiling_enabled() */ +void __clear_page_tag_ref(struct page *page); -static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) +static inline void clear_page_tag_ref(struct page *page) { - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - alloc_tag_sub(&ref, PAGE_SIZE * nr); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } + if (mem_alloc_profiling_enabled()) + __clear_page_tag_ref(page); } /* Should be called only if mem_alloc_profiling_enabled() */ @@ -222,18 +188,6 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) return tag; } -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) -{ - struct alloc_tag *tag; - - if (!mem_alloc_profiling_enabled()) - return; - - tag = __pgalloc_tag_get(page); - if (tag) - this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); -} - void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); void pgalloc_tag_swap(struct folio *new, struct folio *old); @@ -242,10 +196,6 @@ void __init alloc_tag_sec_init(void); #else /* CONFIG_MEM_ALLOC_PROFILING */ static inline void clear_page_tag_ref(struct page *page) {} -static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, - unsigned int nr) {} -static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {} -- cgit v1.2.3 From 0eb7d2c337f9df3b6adbcbdce213595d94781e05 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:12 +0800 Subject: mm/swap: remove SWAP_FLAG_PRIO_SHIFT It doesn't make sense to have a zero value of shift. Remove it to avoid confusion. Link: https://lkml.kernel.org/r/20250205092721.9395-4-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 9a48e79a0a52..bbd06cbd1f2b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -24,7 +24,6 @@ struct pagevec; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff -#define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ -- cgit v1.2.3 From c523aa890760b004eea47a0e00b5750a528f3ced Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:19 +0800 Subject: mm/swap: rename swap_swapcount() to swap_entry_swapped() The new function name can reflect the real behaviour of the function more clearly and more accurately. And the renaming avoids the confusion between swap_swapcount() and swp_swapcount(). Link: https://lkml.kernel.org/r/20250205092721.9395-11-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index bbd06cbd1f2b..2fe91c293636 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -499,7 +499,7 @@ int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); -extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); +extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; @@ -582,9 +582,9 @@ static inline int __swap_count(swp_entry_t entry) return 0; } -static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { - return 0; + return false; } static inline int swp_swapcount(swp_entry_t entry) -- cgit v1.2.3 From 94ba17adaba0f651fdcf745c8891a88e2e028cfa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 7 Feb 2025 13:20:33 -0800 Subject: mm/damon: avoid applying DAMOS action to same entity multiple times 'paddr' DAMON operations set can apply a DAMOS scheme's action to a large folio multiple times in single DAMOS-regions-walk if the folio is laid on multiple DAMON regions. Add a field for DAMOS scheme object that can be used by the underlying ops to know what was the last entity that the scheme's action has applied. The core layer unsets the field when each DAMOS-regions-walk is done for the given scheme. And update 'paddr' ops to use the infrastructure to avoid the problem. Link: https://lkml.kernel.org/r/20250207212033.45269-3-sj@kernel.org Fixes: 57223ac29584 ("mm/damon/paddr: support the pageout scheme") Signed-off-by: SeongJae Park Reported-by: Usama Arif Closes: https://lore.kernel.org/20250203225604.44742-3-usamaarif642@gmail.com Cc: Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index c9074d569596..b4d37d9b9221 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -432,6 +432,7 @@ struct damos_access_pattern { * @wmarks: Watermarks for automated (in)activation of this scheme. * @target_nid: Destination node if @action is "migrate_{hot,cold}". * @filters: Additional set of &struct damos_filter for &action. + * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. * @list: List head for siblings. * @@ -454,6 +455,15 @@ struct damos_access_pattern { * implementation could check pages of the region and skip &action to respect * &filters * + * The minimum entity that @action can be applied depends on the underlying + * &struct damon_operations. Since it may not be aligned with the core layer + * abstract, namely &struct damon_region, &struct damon_operations could apply + * @action to same entity multiple times. Large folios that underlying on + * multiple &struct damon region objects could be such examples. The &struct + * damon_operations can use @last_applied to avoid that. DAMOS core logic + * unsets @last_applied when each regions walking for applying the scheme is + * finished. + * * After applying the &action to each region, &stat_count and &stat_sz is * updated to reflect the number of regions and total size of regions that the * &action is applied. @@ -482,6 +492,7 @@ struct damos { int target_nid; }; struct list_head filters; + void *last_applied; struct damos_stat stat; struct list_head list; }; -- cgit v1.2.3 From a4811f53bb89b3524629b1be41add9349fe0a750 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sat, 8 Feb 2025 15:52:55 +0000 Subject: mm: provide mapping_wrprotect_range() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the fb_defio video driver, page dirty state is used to determine when frame buffer pages have been changed, allowing for batched, deferred I/O to be performed for efficiency. This implementation had only one means of doing so effectively - the use of the folio_mkclean() function. However, this use of the function is inappropriate, as the fb_defio implementation allocates kernel memory to back the framebuffer, and then is forced to specified page->index, mapping fields in order to permit the folio_mkclean() rmap traversal to proceed correctly. It is not correct to specify these fields on kernel-allocated memory, and moreover since these are not folios, page->index, mapping are deprecated fields, soon to be removed. We therefore need to provide a means by which we can correctly traverse the reverse mapping and write-protect mappings for a page backing an address_space page cache object at a given offset. This patch provides this - mapping_wrprotect_range() - which allows for this operation to be performed for a specified address_space, offset, PFN and size, without requiring a folio nor, of course, an inappropriate use of page->index, mapping. With this provided, we can subsequently adjust the fb_defio implementation to make use of this function and avoid incorrect invocation of folio_mkclean() and more importantly, incorrect manipulation of page->index and mapping fields. Link: https://lkml.kernel.org/r/e5bf969d64e7f2f2ae944d42341fc8994b736a81.1739029358.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Helge Deller Cc: Jaya Kumar Cc: Kajtar Zsolt Cc: Maíra Canal Cc: Matthew Wilcox Cc: Simona Vetter Cc: Thomas Zimemrmann Signed-off-by: Andrew Morton --- include/linux/rmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 86425d42c1a9..69e9a431a40e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -738,6 +738,9 @@ unsigned long page_address_in_vma(const struct folio *folio, */ int folio_mkclean(struct folio *); +int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, + unsigned long pfn, unsigned long nr_pages); + int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -- cgit v1.2.3 From 6cdef2ddce2b7de8faca69061a6780080cfecc10 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sat, 8 Feb 2025 15:52:56 +0000 Subject: fb_defio: do not use deprecated page->mapping, index fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the introduction of mapping_wrprotect_range() there is no need to use folio_mkclean() in order to write-protect mappings of frame buffer pages, and therefore no need to inappropriately set kernel-allocated page->index, mapping fields to permit this operation. Instead, store the pointer to the page cache object for the mapped driver in the fb_deferred_io object, and use the already stored page offset from the pageref object to look up mappings in order to write-protect them. This is justified, as for the page objects to store a mapping pointer at the point of assignment of pages, they must all reference the same underlying address_space object. Since the life time of the pagerefs is also the lifetime of the fb_deferred_io object, storing the pointer here makes sense. This eliminates the need for all of the logic around setting and maintaining page->index,mapping which we remove. This eliminates the use of folio_mkclean() entirely but otherwise should have no functional change. [lorenzo.stoakes@oracle.com: fixup unused variable warnings] Link: https://lkml.kernel.org/r/d4018405-2762-4385-a816-e54cc23839ac@lucifer.local Link: https://lkml.kernel.org/r/81171ab16c14e3df28f6de9d14982cee528d8519.1739029358.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Tested-by: Kajtar Zsolt Acked-by: Thomas Zimmermann Cc: David Hildenbrand Cc: Helge Deller Cc: Jaya Kumar Cc: Maíra Canal Cc: Matthew Wilcox Cc: Simona Vetter Signed-off-by: Andrew Morton --- include/linux/fb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fb.h b/include/linux/fb.h index 5ba187e08cf7..cd653862ab99 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -225,6 +225,7 @@ struct fb_deferred_io { int open_count; /* number of opened files; protected by fb_info lock */ struct mutex lock; /* mutex that protects the pageref list */ struct list_head pagereflist; /* list of pagerefs for touched pages */ + struct address_space *mapping; /* page cache object for fb device */ /* callback */ struct page *(*get_page)(struct fb_info *info, unsigned long offset); void (*deferred_io)(struct fb_info *info, struct list_head *pagelist); -- cgit v1.2.3 From 6340584e489f1f8ddb65e44a7ad2ab9f3503c4d3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 11 Feb 2025 12:59:11 -0800 Subject: mm/vmstat: revert "fix a W=1 clang compiler warning" Commit 75b5ab134bb5 enabled -Wenum-enum-conversion in W=1 builds. Commit 30c2de0a267c ("mm/vmstat: fix a W=1 clang compiler warning") fixed a -Wenum-enum-conversion warning. Commit 8f6629c004b1 removed the -Wenum-enum-conversion option again from W=1 builds. Since the W=1 compiler warning fix is no longer necessary, revert it. Link: https://lkml.kernel.org/r/20250211205911.1707684-1-bvanassche@acm.org Signed-off-by: Bart Van Assche Acked-by: Vlastimil Babka Cc: Matthew Wilcox Cc: Ivan Shapovalov Cc: David Laight Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 9f3a04345b86..d2761bf8ff32 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -515,7 +515,7 @@ static inline const char *node_stat_name(enum node_stat_item item) static inline const char *lru_list_name(enum lru_list lru) { - return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_" + return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) -- cgit v1.2.3 From 0431c42622612a96cce97cdd4cfbe7d487606cf3 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Tue, 11 Feb 2025 12:43:40 +0000 Subject: mm/damon: introduce DAMOS filter type hugepage_size Patch series "mm/damon: add support for hugepage_size DAMOS filter", v5. hugepage_size DAMOS filter can be used to gather statistics to check if memory regions of specific access tempratures are backed by hugepages of a size in a specific range. This filter can help to observe and prove the effectivenes of different schemes for shrinking/collapsing hugepages. This patch (of 4): This is to gather statistics to check if memory regions of specific access tempratures are backed by pages of a size in a specific range. This filter can help to observe and prove the effectivenes of different schemes for shrinking/collapsing hugepages. [sj@kernel.org: add kernel-doc comment for damos_filter->sz_range] Link: https://lkml.kernel.org/r/20250218223058.52459-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250211124437.278873-1-usamaarif642@gmail.com Link: https://lkml.kernel.org/r/20250211124437.278873-2-usamaarif642@gmail.com Signed-off-by: Usama Arif Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Johannes Weiner Cc: Usama Arif Signed-off-by: Andrew Morton --- include/linux/damon.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index b4d37d9b9221..5e7ae7bca5dc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -35,6 +35,16 @@ struct damon_addr_range { unsigned long end; }; +/** + * struct damon_size_range - Represents size for filter to operate on [@min, @max]. + * @min: Min size (inclusive). + * @max: Max size (inclusive). + */ +struct damon_size_range { + unsigned long min; + unsigned long max; +}; + /** * struct damon_region - Represents a monitoring target region. * @ar: The address range of the region. @@ -326,6 +336,7 @@ struct damos_stat { * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @DAMOS_FILTER_TYPE_YOUNG: Recently accessed pages. + * @DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: Page is part of a hugepage. * @DAMOS_FILTER_TYPE_ADDR: Address range. * @DAMOS_FILTER_TYPE_TARGET: Data Access Monitoring target. * @NR_DAMOS_FILTER_TYPES: Number of filter types. @@ -345,6 +356,7 @@ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, DAMOS_FILTER_TYPE_MEMCG, DAMOS_FILTER_TYPE_YOUNG, + DAMOS_FILTER_TYPE_HUGEPAGE_SIZE, DAMOS_FILTER_TYPE_ADDR, DAMOS_FILTER_TYPE_TARGET, NR_DAMOS_FILTER_TYPES, @@ -360,6 +372,7 @@ enum damos_filter_type { * @target_idx: Index of the &struct damon_target of * &damon_ctx->adaptive_targets if @type is * DAMOS_FILTER_TYPE_TARGET. + * @sz_range: Size range if @type is DAMOS_FILTER_TYPE_HUGEPAGE_SIZE. * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each @@ -376,6 +389,7 @@ struct damos_filter { unsigned short memcg_id; struct damon_addr_range addr_range; int target_idx; + struct damon_size_range sz_range; }; struct list_head list; }; -- cgit v1.2.3 From 1d23b9403aedea2d8eb4175e8eeb05fcd2967219 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 11 Feb 2025 18:13:39 +0800 Subject: mm/mmu_gather: remove unused __tlb_remove_page() Nobody is using __tlb_remove_page() now, clean it up. And also remove the code comment above tlb_remove_page() because it's not meaningful any more. Link: https://lkml.kernel.org/r/Z6si0/A/zzEF/bFJ@MiWiFi-R3L-srv Signed-off-by: Baoquan He Reviewed-by: Qi Zheng Cc: "Aneesh Kumar K.V" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Will Deacon Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index e402aef79c93..1fac1985127d 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -489,16 +489,6 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, tlb_flush_mmu(tlb); } -static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, - struct page *page, bool delay_rmap) -{ - return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE); -} - -/* tlb_remove_page - * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when - * required. - */ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return tlb_remove_page_size(tlb, page, PAGE_SIZE); -- cgit v1.2.3 From 7bd1fa0d5624e78628ad7ce70d7e69082c34c634 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 11 Feb 2025 11:43:48 +0800 Subject: mm/mmu_gather: clean up the stale code comment In commit d7f861b9c43a ("mm/mmu_gather: add __tlb_remove_folio_pages()"), helper function __tlb_remove_folio_pages_size() was added. And based on the helper, wrapper functions __tlb_remove_folio_pages() and __tlb_remove_page_size() are created and used by upper level functions. So let's update the code comment to reflect the current code about tlb_remove_page()/tlb_remove_page_size(), etc. Link: https://lkml.kernel.org/r/20250211034348.39531-2-bhe@redhat.com Signed-off-by: Baoquan He Cc: "Aneesh Kumar K.V" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Qi Zheng Cc: Will Deacon Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 1fac1985127d..d1adfba8387e 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -67,22 +67,21 @@ * * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. * - * - tlb_remove_page() / __tlb_remove_page() - * - tlb_remove_page_size() / __tlb_remove_page_size() - * - __tlb_remove_folio_pages() + * - tlb_remove_page() / tlb_remove_page_size() + * - __tlb_remove_folio_pages() / __tlb_remove_page_size() + * - __tlb_remove_folio_pages_size() * - * __tlb_remove_page_size() is the basic primitive that queues a page for - * freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a - * boolean indicating if the queue is (now) full and a call to - * tlb_flush_mmu() is required. + * __tlb_remove_folio_pages_size() is the basic primitive that queues pages + * for freeing. It will return a boolean indicating if the queue is (now) + * full and a call to tlb_flush_mmu() is required. * * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * - * __tlb_remove_folio_pages() is similar to __tlb_remove_page(), however, - * instead of removing a single page, remove the given number of consecutive - * pages that are all part of the same (large) folio: just like calling - * __tlb_remove_page() on each page individually. + * __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(), + * however, instead of removing a single page, assume PAGE_SIZE and remove + * the given number of consecutive pages that are all part of the + * same (large) folio. * * - tlb_change_page_size() * -- cgit v1.2.3 From b2ae5fccb8c0ec2167e6e6c5c5a5eb8d656f70ef Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:38 -0800 Subject: mm: introduce vma_start_read_locked{_nested} helpers Patch series "reimplement per-vma lock as a refcount", v10. Back when per-vma locks were introduces, vm_lock was moved out of vm_area_struct in [1] because of the performance regression caused by false cacheline sharing. Recent investigation [2] revealed that the regressions is limited to a rather old Broadwell microarchitecture and even there it can be mitigated by disabling adjacent cacheline prefetching, see [3]. Splitting single logical structure into multiple ones leads to more complicated management, extra pointer dereferences and overall less maintainable code. When that split-away part is a lock, it complicates things even further. With no performance benefits, there are no reasons for this split. Merging the vm_lock back into vm_area_struct also allows vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset. This patchset: 1. moves vm_lock back into vm_area_struct, aligning it at the cacheline boundary and changing the cache to be cacheline-aligned to minimize cacheline sharing; 2. changes vm_area_struct initialization to mark new vma as detached until it is inserted into vma tree; 3. replaces vm_lock and vma->detached flag with a reference counter; 4. regroups vm_area_struct members to fit them into 3 cachelines; 5. changes vm_area_struct cache to SLAB_TYPESAFE_BY_RCU to allow for their reuse and to minimize call_rcu() calls. Pagefault microbenchmarks show performance improvement: Hmean faults/cpu-1 507926.5547 ( 0.00%) 506519.3692 * -0.28%* Hmean faults/cpu-4 479119.7051 ( 0.00%) 481333.6802 * 0.46%* Hmean faults/cpu-7 452880.2961 ( 0.00%) 455845.6211 * 0.65%* Hmean faults/cpu-12 347639.1021 ( 0.00%) 352004.2254 * 1.26%* Hmean faults/cpu-21 200061.2238 ( 0.00%) 229597.0317 * 14.76%* Hmean faults/cpu-30 145251.2001 ( 0.00%) 164202.5067 * 13.05%* Hmean faults/cpu-48 106848.4434 ( 0.00%) 120641.5504 * 12.91%* Hmean faults/cpu-56 92472.3835 ( 0.00%) 103464.7916 * 11.89%* Hmean faults/sec-1 507566.1468 ( 0.00%) 506139.0811 * -0.28%* Hmean faults/sec-4 1880478.2402 ( 0.00%) 1886795.6329 * 0.34%* Hmean faults/sec-7 3106394.3438 ( 0.00%) 3140550.7485 * 1.10%* Hmean faults/sec-12 4061358.4795 ( 0.00%) 4112477.0206 * 1.26%* Hmean faults/sec-21 3988619.1169 ( 0.00%) 4577747.1436 * 14.77%* Hmean faults/sec-30 3909839.5449 ( 0.00%) 4311052.2787 * 10.26%* Hmean faults/sec-48 4761108.4691 ( 0.00%) 5283790.5026 * 10.98%* Hmean faults/sec-56 4885561.4590 ( 0.00%) 5415839.4045 * 10.85%* This patch (of 18): Introduce helper functions which can be used to read-lock a VMA when holding mmap_lock for read. Replace direct accesses to vma->vm_lock with these new helpers. Link: https://lkml.kernel.org/r/20250213224655.1680278-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Lorenzo Stoakes Reviewed-by: Davidlohr Bueso Reviewed-by: Shakeel Butt Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index fd1e85b4b48a..6a4914bc1a38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -735,6 +735,30 @@ static inline bool vma_start_read(struct vm_area_struct *vma) return true; } +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) +{ + mmap_assert_locked(vma->vm_mm); + down_read_nested(&vma->vm_lock->lock, subclass); +} + +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline void vma_start_read_locked(struct vm_area_struct *vma) +{ + mmap_assert_locked(vma->vm_mm); + down_read(&vma->vm_lock->lock); +} + static inline void vma_end_read(struct vm_area_struct *vma) { rcu_read_lock(); /* keeps vma alive till the end of up_read */ -- cgit v1.2.3 From 7b6218ae1253491d56f21f4b1f3609f3dd873600 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:39 -0800 Subject: mm: move per-vma lock into vm_area_struct Back when per-vma locks were introduces, vm_lock was moved out of vm_area_struct in [1] because of the performance regression caused by false cacheline sharing. Recent investigation [2] revealed that the regressions is limited to a rather old Broadwell microarchitecture and even there it can be mitigated by disabling adjacent cacheline prefetching, see [3]. Splitting single logical structure into multiple ones leads to more complicated management, extra pointer dereferences and overall less maintainable code. When that split-away part is a lock, it complicates things even further. With no performance benefits, there are no reasons for this split. Merging the vm_lock back into vm_area_struct also allows vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset. Move vm_lock back into vm_area_struct, aligning it at the cacheline boundary and changing the cache to be cacheline-aligned as well. With kernel compiled using defconfig, this causes VMA memory consumption to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes: slabinfo before: ... : ... vma_lock ... 40 102 1 : ... vm_area_struct ... 160 51 2 : ... slabinfo after moving vm_lock: ... : ... vm_area_struct ... 256 32 2 : ... Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages, which is 5.5MB per 100000 VMAs. Note that the size of this structure is dependent on the kernel configuration and typically the original size is higher than 160 bytes. Therefore these calculations are close to the worst case scenario. A more realistic vm_area_struct usage before this change is: ... : ... vma_lock ... 40 102 1 : ... vm_area_struct ... 176 46 2 : ... Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages, which is 3.9MB per 100000 VMAs. This memory consumption growth can be addressed later by optimizing the vm_lock. [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/ [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/ [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/ Link: https://lkml.kernel.org/r/20250213224655.1680278-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Lorenzo Stoakes Reviewed-by: Shakeel Butt Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++++++------------ include/linux/mm_types.h | 6 ++++-- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6a4914bc1a38..bf3b6362927f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -697,6 +697,12 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK +static inline void vma_lock_init(struct vm_area_struct *vma) +{ + init_rwsem(&vma->vm_lock.lock); + vma->vm_lock_seq = UINT_MAX; +} + /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to @@ -714,7 +720,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; - if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) + if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0)) return false; /* @@ -729,7 +735,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { - up_read(&vma->vm_lock->lock); + up_read(&vma->vm_lock.lock); return false; } return true; @@ -744,7 +750,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) { mmap_assert_locked(vma->vm_mm); - down_read_nested(&vma->vm_lock->lock, subclass); + down_read_nested(&vma->vm_lock.lock, subclass); } /* @@ -756,13 +762,13 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int static inline void vma_start_read_locked(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); - down_read(&vma->vm_lock->lock); + down_read(&vma->vm_lock.lock); } static inline void vma_end_read(struct vm_area_struct *vma) { rcu_read_lock(); /* keeps vma alive till the end of up_read */ - up_read(&vma->vm_lock->lock); + up_read(&vma->vm_lock.lock); rcu_read_unlock(); } @@ -791,7 +797,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - down_write(&vma->vm_lock->lock); + down_write(&vma->vm_lock.lock); /* * We should use WRITE_ONCE() here because we can have concurrent reads * from the early lockless pessimistic check in vma_start_read(). @@ -799,7 +805,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - up_write(&vma->vm_lock->lock); + up_write(&vma->vm_lock.lock); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) @@ -811,7 +817,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) static inline void vma_assert_locked(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_lock->lock)) + if (!rwsem_is_locked(&vma->vm_lock.lock)) vma_assert_write_locked(vma); } @@ -844,6 +850,7 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ +static inline void vma_lock_init(struct vm_area_struct *vma) {} static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} @@ -878,10 +885,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf) extern const struct vm_operations_struct vma_dummy_vm_ops; -/* - * WARNING: vma_init does not initialize vma->vm_lock. - * Use vm_area_alloc()/vm_area_free() if vma needs locking. - */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); @@ -890,6 +893,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); vma_numab_state_init(vma); + vma_lock_init(vma); } /* Use when VMA is not part of the VMA tree and needs no locking */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0234f14f2aa6..36dea20cd101 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -730,8 +730,6 @@ struct vm_area_struct { * slowpath. */ unsigned int vm_lock_seq; - /* Unstable RCU readers are allowed to read this. */ - struct vma_lock *vm_lock; #endif /* @@ -784,6 +782,10 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef CONFIG_PER_VMA_LOCK + /* Unstable RCU readers are allowed to read this. */ + struct vma_lock vm_lock ____cacheline_aligned_in_smp; +#endif } __randomize_layout; #ifdef CONFIG_NUMA -- cgit v1.2.3 From 8ef95d8f15f9e785341775814f0ed2ee22017aa5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:40 -0800 Subject: mm: mark vma as detached until it's added into vma tree Current implementation does not set detached flag when a VMA is first allocated. This does not represent the real state of the VMA, which is detached until it is added into mm's VMA tree. Fix this by marking new VMAs as detached and resetting detached flag only after VMA is added into a tree. Introduce vma_mark_attached() to make the API more readable and to simplify possible future cleanup when vma->vm_mm might be used to indicate detached vma and vma_mark_attached() will need an additional mm parameter. Link: https://lkml.kernel.org/r/20250213224655.1680278-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index bf3b6362927f..d333d4070362 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -821,12 +821,21 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) vma_assert_write_locked(vma); } -static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) +static inline void vma_mark_attached(struct vm_area_struct *vma) +{ + vma->detached = false; +} + +static inline void vma_mark_detached(struct vm_area_struct *vma) { /* When detaching vma should be write-locked */ - if (detached) - vma_assert_write_locked(vma); - vma->detached = detached; + vma_assert_write_locked(vma); + vma->detached = true; +} + +static inline bool is_vma_detached(struct vm_area_struct *vma) +{ + return vma->detached; } static inline void release_fault_lock(struct vm_fault *vmf) @@ -857,8 +866,8 @@ static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } -static inline void vma_mark_detached(struct vm_area_struct *vma, - bool detached) {} +static inline void vma_mark_attached(struct vm_area_struct *vma) {} +static inline void vma_mark_detached(struct vm_area_struct *vma) {} static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) @@ -891,7 +900,10 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); - vma_mark_detached(vma, false); +#ifdef CONFIG_PER_VMA_LOCK + /* vma is not locked, can't use vma_mark_detached() */ + vma->detached = true; +#endif vma_numab_state_init(vma); vma_lock_init(vma); } @@ -1086,6 +1098,7 @@ static inline int vma_iter_bulk_store(struct vma_iterator *vmi, if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; + vma_mark_attached(vma); return 0; } -- cgit v1.2.3 From 55e50223bf3e06abceaf68e2ad125458bb5f874f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:41 -0800 Subject: mm: introduce vma_iter_store_attached() to use with attached vmas vma_iter_store() functions can be used both when adding a new vma and when updating an existing one. However for existing ones we do not need to mark them attached as they are already marked that way. With vma->detached being a separate flag, double-marking a vmas as attached or detached is not an issue because the flag will simply be overwritten with the same value. However once we fold this flag into the refcount later in this series, re-attaching or re-detaching a vma becomes an issue since these operations will be incrementing/decrementing a refcount. Introduce vma_iter_store_new() and vma_iter_store_overwrite() to replace vma_iter_store() and avoid re-attaching a vma during vma update. Add assertions in vma_mark_attached()/vma_mark_detached() to catch invalid usage. Update vma tests to check for vma detached state correctness. Link: https://lkml.kernel.org/r/20250213224655.1680278-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d333d4070362..003c3e5c0a96 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -821,8 +821,19 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) vma_assert_write_locked(vma); } +static inline void vma_assert_attached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(vma->detached); +} + +static inline void vma_assert_detached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(!vma->detached); +} + static inline void vma_mark_attached(struct vm_area_struct *vma) { + vma_assert_detached(vma); vma->detached = false; } @@ -830,6 +841,7 @@ static inline void vma_mark_detached(struct vm_area_struct *vma) { /* When detaching vma should be write-locked */ vma_assert_write_locked(vma); + vma_assert_attached(vma); vma->detached = true; } @@ -866,6 +878,8 @@ static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } +static inline void vma_assert_attached(struct vm_area_struct *vma) {} +static inline void vma_assert_detached(struct vm_area_struct *vma) {} static inline void vma_mark_attached(struct vm_area_struct *vma) {} static inline void vma_mark_detached(struct vm_area_struct *vma) {} -- cgit v1.2.3 From 2c2bd11caba2a45445c1f0c722451fe29e59db79 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:43 -0800 Subject: types: move struct rcuwait into types.h Move rcuwait struct definition into types.h so that rcuwait can be used without including rcuwait.h which includes other headers. Without this change mm_types.h can't use rcuwait due to a the following circular dependency: mm_types.h -> rcuwait.h -> signal.h -> mm_types.h Link: https://lkml.kernel.org/r/20250213224655.1680278-7-surenb@google.com Suggested-by: Matthew Wilcox Signed-off-by: Suren Baghdasaryan Acked-by: Davidlohr Bueso Acked-by: Liam R. Howlett Acked-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/rcuwait.h | 13 +------------ include/linux/types.h | 12 ++++++++++++ 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 27343424225c..9ad134a04b41 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -4,18 +4,7 @@ #include #include - -/* - * rcuwait provides a way of blocking and waking up a single - * task in an rcu-safe manner. - * - * The only time @task is non-nil is when a user is blocked (or - * checking if it needs to) on a condition, and reset as soon as we - * know that the condition has succeeded and are awoken. - */ -struct rcuwait { - struct task_struct __rcu *task; -}; +#include #define __RCUWAIT_INITIALIZER(name) \ { .task = NULL, } diff --git a/include/linux/types.h b/include/linux/types.h index 1c509ce8f7f6..a3d2182c2686 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -248,5 +248,17 @@ typedef void (*swap_func_t)(void *a, void *b, int size); typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv); typedef int (*cmp_func_t)(const void *a, const void *b); +/* + * rcuwait provides a way of blocking and waking up a single + * task in an rcu-safe manner. + * + * The only time @task is non-nil is when a user is blocked (or + * checking if it needs to) on a condition, and reset as soon as we + * know that the condition has succeeded and are awoken. + */ +struct rcuwait { + struct task_struct __rcu *task; +}; + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_TYPES_H */ -- cgit v1.2.3 From 7440adb405dfc4abe21bc95abf3481e6a6649c05 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:44 -0800 Subject: mm: allow vma_start_read_locked/vma_start_read_locked_nested to fail With upcoming replacement of vm_lock with vm_refcnt, we need to handle a possibility of vma_start_read_locked/vma_start_read_locked_nested failing due to refcount overflow. Prepare for such possibility by changing these APIs and adjusting their users. Link: https://lkml.kernel.org/r/20250213224655.1680278-8-surenb@google.com Signed-off-by: Suren Baghdasaryan Cc: Lokesh Gidra Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 003c3e5c0a96..09b48af68699 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -747,10 +747,11 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * not be used in such cases because it might fail due to mm_lock_seq overflow. * This functionality is used to obtain vma read lock and drop the mmap read lock. */ -static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) +static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) { mmap_assert_locked(vma->vm_mm); down_read_nested(&vma->vm_lock.lock, subclass); + return true; } /* @@ -759,10 +760,11 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int * not be used in such cases because it might fail due to mm_lock_seq overflow. * This functionality is used to obtain vma read lock and drop the mmap read lock. */ -static inline void vma_start_read_locked(struct vm_area_struct *vma) +static inline bool vma_start_read_locked(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); down_read(&vma->vm_lock.lock); + return true; } static inline void vma_end_read(struct vm_area_struct *vma) -- cgit v1.2.3 From ce0853966085dd8eab7153ce0b815c4a07d86698 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:45 -0800 Subject: mm: move mmap_init_lock() out of the header file mmap_init_lock() is used only from mm_init() in fork.c, therefore it does not have to reside in the header file. This move lets us avoid including additional headers in mmap_lock.h later, when mmap_init_lock() needs to initialize rcuwait object. Link: https://lkml.kernel.org/r/20250213224655.1680278-9-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 45a21faa3ff6..4706c6769902 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -122,12 +122,6 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int #endif /* CONFIG_PER_VMA_LOCK */ -static inline void mmap_init_lock(struct mm_struct *mm) -{ - init_rwsem(&mm->mmap_lock); - mm_lock_seqcount_init(mm); -} - static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); -- cgit v1.2.3 From 45ad9f5290dc4bb2249e951d4b3756d3ebda2d66 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:46 -0800 Subject: mm: uninline the main body of vma_start_write() vma_start_write() is used in many places and will grow in size very soon. It is not used in performance critical paths and uninlining it should limit the future code size growth. No functional changes. Link: https://lkml.kernel.org/r/20250213224655.1680278-10-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 09b48af68699..c24c521e38a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -787,6 +787,8 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l return (vma->vm_lock_seq == *mm_lock_seq); } +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); + /* * Begin writing to a VMA. * Exclude concurrent readers under the per-VMA lock until the currently @@ -799,15 +801,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - down_write(&vma->vm_lock.lock); - /* - * We should use WRITE_ONCE() here because we can have concurrent reads - * from the early lockless pessimistic check in vma_start_read(). - * We don't really care about the correctness of that early check, but - * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. - */ - WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - up_write(&vma->vm_lock.lock); + __vma_start_write(vma, mm_lock_seq); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) -- cgit v1.2.3 From 7f8ceea0c58039dcea3d31b8d5da58aa5f6e12bf Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:47 -0800 Subject: refcount: provide ops for cases when object's memory can be reused For speculative lookups where a successful inc_not_zero() pins the object, but where we still need to double check if the object acquired is indeed the one we set out to acquire (identity check), needs this validation to happen *after* the increment. Similarly, when a new object is initialized and its memory might have been previously occupied by another object, all stores to initialize the object should happen *before* refcount initialization. Notably SLAB_TYPESAFE_BY_RCU is one such an example when this ordering is required for reference counting. Add refcount_{add|inc}_not_zero_acquire() to guarantee the proper ordering between acquiring a reference count on an object and performing the identity check for that object. Add refcount_set_release() to guarantee proper ordering between stores initializing object attributes and the store initializing the refcount. refcount_set_release() should be done after all other object attributes are initialized. Once refcount_set_release() is called, the object should be considered visible to other tasks even if it was not yet added into an object collection normally used to discover it. This is because other tasks might have discovered the object previously occupying the same memory and after memory reuse they can succeed in taking refcount for the new object and start using it. Object reuse example to consider: consumer: obj = lookup(collection, key); if (!refcount_inc_not_zero_acquire(&obj->ref)) return; if (READ_ONCE(obj->key) != key) { /* identity check */ put_ref(obj); return; } use(obj->value); producer: remove(collection, obj->key); if (!refcount_dec_and_test(&obj->ref)) return; obj->key = KEY_INVALID; free(obj); obj = malloc(); /* obj is reused */ obj->key = new_key; obj->value = new_value; refcount_set_release(obj->ref, 1); add(collection, new_key, obj); refcount_{add|inc}_not_zero_acquire() is required to prevent the following reordering when refcount_inc_not_zero() is used instead: consumer: obj = lookup(collection, key); if (READ_ONCE(obj->key) != key) { /* reordered identity check */ put_ref(obj); return; } producer: remove(collection, obj->key); if (!refcount_dec_and_test(&obj->ref)) return; obj->key = KEY_INVALID; free(obj); obj = malloc(); /* obj is reused */ obj->key = new_key; obj->value = new_value; refcount_set_release(obj->ref, 1); add(collection, new_key, obj); if (!refcount_inc_not_zero(&obj->ref)) return; use(obj->value); /* USING WRONG OBJECT */ refcount_set_release() is required to prevent the following reordering when refcount_set() is used instead: consumer: obj = lookup(collection, key); producer: remove(collection, obj->key); if (!refcount_dec_and_test(&obj->ref)) return; obj->key = KEY_INVALID; free(obj); obj = malloc(); /* obj is reused */ obj->key = new_key; /* new_key == old_key */ refcount_set(obj->ref, 1); if (!refcount_inc_not_zero_acquire(&obj->ref)) return; if (READ_ONCE(obj->key) != key) { /* pass since new_key == old_key */ put_ref(obj); return; } use(obj->value); /* USING STALE obj->value */ obj->value = new_value; /* reordered store */ add(collection, key, obj); [surenb@google.com: fix title underlines in refcount-vs-atomic.rst] Link: https://lkml.kernel.org/r/20250217161645.3137927-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-11-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Vlastimil Babka [slab] Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Peter Zijlstra Cc: Will Deacon Cc: Paul E. McKenney Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/refcount.h | 106 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/slab.h | 9 ++++ 2 files changed, 115 insertions(+) (limited to 'include') diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 35f039ecb272..4589d2e7bfea 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -87,6 +87,15 @@ * The decrements dec_and_test() and sub_and_test() also provide acquire * ordering on success. * + * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() provide + * acquire and release ordering for cases when the memory occupied by the + * object might be reused to store another object. This is important for the + * cases where secondary validation is required to detect such reuse, e.g. + * SLAB_TYPESAFE_BY_RCU. The secondary validation checks have to happen after + * the refcount is taken, hence acquire order is necessary. Similarly, when the + * object is initialized, all stores to its attributes should be visible before + * the refcount is set, otherwise a stale attribute value might be used by + * another task which succeeds in taking a refcount to the new object. */ #ifndef _LINUX_REFCOUNT_H @@ -125,6 +134,31 @@ static inline void refcount_set(refcount_t *r, int n) atomic_set(&r->refs, n); } +/** + * refcount_set_release - set a refcount's value with release ordering + * @r: the refcount + * @n: value to which the refcount will be set + * + * This function should be used when memory occupied by the object might be + * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. + * + * Provides release memory ordering which will order previous memory operations + * against this store. This ensures all updates to this object are visible + * once the refcount is set and stale values from the object previously + * occupying this memory are overwritten with new ones. + * + * This function should be called only after new object is fully initialized. + * After this call the object should be considered visible to other tasks even + * if it was not yet added into an object collection normally used to discover + * it. This is because other tasks might have discovered the object previously + * occupying the same memory and after memory reuse they can succeed in taking + * refcount to the new object and start using it. + */ +static inline void refcount_set_release(refcount_t *r, int n) +{ + atomic_set_release(&r->refs, n); +} + /** * refcount_read - get a refcount's value * @r: the refcount @@ -178,6 +212,52 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) return __refcount_add_not_zero(i, r, NULL); } +static inline __must_check __signed_wrap +bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) +{ + int old = refcount_read(r); + + do { + if (!old) + break; + } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i)); + + if (oldp) + *oldp = old; + + if (unlikely(old < 0 || old + i < 0)) + refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); + + return old; +} + +/** + * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0 + * + * @i: the value to add to the refcount + * @r: the refcount + * + * Will saturate at REFCOUNT_SATURATED and WARN. + * + * This function should be used when memory occupied by the object might be + * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. + * + * Provides acquire memory ordering on success, it is assumed the caller has + * guaranteed the object memory to be stable (RCU, etc.). It does provide a + * control dependency and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc_not_zero_acquire() should instead be used to increment a + * reference count. + * + * Return: false if the passed refcount is 0, true otherwise + */ +static inline __must_check bool refcount_add_not_zero_acquire(int i, refcount_t *r) +{ + return __refcount_add_not_zero_acquire(i, r, NULL); +} + static inline __signed_wrap void __refcount_add(int i, refcount_t *r, int *oldp) { @@ -236,6 +316,32 @@ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) return __refcount_inc_not_zero(r, NULL); } +static inline __must_check bool __refcount_inc_not_zero_acquire(refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero_acquire(1, r, oldp); +} + +/** + * refcount_inc_not_zero_acquire - increment a refcount with acquire ordering unless it is 0 + * @r: the refcount to increment + * + * Similar to refcount_inc_not_zero(), but provides acquire memory ordering on + * success. + * + * This function should be used when memory occupied by the object might be + * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. + * + * Provides acquire memory ordering on success, it is assumed the caller has + * guaranteed the object memory to be stable (RCU, etc.). It does provide a + * control dependency and thereby orders future stores. See the comment on top. + * + * Return: true if the increment was successful, false otherwise + */ +static inline __must_check bool refcount_inc_not_zero_acquire(refcount_t *r) +{ + return __refcount_inc_not_zero_acquire(r, NULL); +} + static inline void __refcount_inc(refcount_t *r, int *oldp) { __refcount_add(1, r, oldp); diff --git a/include/linux/slab.h b/include/linux/slab.h index 09eedaecf120..ad902a2d692b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -136,6 +136,15 @@ enum _slab_flag_bits { * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. * + * Note that object identity check has to be done *after* acquiring a + * reference, therefore user has to ensure proper ordering for loads. + * Similarly, when initializing objects allocated with SLAB_TYPESAFE_BY_RCU, + * the newly allocated object has to be fully initialized *before* its + * refcount gets initialized and proper ordering for stores is required. + * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() are + * designed with the proper fences required for reference counting objects + * allocated with SLAB_TYPESAFE_BY_RCU. + * * Note that it is not possible to acquire a lock within a structure * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference * as described above. The reason is that SLAB_TYPESAFE_BY_RCU pages -- cgit v1.2.3 From 4e0dbe105d5088c77eb09de6f049aaf44711a2ec Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:48 -0800 Subject: refcount: introduce __refcount_{add|inc}_not_zero_limited_acquire Introduce functions to increase refcount but with a top limit above which they will fail to increase (the limit is inclusive). Setting the limit to INT_MAX indicates no limit. Link: https://lkml.kernel.org/r/20250213224655.1680278-12-surenb@google.com Signed-off-by: Suren Baghdasaryan Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/refcount.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 4589d2e7bfea..80dc023ac2bf 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -213,13 +213,20 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) } static inline __must_check __signed_wrap -bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) +bool __refcount_add_not_zero_limited_acquire(int i, refcount_t *r, int *oldp, + int limit) { int old = refcount_read(r); do { if (!old) break; + + if (i > limit - old) { + if (oldp) + *oldp = old; + return false; + } } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i)); if (oldp) @@ -231,6 +238,18 @@ bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) return old; } +static inline __must_check bool +__refcount_inc_not_zero_limited_acquire(refcount_t *r, int *oldp, int limit) +{ + return __refcount_add_not_zero_limited_acquire(1, r, oldp, limit); +} + +static inline __must_check __signed_wrap +bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero_limited_acquire(i, r, oldp, INT_MAX); +} + /** * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0 * -- cgit v1.2.3 From f35ab95ca0af7a27feab57b9d7e906405bddb093 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:49 -0800 Subject: mm: replace vm_lock and detached flag with a reference count rw_semaphore is a sizable structure of 40 bytes and consumes considerable space for each vm_area_struct. However vma_lock has two important specifics which can be used to replace rw_semaphore with a simpler structure: 1. Readers never wait. They try to take the vma_lock and fall back to mmap_lock if that fails. 2. Only one writer at a time will ever try to write-lock a vma_lock because writers first take mmap_lock in write mode. Because of these requirements, full rw_semaphore functionality is not needed and we can replace rw_semaphore and the vma->detached flag with a refcount (vm_refcnt). When vma is in detached state, vm_refcnt is 0 and only a call to vma_mark_attached() can take it out of this state. Note that unlike before, now we enforce both vma_mark_attached() and vma_mark_detached() to be done only after vma has been write-locked. vma_mark_attached() changes vm_refcnt to 1 to indicate that it has been attached to the vma tree. When a reader takes read lock, it increments vm_refcnt, unless the top usable bit of vm_refcnt (0x40000000) is set, indicating presence of a writer. When writer takes write lock, it sets the top usable bit to indicate its presence. If there are readers, writer will wait using newly introduced mm->vma_writer_wait. Since all writers take mmap_lock in write mode first, there can be only one writer at a time. The last reader to release the lock will signal the writer to wake up. refcount might overflow if there are many competing readers, in which case read-locking will fail. Readers are expected to handle such failures. In summary: 1. all readers increment the vm_refcnt; 2. writer sets top usable (writer) bit of vm_refcnt; 3. readers cannot increment the vm_refcnt if the writer bit is set; 4. in the presence of readers, writer must wait for the vm_refcnt to drop to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma with no readers; 5. vm_refcnt overflow is handled by the readers. While this vm_lock replacement does not yet result in a smaller vm_area_struct (it stays at 256 bytes due to cacheline alignment), it allows for further size optimization by structure member regrouping to bring the size of vm_area_struct below 192 bytes. [surenb@google.com: fix a crash due to vma_end_read() that should have been removed] Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Peter Zijlstra Suggested-by: Matthew Wilcox Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 128 +++++++++++++++++++++++++++++++---------------- include/linux/mm_types.h | 22 ++++---- 2 files changed, 95 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c24c521e38a2..06f179c844c3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,6 +32,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -697,19 +698,54 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK -static inline void vma_lock_init(struct vm_area_struct *vma) +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) { - init_rwsem(&vma->vm_lock.lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key lockdep_key; + + lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); +#endif + if (reset_refcnt) + refcount_set(&vma->vm_refcnt, 0); vma->vm_lock_seq = UINT_MAX; } +static inline bool is_vma_writer_only(int refcnt) +{ + /* + * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma + * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on + * a detached vma happens only in vma_mark_detached() and is a rare + * case, therefore most of the time there will be no unnecessary wakeup. + */ + return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; +} + +static inline void vma_refcount_put(struct vm_area_struct *vma) +{ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *mm = vma->vm_mm; + int oldcnt; + + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { + + if (is_vma_writer_only(oldcnt - 1)) + rcuwait_wake_up(&mm->vma_writer_wait); + } +} + /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got + * detached. */ -static inline bool vma_start_read(struct vm_area_struct *vma) +static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) { + int oldcnt; + /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need @@ -718,15 +754,25 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * need ordering is below. */ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) - return false; + return NULL; - if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0)) - return false; + /* + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * Acquire fence is required here to avoid reordering against later + * vm_lock_seq check and checks inside lock_vma_under_rcu(). + */ + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) { + /* return EAGAIN if vma got detached from under us */ + return oldcnt ? NULL : ERR_PTR(-EAGAIN); + } + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); /* - * Overflow might produce false locked result. + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are @@ -735,10 +781,11 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { - up_read(&vma->vm_lock.lock); - return false; + vma_refcount_put(vma); + return NULL; } - return true; + + return vma; } /* @@ -749,8 +796,14 @@ static inline bool vma_start_read(struct vm_area_struct *vma) */ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) { + int oldcnt; + mmap_assert_locked(vma->vm_mm); - down_read_nested(&vma->vm_lock.lock, subclass); + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) + return false; + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); return true; } @@ -762,16 +815,12 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int */ static inline bool vma_start_read_locked(struct vm_area_struct *vma) { - mmap_assert_locked(vma->vm_mm); - down_read(&vma->vm_lock.lock); - return true; + return vma_start_read_locked_nested(vma, 0); } static inline void vma_end_read(struct vm_area_struct *vma) { - rcu_read_lock(); /* keeps vma alive till the end of up_read */ - up_read(&vma->vm_lock.lock); - rcu_read_unlock(); + vma_refcount_put(vma); } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ @@ -813,38 +862,35 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) static inline void vma_assert_locked(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_lock.lock)) - vma_assert_write_locked(vma); + unsigned int mm_lock_seq; + + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && + !__is_vma_write_locked(vma, &mm_lock_seq), vma); } +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ static inline void vma_assert_attached(struct vm_area_struct *vma) { - WARN_ON_ONCE(vma->detached); + WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); } static inline void vma_assert_detached(struct vm_area_struct *vma) { - WARN_ON_ONCE(!vma->detached); + WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); } static inline void vma_mark_attached(struct vm_area_struct *vma) { - vma_assert_detached(vma); - vma->detached = false; -} - -static inline void vma_mark_detached(struct vm_area_struct *vma) -{ - /* When detaching vma should be write-locked */ vma_assert_write_locked(vma); - vma_assert_attached(vma); - vma->detached = true; + vma_assert_detached(vma); + refcount_set(&vma->vm_refcnt, 1); } -static inline bool is_vma_detached(struct vm_area_struct *vma) -{ - return vma->detached; -} +void vma_mark_detached(struct vm_area_struct *vma); static inline void release_fault_lock(struct vm_fault *vmf) { @@ -867,9 +913,9 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ -static inline void vma_lock_init(struct vm_area_struct *vma) {} -static inline bool vma_start_read(struct vm_area_struct *vma) - { return false; } +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} +static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) + { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) @@ -910,12 +956,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); -#ifdef CONFIG_PER_VMA_LOCK - /* vma is not locked, can't use vma_mark_detached() */ - vma->detached = true; -#endif vma_numab_state_init(vma); - vma_lock_init(vma); + vma_lock_init(vma, false); } /* Use when VMA is not part of the VMA tree and needs no locking */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 36dea20cd101..9de0a6cb3c2d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -629,9 +630,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) } #endif -struct vma_lock { - struct rw_semaphore lock; -}; +#define VMA_LOCK_OFFSET 0x40000000 +#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1) struct vma_numab_state { /* @@ -709,19 +709,13 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK - /* - * Flag to indicate areas detached from the mm->mm_mt tree. - * Unstable RCU readers are allowed to read this. - */ - bool detached; - /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_lock->lock (in write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_lock->lock (in read or write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * @@ -784,7 +778,10 @@ struct vm_area_struct { struct vm_userfaultfd_ctx vm_userfaultfd_ctx; #ifdef CONFIG_PER_VMA_LOCK /* Unstable RCU readers are allowed to read this. */ - struct vma_lock vm_lock ____cacheline_aligned_in_smp; + refcount_t vm_refcnt ____cacheline_aligned_in_smp; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map vmlock_dep_map; +#endif #endif } __randomize_layout; @@ -920,6 +917,7 @@ struct mm_struct { * by mmlist_lock */ #ifdef CONFIG_PER_VMA_LOCK + struct rcuwait vma_writer_wait; /* * This field has lock-like semantics, meaning it is sometimes * accessed with ACQUIRE/RELEASE semantics. -- cgit v1.2.3 From 6bef4c2f97221f3b595d08c8656eb5845ef80fe9 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:50 -0800 Subject: mm: move lesser used vma_area_struct members into the last cacheline Move several vma_area_struct members which are rarely or never used during page fault handling into the last cacheline to better pack vm_area_struct. As a result vm_area_struct will fit into 3 as opposed to 4 cachelines. New typical vm_area_struct layout: struct vm_area_struct { union { struct { long unsigned int vm_start; /* 0 8 */ long unsigned int vm_end; /* 8 8 */ }; /* 0 16 */ freeptr_t vm_freeptr; /* 0 8 */ }; /* 0 16 */ struct mm_struct * vm_mm; /* 16 8 */ pgprot_t vm_page_prot; /* 24 8 */ union { const vm_flags_t vm_flags; /* 32 8 */ vm_flags_t __vm_flags; /* 32 8 */ }; /* 32 8 */ unsigned int vm_lock_seq; /* 40 4 */ /* XXX 4 bytes hole, try to pack */ struct list_head anon_vma_chain; /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct anon_vma * anon_vma; /* 64 8 */ const struct vm_operations_struct * vm_ops; /* 72 8 */ long unsigned int vm_pgoff; /* 80 8 */ struct file * vm_file; /* 88 8 */ void * vm_private_data; /* 96 8 */ atomic_long_t swap_readahead_info; /* 104 8 */ struct mempolicy * vm_policy; /* 112 8 */ struct vma_numab_state * numab_state; /* 120 8 */ /* --- cacheline 2 boundary (128 bytes) --- */ refcount_t vm_refcnt (__aligned__(64)); /* 128 4 */ /* XXX 4 bytes hole, try to pack */ struct { struct rb_node rb (__aligned__(8)); /* 136 24 */ long unsigned int rb_subtree_last; /* 160 8 */ } __attribute__((__aligned__(8))) shared; /* 136 32 */ struct anon_vma_name * anon_name; /* 168 8 */ struct vm_userfaultfd_ctx vm_userfaultfd_ctx; /* 176 8 */ /* size: 192, cachelines: 3, members: 18 */ /* sum members: 176, holes: 2, sum holes: 8 */ /* padding: 8 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 4 */ } __attribute__((__aligned__(64))); Memory consumption per 1000 VMAs becomes 48 pages: slabinfo after vm_area_struct changes: ... : ... vm_area_struct ... 192 42 2 : ... Link: https://lkml.kernel.org/r/20250213224655.1680278-14-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9de0a6cb3c2d..c3aa0e20be41 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -725,17 +725,6 @@ struct vm_area_struct { */ unsigned int vm_lock_seq; #endif - - /* - * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree. - * - */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; - /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma @@ -755,14 +744,6 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ -#ifdef CONFIG_ANON_VMA_NAME - /* - * For private and shared anonymous mappings, a pointer to a null - * terminated string containing the name given to the vma, or NULL if - * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. - */ - struct anon_vma_name *anon_name; -#endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif @@ -775,7 +756,6 @@ struct vm_area_struct { #ifdef CONFIG_NUMA_BALANCING struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif - struct vm_userfaultfd_ctx vm_userfaultfd_ctx; #ifdef CONFIG_PER_VMA_LOCK /* Unstable RCU readers are allowed to read this. */ refcount_t vm_refcnt ____cacheline_aligned_in_smp; @@ -783,6 +763,24 @@ struct vm_area_struct { struct lockdep_map vmlock_dep_map; #endif #endif + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. + * + */ + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif + struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; #ifdef CONFIG_NUMA -- cgit v1.2.3 From e218d9fedd056a0b17468d6e19fddaf2c3550f97 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:52 -0800 Subject: mm: remove extra vma_numab_state_init() call vma_init() already memset's the whole vm_area_struct to 0, so there is no need to an additional vma_numab_state_init(). Link: https://lkml.kernel.org/r/20250213224655.1680278-16-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 06f179c844c3..aad932c4bcf0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -956,7 +956,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); - vma_numab_state_init(vma); vma_lock_init(vma, false); } -- cgit v1.2.3 From e49510bf00de4f832eaaebcfc113795f127ad519 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:53 -0800 Subject: mm: prepare lock_vma_under_rcu() for vma reuse possibility Once we make vma cache SLAB_TYPESAFE_BY_RCU, it will be possible for a vma to be reused and attached to another mm after lock_vma_under_rcu() locks the vma. lock_vma_under_rcu() should ensure that vma_start_read() is using the original mm and after locking the vma it should ensure that vma->vm_mm has not changed from under us. Link: https://lkml.kernel.org/r/20250213224655.1680278-17-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index aad932c4bcf0..e3f962de1677 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -739,10 +739,13 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * detached. */ -static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) { int oldcnt; @@ -753,7 +756,7 @@ static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) return NULL; /* @@ -780,7 +783,7 @@ static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { vma_refcount_put(vma); return NULL; } @@ -914,7 +917,8 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} -- cgit v1.2.3 From 3104138517fc66aad21f4a2487bb572e9fc2e3ec Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:54 -0800 Subject: mm: make vma cache SLAB_TYPESAFE_BY_RCU To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that object reuse before RCU grace period is over will be detected by lock_vma_under_rcu(). Current checks are sufficient as long as vma is detached before it is freed. The only place this is not currently happening is in exit_mmap(). Add the missing vma_mark_detached() in exit_mmap(). Another issue which might trick lock_vma_under_rcu() during vma reuse is vm_area_dup(), which copies the entire content of the vma into a new one, overriding new vma's vm_refcnt and temporarily making it appear as attached. This might trick a racing lock_vma_under_rcu() to operate on a reused vma if it found the vma before it got reused. To prevent this situation, we should ensure that vm_refcnt stays at detached state (0) when it is copied and advances to attached state only after it is added into the vma tree. Introduce vm_area_init_from() which preserves new vma's vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached state with no current readers when they are freed, lock_vma_under_rcu() will not be able to take vm_refcnt after vma got detached even if vma is reused. vma_mark_attached() in modified to include a release fence to ensure all stores to the vma happen before vm_refcnt gets initialized. Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate vm_area_struct reuse and will minimize the number of call_rcu() calls. [surenb@google.com: remove atomic_set_release() usage in tools/] Link: https://lkml.kernel.org/r/20250217054351.2973666-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-18-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 +--- include/linux/mm_types.h | 13 ++++++++++--- include/linux/slab.h | 6 ------ 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index e3f962de1677..14115c9949d8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code, struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); -/* Use only if VMA has no other users */ -void __vm_area_free(struct vm_area_struct *vma); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; @@ -890,7 +888,7 @@ static inline void vma_mark_attached(struct vm_area_struct *vma) { vma_assert_write_locked(vma); vma_assert_detached(vma); - refcount_set(&vma->vm_refcnt, 1); + refcount_set_release(&vma->vm_refcnt, 1); } void vma_mark_detached(struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c3aa0e20be41..6a93abb4452b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -574,6 +574,12 @@ static inline void *folio_get_private(struct folio *folio) typedef unsigned long vm_flags_t; +/* + * freeptr_t represents a SLUB freelist pointer, which might be encoded + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. + */ +typedef struct { unsigned long v; } freeptr_t; + /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that @@ -677,6 +683,9 @@ struct vma_numab_state { * * Only explicitly marked struct members may be accessed by RCU readers before * getting a stable reference. + * + * WARNING: when adding new members, please update vm_area_init_from() to copy + * them during vm_area_struct content duplication. */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -687,9 +696,7 @@ struct vm_area_struct { unsigned long vm_start; unsigned long vm_end; }; -#ifdef CONFIG_PER_VMA_LOCK - struct rcu_head vm_rcu; /* Used for deferred freeing. */ -#endif + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ }; /* diff --git a/include/linux/slab.h b/include/linux/slab.h index ad902a2d692b..f8924fd6ea26 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -243,12 +243,6 @@ enum _slab_flag_bits { #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED #endif -/* - * freeptr_t represents a SLUB freelist pointer, which might be encoded - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. - */ -typedef struct { unsigned long v; } freeptr_t; - /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * -- cgit v1.2.3 From 63a23847dc47113b879a5f53cc0ca5cedc881ffd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:06 +0000 Subject: fs: convert block_commit_write() to take a folio All callers now have a folio, so pass it in instead of converting folio->page->folio. Link: https://lkml.kernel.org/r/20250217192009.437916-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 932139c5d46f..6672e1a5031c 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -271,7 +271,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct folio **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); -void block_commit_write(struct page *page, unsigned int from, unsigned int to); +void block_commit_write(struct folio *folio, size_t from, size_t to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); -- cgit v1.2.3 From 52d671a1a36a16f3a0dd9a2beff964e75bce9787 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:07 +0000 Subject: fs: remove page_file_mapping() This wrapper has no more callers. Delete it. Link: https://lkml.kernel.org/r/20250217192009.437916-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 47bfc6b1b632..975c56fb4f85 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -575,11 +575,6 @@ static inline struct address_space *folio_flush_mapping(struct folio *folio) return folio_mapping(folio); } -static inline struct address_space *page_file_mapping(struct page *page) -{ - return folio_file_mapping(page_folio(page)); -} - /** * folio_inode - Get the host inode for this folio. * @folio: The folio. -- cgit v1.2.3 From 0d40cfe63a2f19b9d375382e6d90b9ebd412901e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:08 +0000 Subject: fs: remove folio_file_mapping() No callers of this function remain as filesystems no longer see swapfile pages through their normal read/write paths. Link: https://lkml.kernel.org/r/20250217192009.437916-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 975c56fb4f85..ad7c0f615e9b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -535,26 +535,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); -/** - * folio_file_mapping - Find the mapping this folio belongs to. - * @folio: The folio. - * - * For folios which are in the page cache, return the mapping that this - * page belongs to. Folios in the swap cache return the mapping of the - * swap file or swap device where the data is stored. This is different - * from the mapping returned by folio_mapping(). The only reason to - * use it is if, like NFS, you return 0 from ->activate_swapfile. - * - * Do not call this for folios which aren't in the page cache or swap cache. - */ -static inline struct address_space *folio_file_mapping(struct folio *folio) -{ - if (unlikely(folio_test_swapcache(folio))) - return swapcache_mapping(folio); - - return folio->mapping; -} - /** * folio_flush_mapping - Find the file mapping this folio belongs to. * @folio: The folio. -- cgit v1.2.3 From 86758b504864913233f6a16076184ba784cd4466 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 18 Feb 2025 15:49:54 +0530 Subject: mm/ioremap: pass pgprot_t to ioremap_prot() instead of unsigned long ioremap_prot() currently accepts pgprot_val parameter as an unsigned long, thus implicitly assuming that pgprot_val and pgprot_t could never be bigger than unsigned long. But this assumption soon will not be true on arm64 when using D128 pgtables. In 128 bit page table configuration, unsigned long is 64 bit, but pgprot_t is 128 bit. Passing platform abstracted pgprot_t argument is better as compared to size based data types. Let's change the parameter to directly pass pgprot_t like another similar helper generic_ioremap_prot(). Without this change in place, D128 configuration does not work on arm64 as the top 64 bits gets silently stripped when passing the protection value to this function. Link: https://lkml.kernel.org/r/20250218101954.415331-1-anshuman.khandual@arm.com Signed-off-by: Ryan Roberts Co-developed-by: Anshuman Khandual Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas [arm64] Signed-off-by: Andrew Morton --- include/asm-generic/io.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index a5cbbf3e26ec..402020b23423 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1111,7 +1111,7 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot); void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot); + pgprot_t prot); void iounmap(volatile void __iomem *addr); void generic_iounmap(volatile void __iomem *addr); @@ -1120,7 +1120,7 @@ void generic_iounmap(volatile void __iomem *addr); static inline void __iomem *ioremap(phys_addr_t addr, size_t size) { /* _PAGE_IOREMAP needs to be supplied by the architecture */ - return ioremap_prot(addr, size, _PAGE_IOREMAP); + return ioremap_prot(addr, size, __pgprot(_PAGE_IOREMAP)); } #endif #endif /* !CONFIG_MMU || CONFIG_GENERIC_IOREMAP */ -- cgit v1.2.3 From c009da4258f9885c5a3749fc004870db9c0e7a99 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:03 +0000 Subject: mm, cma: support multiple contiguous ranges, if requested Currently, CMA manages one range of physically contiguous memory. Creation of larger CMA areas with hugetlb_cma may run in to gaps in physical memory, so that they are not able to allocate that contiguous physical range from memblock when creating the CMA area. This can happen, for example, on an AMD system with > 1TB of memory, where there will be a gap just below the 1TB (40bit DMA) line. If you have set aside most of memory for potential hugetlb CMA allocation, cma_declare_contiguous_nid will fail. hugetlb_cma doesn't need the entire area to be one physically contiguous range. It just cares about being able to get physically contiguous chunks of a certain size (e.g. 1G), and it is fine to have the CMA area backed by multiple physical ranges, as long as it gets 1G contiguous allocations. Multi-range support is implemented by introducing an array of ranges, instead of just one big one. Each range has its own bitmap. Effectively, the allocate and release operations work as before, just per-range. So, instead of going through one large bitmap, they now go through a number of smaller ones. The maximum number of supported ranges is 8, as defined in CMA_MAX_RANGES. Since some current users of CMA expect a CMA area to just use one physically contiguous range, only allow for multiple ranges if a new interface, cma_declare_contiguous_nid_multi, is used. The other interfaces will work like before, creating only CMA areas with 1 range. cma_declare_contiguous_nid_multi works as follows, mimicking the default "bottom-up, above 4G" reservation approach: 0) Try cma_declare_contiguous_nid, which will use only one region. If this succeeds, return. This makes sure that for all the cases that currently work, the behavior remains unchanged even if the caller switches from cma_declare_contiguous_nid to cma_declare_contiguous_nid_multi. 1) Select the largest free memblock ranges above 4G, with a maximum number of CMA_MAX_RANGES. 2) If we did not find at most CMA_MAX_RANGES that add up to the total size requested, return -ENOMEM. 3) Sort the selected ranges by base address. 4) Reserve them bottom-up until we get what we wanted. Link: https://lkml.kernel.org/r/20250228182928.2645936-3-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Arnd Bergmann Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/cma.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/cma.h b/include/linux/cma.h index d15b64f51336..863427c27dc2 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -40,6 +40,9 @@ static inline int __init cma_declare_contiguous(phys_addr_t base, return cma_declare_contiguous_nid(base, size, limit, alignment, order_per_bit, fixed, name, res_cma, NUMA_NO_NODE); } +extern int __init cma_declare_contiguous_multi(phys_addr_t size, + phys_addr_t align, unsigned int order_per_bit, + const char *name, struct cma **res_cma, int nid); extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, const char *name, -- cgit v1.2.3 From 624ab90b7b8758090654520b03c97cecc438a414 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:04 +0000 Subject: mm/cma: introduce cma_intersects function Now that CMA areas can have multiple physical ranges, code can't assume a CMA struct represents a base_pfn plus a size, as returned from cma_get_base. Most cases are ok though, since they all explicitly refer to CMA areas that were created using existing interfaces (cma_declare_contiguous_nid or cma_init_reserved_mem), which guarantees they have just one physical range. An exception is the s390 code, which walks all CMA ranges to see if they intersect with a range of memory that is about to be hotremoved. So, in the future, it might run in to multi-range areas. To keep this check working, define a cma_intersects function. This just checks if a physaddr range intersects any of the ranges. Use it in the s390 check. Link: https://lkml.kernel.org/r/20250228182928.2645936-4-fvdl@google.com Signed-off-by: Frank van der Linden Acked-by: Alexander Gordeev Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/cma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cma.h b/include/linux/cma.h index 863427c27dc2..03d85c100dcc 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -53,6 +53,7 @@ extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); +extern bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end); extern void cma_reserve_pages_on_error(struct cma *cma); -- cgit v1.2.3 From 5b47c02967ab770aa7661c8863a21b2fd59e35ff Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:08 +0000 Subject: mm/hugetlb: convert cmdline parameters from setup to early Convert the cmdline parameters (hugepagesz, hugepages, default_hugepagesz and hugetlb_free_vmemmap) to early parameters. Since parse_early_param might run before MMU setups on some platforms (powerpc), validation of huge page sizes as specified in command line parameters would fail. So instead, for the hstate-related values, just record the them and parse them on demand, from hugetlb_bootmem_alloc. The allocation of hugetlb bootmem pages is now done in hugetlb_bootmem_alloc, which is called explicitly at the start of mm_core_init(). core_initcall would be too late, as that happens with memblock already torn down. This change will allow earlier allocation and initialization of bootmem hugetlb pages later on. No functional change intended. Link: https://lkml.kernel.org/r/20250228182928.2645936-8-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 76a75ec03dd6..a596aaa178d1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -174,6 +174,8 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages[MAX_NUMNODES]; +void hugetlb_bootmem_alloc(void); + /* arch callbacks */ #ifndef CONFIG_HIGHPTE @@ -1257,6 +1259,10 @@ static inline bool hugetlbfs_pagecache_present( { return false; } + +static inline void hugetlb_bootmem_alloc(void) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, -- cgit v1.2.3 From 243a75e236802614075511266c8d1834d4bcffe9 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:10 +0000 Subject: mm/bootmem_info: export register_page_bootmem_memmap If other mm code wants to use this function for early memmap inialization (on the platforms that have it), it should be made available properly, not just unconditionally in mm.h Make this function available for such cases. Link: https://lkml.kernel.org/r/20250228182928.2645936-10-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/bootmem_info.h | 7 +++++++ include/linux/mm.h | 3 --- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index d8a8d245824a..4c506e76a808 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -18,6 +18,8 @@ enum bootmem_type { #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE void __init register_page_bootmem_info_node(struct pglist_data *pgdat); +void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, + unsigned long nr_pages); void get_page_bootmem(unsigned long info, struct page *page, enum bootmem_type type); @@ -58,6 +60,11 @@ static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } +static inline void register_page_bootmem_memmap(unsigned long section_nr, + struct page *map, unsigned long nr_pages) +{ +} + static inline void put_page_bootmem(struct page *page) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index 14115c9949d8..e90ab0bdcc8c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4018,9 +4018,6 @@ static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, } #endif -void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, - unsigned long nr_pages); - enum mf_flags { MF_COUNT_INCREASED = 1 << 0, MF_ACTION_REQUIRED = 1 << 1, -- cgit v1.2.3 From d65917c42373f70159a3fc453f8f028fd665e04f Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:11 +0000 Subject: mm/sparse: allow for alternate vmemmap section init at boot Add functions that are called just before the per-section memmap is initialized and just before the memmap page structures are initialized. They are called sparse_vmemmap_init_nid_early and sparse_vmemmap_init_nid_late, respectively. This allows for mm subsystems to add calls to initialize memmap and page structures in a specific way, if using SPARSEMEM_VMEMMAP. Specifically, hugetlb can pre-HVO bootmem allocated pages that way, so that no time and resources are wasted on allocating vmemmap pages, only to free them later (and possibly unnecessarily running the system out of memory in the process). Refactor some code and export a few convenience functions for external use. In sparse_init_nid, skip any sections that are already initialized, e.g. they have been initialized by sparse_vmemmap_init_nid_early already. The hugetlb code to use these functions will be added in a later commit. Export section_map_size, as any alternate memmap init code will want to use it. The internal config option to enable this is SPARSEMEM_VMEMMAP_PREINIT, which is selected if an architecture-specific option, ARCH_WANT_HUGETLB_VMEMMAP_PREINIT, is set. In the future, if other subsystems want to do preinit too, they can do it in a similar fashion. The internal config option is there because a section flag is used, and the number of flags available is architecture-dependent (see mmzone.h). Architecures can decide if there is room for the flag when enabling options that select SPARSEMEM_VMEMMAP_PREINIT. Fortunately, as of right now, all sparse vmemmap using architectures do have room. Link: https://lkml.kernel.org/r/20250228182928.2645936-11-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Johannes Weiner Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + include/linux/mmzone.h | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index e90ab0bdcc8c..03e4807bd911 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3928,6 +3928,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) #endif void *sparse_buffer_alloc(unsigned long size); +unsigned long section_map_size(void); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9540b41894da..44ecb2f90db4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1933,6 +1933,9 @@ enum { SECTION_IS_EARLY_BIT, #ifdef CONFIG_ZONE_DEVICE SECTION_TAINT_ZONE_DEVICE_BIT, +#endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT + SECTION_IS_VMEMMAP_PREINIT_BIT, #endif SECTION_MAP_LAST_BIT, }; @@ -1944,6 +1947,9 @@ enum { #ifdef CONFIG_ZONE_DEVICE #define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT +#define SECTION_IS_VMEMMAP_PREINIT BIT(SECTION_IS_VMEMMAP_PREINIT_BIT) +#endif #define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) #define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT @@ -1998,6 +2004,30 @@ static inline int online_device_section(struct mem_section *section) } #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT +static inline int preinited_vmemmap_section(struct mem_section *section) +{ + return (section && + (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT)); +} + +void sparse_vmemmap_init_nid_early(int nid); +void sparse_vmemmap_init_nid_late(int nid); + +#else +static inline int preinited_vmemmap_section(struct mem_section *section) +{ + return 0; +} +static inline void sparse_vmemmap_init_nid_early(int nid) +{ +} + +static inline void sparse_vmemmap_init_nid_late(int nid) +{ +} +#endif + static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); @@ -2035,6 +2065,9 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) } #endif +void sparse_init_early_section(int nid, struct page *map, unsigned long pnum, + unsigned long flags); + #ifndef CONFIG_HAVE_ARCH_PFN_VALID /** * pfn_valid - check if there is a valid memory map entry for a PFN @@ -2116,6 +2149,8 @@ void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) +#define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0) +#define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ -- cgit v1.2.3 From 9eb6207b7812cee13431831c9661d21b53f3e93f Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:15 +0000 Subject: mm/sparse: add vmemmap_*_hvo functions Add a few functions to enable early HVO: vmemmap_populate_hvo vmemmap_undo_hvo vmemmap_wrprotect_hvo The populate and undo functions are expected to be used in early init, from the sparse_init_nid_early() function. The wrprotect function is to be used, potentially, later. To implement these functions, mostly re-use the existing compound pages vmemmap logic used by DAX. vmemmap_populate_address has its argument changed a bit in this commit: the page structure passed in to be reused in the mapping is replaced by a PFN and a flag. The flag indicates whether an extra ref should be taken on the vmemmap page containing the head page structure. Taking the ref is appropriate to for DAX / ZONE_DEVICE, but not for HugeTLB HVO. The HugeTLB vmemmap optimization maps tail page structure pages read-only. The vmemmap_wrprotect_hvo function that does this is implemented separately, because it cannot be guaranteed that reserved page structures will not be write accessed during memory initialization. Even with CONFIG_DEFERRED_STRUCT_PAGE_INIT, they might still be written to (if they are at the bottom of a zone). So, vmemmap_populate_hvo leaves the tail page structure pages RW initially, and then later during initialization, after memmap init is fully done, vmemmap_wrprotect_hvo must be called to finish the job. Subsequent commits will use these functions for early HugeTLB HVO. Link: https://lkml.kernel.org/r/20250228182928.2645936-15-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 03e4807bd911..9a74a3ee68bc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3937,7 +3937,8 @@ p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, - struct vmem_altmap *altmap, struct page *reuse); + struct vmem_altmap *altmap, unsigned long ptpfn, + unsigned long flags); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, @@ -3953,6 +3954,12 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); +int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); +void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); void vmemmap_populate_print_last(void); #ifdef CONFIG_MEMORY_HOTPLUG void vmemmap_free(unsigned long start, unsigned long end, -- cgit v1.2.3 From d58b2498200724e4f8c12d71a5953da03c8c8bdf Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:16 +0000 Subject: mm/hugetlb: deal with multiple calls to hugetlb_bootmem_alloc Architectures that want pre-HVO of hugetlb vmemmap pages will need to call hugetlb_bootmem_alloc from an earlier spot in boot (before sparse_init). To facilitate some architectures doing this, protect hugetlb_bootmem_alloc against multiple calls. Also provide a helper function to check if it's been called, so that the early HVO code, to be added later, can see if there is anything to do. Link: https://lkml.kernel.org/r/20250228182928.2645936-16-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a596aaa178d1..f0ab4ca4ecf2 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -175,6 +175,7 @@ extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); +bool hugetlb_bootmem_allocated(void); /* arch callbacks */ @@ -1263,6 +1264,11 @@ static inline bool hugetlbfs_pagecache_present( static inline void hugetlb_bootmem_alloc(void) { } + +static inline bool hugetlb_bootmem_allocated(void) +{ + return false; +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, -- cgit v1.2.3 From 752fe17af693323dba5622b19c858a46fed219a1 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:18 +0000 Subject: mm/hugetlb: add pre-HVO framework Define flags for pre-HVOed bootmem hugetlb pages, and act on them. The most important flag is the HVO flag, signalling that a bootmem allocated gigantic page has already been HVO-ed. If this flag is seen by the hugetlb bootmem gather code, the page is marked as HVO optimized. The HVO code will then not try to optimize it again. Instead, it will just map the tail page mirror pages read-only, completing the HVO steps. No functional change, as nothing sets the flags yet. Link: https://lkml.kernel.org/r/20250228182928.2645936-18-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f0ab4ca4ecf2..bbccc3e6b9dd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -681,8 +681,12 @@ struct hstate { struct huge_bootmem_page { struct list_head list; struct hstate *hstate; + unsigned long flags; }; +#define HUGE_BOOTMEM_HVO 0x0001 +#define HUGE_BOOTMEM_ZONES_VALID 0x0002 + int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); void wait_for_freed_hugetlb_folios(void); -- cgit v1.2.3 From b1222550fbf73840e31a103305d03ad53b8f5a59 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:20 +0000 Subject: mm/hugetlb: do pre-HVO for bootmem allocated pages For large systems, the overhead of vmemmap pages for hugetlb is substantial. It's about 1.5% of memory, which is about 45G for a 3T system. If you want to configure most of that system for hugetlb (e.g. to use as backing memory for VMs), there is a chance of running out of memory on boot, even though you know that the 45G will become available later. To avoid this scenario, and since it's a waste to first allocate and then free that 45G during boot, do pre-HVO for hugetlb bootmem allocated pages ('gigantic' pages). pre-HVO is done by adding functions that are called from sparse_init_nid_early and sparse_init_nid_late. The first is called before memmap allocation, so it takes care of allocating memmap HVO-style. The second verifies that all bootmem pages look good, specifically it checks that they do not intersect with multiple zones. This can only be done from sparse_init_nid_late path, when zones have been initialized. The hugetlb page size must be aligned to the section size, and aligned to the size of memory described by the number of page structures contained in one PMD (since pre-HVO is not prepared to split PMDs). This should be true for most 'gigantic' pages, it is for 1G pages on x86, where both of these alignment requirements are 128M. This will only have an effect if hugetlb_bootmem_alloc was called early in boot. If not, it won't do anything, and HVO for bootmem hugetlb pages works as before. Link: https://lkml.kernel.org/r/20250228182928.2645936-20-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index bbccc3e6b9dd..f6b82b0524ed 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -687,6 +687,8 @@ struct huge_bootmem_page { #define HUGE_BOOTMEM_HVO 0x0001 #define HUGE_BOOTMEM_ZONES_VALID 0x0002 +bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m); + int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); void wait_for_freed_hugetlb_folios(void); -- cgit v1.2.3 From 9320fa2717810a7d3451dd1a18c31f986a1d4068 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:24 +0000 Subject: mm/cma: introduce a cma validate function Define a function to check if a CMA area is valid, which means: do its ranges not cross any zone boundaries. Store the result in the newly created flags for each CMA area, so that multiple calls are dealt with. This allows for checking the validity of a CMA area early, which is needed later in order to be able to allocate hugetlb bootmem pages from it with pre-HVO. Link: https://lkml.kernel.org/r/20250228182928.2645936-24-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/cma.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/cma.h b/include/linux/cma.h index 03d85c100dcc..62d9c1cf6326 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -60,6 +60,7 @@ extern void cma_reserve_pages_on_error(struct cma *cma); #ifdef CONFIG_CMA struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp); bool cma_free_folio(struct cma *cma, const struct folio *folio); +bool cma_validate_zones(struct cma *cma); #else static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) { @@ -70,6 +71,10 @@ static inline bool cma_free_folio(struct cma *cma, const struct folio *folio) { return false; } +static inline bool cma_validate_zones(struct cma *cma) +{ + return false; +} #endif #endif -- cgit v1.2.3 From d2d78671408061b5332d9ad357686812bbec5070 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:27 +0000 Subject: mm/hugetlb: enable bootmem allocation from CMA areas If hugetlb_cma_only is enabled, we know that hugetlb pages can only be allocated from CMA. Now that there is an interface to do early reservations from a CMA area (returning memblock memory), it can be used to allocate hugetlb pages from CMA. This also allows for doing pre-HVO on these pages (if enabled). Make sure to initialize the page structures and associated data correctly. Create a flag to signal that a hugetlb page has been allocated from CMA to make things a little easier. Some configurations of powerpc have a special hugetlb bootmem allocator, so introduce a boolean arch_specific_huge_bootmem_alloc that returns true if such an allocator is present. In that case, CMA bootmem allocations can't be used, so check that function before trying. Link: https://lkml.kernel.org/r/20250228182928.2645936-27-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f6b82b0524ed..8f3ac832ee7f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -591,6 +591,7 @@ enum hugetlb_page_flags { HPG_freed, HPG_vmemmap_optimized, HPG_raw_hwp_unreliable, + HPG_cma, __NR_HPAGEFLAGS, }; @@ -650,6 +651,7 @@ HPAGEFLAG(Temporary, temporary) HPAGEFLAG(Freed, freed) HPAGEFLAG(VmemmapOptimized, vmemmap_optimized) HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable) +HPAGEFLAG(Cma, cma) #ifdef CONFIG_HUGETLB_PAGE @@ -678,14 +680,18 @@ struct hstate { char name[HSTATE_NAME_LEN]; }; +struct cma; + struct huge_bootmem_page { struct list_head list; struct hstate *hstate; unsigned long flags; + struct cma *cma; }; #define HUGE_BOOTMEM_HVO 0x0001 #define HUGE_BOOTMEM_ZONES_VALID 0x0002 +#define HUGE_BOOTMEM_CMA 0x0004 bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m); @@ -824,6 +830,17 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, } #endif +#ifndef arch_has_huge_bootmem_alloc +/* + * Some architectures do their own bootmem allocation, so they can't use + * early CMA allocation. + */ +static inline bool arch_has_huge_bootmem_alloc(void) +{ + return false; +} +#endif + static inline struct hstate *folio_hstate(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); -- cgit v1.2.3 From f809b9f3046f702bc1ae40695acb27c1b0abc346 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 19 Feb 2025 14:01:45 -0800 Subject: mm/damon: implement a new DAMOS filter type for unmapped pages Patch series "mm/damon: introduce DAMOS filter type for unmapped pages". User decides whether their memory will be mapped or unmapped. It implies that the two types of memory can have different characteristics and management requirements. Provide the DAMON-observaibility DAMOS-operation capability for the different types by introducing a new DAMOS filter type for unmapped pages. This patch (of 2): Implement yet another DAMOS filter type for unmapped pages on DAMON kernel API, and add support of it from the physical address space DAMON operations set (paddr). Since it is for only unmapped pages, support from the virtual address spaces DAMON operations set (vaddr) is not required. Link: https://lkml.kernel.org/r/20250219220146.133650-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250219220146.133650-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5e7ae7bca5dc..242910b190c9 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -337,6 +337,7 @@ struct damos_stat { * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @DAMOS_FILTER_TYPE_YOUNG: Recently accessed pages. * @DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: Page is part of a hugepage. + * @DAMOS_FILTER_TYPE_UNMAPPED: Unmapped pages. * @DAMOS_FILTER_TYPE_ADDR: Address range. * @DAMOS_FILTER_TYPE_TARGET: Data Access Monitoring target. * @NR_DAMOS_FILTER_TYPES: Number of filter types. @@ -357,6 +358,7 @@ enum damos_filter_type { DAMOS_FILTER_TYPE_MEMCG, DAMOS_FILTER_TYPE_YOUNG, DAMOS_FILTER_TYPE_HUGEPAGE_SIZE, + DAMOS_FILTER_TYPE_UNMAPPED, DAMOS_FILTER_TYPE_ADDR, DAMOS_FILTER_TYPE_TARGET, NR_DAMOS_FILTER_TYPES, -- cgit v1.2.3 From 58abac769b05f6e972d34a02a46a04589d6e9853 Mon Sep 17 00:00:00 2001 From: Liu Ye Date: Wed, 12 Feb 2025 10:58:42 +0800 Subject: mm/folio_queue: delete __folio_order and use folio_order directly __folio_order is the same as folio_order, remove __folio_order and then just include mm.h and use folio_order directly. Link: https://lkml.kernel.org/r/20250212025843.80283-2-liuye@kylinos.cn Signed-off-by: Liu Ye Reviewed-by: Shivank Garg Reviewed-by: Dev Jain Acked-by: David Howells Cc: Christian Brauner Signed-off-by: Andrew Morton --- include/linux/folio_queue.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index 4d3f8074c137..45ad2408a80c 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -15,6 +15,7 @@ #define _LINUX_FOLIO_QUEUE_H #include +#include /* * Segment in a queue of running buffers. Each segment can hold a number of @@ -216,13 +217,6 @@ static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot) clear_bit(slot, &folioq->marks3); } -static inline unsigned int __folio_order(struct folio *folio) -{ - if (!folio_test_large(folio)) - return 0; - return folio->_flags_1 & 0xff; -} - /** * folioq_append: Add a folio to a folio queue segment * @folioq: The segment to add to @@ -241,7 +235,7 @@ static inline unsigned int folioq_append(struct folio_queue *folioq, struct foli unsigned int slot = folioq->vec.nr++; folioq->vec.folios[slot] = folio; - folioq->orders[slot] = __folio_order(folio); + folioq->orders[slot] = folio_order(folio); return slot; } @@ -263,7 +257,7 @@ static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct unsigned int slot = folioq->vec.nr++; folioq->vec.folios[slot] = folio; - folioq->orders[slot] = __folio_order(folio); + folioq->orders[slot] = folio_order(folio); folioq_mark(folioq, slot); return slot; } -- cgit v1.2.3 From 44f76413496ec343da0d8292ceecdcabe3e6ec16 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 3 Mar 2025 11:03:23 +0900 Subject: zsmalloc: introduce new object mapping API Current object mapping API is a little cumbersome. First, it's inconsistent, sometimes it returns with page-faults disabled and sometimes with page-faults enabled. Second, and most importantly, it enforces atomicity restrictions on its users. zs_map_object() has to return a liner object address which is not always possible because some objects span multiple physical (non-contiguous) pages. For such objects zsmalloc uses a per-CPU buffer to which object's data is copied before a pointer to that per-CPU buffer is returned back to the caller. This leads to another, final, issue - extra memcpy(). Since the caller gets a pointer to per-CPU buffer it can memcpy() data only to that buffer, and during zs_unmap_object() zsmalloc will memcpy() from that per-CPU buffer to physical pages that object in question spans across. New API splits functions by access mode: - zs_obj_read_begin(handle, local_copy) Returns a pointer to handle memory. For objects that span two physical pages a local_copy buffer is used to store object's data before the address is returned to the caller. Otherwise the object's page is kmap_local mapped directly. - zs_obj_read_end(handle, buf) Unmaps the page if it was kmap_local mapped by zs_obj_read_begin(). - zs_obj_write(handle, buf, len) Copies len-bytes from compression buffer to handle memory (takes care of objects that span two pages). This does not need any additional (e.g. per-CPU) buffers and writes the data directly to zsmalloc pool pages. In terms of performance, on a synthetic and completely reproducible test that allocates fixed number of objects of fixed sizes and iterates over those objects, first mapping in RO then in RW mode: OLD API ======= 3 first results out of 10 369,205,778 instructions # 0.80 insn per cycle 40,467,926 branches # 113.732 M/sec 369,002,122 instructions # 0.62 insn per cycle 40,426,145 branches # 189.361 M/sec 369,036,706 instructions # 0.63 insn per cycle 40,430,860 branches # 204.105 M/sec [..] NEW API ======= 3 first results out of 10 265,799,293 instructions # 0.51 insn per cycle 29,834,567 branches # 170.281 M/sec 265,765,970 instructions # 0.55 insn per cycle 29,829,019 branches # 161.602 M/sec 265,764,702 instructions # 0.51 insn per cycle 29,828,015 branches # 189.677 M/sec [..] T-test on all 10 runs ===================== Difference at 95.0% confidence -1.03219e+08 +/- 55308.7 -27.9705% +/- 0.0149878% (Student's t, pooled s = 58864.4) The old API will stay around until the remaining users switch to the new one. After that we'll also remove zsmalloc per-CPU buffer and CPU hotplug handling. The split of map(RO) and map(WO) into read_{begin/end}/write is suggested by Yosry Ahmed. Link: https://lkml.kernel.org/r/20250303022425.285971-15-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Suggested-by: Yosry Ahmed Reviewed-by: Yosry Ahmed Cc: Hillf Danton Cc: Kairui Song Cc: Minchan Kim Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/zsmalloc.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index a48cd0ffe57d..7d70983cf398 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -58,4 +58,12 @@ unsigned long zs_compact(struct zs_pool *pool); unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size); void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); + +void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, + void *local_copy); +void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, + void *handle_mem); +void zs_obj_write(struct zs_pool *pool, unsigned long handle, + void *handle_mem, size_t mem_len); + #endif -- cgit v1.2.3 From 1b7e90020eb770aa52991a34f21552b4c38ea690 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 14 Mar 2025 00:59:33 +0800 Subject: mm, swap: use percpu cluster as allocation fast path Current allocation workflow first traverses the plist with a global lock held, after choosing a device, it uses the percpu cluster on that swap device. This commit moves the percpu cluster variable out of being tied to individual swap devices, making it a global percpu variable, and will be used directly for allocation as a fast path. The global percpu cluster variable will never point to a HDD device, and allocations on a HDD device are still globally serialized. This improves the allocator performance and prepares for removal of the slot cache in later commits. There shouldn't be much observable behavior change, except one thing: this changes how swap device allocation rotation works. Currently, each allocation will rotate the plist, and because of the existence of slot cache (one order 0 allocation usually returns 64 entries), swap devices of the same priority are rotated for every 64 order 0 entries consumed. High order allocations are different, they will bypass the slot cache, and so swap device is rotated for every 16K, 32K, or up to 2M allocation. The rotation rule was never clearly defined or documented, it was changed several times without mentioning. After this commit, and once slot cache is gone in later commits, swap device rotation will happen for every consumed cluster. Ideally non-HDD devices will be rotated if 2M space has been consumed for each order. Fragmented clusters will rotate the device faster, which seems OK. HDD devices is rotated for every allocation regardless of the allocation order, which should be OK too and trivial. This commit also slightly changes allocation behaviour for slot cache. The new added cluster allocation fast path may allocate entries from different device to the slot cache, this is not observable from user space, only impact performance very slightly, and slot cache will be just gone in next commit, so this can be ignored. Link: https://lkml.kernel.org/r/20250313165935.63303-6-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kalesh Singh Cc: Matthew Wilcow (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe91c293636..374bffc87427 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -284,12 +284,10 @@ enum swap_cluster_flags { #endif /* - * We assign a cluster to each CPU, so each CPU can allocate swap entry from - * its own cluster and swapout sequentially. The purpose is to optimize swapout - * throughput. + * We keep using same cluster for rotational device so IO will be sequential. + * The purpose is to optimize SWAP throughput on these device. */ -struct percpu_cluster { - local_lock_t lock; /* Protect the percpu_cluster above */ +struct swap_sequential_cluster { unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; @@ -315,8 +313,7 @@ struct swap_info_struct { atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ - struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ - struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ + struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ -- cgit v1.2.3 From 0ff67f990bd45726e0d9e91111d998e7a3595b32 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 14 Mar 2025 00:59:34 +0800 Subject: mm, swap: remove swap slot cache Slot cache is no longer needed now, removing it and all related code. - vm-scalability with: `usemem --init-time -O -y -x -R -31 1G`, 12G memory cgroup using simulated pmem as SWAP (32G pmem, 32 CPUs), 16 test runs for each case, measuring the total throughput: Before (KB/s) (stdev) After (KB/s) (stdev) Random (4K): 424907.60 (24410.78) 414745.92 (34554.78) Random (64K): 163308.82 (11635.72) 167314.50 (18434.99) Sequential (4K, !-R): 6150056.79 (103205.90) 6321469.06 (115878.16) The performance changes are below noise level. - Build linux kernel with make -j96, using 4K folio with 1.5G memory cgroup limit and 64K folio with 2G memory cgroup limit, on top of tmpfs, 12 test runs, measuring the system time: Before (s) (stdev) After (s) (stdev) make -j96 (4K): 6445.69 (61.95) 6408.80 (69.46) make -j96 (64K): 6841.71 (409.04) 6437.99 (435.55) Similar to above, 64k mTHP case showed a slight improvement. Link: https://lkml.kernel.org/r/20250313165935.63303-7-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baoquan He Cc: Baolin Wang Cc: Barry Song Cc: Chris Li Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kalesh Singh Cc: Matthew Wilcow (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 --- include/linux/swap_slots.h | 28 ---------------------------- 2 files changed, 31 deletions(-) delete mode 100644 include/linux/swap_slots.h (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 374bffc87427..c5856dcc263a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -465,7 +465,6 @@ void free_pages_and_swap_cache(struct encoded_page **, int); extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; -extern bool has_usable_swap(void); /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) @@ -483,13 +482,11 @@ swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t, int); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); -extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h deleted file mode 100644 index 840aec3523b2..000000000000 --- a/include/linux/swap_slots.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SWAP_SLOTS_H -#define _LINUX_SWAP_SLOTS_H - -#include -#include -#include - -#define SWAP_SLOTS_CACHE_SIZE SWAP_BATCH -#define THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE (5*SWAP_SLOTS_CACHE_SIZE) -#define THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE (2*SWAP_SLOTS_CACHE_SIZE) - -struct swap_slots_cache { - bool lock_initialized; - struct mutex alloc_lock; /* protects slots, nr, cur */ - swp_entry_t *slots; - int nr; - int cur; - int n_ret; -}; - -void disable_swap_slots_cache_lock(void); -void reenable_swap_slots_cache_unlock(void); -void enable_swap_slots_cache(void); - -extern bool swap_slot_cache_enabled; - -#endif /* _LINUX_SWAP_SLOTS_H */ -- cgit v1.2.3 From b487a2da3575b6cdfb6d6559311830c8fea70bb9 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 14 Mar 2025 00:59:35 +0800 Subject: mm, swap: simplify folio swap allocation With slot cache gone, clean up the allocation helpers even more. folio_alloc_swap will be the only entry for allocation and adding the folio to swap cache (except suspend), making it opposite of folio_free_swap. Link: https://lkml.kernel.org/r/20250313165935.63303-8-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kalesh Singh Cc: Matthew Wilcow (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index c5856dcc263a..9c99eee160f9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -478,7 +478,7 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -swp_entry_t folio_alloc_swap(struct folio *folio); +int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); @@ -586,11 +586,9 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline swp_entry_t folio_alloc_swap(struct folio *folio) +static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask) { - swp_entry_t entry; - entry.val = 0; - return entry; + return -EINVAL; } static inline bool folio_free_swap(struct folio *folio) -- cgit v1.2.3 From 7b54a96f30dec4b812841d179a26e88a7887dc06 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Fri, 31 Jan 2025 17:08:25 +0530 Subject: crash: remove an unused argument from reserve_crashkernel_generic() cmdline argument is not used in reserve_crashkernel_generic() so remove it. Correspondingly, all the callers have been updated as well. No functional change intended. Link: https://lkml.kernel.org/r/20250131113830.925179-3-sourabhjain@linux.ibm.com Signed-off-by: Sourabh Jain Acked-by: Hari Bathini Acked-by: Baoquan He Cc: Madhavan Srinivasan Cc: Mahesh Salgaonkar Cc: Michael Ellerman Signed-off-by: Andrew Morton --- include/linux/crash_reserve.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h index 5a9df944fb80..1fe7e7d1b214 100644 --- a/include/linux/crash_reserve.h +++ b/include/linux/crash_reserve.h @@ -32,13 +32,12 @@ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, #define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() #endif -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high); +void __init reserve_crashkernel_generic(unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high); #else -static inline void __init reserve_crashkernel_generic(char *cmdline, +static inline void __init reserve_crashkernel_generic( unsigned long long crash_size, unsigned long long crash_base, unsigned long long crash_low_size, -- cgit v1.2.3 From 9f0552c978f8950b4661f1222290fb57af811a8b Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Fri, 31 Jan 2025 17:08:26 +0530 Subject: crash: let arch decide usable memory range in reserved area Although the crashkernel area is reserved, on architectures like PowerPC, it is possible for the crashkernel reserved area to contain components like RTAS, TCE, OPAL, etc. To avoid placing kexec segments over these components, PowerPC has its own set of APIs to locate holes in the crashkernel reserved area. Add an arch hook in the generic locate mem hole APIs so that architectures can handle such special regions in the crashkernel area while locating memory holes for kexec segments using generic APIs. With this, a lot of redundant arch-specific code can be removed, as it performs the exact same job as the generic APIs. To keep the generic and arch-specific changes separate, the changes related to moving PowerPC to use the generic APIs and the removal of PowerPC-specific APIs for memory hole allocation are done in a subsequent patch titled "powerpc/crash: Use generic APIs to locate memory hole for kdump. Link: https://lkml.kernel.org/r/20250131113830.925179-4-sourabhjain@linux.ibm.com Signed-off-by: Sourabh Jain Acked-by: Baoquan He Cc: Hari Bathini Cc: Madhavan Srinivasan Cc: Mahesh Salgaonkar Cc: Michael Ellerman Signed-off-by: Andrew Morton --- include/linux/kexec.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index f0e9f8eda7a3..407f8b0346aa 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -205,6 +205,15 @@ static inline int arch_kimage_file_post_load_cleanup(struct kimage *image) } #endif +#ifndef arch_check_excluded_range +static inline int arch_check_excluded_range(struct kimage *image, + unsigned long start, + unsigned long end) +{ + return 0; +} +#endif + #ifdef CONFIG_KEXEC_SIG #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len); -- cgit v1.2.3 From 8c6bbda879b62f16bb03321a84554b4f63415c55 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 3 Feb 2025 16:05:22 +0100 Subject: rcu: provide a static initializer for hlist_nulls_head Patch series "ucount: Simplify refcounting with rcuref_t". I noticed that the atomic_dec_and_lock_irqsave() in put_ucounts() loops sometimes even during boot. Something like 2-3 iterations but still. This series replaces the refcounting with rcuref_t and adds a RCU lookup. This allows a lockless lookup in alloc_ucounts() if the entry is available and a cmpxchg()less put of the item. This patch (of 4): Provide a static initializer for hlist_nulls_head so that it can be used in statically defined data structures. Link: https://lkml.kernel.org/r/20250203150525.456525-1-bigeasy@linutronix.de Link: https://lkml.kernel.org/r/20250203150525.456525-2-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Boqun Feng Cc: Steven Rostedt Cc: Joel Fernandes Cc: Josh Triplett Cc: Lai jiangshan Cc: Mathieu Desnoyers Cc: Mengen Sun Cc: "Paul E . McKenney" Cc: "Uladzislau Rezki (Sony)" Cc: YueHong Wu Cc: Zqiang Signed-off-by: Andrew Morton --- include/linux/list_nulls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index fa6e8471bd22..248db9b77ee2 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -28,6 +28,7 @@ struct hlist_nulls_node { #define NULLS_MARKER(value) (1UL | (((long)value) << 1)) #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) +#define HLIST_NULLS_HEAD_INIT(nulls) {.first = (struct hlist_nulls_node *)NULLS_MARKER(nulls)} #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) -- cgit v1.2.3 From 5f01a22c5b231dd590f61a2591b3090665733bcb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 3 Feb 2025 16:05:24 +0100 Subject: ucount: use RCU for ucounts lookups The ucounts element is looked up under ucounts_lock. This can be optimized by using RCU for a lockless lookup and return and element if the reference can be obtained. Replace hlist_head with hlist_nulls_head which is RCU compatible. Let find_ucounts() search for the required item within a RCU section and return the item if a reference could be obtained. This means alloc_ucounts() will always return an element (unless the memory allocation failed). Let put_ucounts() RCU free the element if the reference counter dropped to zero. Link: https://lkml.kernel.org/r/20250203150525.456525-4-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Boqun Feng Cc: Joel Fernandes Cc: Josh Triplett Cc: Lai jiangshan Cc: Mathieu Desnoyers Cc: Mengen Sun Cc: Steven Rostedt Cc: "Uladzislau Rezki (Sony)" Cc: YueHong Wu Cc: Zqiang Signed-off-by: Andrew Morton --- include/linux/user_namespace.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 7183e5aca282..ad4dbef92597 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -115,9 +116,10 @@ struct user_namespace { } __randomize_layout; struct ucounts { - struct hlist_node node; + struct hlist_nulls_node node; struct user_namespace *ns; kuid_t uid; + struct rcu_head rcu; atomic_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; -- cgit v1.2.3 From b4dc0bee2a749083028afba346910e198653f42a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 3 Feb 2025 16:05:25 +0100 Subject: ucount: use rcuref_t for reference counting Use rcuref_t for reference counting. This eliminates the cmpxchg loop in the get and put path. This also eliminates the need to acquire the lock in the put path because once the final user returns the reference, it can no longer be obtained anymore. Use rcuref_t for reference counting. Link: https://lkml.kernel.org/r/20250203150525.456525-5-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Boqun Feng Cc: Joel Fernandes Cc: Josh Triplett Cc: Lai jiangshan Cc: Mathieu Desnoyers Cc: Mengen Sun Cc: Steven Rostedt Cc: "Uladzislau Rezki (Sony)" Cc: YueHong Wu Cc: Zqiang Signed-off-by: Andrew Morton --- include/linux/user_namespace.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index ad4dbef92597..a0bb6d012137 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -120,7 +121,7 @@ struct ucounts { struct user_namespace *ns; kuid_t uid; struct rcu_head rcu; - atomic_t count; + rcuref_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; }; @@ -133,9 +134,15 @@ void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); -struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); void put_ucounts(struct ucounts *ucounts); +static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) +{ + if (rcuref_get(&ucounts->count)) + return ucounts; + return NULL; +} + static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { return atomic_long_read(&ucounts->rlimit[type]); -- cgit v1.2.3 From 318f05a057157c400a92f17c5f2fa3dd96f57c7e Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Mon, 17 Feb 2025 21:39:41 +0100 Subject: reboot: replace __hw_protection_shutdown bool action parameter with an enum Patch series "reboot: support runtime configuration of emergency hw_protection action", v3. We currently leave the decision of whether to shutdown or reboot to protect hardware in an emergency situation to the individual drivers. This works out in some cases, where the driver detecting the critical failure has inside knowledge: It binds to the system management controller for example or is guided by hardware description that defines what to do. This is inadequate in the general case though as a driver reporting e.g. an imminent power failure can't know whether a shutdown or a reboot would be more appropriate for a given hardware platform. To address this, this series adds a hw_protection kernel parameter and sysfs toggle that can be used to change the action from the shutdown default to reboot. A new hw_protection_trigger API then makes use of this default action. My particular use case is unattended embedded systems that don't have support for shutdown and that power on automatically when power is supplied: - A brief power cycle gets detected by the driver - The kernel powers down the system and SoC goes into shutdown mode - Power is restored - The system remains oblivious to the restored power - System needs to be manually power cycled for a duration long enough to drain the capacitors With this series, such systems can configure the kernel with hw_protection=reboot to have the boot firmware worry about critical conditions. This patch (of 12): Currently __hw_protection_shutdown() either reboots or shuts down the system according to its shutdown argument. To make the logic easier to follow, both inside __hw_protection_shutdown and at caller sites, lets replace the bool parameter with an enum. This will be extra useful, when in a later commit, a third action is added to the enumeration. No functional change. Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-0-e1c09b090c0c@pengutronix.de Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-1-e1c09b090c0c@pengutronix.de Signed-off-by: Ahmad Fatoum Reviewed-by: Tzung-Bi Shih Cc: Benson Leung Cc: Mark Brown Cc: Daniel Lezcano Cc: Fabio Estevam Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Liam Girdwood Cc: Lukasz Luba Cc: Matteo Croce Cc: Matti Vaittinen Cc: "Rafael J. Wysocki" Cc: Rob Herring Cc: Rui Zhang Cc: Sascha Hauer Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton --- include/linux/reboot.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index abcdde4df697..e97f6b8e8586 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -177,16 +177,28 @@ void ctrl_alt_del(void); extern void orderly_poweroff(bool force); extern void orderly_reboot(void); -void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown); + +/** + * enum hw_protection_action - Hardware protection action + * + * @HWPROT_ACT_SHUTDOWN: + * The system should be shut down (powered off) for HW protection. + * @HWPROT_ACT_REBOOT: + * The system should be rebooted for HW protection. + */ +enum hw_protection_action { HWPROT_ACT_SHUTDOWN, HWPROT_ACT_REBOOT }; + +void __hw_protection_shutdown(const char *reason, int ms_until_forced, + enum hw_protection_action action); static inline void hw_protection_reboot(const char *reason, int ms_until_forced) { - __hw_protection_shutdown(reason, ms_until_forced, false); + __hw_protection_shutdown(reason, ms_until_forced, HWPROT_ACT_REBOOT); } static inline void hw_protection_shutdown(const char *reason, int ms_until_forced) { - __hw_protection_shutdown(reason, ms_until_forced, true); + __hw_protection_shutdown(reason, ms_until_forced, HWPROT_ACT_SHUTDOWN); } /* -- cgit v1.2.3 From 81cab0f94ab864f13e29e561382d722daec6a55a Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Mon, 17 Feb 2025 21:39:45 +0100 Subject: reboot: rename now misleading __hw_protection_shutdown symbols The __hw_protection_shutdown function name has become misleading since it can cause either a shutdown (poweroff) or a reboot depending on its argument. To avoid further confusion, let's rename it, so it doesn't suggest that a poweroff is all it can do. Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-5-e1c09b090c0c@pengutronix.de Signed-off-by: Ahmad Fatoum Reviewed-by: Tzung-Bi Shih Cc: Benson Leung Cc: Daniel Lezcano Cc: Fabio Estevam Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Liam Girdwood Cc: Lukasz Luba Cc: Mark Brown Cc: Matteo Croce Cc: Matti Vaittinen Cc: "Rafael J. Wysocki" Cc: Rob Herring (Arm) Cc: Rui Zhang Cc: Sascha Hauer Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton --- include/linux/reboot.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index e97f6b8e8586..53c64e31b3cf 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -188,17 +188,17 @@ extern void orderly_reboot(void); */ enum hw_protection_action { HWPROT_ACT_SHUTDOWN, HWPROT_ACT_REBOOT }; -void __hw_protection_shutdown(const char *reason, int ms_until_forced, - enum hw_protection_action action); +void __hw_protection_trigger(const char *reason, int ms_until_forced, + enum hw_protection_action action); static inline void hw_protection_reboot(const char *reason, int ms_until_forced) { - __hw_protection_shutdown(reason, ms_until_forced, HWPROT_ACT_REBOOT); + __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_REBOOT); } static inline void hw_protection_shutdown(const char *reason, int ms_until_forced) { - __hw_protection_shutdown(reason, ms_until_forced, HWPROT_ACT_SHUTDOWN); + __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_SHUTDOWN); } /* -- cgit v1.2.3 From e016173f656b89f5f71b7d45dfc599ada8107eef Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Mon, 17 Feb 2025 21:39:47 +0100 Subject: reboot: add support for configuring emergency hardware protection action We currently leave the decision of whether to shutdown or reboot to protect hardware in an emergency situation to the individual drivers. This works out in some cases, where the driver detecting the critical failure has inside knowledge: It binds to the system management controller for example or is guided by hardware description that defines what to do. In the general case, however, the driver detecting the issue can't know what the appropriate course of action is and shouldn't be dictating the policy of dealing with it. Therefore, add a global hw_protection toggle that allows the user to specify whether shutdown or reboot should be the default action when the driver doesn't set policy. This introduces no functional change yet as hw_protection_trigger() has no callers, but these will be added in subsequent commits. [arnd@arndb.de: hide unused hw_protection_attr] Link: https://lkml.kernel.org/r/20250224141849.1546019-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-7-e1c09b090c0c@pengutronix.de Signed-off-by: Ahmad Fatoum Reviewed-by: Tzung-Bi Shih Cc: Benson Leung Cc: Daniel Lezcano Cc: Fabio Estevam Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Liam Girdwood Cc: Lukasz Luba Cc: Mark Brown Cc: Matteo Croce Cc: Matti Vaittinen Cc: "Rafael J. Wysocki" Cc: Rob Herring (Arm) Cc: Rui Zhang Cc: Sascha Hauer Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton --- include/linux/reboot.h | 22 +++++++++++++++++++++- include/uapi/linux/capability.h | 1 + 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 53c64e31b3cf..79e02876f2ba 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -181,16 +181,36 @@ extern void orderly_reboot(void); /** * enum hw_protection_action - Hardware protection action * + * @HWPROT_ACT_DEFAULT: + * The default action should be taken. This is HWPROT_ACT_SHUTDOWN + * by default, but can be overridden. * @HWPROT_ACT_SHUTDOWN: * The system should be shut down (powered off) for HW protection. * @HWPROT_ACT_REBOOT: * The system should be rebooted for HW protection. */ -enum hw_protection_action { HWPROT_ACT_SHUTDOWN, HWPROT_ACT_REBOOT }; +enum hw_protection_action { HWPROT_ACT_DEFAULT, HWPROT_ACT_SHUTDOWN, HWPROT_ACT_REBOOT }; void __hw_protection_trigger(const char *reason, int ms_until_forced, enum hw_protection_action action); +/** + * hw_protection_trigger - Trigger default emergency system hardware protection action + * + * @reason: Reason of emergency shutdown or reboot to be printed. + * @ms_until_forced: Time to wait for orderly shutdown or reboot before + * triggering it. Negative value disables the forced + * shutdown or reboot. + * + * Initiate an emergency system shutdown or reboot in order to protect + * hardware from further damage. The exact action taken is controllable at + * runtime and defaults to shutdown. + */ +static inline void hw_protection_trigger(const char *reason, int ms_until_forced) +{ + __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_DEFAULT); +} + static inline void hw_protection_reboot(const char *reason, int ms_until_forced) { __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_REBOOT); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 5bb906098697..2e21b5594f81 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -275,6 +275,7 @@ struct vfs_ns_cap_data { /* Allow setting encryption key on loopback filesystem */ /* Allow setting zone reclaim policy */ /* Allow everything under CAP_BPF and CAP_PERFMON for backward compatibility */ +/* Allow setting hardware protection emergency action */ #define CAP_SYS_ADMIN 21 -- cgit v1.2.3 From 5e40d0d196595874c3f0606f0642021e6ade8054 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Mon, 17 Feb 2025 21:39:52 +0100 Subject: reboot: retire hw_protection_reboot and hw_protection_shutdown helpers The hw_protection_reboot and hw_protection_shutdown functions mix mechanism with policy: They let the driver requesting an emergency action for hardware protection also decide how to deal with it. This is inadequate in the general case as a driver reporting e.g. an imminent power failure can't know whether a shutdown or a reboot would be more appropriate for a given hardware platform. With the addition of the hw_protection parameter, it's now possible to configure at runtime the default emergency action and drivers are expected to use hw_protection_trigger to have this parameter dictate policy. As no current users of either hw_protection_shutdown or hw_protection_shutdown helpers remain, remove them, as not to tempt driver authors to call them. Existing users now either defer to hw_protection_trigger or call __hw_protection_trigger with a suitable argument directly when they have inside knowledge on whether a reboot or shutdown would be more appropriate. Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-12-e1c09b090c0c@pengutronix.de Signed-off-by: Ahmad Fatoum Reviewed-by: Tzung-Bi Shih Cc: Benson Leung Cc: Daniel Lezcano Cc: Fabio Estevam Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Liam Girdwood Cc: Lukasz Luba Cc: Mark Brown Cc: Matteo Croce Cc: Matti Vaittinen Cc: "Rafael J. Wysocki" Cc: Rob Herring (Arm) Cc: Rui Zhang Cc: Sascha Hauer Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton --- include/linux/reboot.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 79e02876f2ba..aa08c3bbbf59 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -211,16 +211,6 @@ static inline void hw_protection_trigger(const char *reason, int ms_until_forced __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_DEFAULT); } -static inline void hw_protection_reboot(const char *reason, int ms_until_forced) -{ - __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_REBOOT); -} - -static inline void hw_protection_shutdown(const char *reason, int ms_until_forced) -{ - __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_SHUTDOWN); -} - /* * Emergency restart, callable from an interrupt handler. */ -- cgit v1.2.3 From 0ac451ecec6dd516fdac1ca064467e753ef6ae1a Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 16 Feb 2025 00:56:18 +0800 Subject: lib min_heap: use size_t for array size and index variables Replace the int type with size_t for variables representing array sizes and indices in the min-heap implementation. Using size_t aligns with standard practices for size-related variables and avoids potential issues on platforms where int may be insufficient to represent all valid sizes or indices. Link: https://lkml.kernel.org/r/20250215165618.1757219-1-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Cc: Yu-Chun Lin Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 1160bed6579e..79ddc0adbf2b 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -218,7 +218,7 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size) /* Initialize a min-heap. */ static __always_inline -void __min_heap_init_inline(min_heap_char *heap, void *data, int size) +void __min_heap_init_inline(min_heap_char *heap, void *data, size_t size) { heap->nr = 0; heap->size = size; @@ -254,7 +254,7 @@ bool __min_heap_full_inline(min_heap_char *heap) /* Sift the element at pos down the heap. */ static __always_inline -void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, +void __min_heap_sift_down_inline(min_heap_char *heap, size_t pos, size_t elem_size, const struct min_heap_callbacks *func, void *args) { const unsigned long lsbit = elem_size & -elem_size; @@ -324,7 +324,7 @@ static __always_inline void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size, const struct min_heap_callbacks *func, void *args) { - int i; + ssize_t i; for (i = heap->nr / 2 - 1; i >= 0; i--) __min_heap_sift_down_inline(heap, i, elem_size, func, args); @@ -379,7 +379,7 @@ bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t ele const struct min_heap_callbacks *func, void *args) { void *data = heap->data; - int pos; + size_t pos; if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap")) return false; @@ -428,10 +428,10 @@ bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx, __min_heap_del_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ __minheap_obj_size(_heap), _idx, _func, _args) -void __min_heap_init(min_heap_char *heap, void *data, int size); +void __min_heap_init(min_heap_char *heap, void *data, size_t size); void *__min_heap_peek(struct min_heap_char *heap); bool __min_heap_full(min_heap_char *heap); -void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, +void __min_heap_sift_down(min_heap_char *heap, size_t pos, size_t elem_size, const struct min_heap_callbacks *func, void *args); void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, const struct min_heap_callbacks *func, void *args); -- cgit v1.2.3 From 758502918e77323bf999a3eddd71466bf6135173 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 21 Feb 2025 05:02:21 -0800 Subject: rhashtable: remove needless return in three void APIs Remove needless 'return' in the following void APIs: rhltable_walk_enter() rhltable_free_and_destroy() rhltable_destroy() Since both the API and callee involved are void functions. Link: https://lkml.kernel.org/r/20250221-rmv_return-v1-16-cc8dff275827@quicinc.com Signed-off-by: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/rhashtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 8463a128e2f4..6c85b28ea30b 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -1259,7 +1259,7 @@ static inline int rhashtable_replace_fast( static inline void rhltable_walk_enter(struct rhltable *hlt, struct rhashtable_iter *iter) { - return rhashtable_walk_enter(&hlt->ht, iter); + rhashtable_walk_enter(&hlt->ht, iter); } /** @@ -1275,12 +1275,12 @@ static inline void rhltable_free_and_destroy(struct rhltable *hlt, void *arg), void *arg) { - return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); + rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); } static inline void rhltable_destroy(struct rhltable *hlt) { - return rhltable_free_and_destroy(hlt, NULL, NULL); + rhltable_free_and_destroy(hlt, NULL, NULL); } #endif /* _LINUX_RHASHTABLE_H */ -- cgit v1.2.3 From ede7cd607a1d206b9579264a7405d78316b2d7fd Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 21 Feb 2025 05:02:07 -0800 Subject: cpu: remove needless return in void API suspend_enable_secondary_cpus() Remove needless 'return' in void API suspend_enable_secondary_cpus() since both the API and thaw_secondary_cpus() are void functions. Link: https://lkml.kernel.org/r/20250221-rmv_return-v1-2-cc8dff275827@quicinc.com Signed-off-by: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/cpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 6a0a8f1c7c90..e3049543008b 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -148,7 +148,7 @@ static inline int suspend_disable_secondary_cpus(void) } static inline void suspend_enable_secondary_cpus(void) { - return thaw_secondary_cpus(); + thaw_secondary_cpus(); } #else /* !CONFIG_PM_SLEEP_SMP */ -- cgit v1.2.3 From 720ba85040a6549674e5599685ca7df86a902448 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 26 Feb 2025 14:22:57 +0100 Subject: mm/mmu_notifier: use MMU_NOTIFY_CLEAR in remove_device_exclusive_entry() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's limit the use of MMU_NOTIFY_EXCLUSIVE to the case where we convert a present PTE to device-exclusive. For the other case, we can simply use MMU_NOTIFY_CLEAR, because it really is clearing the device-exclusive entry first, to then install the present entry. Update the documentation of MMU_NOTIFY_EXCLUSIVE, to document the single use case more thoroughly. If ever required, we could add a separate MMU_NOTIFY_CLEAR_EXCLUSIVE; for now using MMU_NOTIFY_CLEAR seems to be sufficient. Link: https://lkml.kernel.org/r/20250226132257.2826043-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Alistair Popple Cc: Jason Gunthorpe Cc: Jérôme Glisse Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d4e714661826..bc2402a45741 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -43,10 +43,10 @@ struct mmu_interval_notifier; * a device driver to possibly ignore the invalidation if the * owner field matches the driver's device private pgmap owner. * - * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no - * longer have exclusive access to the page. When sent during creation of an - * exclusive range the owner will be initialised to the value provided by the - * caller of make_device_exclusive(), otherwise the owner will be NULL. + * @MMU_NOTIFY_EXCLUSIVE: conversion of a page table entry to device-exclusive. + * The owner is initialized to the value provided by the caller of + * make_device_exclusive(), such that this caller can filter out these + * events. */ enum mmu_notifier_event { MMU_NOTIFY_UNMAP = 0, -- cgit v1.2.3 From 1eb3471bf5749ff3769ec52723bd9b8d773c7a62 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 3 Mar 2025 14:17:19 -0800 Subject: mm/damon: add data structure for monitoring intervals auto-tuning Patch series "mm/damon: auto-tune aggregation interval". DAMON requires time-consuming and repetitive aggregation interval tuning. Introduce a feature for automating it using a feedback loop that aims an amount of observed access events, like auto-exposing cameras. Background: Access Frequency Monitoring and Aggregation Interval ================================================================ DAMON checks if each memory element (damon_region) is accessed or not for every user-specified time interval called 'sampling interval'. It aggregates the check intervals on per-element counter called 'nr_accesses'. DAMON users can read the counters to get the access temperature of a given element. The counters are reset for every another user-specified time interval called 'aggregation interval'. This can be illustrated as DAMON continuously capturing a snapshot of access events that happen and captured within the last aggregation interval. This implies the aggregation interval plays a key role for the quality of the snapshots, like the camera exposure time. If it is too short, the amount of access events that happened and captured for each snapshot is small, so each snapshot will show no many interesting things but just a cold and dark world with hopefuly one pale blue dot or two. If it is too long, too many events are aggregated in a single shot, so each snapshot will look like world of flames, or Muspellheim. It will be difficult to find practical insights in both cases. Problem: Time Consuming and Repetitive Tuning ============================================= The appropriate length of the aggregation interval depends on how frequently the system and workloads are making access events that DAMON can observe. Hence, users have to tune the interval with excessive amount of tests with the target system and workloads. If the system and workloads are changed, the tuning should be done again. If the characteristic of the workloads is dynamic, it becomes more challenging. It is therefore time-consuming and repetitive. The tuning challenge mainly stems from the wrong question. It is not asking users what quality of monitoring results they want, but how DAMON should operate for their hidden goal. To make the right answer, users need to fully understand DAMON's mechanisms and the characteristics of their workloads. Users shouldn't be asked to understand the underlying mechanism. Understanding the characteristics of the workloads shouldn't be the role of users but DAMON. Aim-oriented Feedback-driven Auto-Tuning ========================================= Fortunately, the appropriate length of the aggregation interval can be inferred using a feedback loop. If the current snapshots are showing no much intresting information, in other words, if it shows only rare access events, increasing the aggregation interval helps, and vice versa. We tested this theory on a few real-world workloads, and documented one of the experience with an official DAMON monitoring intervals tuning guideline. Since it is a simple theory that requires repeatable tries, it can be a good job for machines. Based on the guideline's theory, we design an automation of aggregation interval tuning, in a way similar to that of camera auto-exposure feature. It defines the amount of interesting information as the ratio of DAMON-observed access events that DAMON actually observed to theoretical maximum amount of it within each snapshot. Events are accounted in byte and sampling attempts granularity. For example, let's say there is a region of 'X' bytes size. DAMON tried access check smapling for the region 'Y' times in total for a given aggregation. Among the 'Y' attempts, 'Z' times it shown positive results. Then, the theoritical maximum number of access events for the region is 'X * Y'. And the number of access events that DAMON has observed for the region is 'X * Z'. The abount of the interesting information is '(X * Z / X * Y)'. Note that each snapshot would have multiple regions. Users can set an arbitrary value of the ratio as their target. Once the target is set, the automation periodically measures the current value of the ratio and increase or decrease the aggregation interval if the ratio value is lower or higher than the target. The amount of the change is proportion to the distance between the current adn the target values. To avoid auto-tuning goes too long way, let users set the minimum and the maximum aggregation interval times. Changing only aggregation interval while sampling interval is kept makes the maximum level of access frequency in each snapshot, or discernment of regions inconsistent. Also, unnecessarily short sampling interval causes meaningless monitoring overhed. The automation therefore adjusts the sampling interval together with aggregation interval, while keeping the ratio between the two intervals. Users can set the ratio, or the discernment. Discussion ========== The modified question (aimed amount of access events, or lights, in each snapshot) is easy to answer by both the users and the kernel. If users are interested in finding more cold regions, the value should be lower, and vice versa. If users have no idea, kernel can suggest a fair default value based on some theories and experiments. For example, based on the Pareto principle (80/20 rule), we could expect 20% target ratio will capture 80% of real access events. Since 80% might be too high, applying the rule once again, 4% (20% * 20%) may capture about 56% (80% * 80%) of real access events. Sampling to aggregation intervals ratio and min/max aggregation intervals are also arguably easy to answer. What users want is discernment of regions for efficient system operation, for examples, X amount of colder regions or Y amount of warmer regions, not exactly how many times each cache line is accessed in nanoseconds degree. The appropriate min/max aggregation interval can relatively naively set, and may better to set for aimed monitoring overhead. Since sampling interval is directly deciding the overhead, setting it based on the sampling interval can be easy. With my experiences, I'd argue the intervals ratio 0.05, and 5 milliseconds to 20 seconds sampling interval range (100 milliseconds to 400 seconds aggregation interval) can be a good default suggestion. Evaluation ========== On a machine running a real world server workload, I ran DAMON to monitor its physical address space for about 23 hours, with this feature turned on. We set it to tune sampling interval in a range from 5 milliseconds to 10 seconds, aiming 4 % DAMON-observed access ratio per three aggregation intervals. The exact command I used is as below. damo start --monitoring_intervals_goal 4% 3 5ms 10s --damos_action stat During the test run, DAMON continuously updated sampling and aggregation intervals as designed, within the given range. For all the time, DAMON was able to find the intervals that meets the target access events ratio in the given intervals range (sampling interval between 5 milliseconds and 10 seconds). For most of the time, tuned sampling interval was converged in 300-400 milliseconds. It made only small amount of changes within the range. The average of the tuned sampling interval during the test was about 380 milliseconds. The workload periodically gets less load and decreases its CPU usage. Presumably this also caused it making less memory access events. Reactively to such event,s DAMON also increased the intervals as expected. It was still able to find the optimum interval that satisfying the target access ratio within the given intervals range. Usually it was converged to about 5 seconds. Once the workload gets normal amount of load again, DAMON reactively reduced the intervals to the normal range. I collected and visualized DAMON's monitoring results on the server a few times. Every time the visualized access pattern looked not biased to only cold or hot pages but diverse and balanced. Let me show some of the snapshots that I collected at the nearly end of the test (after about 23 hours have passed since starting DAMON on the server). The recency histogram looks as below. Please note that this visualization shows only a very coarse grained information. For more details about the visualization format, please refer to DAMON user-space tool documentation[1]. # ./damo report access --style recency-sz-hist --tried_regions_of 0 0 0 --access_rate 0 0 [-19 h 7 m 45.514 s, -17 h 12 m 58.963 s) 6.198 GiB |**** | [-17 h 12 m 58.963 s, -15 h 18 m 12.412 s) 0 B | | [-15 h 18 m 12.412 s, -13 h 23 m 25.860 s) 0 B | | [-13 h 23 m 25.860 s, -11 h 28 m 39.309 s) 0 B | | [-11 h 28 m 39.309 s, -9 h 33 m 52.757 s) 0 B | | [-9 h 33 m 52.757 s, -7 h 39 m 6.206 s) 0 B | | [-7 h 39 m 6.206 s, -5 h 44 m 19.654 s) 0 B | | [-5 h 44 m 19.654 s, -3 h 49 m 33.103 s) 0 B | | [-3 h 49 m 33.103 s, -1 h 54 m 46.551 s) 0 B | | [-1 h 54 m 46.551 s, -0 ns) 16.967 GiB |********* | [-0 ns, --6886551440000 ns) 38.835 GiB |********************| memory bw estimate: 9.425 GiB per second total size: 62.000 GiB It shows about 38 GiB of memory was accessed at least once within last aggregation interval (given ~300 milliseconds tuned sampling interval, this is about six seconds). This is about 61 % of the total memory. In other words, DAMON found warmest 61 % memory of the system. The number is particularly interesting given our Pareto principle based theory for the tuning goal value. We set it as 20 % of 20 % (4 %), thinking it would capture 80 % of 80 % (64 %) real access events. And it foudn 61 % hot memory, or working set. Nevertheless, to make the theory clearer, much more discussion and tests would be needed. At the moment, nonetheless, we can say making the target value higher helps finding more hot memory regions. The histogram also shows an amount of cold memory. About 17 GiB memory of the system has not accessed at least for last aggregation interval (about six seconds), and at most for about last two hours. The real longest unaccessed time of the 17 GiB memory was about 19 minutes, though. This is a limitation of this visualization format. It further found very cold 6 GiB memory. It has not accessed at least for last 17 hours and at most 19 hours. What about hot memory distribution? To see this, I capture and visualize the snapshot in access temperature histogram. Again, please refer to the DAMON user-space tool documentation[1] for the format and what access temperature mean. Both the visualization and metric shows only very coarse grained and limited information. The resulting histogram look like below. # ./damo report access --style temperature-sz-hist --tried_regions_of 0 0 0 [-6,840,763,776,000, -5,501,580,939,800) 6.198 GiB |*** | [-5,501,580,939,800, -4,162,398,103,600) 0 B | | [-4,162,398,103,600, -2,823,215,267,400) 0 B | | [-2,823,215,267,400, -1,484,032,431,200) 0 B | | [-1,484,032,431,200, -144,849,595,000) 0 B | | [-144,849,595,000, 1,194,333,241,200) 55.802 GiB |********************| [1,194,333,241,200, 2,533,516,077,400) 4.000 KiB |* | [2,533,516,077,400, 3,872,698,913,600) 4.000 KiB |* | [3,872,698,913,600, 5,211,881,749,800) 8.000 KiB |* | [5,211,881,749,800, 6,551,064,586,000) 12.000 KiB |* | [6,551,064,586,000, 7,890,247,422,200) 4.000 KiB |* | memory bw estimate: 5.178 GiB per second total size: 62.000 GiB We can see most of the memory is in similar access temperature range, and definitely some pages are extremely hot. To see the picture in more detail, let's capture and visualize the snapshot per DAMON-region, sorted by their access temperature. The total number of the regions was about 300. Due to the limited space, I'm showing only a few parts of the output here. # ./damo report access --style hot --tried_regions_of 0 0 0 heatmap: 00000000888888889999999888888888888888888888888888888888888888888888888888888888 # min/max temperatures: -6,827,258,184,000, 17,589,052,500, column size: 793.600 MiB |999999999999999999999999999999999999999| 4.000 KiB access 100 % 18 h 9 m 43.918 s |999999999999999999999999999999999999999| 8.000 KiB access 100 % 17 h 56 m 5.351 s |999999999999999999999999999999999999999| 4.000 KiB access 100 % 15 h 24 m 19.634 s |999999999999999999999999999999999999999| 4.000 KiB access 100 % 14 h 10 m 55.606 s |999999999999999999999999999999999999999| 4.000 KiB access 100 % 11 h 34 m 18.993 s [...] |99999999999999999999999999999| 8.000 KiB access 100 % 1 m 27.945 s |11111111111111111111111111111| 80.000 KiB access 15 % 1 m 21.180 s |00000000000000000000000000000| 24.000 KiB access 5 % 1 m 21.180 s |00000000000000000000000000000| 5.919 GiB access 10 % 1 m 14.415 s |99999999999999999999999999999| 12.000 KiB access 100 % 1 m 7.650 s [...] |0| 4.000 KiB access 5 % 0 ns |0| 12.000 KiB access 5 % 0 ns |0| 188.000 KiB access 0 % 0 ns |0| 24.000 KiB access 0 % 0 ns |0| 48.000 KiB access 0 % 0 ns [...] |0000000000000000000000000000000| 8.000 KiB access 0 % 6 m 45.901 s |00000000000000000000000000000000| 36.000 KiB access 0 % 7 m 26.491 s |00000000000000000000000000000000| 4.000 KiB access 0 % 12 m 37.682 s |000000000000000000000000000000000| 8.000 KiB access 0 % 18 m 9.168 s |000000000000000000000000000000000| 16.000 KiB access 0 % 19 m 3.288 s |0000000000000000000000000000000000000000| 6.198 GiB access 0 % 18 h 57 m 52.582 s memory bw estimate: 8.798 GiB per second total size: 62.000 GiB We can see DAMON found small and extremely hot regions that accessed for all access check sampling (once per about 300 milliseconds) for more than 10 hours. The access temperature rapidly decreases. DAMON was also able to find small and big regions that not accessed for up to about 19 minutes. It even found an outlier cold region of 6 GiB that not accessed for about 19 hours. It is unclear what the outlier region is, as of this writing. For the testing, DAMON was consuming about 0.1% of single CPU time. This is again expected results, since DAMON was using about 370 milliseconds sampling interval in most case. # ps -p $kdamond_pid -o %cpu %CPU 0.1 I also ran similar tests against kernel build workload and an in-memory cache workload benchmark[2]. Detialed results including tuned intervals and captured access pattern were of course different sicne those depend on the workloads. But the auto-tuning feature was always working as expected like the above results for the real world workload. To wrap up, with intervals auto-tuning feature, DAMON was able to capture access pattern snapshots of a quality on a real world server workload. The auto-tuning feature was able to adaptively react to the dynamic access patterns of the workload and reliably provide consistent monitoring results without manual human interventions. Also, the auto-tuning made DAMON consumes only necessary amount of resource for the required quality. References ========== [1] https://github.com/damonitor/damo/blob/next/USAGE.md#access-report-styles [2] https://github.com/facebookresearch/DCPerf/blob/main/packages/tao_bench/README.md This patch (of 8): Add data structures for DAMON sampling and aggregation intervals automatic tuning that aims specific amount of DAMON-observed access events per snapshot. In more detail, define the data structure for the tuning goal, link it to the monitoring attributes data structure so that DAMON kernel API callers can make the request, and update parameters setup DAMON function to respect the new parameter. Link: https://lkml.kernel.org/r/20250303221726.484227-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250303221726.484227-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 242910b190c9..5f2609f24761 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -659,12 +659,38 @@ struct damon_call_control { bool canceled; }; +/** + * struct damon_intervals_goal - Monitoring intervals auto-tuning goal. + * + * @access_bp: Access events observation ratio to achieve in bp. + * @aggrs: Number of aggregations to acheive @access_bp within. + * @min_sample_us: Minimum resulting sampling interval in microseconds. + * @max_sample_us: Maximum resulting sampling interval in microseconds. + * + * DAMON automatically tunes &damon_attrs->sample_interval and + * &damon_attrs->aggr_interval aiming the ratio in bp (1/10,000) of + * DAMON-observed access events to theoretical maximum amount within @aggrs + * aggregations be same to @access_bp. The logic increases + * &damon_attrs->aggr_interval and &damon_attrs->sampling_interval in same + * ratio if the current access events observation ratio is lower than the + * target for each @aggrs aggregations, and vice versa. + * + * If @aggrs is zero, the tuning is disabled and hence this struct is ignored. + */ +struct damon_intervals_goal { + unsigned long access_bp; + unsigned long aggrs; + unsigned long min_sample_us; + unsigned long max_sample_us; +}; + /** * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * * @sample_interval: The time between access samplings. * @aggr_interval: The time between monitor results aggregations. * @ops_update_interval: The time between monitoring operations updates. + * @intervals_goal: Intervals auto-tuning goal. * @min_nr_regions: The minimum number of adaptive monitoring * regions. * @max_nr_regions: The maximum number of adaptive monitoring @@ -684,6 +710,7 @@ struct damon_attrs { unsigned long sample_interval; unsigned long aggr_interval; unsigned long ops_update_interval; + struct damon_intervals_goal intervals_goal; unsigned long min_nr_regions; unsigned long max_nr_regions; }; -- cgit v1.2.3 From f04b0fedbe714f822bd066b319a60faa39a985a1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 3 Mar 2025 14:17:20 -0800 Subject: mm/damon/core: implement intervals auto-tuning Implement the DAMON sampling and aggregation intervals auto-tuning mechanism as briefly described on 'struct damon_intervals_goal'. The core part for deciding the direction and amount of the changes is implemented reusing the feedback loop function which is being used for DAMOS quotas auto-tuning. Unlike the DAMOS quotas auto-tuning use case, limit the maximum decreasing amount after the adjustment to 50% of the current value, though. This is because the intervals have no good merits at rapid reductions since it could unnecessarily increase the monitoring overhead. Link: https://lkml.kernel.org/r/20250303221726.484227-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5f2609f24761..b3e2c793c1f4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -713,6 +713,17 @@ struct damon_attrs { struct damon_intervals_goal intervals_goal; unsigned long min_nr_regions; unsigned long max_nr_regions; +/* private: internal use only */ + /* + * @aggr_interval to @sample_interval ratio. + * Core-external components call damon_set_attrs() with &damon_attrs + * that this field is unset. In the case, damon_set_attrs() sets this + * field of resulting &damon_attrs. Core-internal components such as + * kdamond_tune_intervals() calls damon_set_attrs() with &damon_attrs + * that this field is set. In the case, damon_set_attrs() just keep + * it. + */ + unsigned long aggr_samples; }; /** @@ -761,6 +772,11 @@ struct damon_ctx { * update */ unsigned long next_ops_update_sis; + /* + * number of sample intervals that should be passed before next + * intervals tuning + */ + unsigned long next_intervals_tune_sis; /* for waiting until the execution of the kdamond_fn is started */ struct completion kdamond_started; /* for scheme quotas prioritization */ -- cgit v1.2.3 From 691ee97e1a9de0cdb3efb893c1f180e3f4a35e32 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 3 Mar 2025 14:15:35 +0000 Subject: mm: fix lazy mmu docs and usage Patch series "Fix lazy mmu mode", v2. I'm planning to implement lazy mmu mode for arm64 to optimize vmalloc. As part of that, I will extend lazy mmu mode to cover kernel mappings in vmalloc table walkers. While lazy mmu mode is already used for kernel mappings in a few places, this will extend it's use significantly. Having reviewed the existing lazy mmu implementations in powerpc, sparc and x86, it looks like there are a bunch of bugs, some of which may be more likely to trigger once I extend the use of lazy mmu. So this series attempts to clarify the requirements and fix all the bugs in advance of that series. See patch #1 commit log for all the details. This patch (of 5): The docs, implementations and use of arch_[enter|leave]_lazy_mmu_mode() is a bit of a mess (to put it politely). There are a number of issues related to nesting of lazy mmu regions and confusion over whether the task, when in a lazy mmu region, is preemptible or not. Fix all the issues relating to the core-mm. Follow up commits will fix the arch-specific implementations. 3 arches implement lazy mmu; powerpc, sparc and x86. When arch_[enter|leave]_lazy_mmu_mode() was first introduced by commit 6606c3e0da53 ("[PATCH] paravirt: lazy mmu mode hooks.patch"), it was expected that lazy mmu regions would never nest and that the appropriate page table lock(s) would be held while in the region, thus ensuring the region is non-preemptible. Additionally lazy mmu regions were only used during manipulation of user mappings. Commit 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy updates") started invoking the lazy mmu mode in apply_to_pte_range(), which is used for both user and kernel mappings. For kernel mappings the region is no longer protected by any lock so there is no longer any guarantee about non-preemptibility. Additionally, for RT configs, the holding the PTL only implies no CPU migration, it doesn't prevent preemption. Commit bcc6cc832573 ("mm: add default definition of set_ptes()") added arch_[enter|leave]_lazy_mmu_mode() to the default implementation of set_ptes(), used by x86. So after this commit, lazy mmu regions can be nested. Additionally commit 1a10a44dfc1d ("sparc64: implement the new page table range API") and commit 9fee28baa601 ("powerpc: implement the new page table range API") did the same for the sparc and powerpc set_ptes() overrides. powerpc couldn't deal with preemption so avoids it in commit b9ef323ea168 ("powerpc/64s: Disable preemption in hash lazy mmu mode"), which explicitly disables preemption for the whole region in its implementation. x86 can support preemption (or at least it could until it tried to add support nesting; more on this below). Sparc looks to be totally broken in the face of preemption, as far as I can tell. powerpc can't deal with nesting, so avoids it in commit 47b8def9358c ("powerpc/mm: Avoid calling arch_enter/leave_lazy_mmu() in set_ptes"), which removes the lazy mmu calls from its implementation of set_ptes(). x86 attempted to support nesting in commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode") but as far as I can tell, this breaks its support for preemption. In short, it's all a mess; the semantics for arch_[enter|leave]_lazy_mmu_mode() are not clearly defined and as a result the implementations all have different expectations, sticking plasters and bugs. arm64 is aiming to start using these hooks, so let's clean everything up before adding an arm64 implementation. Update the documentation to state that lazy mmu regions can never be nested, must not be called in interrupt context and preemption may or may not be enabled for the duration of the region. And fix the generic implementation of set_ptes() to avoid nesting. arch-specific fixes to conform to the new spec will proceed this one. These issues were spotted by code review and I have no evidence of issues being reported in the wild. Link: https://lkml.kernel.org/r/20250303141542.3371656-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20250303141542.3371656-2-ryan.roberts@arm.com Fixes: bcc6cc832573 ("mm: add default definition of set_ptes()") Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Acked-by: Juergen Gross Cc: Andreas Larsson Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Juegren Gross Cc: Matthew Wilcow (Oracle) Cc: Thomas Gleinxer Cc: Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 94d267d02372..787c632ee2c9 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -222,10 +222,14 @@ static inline int pmd_dirty(pmd_t pmd) * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be - * up to date. This mode can only be entered and left under the protection of - * the page table locks for all page tables which may be modified. In the UP - * case, this is required so that preemption is disabled, and in the SMP case, - * it must synchronize the delayed page table writes properly on other CPUs. + * up to date. + * + * In the general case, no lock is guaranteed to be held between entry and exit + * of the lazy mode. So the implementation must assume preemption may be enabled + * and cpu migration is possible; it must take steps to be robust against this. + * (In practice, for user PTE updates, the appropriate page table lock(s) are + * held, but for kernel PTE updates, no lock is held). Nesting is not permitted + * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE #define arch_enter_lazy_mmu_mode() do {} while (0) @@ -287,7 +291,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, { page_table_check_ptes_set(mm, ptep, pte, nr); - arch_enter_lazy_mmu_mode(); for (;;) { set_pte(ptep, pte); if (--nr == 0) @@ -295,7 +298,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, ptep++; pte = pte_next_pfn(pte); } - arch_leave_lazy_mmu_mode(); } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) -- cgit v1.2.3 From 0e2759afcaf9bff25a63201856fa89b64181749f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 27 Feb 2025 23:58:07 -0800 Subject: page_counter: track failcnt only for legacy cgroups Currently page_counter tracks failcnt for counters used by v1 and v2 controllers. However failcnt is only exported for v1 deployment and thus there is no need to maintain it in v2. The oom report does expose failcnt for memory and swap in v2 but v2 already maintains MEMCG_MAX and MEMCG_SWAP_MAX event counters which can be used. Link: https://lkml.kernel.org/r/20250228075808.207484-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin (Cruise) Signed-off-by: Andrew Morton --- include/linux/page_counter.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 46406f3fe34d..e4bd8fd427be 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -28,12 +28,13 @@ struct page_counter { unsigned long watermark; /* Latest cg2 reset watermark */ unsigned long local_watermark; - unsigned long failcnt; + unsigned long failcnt; /* v1-only field */ /* Keep all the read most fields in a separete cacheline. */ CACHELINE_PADDING(_pad2_); bool protection_support; + bool track_failcnt; unsigned long min; unsigned long low; unsigned long high; @@ -58,6 +59,7 @@ static inline void page_counter_init(struct page_counter *counter, counter->max = PAGE_COUNTER_MAX; counter->parent = parent; counter->protection_support = protection_support; + counter->track_failcnt = false; } static inline unsigned long page_counter_read(struct page_counter *counter) -- cgit v1.2.3 From f7b0797d36e75d2a622580b56b9bfd3130d5d0e9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 27 Feb 2025 23:58:08 -0800 Subject: page_counter: reduce struct page_counter size The struct page_counter has explicit padding for better cache alignment. The commit c6f53ed8f213a ("mm, memcg: cg2 memory{.swap,}.peak write handlers") added a field to the struct page_counter and accidently increased its size. Let's move the failcnt field which is v1-only field to the same cacheline of usage to reduce the size of struct page_counter. Link: https://lkml.kernel.org/r/20250228075808.207484-4-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin (Cruise) Signed-off-by: Andrew Morton --- include/linux/page_counter.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index e4bd8fd427be..d649b6bbbc87 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -9,10 +9,12 @@ struct page_counter { /* - * Make sure 'usage' does not share cacheline with any other field. The - * memcg->memory.usage is a hot member of struct mem_cgroup. + * Make sure 'usage' does not share cacheline with any other field in + * v2. The memcg->memory.usage is a hot member of struct mem_cgroup. */ atomic_long_t usage; + unsigned long failcnt; /* v1-only field */ + CACHELINE_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ @@ -28,7 +30,6 @@ struct page_counter { unsigned long watermark; /* Latest cg2 reset watermark */ unsigned long local_watermark; - unsigned long failcnt; /* v1-only field */ /* Keep all the read most fields in a separete cacheline. */ CACHELINE_PADDING(_pad2_); -- cgit v1.2.3 From f1ab2831e2a4312046bca79256b2efc41d373eaf Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Tue, 4 Mar 2025 19:03:16 +0800 Subject: writeback: let trace_balance_dirty_pages() take struct dtc as parameter Patch series "Fix calculations in trace_balance_dirty_pages() for cgwb", v2. In my experiment, I found that the output of trace_balance_dirty_pages() in the cgroup writeback scenario was strange because trace_balance_dirty_pages() always uses global_wb_domain.dirty_limit for related calculations instead of the dirty_limit of the corresponding memcg's wb_domain. The basic idea of the fix is to store the hard dirty limit value computed in wb_position_ratio() into struct dirty_throttle_control and use it for calculations in trace_balance_dirty_pages(). This patch (of 3): Currently, trace_balance_dirty_pages() already has 12 parameters. In the patch #3, I initially attempted to introduce an additional parameter. However, in include/linux/trace_events.h, bpf_trace_run12() only supports up to 12 parameters and bpf_trace_run13() does not exist. To reduce the number of parameters in trace_balance_dirty_pages(), we can make it accept a pointer to struct dirty_throttle_control as a parameter. To achieve this, we need to move the definition of struct dirty_throttle_control from mm/page-writeback.c to include/linux/writeback.h. Link: https://lkml.kernel.org/r/20250304110318.159567-1-yizhou.tang@shopee.com Link: https://lkml.kernel.org/r/20250304110318.159567-2-yizhou.tang@shopee.com Signed-off-by: Tang Yizhou Cc: Alexei Starovoitov Cc: Christian Brauner Cc: Steven Rostedt Cc: Jan Kara Cc: "Masami Hiramatsu (Google)" Cc: Matthew Wilcow (Oracle) Cc: Tang Yizhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/writeback.h | 23 +++++++++++++++++++++++ include/trace/events/writeback.h | 16 ++++++---------- 2 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d11b903c2edb..32095928365c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -313,6 +313,29 @@ static inline void cgroup_writeback_umount(struct super_block *sb) /* * mm/page-writeback.c */ +/* consolidated parameters for balance_dirty_pages() and its subroutines */ +struct dirty_throttle_control { +#ifdef CONFIG_CGROUP_WRITEBACK + struct wb_domain *dom; + struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ +#endif + struct bdi_writeback *wb; + struct fprop_local_percpu *wb_completions; + + unsigned long avail; /* dirtyable */ + unsigned long dirty; /* file_dirty + write + nfs */ + unsigned long thresh; /* dirty threshold */ + unsigned long bg_thresh; /* dirty background threshold */ + + unsigned long wb_dirty; /* per-wb counterparts */ + unsigned long wb_thresh; + unsigned long wb_bg_thresh; + + unsigned long pos_ratio; + bool freerun; + bool dirty_exceeded; +}; + void laptop_io_completion(struct backing_dev_info *info); void laptop_sync_completion(void); void laptop_mode_timer_fn(struct timer_list *t); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index a261e86e61fa..3213b9023794 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -629,11 +629,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, TRACE_EVENT(balance_dirty_pages, TP_PROTO(struct bdi_writeback *wb, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, + struct dirty_throttle_control *dtc, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, @@ -641,7 +637,7 @@ TRACE_EVENT(balance_dirty_pages, long pause, unsigned long start_time), - TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, + TP_ARGS(wb, dtc, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), @@ -664,16 +660,16 @@ TRACE_EVENT(balance_dirty_pages, ), TP_fast_assign( - unsigned long freerun = (thresh + bg_thresh) / 2; + unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2; strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + freerun) / 2; - __entry->dirty = dirty; + __entry->dirty = dtc->dirty; __entry->bdi_setpoint = __entry->setpoint * - bdi_thresh / (thresh + 1); - __entry->bdi_dirty = bdi_dirty; + dtc->wb_thresh / (dtc->thresh + 1); + __entry->bdi_dirty = dtc->wb_dirty; __entry->dirty_ratelimit = KBps(dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; -- cgit v1.2.3 From 28c24ef9e04f95672b72b1297eff7dae91cceea8 Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Tue, 4 Mar 2025 19:03:17 +0800 Subject: writeback: rename variables in trace_balance_dirty_pages() Rename bdi_setpoint and bdi_dirty in the tracepoint to wb_setpoint and wb_dirty, respectively. These changes were omitted by Tejun in the cgroup writeback patchset. Link: https://lkml.kernel.org/r/20250304110318.159567-3-yizhou.tang@shopee.com Signed-off-by: Tang Yizhou Cc: Alexei Starovoitov Cc: Christian Brauner Cc: Jan Kara Cc: "Masami Hiramatsu (Google)" Cc: Matthew Wilcow (Oracle) Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/trace/events/writeback.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 3213b9023794..3046ca6b08ea 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -646,8 +646,8 @@ TRACE_EVENT(balance_dirty_pages, __field(unsigned long, limit) __field(unsigned long, setpoint) __field(unsigned long, dirty) - __field(unsigned long, bdi_setpoint) - __field(unsigned long, bdi_dirty) + __field(unsigned long, wb_setpoint) + __field(unsigned long, wb_dirty) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned int, dirtied) @@ -667,9 +667,9 @@ TRACE_EVENT(balance_dirty_pages, __entry->setpoint = (global_wb_domain.dirty_limit + freerun) / 2; __entry->dirty = dtc->dirty; - __entry->bdi_setpoint = __entry->setpoint * + __entry->wb_setpoint = __entry->setpoint * dtc->wb_thresh / (dtc->thresh + 1); - __entry->bdi_dirty = dtc->wb_dirty; + __entry->wb_dirty = dtc->wb_dirty; __entry->dirty_ratelimit = KBps(dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; @@ -685,7 +685,7 @@ TRACE_EVENT(balance_dirty_pages, TP_printk("bdi %s: " "limit=%lu setpoint=%lu dirty=%lu " - "bdi_setpoint=%lu bdi_dirty=%lu " + "wb_setpoint=%lu wb_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", @@ -693,8 +693,8 @@ TRACE_EVENT(balance_dirty_pages, __entry->limit, __entry->setpoint, __entry->dirty, - __entry->bdi_setpoint, - __entry->bdi_dirty, + __entry->wb_setpoint, + __entry->wb_dirty, __entry->dirty_ratelimit, __entry->task_ratelimit, __entry->dirtied, -- cgit v1.2.3 From 6cc4c3aa714bc58ec5d20f3054ca5f23534984d1 Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Tue, 4 Mar 2025 19:03:18 +0800 Subject: writeback: fix calculations in trace_balance_dirty_pages() for cgwb In the commit dcc25ae76eb7 ("writeback: move global_dirty_limit into wb_domain") of the cgroup writeback backpressure propagation patchset, Tejun made some adaptations to trace_balance_dirty_pages() for cgroup writeback. However, this adaptation was incomplete and Tejun missed further adaptation in the subsequent patches. In the cgroup writeback scenario, if sdtc in balance_dirty_pages() is assigned to mdtc, then upon entering trace_balance_dirty_pages(), __entry->limit should be assigned based on the dirty_limit of the corresponding memcg's wb_domain, rather than global_wb_domain. To address this issue and simplify the implementation, introduce a 'limit' field in struct dirty_throttle_control to store the hard_limit value computed in wb_position_ratio() by calling hard_dirty_limit(). This field will then be used in trace_balance_dirty_pages() to assign the value to __entry->limit. Link: https://lkml.kernel.org/r/20250304110318.159567-4-yizhou.tang@shopee.com Fixes: dcc25ae76eb7 ("writeback: move global_dirty_limit into wb_domain") Signed-off-by: Tang Yizhou Acked-by: Tejun Heo Cc: Alexei Starovoitov Cc: Christian Brauner Cc: Jan Kara Cc: "Masami Hiramatsu (Google)" Cc: Matthew Wilcow (Oracle) Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/linux/writeback.h | 1 + include/trace/events/writeback.h | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 32095928365c..58bda3347914 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -326,6 +326,7 @@ struct dirty_throttle_control { unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ + unsigned long limit; /* hard dirty limit */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 3046ca6b08ea..0ff388131fc9 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -663,9 +663,8 @@ TRACE_EVENT(balance_dirty_pages, unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2; strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); - __entry->limit = global_wb_domain.dirty_limit; - __entry->setpoint = (global_wb_domain.dirty_limit + - freerun) / 2; + __entry->limit = dtc->limit; + __entry->setpoint = (dtc->limit + freerun) / 2; __entry->dirty = dtc->dirty; __entry->wb_setpoint = __entry->setpoint * dtc->wb_thresh / (dtc->thresh + 1); -- cgit v1.2.3 From ab82e57981d0e4cb46d2817e7b65b9d5fdcf3832 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 4 Mar 2025 13:19:05 -0800 Subject: mm/damon/core: introduce damos->ops_filters Patch series "mm/damon: make allow filters after reject filters useful and intuitive". DAMOS filters do allow or reject elements of memory for given DAMOS scheme only if those match the filter criterias. For elements that don't match any DAMOS filter, 'allowing' is the default behavior. This makes allow-filters that don't have any reject-filter after them meaningless sources of overhead. The decision was made to keep the behavior consistent with that before the introduction of allow-filters. This, however, makes usage of DAMOS filters confusing and inefficient. It is more intuitive and still consistent behavior to reject by default unless there is no filter at all or the last filter is a reject filter. Update the filtering logic in the way and update documents to clarify the behavior. Note that this is changing the old behavior. But the old behavior for the problematic filter combination was definitely confusing, inefficient and anyway useless. Also, the behavior has relatively recently introduced. It is difficult to anticipate any user that depends on the behavior. Hence this is not a user-breaking behavior change but an obvious improvement. This patch (of 9): DAMOS filters can be categorized into two groups depending on which layer they are handled, namely core layer and ops layer. The groups are important because the filtering behavior depends on evaluation sequence of filters, and core layer-handled filters are evaluated before operations layer-handled ones. The behavior is clearly documented, but the implementation is bit inefficient and complicated. All filters are maintained in a single list (damos->filters) in mix. Filters evaluation logics in core layer and operations layer iterates all the filters on the list, while skipping filters that should be not handled by the layer of the logic. It is inefficient. Making future extensions having differentiations for filters of different handling layers will also be complicated. Add a new list that will be used for having all operations layer-handled DAMOS filters to DAMOS scheme data structure. Also add the support of its initialization and basic traversal functions. Link: https://lkml.kernel.org/r/20250304211913.53574-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250304211913.53574-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index b3e2c793c1f4..7f76e2e99f37 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -448,6 +448,7 @@ struct damos_access_pattern { * @wmarks: Watermarks for automated (in)activation of this scheme. * @target_nid: Destination node if @action is "migrate_{hot,cold}". * @filters: Additional set of &struct damos_filter for &action. + * @ops_filters: ops layer handling &struct damos_filter objects list. * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. * @list: List head for siblings. @@ -508,6 +509,7 @@ struct damos { int target_nid; }; struct list_head filters; + struct list_head ops_filters; void *last_applied; struct damos_stat stat; struct list_head list; @@ -858,6 +860,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damos_for_each_filter_safe(f, next, scheme) \ list_for_each_entry_safe(f, next, &(scheme)->filters, list) +#define damos_for_each_ops_filter(f, scheme) \ + list_for_each_entry(f, &(scheme)->ops_filters, list) + +#define damos_for_each_ops_filter_safe(f, next, scheme) \ + list_for_each_entry_safe(f, next, &(scheme)->ops_filters, list) + #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); -- cgit v1.2.3 From dd038b728c8a2a0e1a632b767a50f09f076dab79 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 4 Mar 2025 13:19:10 -0800 Subject: mm/damon: add default allow/reject behavior fields to struct damos Current default allow/reject behavior of filters handling stage has made before introduction of the allow behavior. For allow-filters usage, it is confusing and inefficient. It is more intuitive to decide the default filtering stage allow/reject behavior as opposite to the last filter's behavior. The decision should be made separately for core and operations layers' filtering stages, since last core layer-handled filter is not really a last filter if there are operations layer handling filters. Keeping separate decisions for the two categories can make the logic simpler. Add fields for storing the two decisions. Link: https://lkml.kernel.org/r/20250304211913.53574-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 7f76e2e99f37..52559475dbe7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -502,6 +502,9 @@ struct damos { * layer-handled filters. If true, operations layer allows it, too. */ bool core_filters_allowed; + /* whether to reject core/ops filters umatched regions */ + bool core_filters_default_reject; + bool ops_filters_default_reject; /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; -- cgit v1.2.3 From 9bbe033c75a56d72fc35e7c8ca6f3258d9782fa5 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:29 +0000 Subject: mm: zpool: add interfaces for object read/write APIs Patch series "Switch zswap to object read/write APIs". This patch series updates zswap to use the new object read/write APIs defined by zsmalloc in [1], and remove the old object mapping APIs and the related code from zpool and zsmalloc. This patch (of 5): Zsmalloc introduced new APIs to read/write objects besides mapping them. Add the necessary zpool interfaces. Link: https://lkml.kernel.org/r/20250305061134.4105762-1-yosry.ahmed@linux.dev Link: https://lkml.kernel.org/r/20250305061134.4105762-2-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Reviewed-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/zpool.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 5e6dc46b8cc4..1784e735ee04 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -52,6 +52,16 @@ void *zpool_map_handle(struct zpool *pool, unsigned long handle, void zpool_unmap_handle(struct zpool *pool, unsigned long handle); + +void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, + void *local_copy); + +void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, + void *handle_mem); + +void zpool_obj_write(struct zpool *zpool, unsigned long handle, + void *handle_mem, size_t mem_len); + u64 zpool_get_total_pages(struct zpool *pool); @@ -90,6 +100,13 @@ struct zpool_driver { enum zpool_mapmode mm); void (*unmap)(void *pool, unsigned long handle); + void *(*obj_read_begin)(void *pool, unsigned long handle, + void *local_copy); + void (*obj_read_end)(void *pool, unsigned long handle, + void *handle_mem); + void (*obj_write)(void *pool, unsigned long handle, + void *handle_mem, size_t mem_len); + u64 (*total_pages)(void *pool); }; -- cgit v1.2.3 From fcbea574754c63f7035d0c4ef7dfb161b60b5bde Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:31 +0000 Subject: mm: zpool: remove object mapping APIs zpool_map_handle(), zpool_unmap_handle(), and zpool_can_sleep_mapped() are no longer used. Remove them with the underlying driver callbacks. Link: https://lkml.kernel.org/r/20250305061134.4105762-4-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Reviewed-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/zpool.h | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'include') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 1784e735ee04..2c8a9d2654f6 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -13,25 +13,6 @@ struct zpool; -/* - * Control how a handle is mapped. It will be ignored if the - * implementation does not support it. Its use is optional. - * Note that this does not refer to memory protection, it - * refers to how the memory will be copied in/out if copying - * is necessary during mapping; read-write is the safest as - * it copies the existing memory in on map, and copies the - * changed memory back out on unmap. Write-only does not copy - * in the memory and should only be used for initialization. - * If in doubt, use ZPOOL_MM_DEFAULT which is read-write. - */ -enum zpool_mapmode { - ZPOOL_MM_RW, /* normal read-write mapping */ - ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */ - ZPOOL_MM_WO, /* write-only (no copy-in at map time) */ - - ZPOOL_MM_DEFAULT = ZPOOL_MM_RW -}; - bool zpool_has_pool(char *type); struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp); @@ -47,12 +28,6 @@ int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, void zpool_free(struct zpool *pool, unsigned long handle); -void *zpool_map_handle(struct zpool *pool, unsigned long handle, - enum zpool_mapmode mm); - -void zpool_unmap_handle(struct zpool *pool, unsigned long handle); - - void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, void *local_copy); @@ -95,11 +70,6 @@ struct zpool_driver { unsigned long *handle); void (*free)(void *pool, unsigned long handle); - bool sleep_mapped; - void *(*map)(void *pool, unsigned long handle, - enum zpool_mapmode mm); - void (*unmap)(void *pool, unsigned long handle); - void *(*obj_read_begin)(void *pool, unsigned long handle, void *local_copy); void (*obj_read_end)(void *pool, unsigned long handle, -- cgit v1.2.3 From 07864f1a57fb1f798a7d21f13e4929c9cb52daf7 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:32 +0000 Subject: mm: zsmalloc: remove object mapping APIs and per-CPU map areas zs_map_object() and zs_unmap_object() are no longer used, remove them. Since these are the only users of per-CPU mapping_areas, remove them and the associated CPU hotplug callbacks too. [yosry.ahmed@linux.dev: update the docs] Link: https://lkml.kernel.org/r/Z8ier-ZZp8T6MOTH@google.com Link: https://lkml.kernel.org/r/20250305061134.4105762-5-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Acked-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/cpuhotplug.h | 1 - include/linux/zsmalloc.h | 21 --------------------- 2 files changed, 22 deletions(-) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 6cc5e484547c..1987400000b4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -116,7 +116,6 @@ enum cpuhp_state { CPUHP_NET_IUCV_PREPARE, CPUHP_ARM_BL_PREPARE, CPUHP_TRACE_RB_PREPARE, - CPUHP_MM_ZS_PREPARE, CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 7d70983cf398..c26baf9fb331 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -16,23 +16,6 @@ #include -/* - * zsmalloc mapping modes - * - * NOTE: These only make a difference when a mapped object spans pages. - */ -enum zs_mapmode { - ZS_MM_RW, /* normal read-write mapping */ - ZS_MM_RO, /* read-only (no copy-out at unmap time) */ - ZS_MM_WO /* write-only (no copy-in at map time) */ - /* - * NOTE: ZS_MM_WO should only be used for initializing new - * (uninitialized) allocations. Partial writes to already - * initialized allocations should use ZS_MM_RW to preserve the - * existing data. - */ -}; - struct zs_pool_stats { /* How many pages were migrated (freed) */ atomic_long_t pages_compacted; @@ -48,10 +31,6 @@ void zs_free(struct zs_pool *pool, unsigned long obj); size_t zs_huge_class_size(struct zs_pool *pool); -void *zs_map_object(struct zs_pool *pool, unsigned long handle, - enum zs_mapmode mm); -void zs_unmap_object(struct zs_pool *pool, unsigned long handle); - unsigned long zs_get_total_pages(struct zs_pool *pool); unsigned long zs_compact(struct zs_pool *pool); -- cgit v1.2.3 From 7b60041156079f256a7df3f7de469de7db618580 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:33 +0000 Subject: mm: zpool: remove zpool_malloc_support_movable() zpool_malloc_support_movable() always returns true for zsmalloc, the only remaining zpool driver. Remove it and set the gfp flags in zswap_compress() accordingly. Opportunistically use GFP_NOWAIT instead of __GFP_NOWARN | __GFP_KSWAPD_RECLAIM for conciseness as they are equivalent. Link: https://lkml.kernel.org/r/20250305061134.4105762-6-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Reviewed-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Thomas Gleixner Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/zpool.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 2c8a9d2654f6..52f30e526607 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -21,8 +21,6 @@ const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); -bool zpool_malloc_support_movable(struct zpool *pool); - int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, unsigned long *handle); @@ -65,7 +63,6 @@ struct zpool_driver { void *(*create)(const char *name, gfp_t gfp); void (*destroy)(void *pool); - bool malloc_support_movable; int (*malloc)(void *pool, size_t size, gfp_t gfp, unsigned long *handle); void (*free)(void *pool, unsigned long handle); -- cgit v1.2.3 From bba38b874886b227283d56ecb2f7ebbaa01a781d Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 19 Feb 2025 14:41:14 +0100 Subject: rtc: pm8xxx: add support for uefi offset On many Qualcomm platforms the PMIC RTC control and time registers are read-only so that the RTC time can not be updated. Instead an offset needs be stored in some machine-specific non-volatile memory, which the driver can take into account. Add support for storing a 32-bit offset from the GPS time epoch in a UEFI variable so that the RTC time can be set on such platforms. The UEFI variable is 882f8c2b-9646-435f-8de5-f208ff80c1bd-RTCInfo and holds a 12-byte structure where the first four bytes is a GPS time offset in little-endian byte order. Note that this format is not arbitrary as the variable is shared with the UEFI firmware (and Windows). Tested-by: Jens Glathe Tested-by: Steev Klimaszewski Tested-by: Joel Stanley Tested-by: Sebastian Reichel # Lenovo T14s Gen6 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20250219134118.31017-3-johan+linaro@kernel.org Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 3f4d315aaec9..95da051fb155 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -170,6 +170,7 @@ struct rtc_device { /* useful timestamps */ #define RTC_TIMESTAMP_BEGIN_0000 -62167219200ULL /* 0000-01-01 00:00:00 */ #define RTC_TIMESTAMP_BEGIN_1900 -2208988800LL /* 1900-01-01 00:00:00 */ +#define RTC_TIMESTAMP_EPOCH_GPS 315964800LL /* 1980-01-06 00:00:00 */ #define RTC_TIMESTAMP_BEGIN_2000 946684800LL /* 2000-01-01 00:00:00 */ #define RTC_TIMESTAMP_END_2063 2966371199LL /* 2063-12-31 23:59:59 */ #define RTC_TIMESTAMP_END_2079 3471292799LL /* 2079-12-31 23:59:59 */ -- cgit v1.2.3 From 16b1936ae6d1858252413bf4bae8bcf247eb4b4c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 10 Mar 2025 07:49:34 +0000 Subject: lib/rbtree: add random seed Current test use pseudo rand function with fixed seed, which means the test data is the same pattern each time. Add random seed parameter to randomize the test. Link: https://lkml.kernel.org/r/20250310074938.26756-4-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Jason Gunthorpe Signed-off-by: Andrew Morton --- include/linux/types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/types.h b/include/linux/types.h index 1c509ce8f7f6..864ab701f297 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -92,6 +92,7 @@ typedef unsigned char unchar; typedef unsigned short ushort; typedef unsigned int uint; typedef unsigned long ulong; +typedef unsigned long long ullong; #ifndef __BIT_TYPES_DEFINED__ #define __BIT_TYPES_DEFINED__ -- cgit v1.2.3 From 19811285784f7f3d4abaa6e94623f2e7d10219d0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 10 Mar 2025 07:49:37 +0000 Subject: lib/interval_tree: skip the check before go to the right subtree The interval_tree_subtree_search() holds the loop invariant: start <= node->ITSUBTREE Let's say we have a following tree: node / \ left right So we know node->ITSUBTREE is contributed by one of the following: * left->ITSUBTREE * ITLAST(node) * right->ITSUBTREE When we come to the right node, we are sure the first two don't contribute to node->ITSUBTREE and it must be the right node does the job. So skip the check before go to the right subtree. Link: https://lkml.kernel.org/r/20250310074938.26756-7-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Jason Gunthorpe Signed-off-by: Andrew Morton --- include/linux/interval_tree_generic.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h index aaa8a0767aa3..1b400f26f63d 100644 --- a/include/linux/interval_tree_generic.h +++ b/include/linux/interval_tree_generic.h @@ -104,12 +104,8 @@ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ if (ITSTART(node) <= last) { /* Cond1 */ \ if (start <= ITLAST(node)) /* Cond2 */ \ return node; /* node is leftmost match */ \ - if (node->ITRB.rb_right) { \ - node = rb_entry(node->ITRB.rb_right, \ - ITSTRUCT, ITRB); \ - if (start <= node->ITSUBTREE) \ - continue; \ - } \ + node = rb_entry(node->ITRB.rb_right, ITSTRUCT, ITRB); \ + continue; \ } \ return NULL; /* No match */ \ } \ -- cgit v1.2.3 From 35a566a24e58f1b5f89737edf60b77de58719ed0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Feb 2025 18:14:26 -0500 Subject: NFSv4: Avoid unnecessary scans of filesystems for returning delegations The amount of looping through the list of delegations is occasionally leading to soft lockups. If the state manager was asked to return delegations asynchronously, it should only scan those filesystems that hold delegations that need to be returned. Fixes: af3b61bf6131 ("NFSv4: Clean up nfs_client_return_marked_delegations()") Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index f00bfcee7120..4e9ad6f6e907 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -250,6 +250,8 @@ struct nfs_server { struct list_head ss_copies; struct list_head ss_src_copies; + unsigned long delegation_flags; +#define NFS4SERV_DELEGRETURN (1) unsigned long delegation_gen; unsigned long mig_gen; unsigned long mig_status; -- cgit v1.2.3 From f163aa81a799e2d46d7f8f0b42a0e7770eaa0d06 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Feb 2025 19:03:21 -0500 Subject: NFSv4: Avoid unnecessary scans of filesystems for expired delegations The amount of looping through the list of delegations is occasionally leading to soft lockups. If the state manager was asked to reap the expired delegations, it should scan only those filesystems that hold delegations that need to be reaped. Fixes: 7f156ef0bf45 ("NFSv4: Clean up nfs_delegation_reap_expired()") Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 4e9ad6f6e907..7d6f164036fa 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -252,6 +252,7 @@ struct nfs_server { unsigned long delegation_flags; #define NFS4SERV_DELEGRETURN (1) +#define NFS4SERV_DELEGATION_EXPIRED (2) unsigned long delegation_gen; unsigned long mig_gen; unsigned long mig_status; -- cgit v1.2.3 From e767b59e29b8327d25edde65efc743f479f30d0a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Feb 2025 18:37:51 -0500 Subject: NFSv4: Avoid unnecessary scans of filesystems for delayed delegations The amount of looping through the list of delegations is occasionally leading to soft lockups. If the state manager was asked to manage the delayed return of delegations, then only scan those filesystems containing delegations that were marked as being delayed. Fixes: be20037725d1 ("NFSv4: Fix delegation return in cases where we have to retry") Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 7d6f164036fa..108862d81b57 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -253,6 +253,7 @@ struct nfs_server { unsigned long delegation_flags; #define NFS4SERV_DELEGRETURN (1) #define NFS4SERV_DELEGATION_EXPIRED (2) +#define NFS4SERV_DELEGRETURN_DELAYED (3) unsigned long delegation_gen; unsigned long mig_gen; unsigned long mig_status; -- cgit v1.2.3 From 8955e7ce61fb6ba28f88e0a38479c992991ba6ab Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 13 Jan 2025 10:32:39 -0500 Subject: NFS: Implement NFSv4.2's OFFLOAD_STATUS XDR Add XDR encoding and decoding functions for the NFSv4.2 OFFLOAD_STATUS operation. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Reviewed-by: Benjamin Coddington Link: https://lore.kernel.org/r/20250113153235.48706-13-cel@kernel.org Signed-off-by: Trond Myklebust --- include/linux/nfs4.h | 1 + include/linux/nfs_xdr.h | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 9ac83ca88326..5fa60fe441b5 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -691,6 +691,7 @@ enum { NFSPROC4_CLNT_LISTXATTRS, NFSPROC4_CLNT_REMOVEXATTR, NFSPROC4_CLNT_READ_PLUS, + NFSPROC4_CLNT_OFFLOAD_STATUS, }; /* nfs41 types */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9155a6ffc370..dc5288111999 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1515,8 +1515,9 @@ struct nfs42_offload_status_args { struct nfs42_offload_status_res { struct nfs4_sequence_res osr_seq_res; - uint64_t osr_count; - int osr_status; + u64 osr_count; + int complete_count; + u32 osr_complete; }; struct nfs42_copy_notify_args { -- cgit v1.2.3 From 77dd8a302f56679178924f82a29eaf437857ea75 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 13 Jan 2025 10:32:40 -0500 Subject: NFS: Implement NFSv4.2's OFFLOAD_STATUS operation Enable the Linux NFS client to observe the progress of an offloaded asynchronous COPY operation. This new operation will be put to use in a subsequent patch. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Reviewed-by: Benjamin Coddington Link: https://lore.kernel.org/r/20250113153235.48706-14-cel@kernel.org Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 108862d81b57..1711eefcf24f 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -293,6 +293,7 @@ struct nfs_server { #define NFS_CAP_CASE_INSENSITIVE (1U << 6) #define NFS_CAP_CASE_PRESERVING (1U << 7) #define NFS_CAP_REBOOT_LAYOUTRETURN (1U << 8) +#define NFS_CAP_OFFLOAD_STATUS (1U << 9) #define NFS_CAP_OPEN_XOR (1U << 12) #define NFS_CAP_DELEGTIME (1U << 13) #define NFS_CAP_POSIX_LOCK (1U << 14) -- cgit v1.2.3 From e6fa3963a30d53427d33a577c5ba4bfd3373bc93 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:30:59 +1100 Subject: fs/dax: refactor wait for dax idle page A FS DAX page is considered idle when its refcount drops to one. This is currently open-coded in all file systems supporting FS DAX. Move the idle detection to a common function to make future changes easier. Link: https://lkml.kernel.org/r/c2c9d269110b90224eeb1dc661ffbc1d82aa20c9.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Acked-by: Theodore Ts'o Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/dax.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index df41a0017b31..9b1ce984d410 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -207,6 +207,14 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); +static inline int dax_wait_page_idle(struct page *page, + void (cb)(struct inode *), + struct inode *inode) +{ + return ___wait_var_event(page, page_ref_count(page) == 1, + TASK_INTERRUPTIBLE, 0, 0, cb(inode)); +} + #if IS_ENABLED(CONFIG_DAX) int dax_read_lock(void); void dax_read_unlock(int id); -- cgit v1.2.3 From d5b3afea22a52517e6bc835432e6dfd079b8bf7c Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:00 +1100 Subject: fs/dax: create a common implementation to break DAX layouts Prior to freeing a block file systems supporting FS DAX must check that the associated pages are both unmapped from user-space and not undergoing DMA or other access from eg. get_user_pages(). This is achieved by unmapping the file range and scanning the FS DAX page-cache to see if any pages within the mapping have an elevated refcount. This is done using two functions - dax_layout_busy_page_range() which returns a page to wait for the refcount to become idle on. Rather than open-code this introduce a common implementation to both unmap and wait for the page to become idle. Link: https://lkml.kernel.org/r/c4d381e41fc618296cee2820403c166d80599d5c.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Dan Williams Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/dax.h | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 9b1ce984d410..a6b277f1e13a 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -207,12 +207,9 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -static inline int dax_wait_page_idle(struct page *page, - void (cb)(struct inode *), - struct inode *inode) +static inline bool dax_page_is_idle(struct page *page) { - return ___wait_var_event(page, page_ref_count(page) == 1, - TASK_INTERRUPTIBLE, 0, 0, cb(inode)); + return page && page_ref_count(page) == 1; } #if IS_ENABLED(CONFIG_DAX) @@ -228,6 +225,15 @@ static inline void dax_read_unlock(int id) { } #endif /* CONFIG_DAX */ + +#if !IS_ENABLED(CONFIG_FS_DAX) +static inline int __must_check dax_break_layout(struct inode *inode, + loff_t start, loff_t end, void (cb)(struct inode *)) +{ + return 0; +} +#endif + bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, @@ -251,6 +257,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); +int __must_check dax_break_layout(struct inode *inode, loff_t start, + loff_t end, void (cb)(struct inode *)); +static inline int __must_check dax_break_layout_inode(struct inode *inode, + void (cb)(struct inode *)) +{ + return dax_break_layout(inode, 0, LLONG_MAX, cb); +} int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same, -- cgit v1.2.3 From bde708f1a65d025c45575bfe1e7bf7bdf7e71e87 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:01 +1100 Subject: fs/dax: always remove DAX page-cache entries when breaking layouts Prior to any truncation operations file systems call dax_break_mapping() to ensure pages in the range are not under going DMA. Later DAX page-cache entries will be removed by truncate_folio_batch_exceptionals() in the generic page-cache code. However this makes it possible for folios to be removed from the page-cache even though they are still DMA busy if the file-system hasn't called dax_break_mapping(). It also means they can never be waited on in future because FS DAX will lose track of them once the page-cache entry has been deleted. Instead it is better to delete the FS DAX entry when the file-system calls dax_break_mapping() as part of it's truncate operation. This ensures only idle pages can be removed from the FS DAX page-cache and makes it easy to detect if a file-system hasn't called dax_break_mapping() prior to a truncate operation. Link: https://lkml.kernel.org/r/3be6115eaaa8d28fee37fcba3287be4f226a7d24.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Dan Williams Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/dax.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index a6b277f1e13a..2fbb262092ca 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -255,6 +255,8 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); +void dax_delete_mapping_range(struct address_space *mapping, + loff_t start, loff_t end); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); int __must_check dax_break_layout(struct inode *inode, loff_t start, -- cgit v1.2.3 From 0e2f80afcfa699ce722c01afc9286a942bd57211 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:02 +1100 Subject: fs/dax: ensure all pages are idle prior to filesystem unmount File systems call dax_break_mapping() prior to reallocating file system blocks to ensure the page is not undergoing any DMA or other accesses. Generally this is needed when a file is truncated to ensure that if a block is reallocated nothing is writing to it. However filesystems currently don't call this when an FS DAX inode is evicted. This can cause problems when the file system is unmounted as a page can continue to be under going DMA or other remote access after unmount. This means if the file system is remounted any truncate or other operation which requires the underlying file system block to be freed will not wait for the remote access to complete. Therefore a busy block may be reallocated to a new file leading to corruption. Link: https://lkml.kernel.org/r/2d3cf575bbd095084993154be2f0aa7442e5cd28.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: Dan Wiliams Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/dax.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 2fbb262092ca..2333c30f6d36 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -232,6 +232,10 @@ static inline int __must_check dax_break_layout(struct inode *inode, { return 0; } + +static inline void dax_break_layout_final(struct inode *inode) +{ +} #endif bool dax_alive(struct dax_device *dax_dev); @@ -266,6 +270,7 @@ static inline int __must_check dax_break_layout_inode(struct inode *inode, { return dax_break_layout(inode, 0, LLONG_MAX, cb); } +void dax_break_layout_final(struct inode *inode); int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same, -- cgit v1.2.3 From cbe298d82cf70aa6bb16cfc8ae4db522f39a0034 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:03 +1100 Subject: fs/dax: remove PAGE_MAPPING_DAX_SHARED mapping flag The page ->mapping pointer can have magic values like PAGE_MAPPING_DAX_SHARED and PAGE_MAPPING_ANON for page owner specific usage. Currently PAGE_MAPPING_DAX_SHARED and PAGE_MAPPING_ANON alias to the same value. This isn't a problem because FS DAX pages are never seen by the anonymous mapping code and vice versa. However a future change will make FS DAX pages more like normal pages, so folio_test_anon() must not return true for a FS DAX page. We could explicitly test for a FS DAX page in folio_test_anon(), etc. however the PAGE_MAPPING_DAX_SHARED flag isn't actually needed. Instead we can use the page->mapping field to implicitly track the first mapping of a page. If page->mapping is non-NULL it implies the page is associated with a single mapping at page->index. If the page is associated with a second mapping clear page->mapping and set page->share to 1. This is possible because a shared mapping implies the file-system implements dax_holder_operations which makes the ->mapping and ->index, which is a union with ->share, unused. The page is considered shared when page->mapping == NULL and page->share > 0 or page->mapping != NULL, implying it is present in at least one address space. This also makes it easier for a future change to detect when a page is first mapped into an address space which requires special handling. Link: https://lkml.kernel.org/r/c22f699202db0acee2f7039eb026e68261ce42d6.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Tested-by: Alison Schofield Cc: Asahi Lina Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: Dan Wiliams Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Ted Ts'o Cc: Vishal Verma Cc: WANG Xuerui Cc: Will Deacon Cc: Alexander Gordeev Cc: Balbir Singh Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Jason Gunthorpe Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vivek Goyal Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 36d283552f80..cab382bd965e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -673,12 +673,6 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) -/* - * Different with flags above, this flag is used only for fsdax mode. It - * indicates that this page->mapping is now under reflink case. - */ -#define PAGE_MAPPING_DAX_SHARED ((void *)0x1) - static __always_inline bool folio_mapping_flags(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; -- cgit v1.2.3 From 82ba975e4c43d98afebced82d940ddb7aec42a9d Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:06 +1100 Subject: mm: allow compound zone device pages Zone device pages are used to represent various type of device memory managed by device drivers. Currently compound zone device pages are not supported. This is because MEMORY_DEVICE_FS_DAX pages are the only user of higher order zone device pages and have their own page reference counting. A future change will unify FS DAX reference counting with normal page reference counting rules and remove the special FS DAX reference counting. Supporting that requires compound zone device pages. Supporting compound zone device pages requires compound_head() to distinguish between head and tail pages whilst still preserving the special struct page fields that are specific to zone device pages. A tail page is distinguished by having bit zero being set in page->compound_head, with the remaining bits pointing to the head page. For zone device pages page->compound_head is shared with page->pgmap. The page->pgmap field must be common to all pages within a folio, even if the folio spans memory sections. Therefore pgmap is the same for both head and tail pages and can be moved into the folio and we can use the standard scheme to find compound_head from a tail page. Link: https://lkml.kernel.org/r/67055d772e6102accf85161d0b57b0b3944292bf.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Signed-off-by: Balbir Singh Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Acked-by: David Hildenbrand Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memremap.h | 6 +++--- include/linux/migrate.h | 4 ++-- include/linux/mm_types.h | 9 +++++++-- include/linux/mmzone.h | 12 +++++++++++- 4 files changed, 23 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 3f7143ade32c..0256a4218dc3 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -161,7 +161,7 @@ static inline bool is_device_private_page(const struct page *page) { return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PRIVATE; + page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE; } static inline bool folio_is_device_private(const struct folio *folio) @@ -173,13 +173,13 @@ static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; + page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA; } static inline bool is_device_coherent_page(const struct page *page) { return is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_COHERENT; + page_pgmap(page)->type == MEMORY_DEVICE_COHERENT; } static inline bool folio_is_device_coherent(const struct folio *folio) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 29919faea2f1..61899ec7a9a3 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -205,8 +205,8 @@ struct migrate_vma { unsigned long end; /* - * Set to the owner value also stored in page->pgmap->owner for - * migrating out of device private memory. The flags also need to + * Set to the owner value also stored in page_pgmap(page)->owner + * for migrating out of device private memory. The flags also need to * be set to MIGRATE_VMA_SELECT_DEVICE_PRIVATE. * The caller should always set this field when using mmu notifier * callbacks to avoid device MMU invalidations for device private diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a93abb4452b..0fa7907d437e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -130,8 +130,11 @@ struct page { unsigned long compound_head; /* Bit zero is set */ }; struct { /* ZONE_DEVICE pages */ - /** @pgmap: Points to the hosting device page map. */ - struct dev_pagemap *pgmap; + /* + * The first word is used for compound_head or folio + * pgmap + */ + void *_unused_pgmap_compound_head; void *zone_device_data; /* * ZONE_DEVICE private pages are counted as being @@ -300,6 +303,7 @@ typedef struct { * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. + * @pgmap: Metadata for ZONE_DEVICE mappings * @virtual: Virtual address in the kernel direct map. * @_last_cpupid: IDs of last CPU and last process that accessed the folio. * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). @@ -338,6 +342,7 @@ struct folio { /* private: */ }; /* public: */ + struct dev_pagemap *pgmap; }; struct address_space *mapping; pgoff_t index; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 44ecb2f90db4..550dbba92521 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1158,6 +1158,12 @@ static inline bool is_zone_device_page(const struct page *page) return page_zonenum(page) == ZONE_DEVICE; } +static inline struct dev_pagemap *page_pgmap(const struct page *page) +{ + VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page); + return page_folio(page)->pgmap; +} + /* * Consecutive zone device pages should not be merged into the same sgl * or bvec segment with other types of pages or if they belong to different @@ -1173,7 +1179,7 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, return false; if (!is_zone_device_page(a)) return true; - return a->pgmap == b->pgmap; + return page_pgmap(a) == page_pgmap(b); } extern void memmap_init_zone_device(struct zone *, unsigned long, @@ -1188,6 +1194,10 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, { return true; } +static inline struct dev_pagemap *page_pgmap(const struct page *page) +{ + return NULL; +} #endif static inline bool folio_is_zone_device(const struct folio *folio) -- cgit v1.2.3 From ec2e0cc67f9c3ed5926da390ad86b25c142a2e4a Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:08 +1100 Subject: mm/memory: add vmf_insert_page_mkwrite() Currently to map a DAX page the DAX driver calls vmf_insert_pfn. This creates a special devmap PTE entry for the pfn but does not take a reference on the underlying struct page for the mapping. This is because DAX page refcounts are treated specially, as indicated by the presence of a devmap entry. To allow DAX page refcounts to be managed the same as normal page refcounts introduce vmf_insert_page_mkwrite(). This will take a reference on the underlying page much the same as vmf_insert_page, except it also permits upgrading an existing mapping to be writable if requested/possible. Link: https://lkml.kernel.org/r/4ce3aa984c060f370105e0bfef1035869578be47.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: Dan Wiliams Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9a74a3ee68bc..c6fb9300f6c0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3644,6 +3644,8 @@ int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num); +vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, + bool write); vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, -- cgit v1.2.3 From 349994cf61e6eaa5996d53e45ffd2272d32d5e4e Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:09 +1100 Subject: mm/rmap: add support for PUD sized mappings to rmap The rmap doesn't currently support adding a PUD mapping of a folio. This patch adds support for entire PUD mappings of folios, primarily to allow for more standard refcounting of device DAX folios. Currently DAX is the only user of this and it doesn't require support for partially mapped PUD-sized folios so we don't support for that for now. Link: https://lkml.kernel.org/r/248582c07896e30627d1aeaeebc6949cfd91b851.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Reviewed-by: Dan Williams Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/rmap.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 69e9a431a40e..6abf7960077a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -192,6 +192,7 @@ typedef int __bitwise rmap_t; enum rmap_level { RMAP_LEVEL_PTE = 0, RMAP_LEVEL_PMD, + RMAP_LEVEL_PUD, }; static inline void __folio_rmap_sanity_checks(const struct folio *folio, @@ -228,6 +229,14 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); break; + case RMAP_LEVEL_PUD: + /* + * Assume that we are creating a single "entire" mapping of the + * folio. + */ + VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio); + VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); + break; default: VM_WARN_ON_ONCE(true); } @@ -251,12 +260,16 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, folio_add_file_rmap_ptes(folio, page, 1, vma) void folio_add_file_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); +void folio_add_file_rmap_pud(struct folio *, struct page *, + struct vm_area_struct *); void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_remove_rmap_pte(folio, page, vma) \ folio_remove_rmap_ptes(folio, page, 1, vma) void folio_remove_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); +void folio_remove_rmap_pud(struct folio *, struct page *, + struct vm_area_struct *); void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address, rmap_t flags); @@ -341,6 +354,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: + case RMAP_LEVEL_PUD: atomic_inc(&folio->_entire_mapcount); atomic_inc(&folio->_large_mapcount); break; @@ -437,6 +451,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: + case RMAP_LEVEL_PUD: if (PageAnonExclusive(page)) { if (unlikely(maybe_pinned)) return -EBUSY; -- cgit v1.2.3 From dbe54153296d0931144a63295fc9367794bbe797 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:10 +1100 Subject: mm/huge_memory: add vmf_insert_folio_pud() Currently DAX folio/page reference counts are managed differently to normal pages. To allow these to be managed the same as normal pages introduce vmf_insert_folio_pud. This will map the entire PUD-sized folio and take references as it would for a normally mapped page. This is distinct from the current mechanism, vmf_insert_pfn_pud, which simply inserts a special devmap PUD entry into the page table without holding a reference to the page for the mapping. Link: https://lkml.kernel.org/r/649a1ef91d556593948351e94f51ef73a14f6794.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Dan Williams Acked-by: David Hildenbrand Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e1bea54820ff..4eeb54b50621 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -39,6 +39,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); +vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, + bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_UNSUPPORTED, -- cgit v1.2.3 From 6c88f72691f88fd1fc493a3c2eed97f91b82b3f0 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:11 +1100 Subject: mm/huge_memory: add vmf_insert_folio_pmd() Currently DAX folio/page reference counts are managed differently to normal pages. To allow these to be managed the same as normal pages introduce vmf_insert_folio_pmd. This will map the entire PMD-sized folio and take references as it would for a normally mapped page. This is distinct from the current mechanism, vmf_insert_pfn_pmd, which simply inserts a special devmap PMD entry into the page table without holding a reference to the page for the mapping. It is not currently useful to implement a more generic vmf_insert_folio() which selects the correct behaviour based on folio_order(). This is because PTE faults require only a subpage of the folio to be PTE mapped rather than the entire folio. It would be possible to add this context somewhere but callers already need to handle PTE faults and PMD faults separately so a more generic function is not useful. Link: https://lkml.kernel.org/r/7bf92a2e68225d13ea368d53bbfee327314d1c40.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: Dan Wiliams Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 4eeb54b50621..e57e811cfd3c 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -39,6 +39,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); +vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, + bool write); vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, bool write); -- cgit v1.2.3 From e5cb23256347469c80138a2a8804887239465d0b Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:12 +1100 Subject: mm/gup: don't allow FOLL_LONGTERM pinning of FS DAX pages Longterm pinning of FS DAX pages should already be disallowed by various pXX_devmap checks. However a future change will cause these checks to be invalid for FS DAX pages so make folio_is_longterm_pinnable() return false for FS DAX pages. Link: https://lkml.kernel.org/r/250a31876704b79f7c65b159f3c835e547f052df.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: John Hubbard Reviewed-by: Dan Williams Acked-by: David Hildenbrand Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memremap.h | 11 +++++++++++ include/linux/mm.h | 7 +++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 0256a4218dc3..4aa151914eab 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -187,6 +187,17 @@ static inline bool folio_is_device_coherent(const struct folio *folio) return is_device_coherent_page(&folio->page); } +static inline bool is_fsdax_page(const struct page *page) +{ + return is_zone_device_page(page) && + page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX; +} + +static inline bool folio_is_fsdax(const struct folio *folio) +{ + return is_fsdax_page(&folio->page); +} + #ifdef CONFIG_ZONE_DEVICE void zone_device_page_init(struct page *page); void *memremap_pages(struct dev_pagemap *pgmap, int nid); diff --git a/include/linux/mm.h b/include/linux/mm.h index c6fb9300f6c0..009484a07074 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2115,6 +2115,13 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio) if (folio_is_device_coherent(folio)) return false; + /* + * Filesystems can only tolerate transient delays to truncate and + * hole-punch operations + */ + if (folio_is_fsdax(folio)) + return false; + /* Otherwise, non-movable zone folios can be pinned. */ return !folio_is_zone_movable(folio); -- cgit v1.2.3 From 38607c62b34b46317c46d5baf1df03ac6e48a1c6 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:14 +1100 Subject: fs/dax: properly refcount fs dax pages Currently fs dax pages are considered free when the refcount drops to one and their refcounts are not increased when mapped via PTEs or decreased when unmapped. This requires special logic in mm paths to detect that these pages should not be properly refcounted, and to detect when the refcount drops to one instead of zero. On the other hand get_user_pages(), etc. will properly refcount fs dax pages by taking a reference and dropping it when the page is unpinned. Tracking this special behaviour requires extra PTE bits (eg. pte_devmap) and introduces rules that are potentially confusing and specific to FS DAX pages. To fix this, and to possibly allow removal of the special PTE bits in future, convert the fs dax page refcounts to be zero based and instead take a reference on the page each time it is mapped as is currently the case for normal pages. This may also allow a future clean-up to remove the pgmap refcounting that is currently done in mm/gup.c. Link: https://lkml.kernel.org/r/c7d886ad7468a20452ef6e0ddab6cfe220874e7c.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Dan Williams Tested-by: Alison Schofield Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/dax.h | 2 +- include/linux/mm.h | 27 ++------------------------- include/linux/mm_types.h | 7 ++++++- 3 files changed, 9 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 2333c30f6d36..dcc9fcdf14e4 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -209,7 +209,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, static inline bool dax_page_is_idle(struct page *page) { - return page && page_ref_count(page) == 1; + return page && page_ref_count(page) == 0; } #if IS_ENABLED(CONFIG_DAX) diff --git a/include/linux/mm.h b/include/linux/mm.h index 009484a07074..de008efd96aa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1192,6 +1192,8 @@ int vma_is_stack_for_current(struct vm_area_struct *vma); struct mmu_gather; struct inode; +extern void prep_compound_page(struct page *page, unsigned int order); + /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be @@ -1513,25 +1515,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf); * back into memory. */ -#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) -DECLARE_STATIC_KEY_FALSE(devmap_managed_key); - -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs); -static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) -{ - if (!static_branch_unlikely(&devmap_managed_key)) - return false; - if (!folio_is_zone_device(folio)) - return false; - return __put_devmap_managed_folio_refs(folio, refs); -} -#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ -static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) -{ - return false; -} -#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ - /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) @@ -1652,12 +1635,6 @@ static inline void put_page(struct page *page) if (folio_test_slab(folio)) return; - /* - * For some devmap managed pages we need to catch refcount transition - * from 2 to 1: - */ - if (put_devmap_managed_folio_refs(folio, 1)) - return; folio_put(folio); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0fa7907d437e..b1827d78ff89 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -296,6 +296,8 @@ typedef struct { * anonymous memory. * @index: Offset within the file, in units of pages. For anonymous memory, * this is the index from the beginning of the mmap. + * @share: number of DAX mappings that reference this folio. See + * dax_associate_entry. * @private: Filesystem per-folio data (see folio_attach_private()). * @swap: Used for swp_entry_t if folio_test_swapcache(). * @_mapcount: Do not access this member directly. Use folio_mapcount() to @@ -345,7 +347,10 @@ struct folio { struct dev_pagemap *pgmap; }; struct address_space *mapping; - pgoff_t index; + union { + pgoff_t index; + unsigned long share; + }; union { void *private; swp_entry_t swap; -- cgit v1.2.3 From 6220ea5583e977e1eeea06c237cc440e5ac3de46 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:29:54 +0100 Subject: mm: factor out large folio handling from folio_order() into folio_large_order() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: MM owner tracking for large folios (!hugetlb) + CONFIG_NO_PAGE_MAPCOUNT", v3. Let's add an "easy" way to decide -- without false positives, without page-mapcounts and without page table/rmap scanning -- whether a large folio is "certainly mapped exclusively" into a single MM, or whether it "maybe mapped shared" into multiple MMs. Use that information to implement Copy-on-Write reuse, to convert folio_likely_mapped_shared() to folio_maybe_mapped_share(), and to introduce a kernel config option that lets us not use+maintain per-page mapcounts in large folios anymore. The bigger picture was presented at LSF/MM [1]. This series is effectively a follow-up on my early work [2], which implemented a more precise, but also more complicated, way to identify whether a large folio is "mapped shared" into multiple MMs or "mapped exclusively" into a single MM. 1 Patch Organization ==================== Patch #1 -> #6: make more room in order-1 folios, so we have two "unsigned long" available for our purposes Patch #7 -> #11: preparations Patch #12: MM owner tracking for large folios Patch #13: COW reuse for PTE-mapped anon THP Patch #14: folio_maybe_mapped_shared() Patch #15 -> #20: introduce and implement CONFIG_NO_PAGE_MAPCOUNT 2 MM owner tracking =================== We assign each MM a unique ID ("MM ID"), to be able to squeeze more information in our folios. On 32bit we use 15-bit IDs, on 64bit we use 31-bit IDs. For each large folios, we now store two MM-ID+mapcount ("slot") combinations: * mm0_id + mm0_mapcount * mm1_id + mm1_mapcount On 32bit, we use a 16-bit per-MM mapcount, on 64bit an ordinary 32bit mapcount. This way, we require 2x "unsigned long" on 32bit and 64bit for both slots. Paired with the large mapcount, we can reliably identify whether one of these MMs is the current owner (-> owns all mappings) or even holds all folio references (-> owns all mappings, and all references are from mappings). As long as only two MMs map folio pages at a time, we can reliably and precisely identify whether a large folio is "mapped shared" or "mapped exclusively". Any additional MM that starts mapping the folio while there are no free slots becomes an "untracked MM". If one such "untracked MM" is the last one mapping a folio exclusively, we will not detect the folio as "mapped exclusively" but instead as "maybe mapped shared". (exception: only a single mapping remains) So that's where the approach gets imprecise. For now, we use a bit-spinlock to sync the large mapcount + slots, and make sure we do keep the machinery fast, to not degrade (un)map performance drastically: for example, we make sure to only use a single atomic (when grabbing the bit-spinlock), like we would already perform when updating the large mapcount. 3 CONFIG_NO_PAGE_MAPCOUNT ========================= patch #15 -> #20 spell out and document what exactly is affected when not maintaining the per-page mapcounts in large folios anymore. Most importantly, as we cannot maintain folio->_nr_pages_mapped anymore when (un)mapping pages, we'll account a complete folio as mapped if a single page is mapped. In addition, we'll not detect partially mapped anonymous folios as such in all cases yet. Likely less relevant changes include that we might now under-estimate the USS (Unique Set Size) of a process, but never over-estimate it. The goal is to make CONFIG_NO_PAGE_MAPCOUNT the default at some point, to then slowly make it the only option, as we learn about real-life impacts and possible ways to mitigate them. 4 Performance ============= Detailed performance numbers were included in v1 [3], and not that much changed between v1 and v2. I did plenty of measurements on different systems in the meantime, that all revealed slightly different results. The pte-mapped-folio micro-benchmarks [4] are fairly sensitive to code layout changes on some systems. Especially the fork() benchmark started being more-shaky-than-before on recent kernels for some reason. In summary, with my micro-benchmarks: * Small folios are not impacted. * CoW performance seems to be mostly unchanged across all folios sizes. * CoW reuse performance of large folios now matches CoW reuse performance of small folios, because we now actually implement the CoW reuse optimization. On an Intel Xeon Silver 4210R I measured a ~65% reduction in runtime, on an arm64 system I measured ~54% reduction. * munmap() performance improves with CONFIG_NO_PAGE_MAPCOUNT. I saw double-digit % reduction (up to ~30% on an Intel Xeon Silver 4210R and up to ~70% on an AmpereOne A192-32X) with larger folios. The larger the folios, the larger the performance improvement. * munmao() performance very slightly (couple percent) degrades without CONFIG_NO_PAGE_MAPCOUNT for smaller folios. For larger folios, there seems to be no change at all. * fork() performance improves with CONFIG_NO_PAGE_MAPCOUNT. I saw double-digit % reduction (up to ~20% on an Intel Xeon Silver 4210R and up to ~10% on an AmpereOne A192-32X) with larger folios. The larger the folios, the larger the performance improvement. * While fork() performance without CONFIG_NO_PAGE_MAPCOUNT seems to be almost unchanged on some systems, I saw some degradation for smaller folios on the AmpereOne A192-32X. I did not investigate the details yet, but I suspect code layout changes or suboptimal code placement / inlining. I'm not to worried about the fork() micro-benchmarks for smaller folios given how shaky the results are lately and by how much we improved fork() performance recently. I also ran case-anon-cow-rand and case-anon-cow-seq part of vm-scalability, to assess the scalability and the impact of the bit-spinlock. My measurements on a two 2-socket 10-core Intel Xeon Silver 4210R CPU revealed no significant changes. Similarly, running these benchmarks with 2 MiB THPs enabled on the AmpereOne A192-32X with 192 cores, I got < 1% difference with < 1% stdev, which is nice. So far, I did not get my hands on a similarly large system with multiple sockets. I found no other fitting scalability benchmarks that seem to really hammer on concurrent mapping/unmapping of large folio pages like case-anon-cow-seq does. 5 Concerns ========== 5.1 Bit spinlock ---------------- I'm not quite happy about the bit-spinlock, but so far it does not seem to affect scalability in my measurements. If it ever becomes a problem we could either investigate improving the locking, or simply stopping the MM tracking once there are "too many mappings" and simply assume that the folio is "mapped shared" until it was freed. This would be similar (but slightly different) to the "0,1,2,stopped" counting idea Willy had at some point. Adding that logic to "stop tracking" adds more code to the hot path, so I avoided that for now. 5.2 folio_maybe_mapped_shared() ------------------------------- I documented the change from folio_likely_mapped_shared() to folio_maybe_mapped_shared() quite extensively. If we run into surprises, I have some ideas on how to resolve them. For now, I think we should be fine. 5.3 Added code to map/unmap hot path ------------------------------------ So far, it looks like the added code on the rmap hot path does not really seem to matter much in the bigger picture. I'd like to further reduce it (and possibly improve fork() performance further), but I don't easily see how right now. Well, and I am out of puff 🙂 Having that said, alternatives I considered (e.g., per-MM per-folio mapcount) would add a lot more overhead to these hot paths. 6 Future Work ============= 6.1 Large mapcount ------------------ It would be very handy if the large mapcount would count how often folio pages are actually mapped into page tables: a PMD on x86-64 would count 512 times. Calculating the average per-page mapcount will be easy, and remapping (PMD->PTE) folios would get even faster. That would also remove the need for the entire mapcount (except for PMD-sized folios for memory statistics reasons ...), and allow for mapping folios larger than PMDs (e.g., 4 MiB) easily. We likely would also have to take the same number of folio references to make our folio_mapcount() == folio_ref_count() work, and we'd want to be able to avoid mapcount+refcount overflows: this could already become an issue with pte-mapped PUD-sized folios (fsdax). One approach we discussed in the THP cabal meeting is (1) extending the mapcount for large folios to 64bit (at least on 64bit systems) and (2) keeping the refcount at 32bit, but (3) having exactly one reference if the the mapcount != 0. It should be doable, but there are some corner cases to consider on the unmap path; it is something that I will be looking into next. 6.2 hugetlb ----------- I'd love to make use of the same tracking also for hugetlb. The real problem is PMD table sharing: getting a page mapped by MM X and unmapped by MM Y will not work. With mshare, that problem should not exist (all mapping/unmapping will be routed through the mshare MM). [1] https://lwn.net/Articles/974223/ [2] https://lore.kernel.org/linux-mm/a9922f58-8129-4f15-b160-e0ace581bcbe@redhat.com/T/ [3] https://lkml.kernel.org/r/20240829165627.2256514-1-david@redhat.com [4] https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/pte-mapped-folio-benchmarks.c This patch (of 20): Let's factor it out into a simple helper function. This helper will also come in handy when working with code where we know that our folio is large. Maybe in the future we'll have the order readily available for small and large folios; in that case, folio_large_order() would simply translate to folio_order(). Link: https://lkml.kernel.org/r/20250303163014.1128035-1-david@redhat.com Link: https://lkml.kernel.org/r/20250303163014.1128035-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lance Yang Reviewed-by: Kirill A. Shutemov Cc: Thomas Gleixner Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: David Hildenbrand Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index de008efd96aa..e5cdbb94ef81 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1194,6 +1194,11 @@ struct inode; extern void prep_compound_page(struct page *page, unsigned int order); +static inline unsigned int folio_large_order(const struct folio *folio) +{ + return folio->_flags_1 & 0xff; +} + /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be @@ -1207,7 +1212,7 @@ static inline unsigned int compound_order(struct page *page) if (!test_bit(PG_head, &folio->flags)) return 0; - return folio->_flags_1 & 0xff; + return folio_large_order(folio); } /** @@ -1223,7 +1228,7 @@ static inline unsigned int folio_order(const struct folio *folio) { if (!folio_test_large(folio)) return 0; - return folio->_flags_1 & 0xff; + return folio_large_order(folio); } #include @@ -2145,7 +2150,7 @@ static inline long folio_nr_pages(const struct folio *folio) #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else - return 1L << (folio->_flags_1 & 0xff); + return 1L << folio_large_order(folio); #endif } @@ -2170,7 +2175,7 @@ static inline unsigned long compound_nr(struct page *page) #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else - return 1L << (folio->_flags_1 & 0xff); + return 1L << folio_large_order(folio); #endif } -- cgit v1.2.3 From 1ea5212aed0682ae1249cba9df7ad7ef539d0a7d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:29:55 +0100 Subject: mm: factor out large folio handling from folio_nr_pages() into folio_large_nr_pages() Let's factor it out into a simple helper function. This helper will also come in handy when working with code where we know that our folio is large. While at it, let's consistently return a "long" value from all these similar functions. Note that we cannot use "unsigned int" (even though _folio_nr_pages is of that type), because it would break some callers that do stuff like "-folio_nr_pages()". Both "int" or "unsigned long" would work as well. Maybe in the future we'll have the nr_pages readily available for all large folios, maybe even for small folios, or maybe for none. Link: https://lkml.kernel.org/r/20250303163014.1128035-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Kirill A. Shutemov Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index e5cdbb94ef81..cf892c08a88c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1199,6 +1199,18 @@ static inline unsigned int folio_large_order(const struct folio *folio) return folio->_flags_1 & 0xff; } +#ifdef CONFIG_64BIT +static inline long folio_large_nr_pages(const struct folio *folio) +{ + return folio->_folio_nr_pages; +} +#else +static inline long folio_large_nr_pages(const struct folio *folio) +{ + return 1L << folio_large_order(folio); +} +#endif + /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be @@ -2147,11 +2159,7 @@ static inline long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; -#ifdef CONFIG_64BIT - return folio->_folio_nr_pages; -#else - return 1L << folio_large_order(folio); -#endif + return folio_large_nr_pages(folio); } /* Only hugetlbfs can allocate folios larger than MAX_ORDER */ @@ -2166,24 +2174,20 @@ static inline long folio_nr_pages(const struct folio *folio) * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ -static inline unsigned long compound_nr(struct page *page) +static inline long compound_nr(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 1; -#ifdef CONFIG_64BIT - return folio->_folio_nr_pages; -#else - return 1L << folio_large_order(folio); -#endif + return folio_large_nr_pages(folio); } /** * thp_nr_pages - The number of regular pages in this huge page. * @page: The head page of a huge page. */ -static inline int thp_nr_pages(struct page *page) +static inline long thp_nr_pages(struct page *page) { return folio_nr_pages((struct folio *)page); } -- cgit v1.2.3 From 4996fc547f5b49f4a43c261dfadb02cf165cdb51 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:29:56 +0100 Subject: mm: let _folio_nr_pages overlay memcg_data in first tail page Let's free up some more of the "unconditionally available on 64BIT" space in order-1 folios by letting _folio_nr_pages overlay memcg_data in the first tail page (second folio page). Consequently, we have the optimization now whenever we have CONFIG_MEMCG, independent of 64BIT. We have to make sure that page->memcg on tail pages does not return "surprises". page_memcg_check() already properly refuses PageTail(). Let's do that earlier in print_page_owner_memcg() to avoid printing wrong "Slab cache page" information. No other code should touch that field on tail pages of compound pages. Reset the "_nr_pages" to 0 when splitting folios, or when freeing them back to the buddy (to avoid false page->memcg_data "bad page" reports). Note that in __split_huge_page(), folio_nr_pages() would stop working already as soon as we start messing with the subpages. Most kernel configs should have at least CONFIG_MEMCG enabled, even if disabled at runtime. 64byte "struct memmap" is what we usually have on 64BIT. While at it, rename "_folio_nr_pages" to "_nr_pages". Hopefully memdescs / dynamically allocating "strut folio" in the future will further clean this up, e.g., making _nr_pages available in all configs and maybe even in small folios. Doing that should be fairly easy on top of this change. [david@redhat.com: make "make htmldoc" happy] Link: https://lkml.kernel.org/r/a97f8a91-ec41-4796-81e3-7c9e0e491ba4@redhat.com Link: https://lkml.kernel.org/r/20250303163014.1128035-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Kirill A. Shutemov Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- include/linux/mm_types.h | 32 ++++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index cf892c08a88c..c9c2ca345350 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1199,10 +1199,10 @@ static inline unsigned int folio_large_order(const struct folio *folio) return folio->_flags_1 & 0xff; } -#ifdef CONFIG_64BIT +#ifdef NR_PAGES_IN_LARGE_FOLIO static inline long folio_large_nr_pages(const struct folio *folio) { - return folio->_folio_nr_pages; + return folio->_nr_pages; } #else static inline long folio_large_nr_pages(const struct folio *folio) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b1827d78ff89..f3b519fbeca1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -287,6 +287,11 @@ typedef struct { unsigned long val; } swp_entry_t; +#if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT) +/* We have some extra room after the refcount in tail pages. */ +#define NR_PAGES_IN_LARGE_FOLIO +#endif + /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. @@ -312,7 +317,7 @@ typedef struct { * @_large_mapcount: Do not use directly, call folio_mapcount(). * @_nr_pages_mapped: Do not use outside of rmap and debug code. * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). - * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). + * @_nr_pages: Do not use directly, call folio_nr_pages(). * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. @@ -376,14 +381,23 @@ struct folio { struct { unsigned long _flags_1; unsigned long _head_1; + union { + struct { /* public: */ - atomic_t _large_mapcount; - atomic_t _entire_mapcount; - atomic_t _nr_pages_mapped; - atomic_t _pincount; -#ifdef CONFIG_64BIT - unsigned int _folio_nr_pages; -#endif + atomic_t _large_mapcount; + atomic_t _entire_mapcount; + atomic_t _nr_pages_mapped; + atomic_t _pincount; + /* private: the union with struct page is transitional */ + }; + unsigned long _usable_1[4]; + }; + atomic_t _mapcount_1; + atomic_t _refcount_1; + /* public: */ +#ifdef NR_PAGES_IN_LARGE_FOLIO + unsigned int _nr_pages; +#endif /* NR_PAGES_IN_LARGE_FOLIO */ /* private: the union with struct page is transitional */ }; struct page __page_1; @@ -435,6 +449,8 @@ FOLIO_MATCH(_last_cpupid, _last_cpupid); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); +FOLIO_MATCH(_mapcount, _mapcount_1); +FOLIO_MATCH(_refcount, _refcount_1); #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ -- cgit v1.2.3 From 4eeec8c89a0c4a8c20fb13a4e7093cc8efce383d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:29:57 +0100 Subject: mm: move hugetlb specific things in folio to page[3] Let's just move the hugetlb specific stuff to a separate page, and stop letting it overlay other fields for now. This frees up some space in page[2], which we will use on 32bit to free up some space in page[1]. While we could move these things to page[3] instead, it's cleaner to just move the hugetlb specific things out of the way and pack the core-folio stuff as tight as possible. ... and we can minimize the work required in dump_folio. We can now avoid re-initializing &folio->_deferred_list in hugetlb code. Hopefully dynamically allocating "strut folio" in the future will further clean this up. Link: https://lkml.kernel.org/r/20250303163014.1128035-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f3b519fbeca1..727322ecbfdd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -407,20 +407,23 @@ struct folio { unsigned long _flags_2; unsigned long _head_2; /* public: */ - void *_hugetlb_subpool; - void *_hugetlb_cgroup; - void *_hugetlb_cgroup_rsvd; - void *_hugetlb_hwpoison; + struct list_head _deferred_list; /* private: the union with struct page is transitional */ }; + struct page __page_2; + }; + union { struct { - unsigned long _flags_2a; - unsigned long _head_2a; + unsigned long _flags_3; + unsigned long _head_3; /* public: */ - struct list_head _deferred_list; + void *_hugetlb_subpool; + void *_hugetlb_cgroup; + void *_hugetlb_cgroup_rsvd; + void *_hugetlb_hwpoison; /* private: the union with struct page is transitional */ }; - struct page __page_2; + struct page __page_3; }; }; @@ -457,8 +460,12 @@ FOLIO_MATCH(_refcount, _refcount_1); offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); -FOLIO_MATCH(flags, _flags_2a); -FOLIO_MATCH(compound_head, _head_2a); +#undef FOLIO_MATCH +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct folio, fl) == \ + offsetof(struct page, pg) + 3 * sizeof(struct page)) +FOLIO_MATCH(flags, _flags_3); +FOLIO_MATCH(compound_head, _head_3); #undef FOLIO_MATCH /** -- cgit v1.2.3 From 31a31da8a6187f1e5448ec73222e01d7d3fed4aa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:29:58 +0100 Subject: mm: move _pincount in folio to page[2] on 32bit Let's free up some space on 32bit in page[1] by moving the _pincount to page[2]. For order-1 folios (never anon folios!) on 32bit, we will now also use the GUP_PIN_COUNTING_BIAS approach. A fully-mapped order-1 folio requires 2 references. With GUP_PIN_COUNTING_BIAS being 1024, we'd detect such folios as "maybe pinned" with 512 full mappings, instead of 1024 for order-0. As anon folios are out of the picture (which are the most relevant users of checking for pinnings on *mapped* pages) and we are talking about 32bit, this is not expected to cause any trouble. In __dump_page(), copy one additional folio page if we detect a folio with an order > 1, so we can dump the pincount on order > 1 folios reliably. Note that THPs on 32bit are not particularly common (and we don't care too much about performance), but we want to keep it working reliably, because likely we want to use large folios there as well in the future, independent of PMD leaf support. Once we dynamically allocate "struct folio", fortunately the 32bit specifics will likely go away again; even small folios could then have a pincount and folio_has_pincount() would essentially always return "true". Link: https://lkml.kernel.org/r/20250303163014.1128035-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 +++++++++-- include/linux/mm_types.h | 5 +++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c9c2ca345350..860082ba8978 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2010,6 +2010,13 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } +static inline bool folio_has_pincount(const struct folio *folio) +{ + if (IS_ENABLED(CONFIG_64BIT)) + return folio_test_large(folio); + return folio_order(folio) > 1; +} + /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. @@ -2026,7 +2033,7 @@ static inline struct folio *pfn_folio(unsigned long pfn) * get that many refcounts, and b) all the callers of this routine are * expected to be able to deal gracefully with a false positive. * - * For large folios, the result will be exactly correct. That's because + * For most large folios, the result will be exactly correct. That's because * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * @@ -2037,7 +2044,7 @@ static inline struct folio *pfn_folio(unsigned long pfn) */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { - if (folio_test_large(folio)) + if (folio_has_pincount(folio)) return atomic_read(&folio->_pincount) > 0; /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 727322ecbfdd..3ea2019a1aac 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -387,7 +387,9 @@ struct folio { atomic_t _large_mapcount; atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; +#ifdef CONFIG_64BIT atomic_t _pincount; +#endif /* CONFIG_64BIT */ /* private: the union with struct page is transitional */ }; unsigned long _usable_1[4]; @@ -408,6 +410,9 @@ struct folio { unsigned long _head_2; /* public: */ struct list_head _deferred_list; +#ifndef CONFIG_64BIT + atomic_t _pincount; +#endif /* !CONFIG_64BIT */ /* private: the union with struct page is transitional */ }; struct page __page_2; -- cgit v1.2.3 From 845d2be6d41f016da670dcc4c8f5357c22172be8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:29:59 +0100 Subject: mm: move _entire_mapcount in folio to page[2] on 32bit Let's free up some space on 32bit in page[1] by moving the _pincount to page[2]. Ordinary folios only use the entire mapcount with PMD mappings, so order-1 folios don't apply. Similarly, hugetlb folios are always larger than order-1, turning the entire mapcount essentially unused for all order-1 folios. Moving it to order-1 folios will not change anything. On 32bit, simply check in folio_entire_mapcount() whether we have an order-1 folio, and return 0 in that case. Note that THPs on 32bit are not particularly common (and we don't care too much about performance), but we want to keep it working reliably, because likely we want to use large folios there as well in the future, independent of PMD leaf support. Once we dynamically allocate "struct folio", the 32bit specifics will go away again; even small folios could then have a pincount. Link: https://lkml.kernel.org/r/20250303163014.1128035-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ include/linux/mm_types.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 860082ba8978..f366c180f2b6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1333,6 +1333,8 @@ static inline int is_vmalloc_or_module_addr(const void *x) static inline int folio_entire_mapcount(const struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1)) + return 0; return atomic_read(&folio->_entire_mapcount) + 1; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3ea2019a1aac..9499eb8e8e66 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -385,9 +385,9 @@ struct folio { struct { /* public: */ atomic_t _large_mapcount; - atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; #ifdef CONFIG_64BIT + atomic_t _entire_mapcount; atomic_t _pincount; #endif /* CONFIG_64BIT */ /* private: the union with struct page is transitional */ @@ -411,6 +411,7 @@ struct folio { /* public: */ struct list_head _deferred_list; #ifndef CONFIG_64BIT + atomic_t _entire_mapcount; atomic_t _pincount; #endif /* !CONFIG_64BIT */ /* private: the union with struct page is transitional */ -- cgit v1.2.3 From 405c4ef769c7c5e83e3556b48a190fe1afe82425 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:30:00 +0100 Subject: mm/rmap: pass dst_vma to folio_dup_file_rmap_pte() and friends We'll need access to the destination MM when modifying the large mapcount of a non-hugetlb large folios next. So pass in the destination VMA. Link: https://lkml.kernel.org/r/20250303163014.1128035-8-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/rmap.h | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6abf7960077a..e795610bade8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -335,7 +335,8 @@ static inline void hugetlb_remove_rmap(struct folio *folio) } static __always_inline void __folio_dup_file_rmap(struct folio *folio, - struct page *page, int nr_pages, enum rmap_level level) + struct page *page, int nr_pages, struct vm_area_struct *dst_vma, + enum rmap_level level) { const int orig_nr_pages = nr_pages; @@ -366,45 +367,47 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, * @folio: The folio to duplicate the mappings of * @page: The first page to duplicate the mappings of * @nr_pages: The number of pages of which the mapping will be duplicated + * @dst_vma: The destination vm area * * The page range of the folio is defined by [page, page + nr_pages) * * The caller needs to hold the page table lock. */ static inline void folio_dup_file_rmap_ptes(struct folio *folio, - struct page *page, int nr_pages) + struct page *page, int nr_pages, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE); } static __always_inline void folio_dup_file_rmap_pte(struct folio *folio, - struct page *page) + struct page *page, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, 1, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE); } /** * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio * @folio: The folio to duplicate the mapping of * @page: The first page to duplicate the mapping of + * @dst_vma: The destination vm area * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock. */ static inline void folio_dup_file_rmap_pmd(struct folio *folio, - struct page *page) + struct page *page, struct vm_area_struct *dst_vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE); #else WARN_ON_ONCE(true); #endif } static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, - struct page *page, int nr_pages, struct vm_area_struct *src_vma, - enum rmap_level level) + struct page *page, int nr_pages, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, enum rmap_level level) { const int orig_nr_pages = nr_pages; bool maybe_pinned; @@ -470,6 +473,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, * @folio: The folio to duplicate the mappings of * @page: The first page to duplicate the mappings of * @nr_pages: The number of pages of which the mapping will be duplicated + * @dst_vma: The destination vm area * @src_vma: The vm area from which the mappings are duplicated * * The page range of the folio is defined by [page, page + nr_pages) @@ -488,16 +492,18 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. */ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, - struct page *page, int nr_pages, struct vm_area_struct *src_vma) + struct page *page, int nr_pages, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { - return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma, - RMAP_LEVEL_PTE); + return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma, + src_vma, RMAP_LEVEL_PTE); } static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, - struct page *page, struct vm_area_struct *src_vma) + struct page *page, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { - return __folio_try_dup_anon_rmap(folio, page, 1, src_vma, + return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma, RMAP_LEVEL_PTE); } @@ -506,6 +512,7 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, * of a folio * @folio: The folio to duplicate the mapping of * @page: The first page to duplicate the mapping of + * @dst_vma: The destination vm area * @src_vma: The vm area from which the mapping is duplicated * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) @@ -524,11 +531,12 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. */ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, - struct page *page, struct vm_area_struct *src_vma) + struct page *page, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma, - RMAP_LEVEL_PMD); + return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma, + src_vma, RMAP_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; -- cgit v1.2.3 From 932961c4b666937e27f7ec5358fb7aabf0f17d41 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:30:02 +0100 Subject: mm/rmap: abstract large mapcount operations for large folios (!hugetlb) Let's abstract the operations so we can extend these operations easily. Link: https://lkml.kernel.org/r/20250303163014.1128035-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/rmap.h | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e795610bade8..d1e888cc97a5 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -173,6 +173,30 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, struct anon_vma *folio_get_anon_vma(const struct folio *folio); +static inline void folio_set_large_mapcount(struct folio *folio, int mapcount, + struct vm_area_struct *vma) +{ + /* Note: mapcounts start at -1. */ + atomic_set(&folio->_large_mapcount, mapcount - 1); +} + +static inline void folio_add_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + atomic_add(diff, &folio->_large_mapcount); +} + +static inline void folio_sub_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + atomic_sub(diff, &folio->_large_mapcount); +} + +#define folio_inc_large_mapcount(folio, vma) \ + folio_add_large_mapcount(folio, 1, vma) +#define folio_dec_large_mapcount(folio, vma) \ + folio_sub_large_mapcount(folio, 1, vma) + /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; @@ -352,12 +376,12 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, do { atomic_inc(&page->_mapcount); } while (page++, --nr_pages > 0); - atomic_add(orig_nr_pages, &folio->_large_mapcount); + folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; case RMAP_LEVEL_PMD: case RMAP_LEVEL_PUD: atomic_inc(&folio->_entire_mapcount); - atomic_inc(&folio->_large_mapcount); + folio_inc_large_mapcount(folio, dst_vma); break; } } @@ -451,7 +475,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, ClearPageAnonExclusive(page); atomic_inc(&page->_mapcount); } while (page++, --nr_pages > 0); - atomic_add(orig_nr_pages, &folio->_large_mapcount); + folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; case RMAP_LEVEL_PMD: case RMAP_LEVEL_PUD: @@ -461,7 +485,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, ClearPageAnonExclusive(page); } atomic_inc(&folio->_entire_mapcount); - atomic_inc(&folio->_large_mapcount); + folio_inc_large_mapcount(folio, dst_vma); break; } return 0; -- cgit v1.2.3 From b85aa9b11b67ed4b086bf5366392adec1b4a6a81 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:30:03 +0100 Subject: bit_spinlock: __always_inline (un)lock functions The compiler might decide that it is a smart idea to not inline bit_spin_lock(), primarily when a couple of functions in the same file end up calling it. Especially when used in RMAP map/unmap code next, the compiler sometimes decides to not inline, which is then observable in some micro-benchmarks. Let's simply flag all lock/unlock functions as __always_inline; arch_test_and_set_bit_lock() and friends are already tagged like that (but not test_and_set_bit_lock() for some reason). If ever a problem, we could split it into a fast and a slow path, and only force the fast path to be inlined. But there is nothing particularly "big" here. Link: https://lkml.kernel.org/r/20250303163014.1128035-11-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/bit_spinlock.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h index bbc4730a6505..c0989b5b0407 100644 --- a/include/linux/bit_spinlock.h +++ b/include/linux/bit_spinlock.h @@ -13,7 +13,7 @@ * Don't use this unless you really need to: spin_lock() and spin_unlock() * are significantly faster. */ -static inline void bit_spin_lock(int bitnum, unsigned long *addr) +static __always_inline void bit_spin_lock(int bitnum, unsigned long *addr) { /* * Assuming the lock is uncontended, this never enters @@ -38,7 +38,7 @@ static inline void bit_spin_lock(int bitnum, unsigned long *addr) /* * Return true if it was acquired */ -static inline int bit_spin_trylock(int bitnum, unsigned long *addr) +static __always_inline int bit_spin_trylock(int bitnum, unsigned long *addr) { preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) @@ -54,7 +54,7 @@ static inline int bit_spin_trylock(int bitnum, unsigned long *addr) /* * bit-based spin_unlock() */ -static inline void bit_spin_unlock(int bitnum, unsigned long *addr) +static __always_inline void bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); @@ -71,7 +71,7 @@ static inline void bit_spin_unlock(int bitnum, unsigned long *addr) * non-atomic version, which can be used eg. if the bit lock itself is * protecting the rest of the flags in the word. */ -static inline void __bit_spin_unlock(int bitnum, unsigned long *addr) +static __always_inline void __bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); -- cgit v1.2.3 From 6af8cb80d3a9a6bbd521d8a7c949b4eafb7dba5d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:30:05 +0100 Subject: mm/rmap: basic MM owner tracking for large folios (!hugetlb) For small folios, we traditionally use the mapcount to decide whether it was "certainly mapped exclusively" by a single MM (mapcount == 1) or whether it "maybe mapped shared" by multiple MMs (mapcount > 1). For PMD-sized folios that were PMD-mapped, we were able to use a similar mechanism (single PMD mapping), but for PTE-mapped folios and in the future folios that span multiple PMDs, this does not work. So we need a different mechanism to handle large folios. Let's add a new mechanism to detect whether a large folio is "certainly mapped exclusively", or whether it is "maybe mapped shared". We'll use this information next to optimize CoW reuse for PTE-mapped anonymous THP, and to convert folio_likely_mapped_shared() to folio_maybe_mapped_shared(), independent of per-page mapcounts. For each large folio, we'll have two slots, whereby a slot stores: (1) an MM id: unique id assigned to each MM (2) a per-MM mapcount If a slot is unoccupied, it can be taken by the next MM that maps folio page. In addition, we'll remember the current state -- "mapped exclusively" vs. "maybe mapped shared" -- and use a bit spinlock to sync on updates and to reduce the total number of atomic accesses on updates. In the future, it might be possible to squeeze a proper spinlock into "struct folio". For now, keep it simple, as we require the whole thing with THP only, that is incompatible with RT. As we have to squeeze this information into the "struct folio" of even folios of order-1 (2 pages), and we generally want to reduce the required metadata, we'll assign each MM a unique ID that can fit into an int. In total, we can squeeze everything into 4x int (2x long) on 64bit. 32bit support is a bit challenging, because we only have 2x long == 2x int in order-1 folios. But we can make it work for now, because we neither expect many MMs nor very large folios on 32bit. We will reliably detect folios as "mapped exclusively" vs. "mapped shared" as long as only two MMs map pages of a folio at one point in time -- for example with fork() and short-lived child processes, or with apps that hand over state from one instance to another. As soon as three MMs are involved at the same time, we might detect "maybe mapped shared" although the folio is "mapped exclusively". Example 1: (1) App1 faults in a (shmem/file-backed) folio page -> Tracked as MM0 (2) App2 faults in a folio page -> Tracked as MM1 (4) App1 unmaps all folio pages -> We will detect "mapped exclusively". Example 2: (1) App1 faults in a (shmem/file-backed) folio page -> Tracked as MM0 (2) App2 faults in a folio page -> Tracked as MM1 (3) App3 faults in a folio page -> No slot available, tracked as "unknown" (4) App1 and App2 unmap all folio pages -> We will detect "maybe mapped shared". Make use of __always_inline to keep possible performance degradation when (un)mapping large folios to a minimum. Note: by squeezing the two flags into the "unsigned long" that stores the MM ids, we can use non-atomic __bit_spin_unlock() and non-atomic setting/clearing of the "maybe mapped shared" bit, effectively not adding any new atomics on the hot path when updating the large mapcount + new metadata, which further helps reduce the runtime overhead in micro-benchmarks. Link: https://lkml.kernel.org/r/20250303163014.1128035-13-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 49 ++++++++++++++ include/linux/page-flags.h | 4 ++ include/linux/rmap.h | 165 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 218 insertions(+) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9499eb8e8e66..aac7c87b04e1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -292,6 +292,44 @@ typedef struct { #define NR_PAGES_IN_LARGE_FOLIO #endif +/* + * On 32bit, we can cut the required metadata in half, because: + * (a) PID_MAX_LIMIT implicitly limits the number of MMs we could ever have, + * so we can limit MM IDs to 15 bit (32767). + * (b) We don't expect folios where even a single complete PTE mapping by + * one MM would exceed 15 bits (order-15). + */ +#ifdef CONFIG_64BIT +typedef int mm_id_mapcount_t; +#define MM_ID_MAPCOUNT_MAX INT_MAX +typedef unsigned int mm_id_t; +#else /* !CONFIG_64BIT */ +typedef short mm_id_mapcount_t; +#define MM_ID_MAPCOUNT_MAX SHRT_MAX +typedef unsigned short mm_id_t; +#endif /* CONFIG_64BIT */ + +/* We implicitly use the dummy ID for init-mm etc. where we never rmap pages. */ +#define MM_ID_DUMMY 0 +#define MM_ID_MIN (MM_ID_DUMMY + 1) + +/* + * We leave the highest bit of each MM id unused, so we can store a flag + * in the highest bit of each folio->_mm_id[]. + */ +#define MM_ID_BITS ((sizeof(mm_id_t) * BITS_PER_BYTE) - 1) +#define MM_ID_MASK ((1U << MM_ID_BITS) - 1) +#define MM_ID_MAX MM_ID_MASK + +/* + * In order to use bit_spin_lock(), which requires an unsigned long, we + * operate on folio->_mm_ids when working on flags. + */ +#define FOLIO_MM_IDS_LOCK_BITNUM MM_ID_BITS +#define FOLIO_MM_IDS_LOCK_BIT BIT(FOLIO_MM_IDS_LOCK_BITNUM) +#define FOLIO_MM_IDS_SHARED_BITNUM (2 * MM_ID_BITS + 1) +#define FOLIO_MM_IDS_SHARED_BIT BIT(FOLIO_MM_IDS_SHARED_BITNUM) + /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. @@ -318,6 +356,9 @@ typedef struct { * @_nr_pages_mapped: Do not use outside of rmap and debug code. * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_nr_pages: Do not use directly, call folio_nr_pages(). + * @_mm_id: Do not use outside of rmap code. + * @_mm_ids: Do not use outside of rmap code. + * @_mm_id_mapcount: Do not use outside of rmap code. * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. @@ -390,6 +431,11 @@ struct folio { atomic_t _entire_mapcount; atomic_t _pincount; #endif /* CONFIG_64BIT */ + mm_id_mapcount_t _mm_id_mapcount[2]; + union { + mm_id_t _mm_id[2]; + unsigned long _mm_ids; + }; /* private: the union with struct page is transitional */ }; unsigned long _usable_1[4]; @@ -1114,6 +1160,9 @@ struct mm_struct { #endif } lru_gen; #endif /* CONFIG_LRU_GEN_WALKS_MMU */ +#ifdef CONFIG_MM_ID + mm_id_t mm_id; +#endif /* CONFIG_MM_ID */ } __randomize_layout; /* diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cab382bd965e..f26ce54c7aa4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1185,6 +1185,10 @@ static inline int folio_has_private(const struct folio *folio) return !!(folio->flags & PAGE_FLAGS_PRIVATE); } +static inline bool folio_test_large_maybe_mapped_shared(const struct folio *folio) +{ + return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); +} #undef PF_ANY #undef PF_HEAD #undef PF_NO_TAIL diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d1e888cc97a5..c131b0efff0f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -13,6 +13,7 @@ #include #include #include +#include /* * The anon_vma heads a list of private "related" vmas, to scan if @@ -173,6 +174,169 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, struct anon_vma *folio_get_anon_vma(const struct folio *folio); +#ifdef CONFIG_MM_ID +static __always_inline void folio_lock_large_mapcount(struct folio *folio) +{ + bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); +} + +static __always_inline void folio_unlock_large_mapcount(struct folio *folio) +{ + __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); +} + +static inline unsigned int folio_mm_id(const struct folio *folio, int idx) +{ + VM_WARN_ON_ONCE(idx != 0 && idx != 1); + return folio->_mm_id[idx] & MM_ID_MASK; +} + +static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id) +{ + VM_WARN_ON_ONCE(idx != 0 && idx != 1); + folio->_mm_id[idx] &= ~MM_ID_MASK; + folio->_mm_id[idx] |= id; +} + +static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio, + int diff, mm_id_t mm_id) +{ + VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio)); + VM_WARN_ON_ONCE(diff <= 0); + VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX); + + /* + * Make sure we can detect at least one complete PTE mapping of the + * folio in a single MM as "exclusively mapped". This is primarily + * a check on 32bit, where we currently reduce the size of the per-MM + * mapcount to a short. + */ + VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio)); + VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX); + + VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY && + folio->_mm_id_mapcount[0] != -1); + VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY && + folio->_mm_id_mapcount[0] < 0); + VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY && + folio->_mm_id_mapcount[1] != -1); + VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY && + folio->_mm_id_mapcount[1] < 0); + VM_WARN_ON_ONCE(!folio_mapped(folio) && + folio_test_large_maybe_mapped_shared(folio)); +} + +static __always_inline void folio_set_large_mapcount(struct folio *folio, + int mapcount, struct vm_area_struct *vma) +{ + __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id); + + VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY); + VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY); + + /* Note: mapcounts start at -1. */ + atomic_set(&folio->_large_mapcount, mapcount - 1); + folio->_mm_id_mapcount[0] = mapcount - 1; + folio_set_mm_id(folio, 0, vma->vm_mm->mm_id); +} + +static __always_inline void folio_add_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + const mm_id_t mm_id = vma->vm_mm->mm_id; + int new_mapcount_val; + + folio_lock_large_mapcount(folio); + __folio_large_mapcount_sanity_checks(folio, diff, mm_id); + + new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff; + atomic_set(&folio->_large_mapcount, new_mapcount_val); + + /* + * If a folio is mapped more than once into an MM on 32bit, we + * can in theory overflow the per-MM mapcount (although only for + * fairly large folios), turning it negative. In that case, just + * free up the slot and mark the folio "mapped shared", otherwise + * we might be in trouble when unmapping pages later. + */ + if (folio_mm_id(folio, 0) == mm_id) { + folio->_mm_id_mapcount[0] += diff; + if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) { + folio->_mm_id_mapcount[0] = -1; + folio_set_mm_id(folio, 0, MM_ID_DUMMY); + folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; + } + } else if (folio_mm_id(folio, 1) == mm_id) { + folio->_mm_id_mapcount[1] += diff; + if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) { + folio->_mm_id_mapcount[1] = -1; + folio_set_mm_id(folio, 1, MM_ID_DUMMY); + folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; + } + } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) { + folio_set_mm_id(folio, 0, mm_id); + folio->_mm_id_mapcount[0] = diff - 1; + /* We might have other mappings already. */ + if (new_mapcount_val != diff - 1) + folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; + } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) { + folio_set_mm_id(folio, 1, mm_id); + folio->_mm_id_mapcount[1] = diff - 1; + /* Slot 0 certainly has mappings as well. */ + folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; + } + folio_unlock_large_mapcount(folio); +} + +static __always_inline void folio_sub_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + const mm_id_t mm_id = vma->vm_mm->mm_id; + int new_mapcount_val; + + folio_lock_large_mapcount(folio); + __folio_large_mapcount_sanity_checks(folio, diff, mm_id); + + new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff; + atomic_set(&folio->_large_mapcount, new_mapcount_val); + + /* + * There are valid corner cases where we might underflow a per-MM + * mapcount (some mappings added when no slot was free, some mappings + * added once a slot was free), so we always set it to -1 once we go + * negative. + */ + if (folio_mm_id(folio, 0) == mm_id) { + folio->_mm_id_mapcount[0] -= diff; + if (folio->_mm_id_mapcount[0] >= 0) + goto out; + folio->_mm_id_mapcount[0] = -1; + folio_set_mm_id(folio, 0, MM_ID_DUMMY); + } else if (folio_mm_id(folio, 1) == mm_id) { + folio->_mm_id_mapcount[1] -= diff; + if (folio->_mm_id_mapcount[1] >= 0) + goto out; + folio->_mm_id_mapcount[1] = -1; + folio_set_mm_id(folio, 1, MM_ID_DUMMY); + } + + /* + * If one MM slot owns all mappings, the folio is mapped exclusively. + * Note that if the folio is now unmapped (new_mapcount_val == -1), both + * slots must be free (mapcount == -1), and we'll also mark it as + * exclusive. + */ + if (folio->_mm_id_mapcount[0] == new_mapcount_val || + folio->_mm_id_mapcount[1] == new_mapcount_val) + folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT; +out: + folio_unlock_large_mapcount(folio); +} +#else /* !CONFIG_MM_ID */ +/* + * See __folio_rmap_sanity_checks(), we might map large folios even without + * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now. + */ static inline void folio_set_large_mapcount(struct folio *folio, int mapcount, struct vm_area_struct *vma) { @@ -191,6 +355,7 @@ static inline void folio_sub_large_mapcount(struct folio *folio, { atomic_sub(diff, &folio->_large_mapcount); } +#endif /* CONFIG_MM_ID */ #define folio_inc_large_mapcount(folio, vma) \ folio_add_large_mapcount(folio, 1, vma) -- cgit v1.2.3 From 003fde4492c88ac3a1fee3d97b3834a679780af3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:30:07 +0100 Subject: mm: convert folio_likely_mapped_shared() to folio_maybe_mapped_shared() Let's reuse our new MM ownership tracking infrastructure for large folios to make folio_likely_mapped_shared() never return false negatives -- never indicating "not mapped shared" although the folio *is* mapped shared. With that, we can rename it to folio_maybe_mapped_shared() and get rid of the dependency on the mapcount of the first folio page. The semantics are now arguably clearer: no mixture of "false negatives" and "false positives", only the remaining possibility for "false positives". Thoroughly document the new semantics. We might now detect that a large folio is "maybe mapped shared" although it *no longer* is -- but once was. Now, if more than two MMs mapped a folio at the same time, and the MM mapping the folio exclusively at the end is not one tracked in the two folio MM slots, we will detect the folio as "maybe mapped shared". For anonymous folios, usually (except weird corner cases) all PTEs that target a "maybe mapped shared" folio are R/O. As soon as a child process would write to them (iow, actively use them), we would CoW and effectively replace these PTEs. Most cases (below) are not expected to really matter with large anonymous folios for this reason. Most importantly, there will be no change at all for: * small folios * hugetlb folios * PMD-mapped PMD-sized THPs (single mapping) This change has the potential to affect existing callers of folio_likely_mapped_shared() -> folio_maybe_mapped_shared(): (1) fs/proc/task_mmu.c: no change (hugetlb) (2) khugepaged counts PTEs that target shared folios towards max_ptes_shared (default: HPAGE_PMD_NR / 2), meaning we could skip a collapse where we would have previously collapsed. This only applies to anonymous folios and is not expected to matter in practice. Worth noting that this change sorts out case (A) documented in commit 1bafe96e89f0 ("mm/khugepaged: replace page_mapcount() check by folio_likely_mapped_shared()") by removing the possibility for "false negatives". (3) MADV_COLD / MADV_PAGEOUT / MADV_FREE will not try splitting PTE-mapped THPs that are considered shared but not fully covered by the requested range, consequently not processing them. PMD-mapped PMD-sized THP are not affected, or when all PTEs are covered. These functions are usually only called on anon/file folios that are exclusively mapped most of the time (no other file mappings or no fork()), so the "false negatives" are not expected to matter in practice. (4) mbind() / migrate_pages() / move_pages() will refuse to migrate shared folios unless MPOL_MF_MOVE_ALL is effective (requires CAP_SYS_NICE). We will now reject some folios that could be migrated. Similar to (3), especially with MPOL_MF_MOVE_ALL, so this is not expected to matter in practice. Note that cpuset_migrate_mm_workfn() calls do_migrate_pages() with MPOL_MF_MOVE_ALL. (5) NUMA hinting mm/migrate.c:migrate_misplaced_folio_prepare() will skip file folios that are probably shared libraries (-> "mapped shared" and executable). This check would have detected it as a shared library at some point (at least 3 MMs mapping it), so detecting it afterwards does not sound wrong (still a shared library). Not expected to matter. mm/memory.c:numa_migrate_check() will indicate TNF_SHARED in MAP_SHARED file mappings when encountering a shared folio. Similar reasoning, not expected to matter. mm/mprotect.c:change_pte_range() will skip folios detected as shared in CoW mappings. Similarly, this is not expected to matter in practice, but if it would ever be a problem we could relax that check a bit (e.g., basing it on the average page-mapcount in a folio), because it was only an optimization when many (e.g., 288) processes were mapping the same folios -- see commit 859d4adc3415 ("mm: numa: do not trap faults on shared data section pages.") (6) mm/rmap.c:folio_referenced_one() will skip exclusive swapbacked folios in dying processes. Applies to anonymous folios only. Without "false negatives", we'll now skip all actually shared ones. Skipping ones that are actually exclusive won't really matter, it's a pure optimization, and is not expected to matter in practice. In theory, one can detect the problematic scenario: folio_mapcount() > 0 and no folio MM slot is occupied ("state unknown"). One could reset the MM slots while doing an rmap walk, which migration / folio split already do when setting everything up. Further, when batching PTEs we might naturally learn about a owner (e.g., folio_mapcount() == nr_ptes) and could update the owner. However, we'll defer that until the scenarios where it would really matter are clear. Link: https://lkml.kernel.org/r/20250303163014.1128035-15-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm.h | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f366c180f2b6..82776b409391 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2251,23 +2251,18 @@ static inline size_t folio_size(const struct folio *folio) } /** - * folio_likely_mapped_shared - Estimate if the folio is mapped into the page - * tables of more than one MM + * folio_maybe_mapped_shared - Whether the folio is mapped into the page + * tables of more than one MM * @folio: The folio. * - * This function checks if the folio is currently mapped into more than one - * MM ("mapped shared"), or if the folio is only mapped into a single MM - * ("mapped exclusively"). + * This function checks if the folio maybe currently mapped into more than one + * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single + * MM ("mapped exclusively"). * * For KSM folios, this function also returns "mapped shared" when a folio is * mapped multiple times into the same MM, because the individual page mappings * are independent. * - * As precise information is not easily available for all folios, this function - * estimates the number of MMs ("sharers") that are currently mapping a folio - * using the number of times the first page of the folio is currently mapped - * into page tables. - * * For small anonymous folios and anonymous hugetlb folios, the return * value will be exactly correct: non-KSM folios can only be mapped at most once * into an MM, and they cannot be partially mapped. KSM folios are @@ -2275,8 +2270,8 @@ static inline size_t folio_size(const struct folio *folio) * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly - * indicate "mapped exclusively" (false negative) when the folio is - * only partially mapped into at least one MM. + * indicate "mapped shared" (false positive) if a folio was mapped by + * more than two MMs at one point in time. * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. @@ -2293,7 +2288,7 @@ static inline size_t folio_size(const struct folio *folio) * * Return: Whether the folio is estimated to be mapped into more than one MM. */ -static inline bool folio_likely_mapped_shared(struct folio *folio) +static inline bool folio_maybe_mapped_shared(struct folio *folio) { int mapcount = folio_mapcount(folio); @@ -2301,16 +2296,22 @@ static inline bool folio_likely_mapped_shared(struct folio *folio) if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) return mapcount > 1; - /* A single mapping implies "mapped exclusively". */ - if (mapcount <= 1) - return false; - - /* If any page is mapped more than once we treat it "mapped shared". */ - if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio)) + /* + * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ... + * simply assume "mapped shared", nobody should really care + * about this for arbitrary kernel allocations. + */ + if (!IS_ENABLED(CONFIG_MM_ID)) return true; - /* Let's guess based on the first subpage. */ - return atomic_read(&folio->_mapcount) > 0; + /* + * A single mapping implies "mapped exclusively", even if the + * folio flag says something different: it's easier to handle this + * case here instead of on the RMAP hot path. + */ + if (mapcount <= 1) + return false; + return folio_test_large_maybe_mapped_shared(folio); } #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE -- cgit v1.2.3 From 749492229e3bd6222dda7267b8244135229d1fd8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:30:13 +0100 Subject: mm: stop maintaining the per-page mapcount of large folios (CONFIG_NO_PAGE_MAPCOUNT) Everything is in place to stop using the per-page mapcounts in large folios: the mapcount of tail pages will always be logically 0 (-1 value), just like it currently is for hugetlb folios already, and the page mapcount of the head page is either 0 (-1 value) or contains a page type (e.g., hugetlb). Maintaining _nr_pages_mapped without per-page mapcounts is impossible, so that one also has to go with CONFIG_NO_PAGE_MAPCOUNT. There are two remaining implications: (1) Per-node, per-cgroup and per-lruvec stats of "NR_ANON_MAPPED" ("mapped anonymous memory") and "NR_FILE_MAPPED" ("mapped file memory"): As soon as any page of the folio is mapped -- folio_mapped() -- we now account the complete folio as mapped. Once the last page is unmapped -- !folio_mapped() -- we account the complete folio as unmapped. This implies that ... * "AnonPages" and "Mapped" in /proc/meminfo and /sys/devices/system/node/*/meminfo * cgroup v2: "anon" and "file_mapped" in "memory.stat" and "memory.numa_stat" * cgroup v1: "rss" and "mapped_file" in "memory.stat" and "memory.numa_stat ... can now appear higher than before. But note that these folios do consume that memory, simply not all pages are actually currently mapped. It's worth nothing that other accounting in the kernel (esp. cgroup charging on allocation) is not affected by this change. [why oh why is "anon" called "rss" in cgroup v1] (2) Detecting partial mappings Detecting whether anon THPs are partially mapped gets a bit more unreliable. As long as a single MM maps such a large folio ("exclusively mapped"), we can reliably detect it. Especially before fork() / after a short-lived child process quit, we will detect partial mappings reliably, which is the common case. In essence, if the average per-page mapcount in an anon THP is < 1, we know for sure that we have a partial mapping. However, as soon as multiple MMs are involved, we might miss detecting partial mappings: this might be relevant with long-lived child processes. If we have a fully-mapped anon folio before fork(), once our child processes and our parent all unmap (zap/COW) the same pages (but not the complete folio), we might not detect the partial mapping. However, once the child processes quit we would detect the partial mapping. How relevant this case is in practice remains to be seen. Swapout/migration will likely mitigate this. In the future, RMAP walkers could check for that for that case (e.g., when collecting access bits during reclaim) and simply flag them for deferred-splitting. Link: https://lkml.kernel.org/r/20250303163014.1128035-21-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/rmap.h | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c131b0efff0f..6b82b618846e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -240,7 +240,7 @@ static __always_inline void folio_set_large_mapcount(struct folio *folio, folio_set_mm_id(folio, 0, vma->vm_mm->mm_id); } -static __always_inline void folio_add_large_mapcount(struct folio *folio, +static __always_inline int folio_add_return_large_mapcount(struct folio *folio, int diff, struct vm_area_struct *vma) { const mm_id_t mm_id = vma->vm_mm->mm_id; @@ -286,9 +286,11 @@ static __always_inline void folio_add_large_mapcount(struct folio *folio, folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; } folio_unlock_large_mapcount(folio); + return new_mapcount_val + 1; } +#define folio_add_large_mapcount folio_add_return_large_mapcount -static __always_inline void folio_sub_large_mapcount(struct folio *folio, +static __always_inline int folio_sub_return_large_mapcount(struct folio *folio, int diff, struct vm_area_struct *vma) { const mm_id_t mm_id = vma->vm_mm->mm_id; @@ -331,7 +333,9 @@ static __always_inline void folio_sub_large_mapcount(struct folio *folio, folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT; out: folio_unlock_large_mapcount(folio); + return new_mapcount_val + 1; } +#define folio_sub_large_mapcount folio_sub_return_large_mapcount #else /* !CONFIG_MM_ID */ /* * See __folio_rmap_sanity_checks(), we might map large folios even without @@ -350,17 +354,33 @@ static inline void folio_add_large_mapcount(struct folio *folio, atomic_add(diff, &folio->_large_mapcount); } +static inline int folio_add_return_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + BUILD_BUG(); +} + static inline void folio_sub_large_mapcount(struct folio *folio, int diff, struct vm_area_struct *vma) { atomic_sub(diff, &folio->_large_mapcount); } + +static inline int folio_sub_return_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + BUILD_BUG(); +} #endif /* CONFIG_MM_ID */ #define folio_inc_large_mapcount(folio, vma) \ folio_add_large_mapcount(folio, 1, vma) +#define folio_inc_return_large_mapcount(folio, vma) \ + folio_add_return_large_mapcount(folio, 1, vma) #define folio_dec_large_mapcount(folio, vma) \ folio_sub_large_mapcount(folio, 1, vma) +#define folio_dec_return_large_mapcount(folio, vma) \ + folio_sub_return_large_mapcount(folio, 1, vma) /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; @@ -538,9 +558,11 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, break; } - do { - atomic_inc(&page->_mapcount); - } while (page++, --nr_pages > 0); + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { + do { + atomic_inc(&page->_mapcount); + } while (page++, --nr_pages > 0); + } folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; case RMAP_LEVEL_PMD: @@ -638,7 +660,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, do { if (PageAnonExclusive(page)) ClearPageAnonExclusive(page); - atomic_inc(&page->_mapcount); + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + atomic_inc(&page->_mapcount); } while (page++, --nr_pages > 0); folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; -- cgit v1.2.3 From f7f0d88b7d6da29d2b073740aa130685f0e18609 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 5 Mar 2025 14:27:29 -0800 Subject: mm/damon/core: expose damos_filter_for_ops() to DAMON kernel API callers damos_filter_for_ops() can be useful to avoid putting wrong type of filters in wrong place. Make it be exposed to DAMON kernel API callers. Link: https://lkml.kernel.org/r/20250305222733.59089-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 52559475dbe7..eed008b64a23 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -894,6 +894,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damos_filter *damos_new_filter(enum damos_filter_type type, bool matching, bool allow); void damos_add_filter(struct damos *s, struct damos_filter *f); +bool damos_filter_for_ops(enum damos_filter_type type); void damos_destroy_filter(struct damos_filter *f); struct damos_quota_goal *damos_new_quota_goal( -- cgit v1.2.3 From ff22f9299d7b2c7874b560993c21543708b7e1b6 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 6 Mar 2025 12:50:10 -0800 Subject: page_io: zswap: do not crash the kernel on decompression failure Currently, we crash the kernel when a decompression failure occurs in zswap (either because of memory corruption, or a bug in the compression algorithm). This is overkill. We should only SIGBUS the unfortunate process asking for the zswap entry on zswap load, and skip the corrupted entry in zswap writeback. See [1] for a recent upstream discussion about this. The zswap writeback case is relatively straightforward to fix. For the zswap_load() case, we change the return behavior: * Return 0 on success. * Return -ENOENT (with the folio locked) if zswap does not own the swapped out content. * Return -EIO if zswap owns the swapped out content, but encounters a decompression failure for some reasons. The folio will be unlocked, but not be marked up-to-date, which will eventually cause the process requesting the page to SIGBUS (see the handling of not-up-to-date folio in do_swap_page() in mm/memory.c), without crashing the kernel. * Return -EINVAL if we encounter a large folio, as large folio should not be swapped in while zswap is being used. Similar to the -EIO case, we also unlock the folio but do not mark it as up-to-date to SIGBUS the faulting process. As a side effect, we require one extra zswap tree traversal in the load and writeback paths. Quick benchmarking on a kernel build test shows no performance difference: With the new scheme: real: mean: 125.1s, stdev: 0.12s user: mean: 3265.23s, stdev: 9.62s sys: mean: 2156.41s, stdev: 13.98s The old scheme: real: mean: 125.78s, stdev: 0.45s user: mean: 3287.18s, stdev: 5.95s sys: mean: 2177.08s, stdev: 26.52s [nphamcs@gmail.com: fix documentation of zswap_load()] Link: https://lkml.kernel.org/r/20250306222453.1269456-1-nphamcs@gmail.com Link: https://lore.kernel.org/all/ZsiLElTykamcYZ6J@casper.infradead.org/ [1] Link: https://lkml.kernel.org/r/20250306205011.784787-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Matthew Wilcox Suggested-by: Yosry Ahmed Suggested-by: Johannes Weiner Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/zswap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index d961ead91bf1..30c193a1207e 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -26,7 +26,7 @@ struct zswap_lruvec_state { unsigned long zswap_total_pages(void); bool zswap_store(struct folio *folio); -bool zswap_load(struct folio *folio); +int zswap_load(struct folio *folio); void zswap_invalidate(swp_entry_t swp); int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); @@ -44,9 +44,9 @@ static inline bool zswap_store(struct folio *folio) return false; } -static inline bool zswap_load(struct folio *folio) +static inline int zswap_load(struct folio *folio) { - return false; + return -ENOENT; } static inline void zswap_invalidate(swp_entry_t swp) {} -- cgit v1.2.3 From 8268af309d07d1c6279080b4e6fd16ec75cc977c Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:59 +0200 Subject: arch, mm: set max_mapnr when allocating memory map for FLATMEM max_mapnr is essentially the size of the memory map for systems that use FLATMEM. There is no reason to calculate it in each and every architecture when it's anyway calculated in alloc_node_mem_map(). Drop setting of max_mapnr from architecture code and set it once in alloc_node_mem_map(). While on it, move definition of mem_map and max_mapnr to mm/mm_init.c so there won't be two copies for MMU and !MMU variants. Link: https://lkml.kernel.org/r/20250313135003.836600-10-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/asm-generic/memory_model.h | 5 +++-- include/linux/mm.h | 11 ----------- 2 files changed, 3 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 6d1fb6162ac1..a3b5029aebbd 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -19,11 +19,12 @@ #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET) +/* avoid include hell */ +extern unsigned long max_mapnr; + #ifndef pfn_valid static inline int pfn_valid(unsigned long pfn) { - /* avoid include hell */ - extern unsigned long max_mapnr; unsigned long pfn_offset = ARCH_PFN_OFFSET; return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr; diff --git a/include/linux/mm.h b/include/linux/mm.h index 82776b409391..8c4cb8c28507 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -46,17 +46,6 @@ extern int sysctl_page_lock_unfairness; void mm_core_init(void); void init_mm_internals(void); -#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ -extern unsigned long max_mapnr; - -static inline void set_max_mapnr(unsigned long limit) -{ - max_mapnr = limit; -} -#else -static inline void set_max_mapnr(unsigned long limit) { } -#endif - extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { -- cgit v1.2.3 From 6faea3422e3b4e8de44a55aa3e6e843320da66d2 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:50:01 +0200 Subject: arch, mm: streamline HIGHMEM freeing All architectures that support HIGHMEM have their code that frees high memory pages to the buddy allocator while __free_memory_core() is limited to freeing only low memory. There is no actual reason for that. The memory map is completely ready by the time memblock_free_all() is called and high pages can be released to the buddy allocator along with low memory. Remove low memory limit from __free_memory_core() and drop per-architecture code that frees high memory pages. Link: https://lkml.kernel.org/r/20250313135003.836600-12-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8c4cb8c28507..6c519a5098d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3275,7 +3275,6 @@ extern void reserve_bootmem_region(phys_addr_t start, /* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page); -#define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) { -- cgit v1.2.3 From 0d98484ee3330c35d1dc0da0be0871b3596cbe0d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:50:02 +0200 Subject: arch, mm: introduce arch_mm_preinit Currently, implementation of mem_init() in every architecture consists of one or more of the following: * initializations that must run before page allocator is active, for instance swiotlb_init() * a call to memblock_free_all() to release all the memory to the buddy allocator * initializations that must run after page allocator is ready and there is no arch-specific hook other than mem_init() for that, like for example register_page_bootmem_info() in x86 and sparc64 or simple setting of mem_init_done = 1 in several architectures * a bunch of semi-related stuff that apparently had no better place to live, for example a ton of BUILD_BUG_ON()s in parisc. Introduce arch_mm_preinit() that will be the first thing called from mm_core_init(). On architectures that have initializations that must happen before the page allocator is ready, move those into arch_mm_preinit() along with the code that does not depend on ordering with page allocator setup. On several architectures this results in reduction of mem_init() to a single call to memblock_free_all() that allows its consolidation next. Link: https://lkml.kernel.org/r/20250313135003.836600-13-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6c519a5098d4..c417e5634a58 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -43,6 +43,7 @@ struct folio_batch; extern int sysctl_page_lock_unfairness; +void arch_mm_preinit(void); void mm_core_init(void); void init_mm_internals(void); -- cgit v1.2.3 From 8afa901c147a41f92e83943cddf154bbb7995ee6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:50:03 +0200 Subject: arch, mm: make releasing of memory to page allocator more explicit The point where the memory is released from memblock to the buddy allocator is hidden inside arch-specific mem_init()s and the call to memblock_free_all() is needlessly duplicated in every artiste cure and after introduction of arch_mm_preinit() hook, mem_init() implementation on many architecture only contains the call to memblock_free_all(). Pull memblock_free_all() call into mm_core_init() and drop mem_init() on relevant architectures to make it more explicit where the free memory is released from memblock to the buddy allocator and to reduce code duplication in architecture specific code. Link: https://lkml.kernel.org/r/20250313135003.836600-14-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Acked-by: Geert Uytterhoeven [m68k] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memblock.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e79eb6ac516f..ef5a1ecc6e59 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -133,7 +133,6 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size); -void memblock_free_all(void); void memblock_free(void *ptr, size_t size); void reset_all_zones_managed_pages(void); -- cgit v1.2.3 From 53058c762afff714ceaec14b87cf8fdce3b6d33e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:59:04 -0800 Subject: mm/damon: remove damon_callback->private The field was added to let users keep their personal data to use inside of the callbacks. However, no one is actively using that now. Remove it. Link: https://lkml.kernel.org/r/20250306175908.66300-10-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index eed008b64a23..dab4bb0fe39d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -609,12 +609,10 @@ struct damon_operations { * @after_aggregation: Called after each aggregation. * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. - * @private: User private data. * * The monitoring thread (&damon_ctx.kdamond) calls @before_start and * @before_terminate just before starting and finishing the monitoring, - * respectively. Therefore, those are good places for installing and cleaning - * @private. + * respectively. * * The monitoring thread calls @after_wmarks_check after each DAMON-based * operation schemes' watermarks check. If users need to make changes to the @@ -630,8 +628,6 @@ struct damon_operations { * If any callback returns non-zero, monitoring stops. */ struct damon_callback { - void *private; - int (*before_start)(struct damon_ctx *context); int (*after_wmarks_check)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); -- cgit v1.2.3 From 07da21855b270c17b2a2d20e644c1419fcaafdd1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:59:05 -0800 Subject: mm/damon: remove ->before_start of damon_callback The function pointer field was added to be used as a place to do some initialization works just before DAMON starts working. However, nobody is using it now. Remove it. Link: https://lkml.kernel.org/r/20250306175908.66300-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index dab4bb0fe39d..043de2408c65 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -603,16 +603,14 @@ struct damon_operations { /** * struct damon_callback - Monitoring events notification callbacks. * - * @before_start: Called before starting the monitoring. * @after_wmarks_check: Called after each schemes' watermarks check. * @after_sampling: Called after each sampling. * @after_aggregation: Called after each aggregation. * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. * - * The monitoring thread (&damon_ctx.kdamond) calls @before_start and - * @before_terminate just before starting and finishing the monitoring, - * respectively. + * The monitoring thread (&damon_ctx.kdamond) calls @before_terminate just + * before finishing the monitoring. * * The monitoring thread calls @after_wmarks_check after each DAMON-based * operation schemes' watermarks check. If users need to make changes to the @@ -628,7 +626,6 @@ struct damon_operations { * If any callback returns non-zero, monitoring stops. */ struct damon_callback { - int (*before_start)(struct damon_ctx *context); int (*after_wmarks_check)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); -- cgit v1.2.3 From cedee98f68875605dad644a95a63eae04de250b2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:59:06 -0800 Subject: mm/damon: remove damon_callback->after_sampling The callback was used by DAMON sysfs interface for reading DAMON internal data. But it is no more being used, and damon_call() can do similar works in a better way. Remove it. Link: https://lkml.kernel.org/r/20250306175908.66300-12-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 043de2408c65..5aa277f4c948 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -604,7 +604,6 @@ struct damon_operations { * struct damon_callback - Monitoring events notification callbacks. * * @after_wmarks_check: Called after each schemes' watermarks check. - * @after_sampling: Called after each sampling. * @after_aggregation: Called after each aggregation. * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. @@ -617,17 +616,15 @@ struct damon_operations { * attributes of the monitoring context while it's deactivated due to the * watermarks, this is the good place to do. * - * The monitoring thread calls @after_sampling and @after_aggregation for each - * of the sampling intervals and aggregation intervals, respectively. - * Therefore, users can safely access the monitoring results without additional - * protection. For the reason, users are recommended to use these callback for - * the accesses to the results. + * The monitoring thread calls @after_aggregation for each of the aggregation + * intervals. Therefore, users can safely access the monitoring results + * without additional protection. For the reason, users are recommended to use + * these callback for the accesses to the results. * * If any callback returns non-zero, monitoring stops. */ struct damon_callback { int (*after_wmarks_check)(struct damon_ctx *context); - int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); int (*before_damos_apply)(struct damon_ctx *context, struct damon_target *target, -- cgit v1.2.3 From 99ce7c9c6d855716356f2f839c53905592fd780b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:59:07 -0800 Subject: mm/damon: remove damon_callback->before_damos_apply The hook was introduced to let DAMON kernel API users access DAMOS schemes-eligible regions in a safe way. Now it is no more used by anyone, and the functionality is provided in a better way by damos_walk(). Remove it. Link: https://lkml.kernel.org/r/20250306175908.66300-13-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5aa277f4c948..be7b281fb922 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -605,7 +605,6 @@ struct damon_operations { * * @after_wmarks_check: Called after each schemes' watermarks check. * @after_aggregation: Called after each aggregation. - * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. * * The monitoring thread (&damon_ctx.kdamond) calls @before_terminate just @@ -626,10 +625,6 @@ struct damon_operations { struct damon_callback { int (*after_wmarks_check)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); - int (*before_damos_apply)(struct damon_ctx *context, - struct damon_target *target, - struct damon_region *region, - struct damos *scheme); void (*before_terminate)(struct damon_ctx *context); }; -- cgit v1.2.3 From 105f830fa35c49ada7db785a7f9b70386f193529 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:59:08 -0800 Subject: mm/damon: remove damon_operations->reset_aggregated The operations layer hook was introduced to let operations set do any aggregation data reset if needed. But it is not really be used now. Remove it. Link: https://lkml.kernel.org/r/20250306175908.66300-14-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index be7b281fb922..3db4f77261f5 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -542,7 +542,6 @@ enum damon_ops_id { * @update: Update operations-related data structures. * @prepare_access_checks: Prepare next access check of target regions. * @check_accesses: Check the accesses to target regions. - * @reset_aggregated: Reset aggregated accesses monitoring results. * @get_scheme_score: Get the score of a region for a scheme. * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. @@ -554,8 +553,7 @@ enum damon_ops_id { * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting * the monitoring, @update after each &damon_attrs.ops_update_interval, and * @check_accesses, @target_valid and @prepare_access_checks after each - * &damon_attrs.sample_interval. Finally, @reset_aggregated is called after - * each &damon_attrs.aggr_interval. + * &damon_attrs.sample_interval. * * Each &struct damon_operations instance having valid @id can be registered * via damon_register_ops() and selected by damon_select_ops() later. @@ -570,8 +568,6 @@ enum damon_ops_id { * last preparation and update the number of observed accesses of each region. * It should also return max number of observed accesses that made as a result * of its update. The value will be used for regions adjustment threshold. - * @reset_aggregated should reset the access monitoring results that aggregated - * by @check_accesses. * @get_scheme_score should return the priority score of a region for a scheme * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided @@ -589,7 +585,6 @@ struct damon_operations { void (*update)(struct damon_ctx *context); void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); - void (*reset_aggregated)(struct damon_ctx *context); int (*get_scheme_score)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); -- cgit v1.2.3 From 9039b9096ea27a20f0349d1537537663c935c8ed Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Thu, 6 Mar 2025 17:44:50 -0500 Subject: mm: page_ext: add an iteration API for page extensions Patch series "mm: page_ext: Introduce new iteration API", v3. Introduction ============ [ Thanks to David Hildenbrand for identifying the root cause of this issue and proving guidance on how to fix it. The new API idea, bugs and misconceptions are all mine though ] Currently, trying to reserve 1G pages with page_owner=on and sparsemem causes a crash. The reproducer is very simple: 1. Build the kernel with CONFIG_SPARSEMEM=y and the table extensions 2. Pass 'default_hugepagesz=1 page_owner=on' in the kernel command-line 3. Reserve one 1G page at run-time, this should crash (see patch 1 for backtrace) [ A crash with page_table_check is also possible, but harder to trigger ] Apparently, starting with commit cf54f310d0d3 ("mm/hugetlb: use __GFP_COMP for gigantic folios") we now pass the full allocation order to page extension clients and the page extension implementation assumes that all PFNs of an allocation range will be stored in the same memory section (which is not true for 1G pages). To fix this, this series introduces a new iteration API for page extension objects. The API checks if the next page extension object can be retrieved from the current section or if it needs to look up for it in another section. Please, find all details in patch 1. I tested this series on arm64 and x86 by reserving 1G pages at run-time and doing kernel builds (always with page_owner=on and page_table_check=on). This patch (of 3): The page extension implementation assumes that all page extensions of a given page order are stored in the same memory section. The function page_ext_next() relies on this assumption by adding an offset to the current object to return the next adjacent page extension. This behavior works as expected for flatmem but fails for sparsemem when using 1G pages. The commit cf54f310d0d3 ("mm/hugetlb: use __GFP_COMP for gigantic folios") exposes this issue, making it possible for a crash when using page_owner or page_table_check page extensions. The problem is that for 1G pages, the page extensions may span memory section boundaries and be stored in different memory sections. This issue was not visible before commit cf54f310d0d3 ("mm/hugetlb: use __GFP_COMP for gigantic folios") because alloc_contig_pages() never passed more than MAX_PAGE_ORDER to post_alloc_hook(). However, the series introducing mentioned commit changed this behavior allowing the full 1G page order to be passed. Reproducer: 1. Build the kernel with CONFIG_SPARSEMEM=y and table extensions support 2. Pass 'default_hugepagesz=1 page_owner=on' in the kernel command-line 3. Reserve one 1G page at run-time, this should crash (backtrace below) To address this issue, this commit introduces a new API for iterating through page extensions. The main iteration macro is for_each_page_ext() and it must be called with the RCU read lock taken. Here's an usage example: """ struct page_ext_iter iter; struct page_ext *page_ext; ... rcu_read_lock(); for_each_page_ext(page, 1 << order, page_ext, iter) { struct my_page_ext *obj = get_my_page_ext_obj(page_ext); ... } rcu_read_unlock(); """ The loop construct uses page_ext_iter_next() which checks to see if we have crossed sections in the iteration. In this case, page_ext_iter_next() retrieves the next page_ext object from another section. Thanks to David Hildenbrand for helping identify the root cause and providing suggestions on how to fix and optmize the solution (final implementation and bugs are all mine through). Lastly, here's the backtrace, without kasan you can get random crashes: [ 76.052526] BUG: KASAN: slab-out-of-bounds in __update_page_owner_handle+0x238/0x298 [ 76.060283] Write of size 4 at addr ffff07ff96240038 by task tee/3598 [ 76.066714] [ 76.068203] CPU: 88 UID: 0 PID: 3598 Comm: tee Kdump: loaded Not tainted 6.13.0-rep1 #3 [ 76.076202] Hardware name: WIWYNN Mt.Jade Server System B81.030Z1.0007/Mt.Jade Motherboard, BIOS 2.10.20220810 (SCP: 2.10.20220810) 2022/08/10 [ 76.088972] Call trace: [ 76.091411] show_stack+0x20/0x38 (C) [ 76.095073] dump_stack_lvl+0x80/0xf8 [ 76.098733] print_address_description.constprop.0+0x88/0x398 [ 76.104476] print_report+0xa8/0x278 [ 76.108041] kasan_report+0xa8/0xf8 [ 76.111520] __asan_report_store4_noabort+0x20/0x30 [ 76.116391] __update_page_owner_handle+0x238/0x298 [ 76.121259] __set_page_owner+0xdc/0x140 [ 76.125173] post_alloc_hook+0x190/0x1d8 [ 76.129090] alloc_contig_range_noprof+0x54c/0x890 [ 76.133874] alloc_contig_pages_noprof+0x35c/0x4a8 [ 76.138656] alloc_gigantic_folio.isra.0+0x2c0/0x368 [ 76.143616] only_alloc_fresh_hugetlb_folio.isra.0+0x24/0x150 [ 76.149353] alloc_pool_huge_folio+0x11c/0x1f8 [ 76.153787] set_max_huge_pages+0x364/0xca8 [ 76.157961] __nr_hugepages_store_common+0xb0/0x1a0 [ 76.162829] nr_hugepages_store+0x108/0x118 [ 76.167003] kobj_attr_store+0x3c/0x70 [ 76.170745] sysfs_kf_write+0xfc/0x188 [ 76.174492] kernfs_fop_write_iter+0x274/0x3e0 [ 76.178927] vfs_write+0x64c/0x8e0 [ 76.182323] ksys_write+0xf8/0x1f0 [ 76.185716] __arm64_sys_write+0x74/0xb0 [ 76.189630] invoke_syscall.constprop.0+0xd8/0x1e0 [ 76.194412] do_el0_svc+0x164/0x1e0 [ 76.197891] el0_svc+0x40/0xe0 [ 76.200939] el0t_64_sync_handler+0x144/0x168 [ 76.205287] el0t_64_sync+0x1ac/0x1b0 Link: https://lkml.kernel.org/r/cover.1741301089.git.luizcap@redhat.com Link: https://lkml.kernel.org/r/a45893880b7e1601082d39d2c5c8b50bcc096305.1741301089.git.luizcap@redhat.com Fixes: cf54f310d0d3 ("mm/hugetlb: use __GFP_COMP for gigantic folios") Signed-off-by: Luiz Capitulino Acked-by: David Hildenbrand Cc: Johannes Weiner Cc: Luiz Capitulino Cc: Muchun Song Cc: Pasha Tatashin Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'include') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index e4b48a0dda24..76c817162d2f 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -3,6 +3,7 @@ #define __LINUX_PAGE_EXT_H #include +#include #include struct pglist_data; @@ -69,16 +70,31 @@ extern void page_ext_init(void); static inline void page_ext_init_flatmem_late(void) { } + +static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) +{ + /* + * page_ext is allocated per memory section. Once we cross a + * memory section, we have to fetch the new pointer. + */ + return next_pfn % PAGES_PER_SECTION; +} #else extern void page_ext_init_flatmem(void); extern void page_ext_init_flatmem_late(void); static inline void page_ext_init(void) { } + +static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) +{ + return true; +} #endif extern struct page_ext *page_ext_get(const struct page *page); extern void page_ext_put(struct page_ext *page_ext); +extern struct page_ext *page_ext_lookup(unsigned long pfn); static inline void *page_ext_data(struct page_ext *page_ext, struct page_ext_operations *ops) @@ -93,6 +109,83 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr) return next; } +struct page_ext_iter { + unsigned long index; + unsigned long start_pfn; + struct page_ext *page_ext; +}; + +/** + * page_ext_iter_begin() - Prepare for iterating through page extensions. + * @iter: page extension iterator. + * @pfn: PFN of the page we're interested in. + * + * Must be called with RCU read lock taken. + * + * Return: NULL if no page_ext exists for this page. + */ +static inline struct page_ext *page_ext_iter_begin(struct page_ext_iter *iter, + unsigned long pfn) +{ + iter->index = 0; + iter->start_pfn = pfn; + iter->page_ext = page_ext_lookup(pfn); + + return iter->page_ext; +} + +/** + * page_ext_iter_next() - Get next page extension + * @iter: page extension iterator. + * + * Must be called with RCU read lock taken. + * + * Return: NULL if no next page_ext exists. + */ +static inline struct page_ext *page_ext_iter_next(struct page_ext_iter *iter) +{ + unsigned long pfn; + + if (WARN_ON_ONCE(!iter->page_ext)) + return NULL; + + iter->index++; + pfn = iter->start_pfn + iter->index; + + if (page_ext_iter_next_fast_possible(pfn)) + iter->page_ext = page_ext_next(iter->page_ext); + else + iter->page_ext = page_ext_lookup(pfn); + + return iter->page_ext; +} + +/** + * page_ext_iter_get() - Get current page extension + * @iter: page extension iterator. + * + * Return: NULL if no page_ext exists for this iterator. + */ +static inline struct page_ext *page_ext_iter_get(const struct page_ext_iter *iter) +{ + return iter->page_ext; +} + +/** + * for_each_page_ext(): iterate through page_ext objects. + * @__page: the page we're interested in + * @__pgcount: how many pages to iterate through + * @__page_ext: struct page_ext pointer where the current page_ext + * object is returned + * @__iter: struct page_ext_iter object (defined in the stack) + * + * IMPORTANT: must be called with RCU read lock taken. + */ +#define for_each_page_ext(__page, __pgcount, __page_ext, __iter) \ + for (__page_ext = page_ext_iter_begin(&__iter, page_to_pfn(__page));\ + __page_ext && __iter.index < __pgcount; \ + __page_ext = page_ext_iter_next(&__iter)) + #else /* !CONFIG_PAGE_EXTENSION */ struct page_ext; -- cgit v1.2.3 From 3fec86f8aa8c7ae84567a0d1396e84ada96141d8 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 7 Mar 2025 12:39:54 -0500 Subject: xarray: add xas_try_split() to split a multi-index entry Patch series "Buddy allocator like (or non-uniform) folio split", v10. This patchset adds a new buddy allocator like (or non-uniform) large folio split from a order-n folio to order-m with m < n. It reduces 1. the total number of after-split folios from 2^(n-m) to n-m+1; 2. the amount of memory needed for multi-index xarray split from 2^(n/6-m/6) to n/6-m/6, assuming XA_CHUNK_SHIFT=6; 3. keep more large folios after a split from all order-m folios to order-(n-1) to order-m folios. For example, to split an order-9 to order-0, folio split generates 10 (or 11 for anonymous memory) folios instead of 512, allocates 1 xa_node instead of 8, and leaves 1 order-8, 1 order-7, ..., 1 order-1 and 2 order-0 folios (or 4 order-0 for anonymous memory) instead of 512 order-0 folios. Instead of duplicating existing split_huge_page*() code, __folio_split() is introduced as the shared backend code for both split_huge_page_to_list_to_order() and folio_split(). __folio_split() can support both uniform split and buddy allocator like (or non-uniform) split. All existing split_huge_page*() users can be gradually converted to use folio_split() if possible. In this patchset, I converted truncate_inode_partial_folio() to use folio_split(). xfstests quick group passed for both tmpfs and xfs. I also semi-replicated Hugh's test[12] and ran it without any issue for almost 24 hours. This patch (of 8): A preparation patch for non-uniform folio split, which always split a folio into half iteratively, and minimal xarray entry split. Currently, xas_split_alloc() and xas_split() always split all slots from a multi-index entry. They cost the same number of xa_node as the to-be-split slots. For example, to split an order-9 entry, which takes 2^(9-6)=8 slots, assuming XA_CHUNK_SHIFT is 6 (!CONFIG_BASE_SMALL), 8 xa_node are needed. Instead xas_try_split() is intended to be used iteratively to split the order-9 entry into 2 order-8 entries, then split one order-8 entry, based on the given index, to 2 order-7 entries, ..., and split one order-1 entry to 2 order-0 entries. When splitting the order-6 entry and a new xa_node is needed, xas_try_split() will try to allocate one if possible. As a result, xas_try_split() would only need 1 xa_node instead of 8. When a new xa_node is needed during the split, xas_try_split() can try to allocate one but no more. -ENOMEM will be return if a node cannot be allocated. -EINVAL will be return if a sibling node is split or cascade split happens, where two or more new nodes are needed, and these are not supported by xas_try_split(). xas_split_alloc() and xas_split() split an order-9 to order-0: --------------------------------- | | | | | | | | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | | | | | | | | --------------------------------- | | | | ------- --- --- ------- | | ... | | V V V V ----------- ----------- ----------- ----------- | xa_node | | xa_node | ... | xa_node | | xa_node | ----------- ----------- ----------- ----------- xas_try_split() splits an order-9 to order-0: --------------------------------- | | | | | | | | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | | | | | | | | --------------------------------- | | V ----------- | xa_node | ----------- Link: https://lkml.kernel.org/r/20250307174001.242794-1-ziy@nvidia.com Link: https://lkml.kernel.org/r/20250307174001.242794-2-ziy@nvidia.com Signed-off-by: Zi Yan Cc: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: John Hubbard Cc: Kefeng Wang Cc: Kirill A. Shuemov Cc: Miaohe Lin Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zi Yan Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/xarray.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 0b618ec04115..4010195201c9 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1555,6 +1555,7 @@ int xa_get_order(struct xarray *, unsigned long index); int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); +void xas_try_split(struct xa_state *xas, void *entry, unsigned int order); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { @@ -1576,6 +1577,11 @@ static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } + +static inline void xas_try_split(struct xa_state *xas, void *entry, + unsigned int order) +{ +} #endif /** -- cgit v1.2.3 From 7460b470a131f985a70302a322617121efdd7caa Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 7 Mar 2025 12:40:00 -0500 Subject: mm/truncate: use folio_split() in truncate operation Instead of splitting the large folio uniformly during truncation, try to use buddy allocator like folio_split() at the start and the end of a truncation range to minimize the number of resulting folios if it is supported. try_folio_split() is introduced to use folio_split() if supported and it falls back to uniform split otherwise. For example, to truncate a order-4 folio [0, 1, 2, 3, 4, 5, ..., 15] between [3, 10] (inclusive), folio_split() splits the folio at 3 to [0,1], [2], [3], [4..7], [8..15] and [3], [4..7] can be dropped and [8..15] is kept with zeros in [8..10], then another folio_split() is done at 10, so [8..10] can be dropped. One possible optimization is to make folio_split() to split a folio based on a given range, like [3..10] above. But that complicates folio_split(), so it will be investigated when necessary. Link: https://lkml.kernel.org/r/20250226210032.2044041-8-ziy@nvidia.com Link: https://lkml.kernel.org/r/20250307174001.242794-8-ziy@nvidia.com Signed-off-by: Zi Yan Cc: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: John Hubbard Cc: Kefeng Wang Cc: Kirill A. Shuemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e57e811cfd3c..e893d546a49f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -345,6 +345,36 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); +bool uniform_split_supported(struct folio *folio, unsigned int new_order, + bool warns); +bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, + bool warns); +int folio_split(struct folio *folio, unsigned int new_order, struct page *page, + struct list_head *list); +/* + * try_folio_split - try to split a @folio at @page using non uniform split. + * @folio: folio to be split + * @page: split to order-0 at the given page + * @list: store the after-split folios + * + * Try to split a @folio at @page using non uniform split to order-0, if + * non uniform split is not supported, fall back to uniform split. + * + * Return: 0: split is successful, otherwise split failed. + */ +static inline int try_folio_split(struct folio *folio, struct page *page, + struct list_head *list) +{ + int ret = min_order_for_split(folio); + + if (ret < 0) + return ret; + + if (!non_uniform_split_supported(folio, 0, false)) + return split_huge_page_to_list_to_order(&folio->page, list, + ret); + return folio_split(folio, ret, page, list); +} static inline int split_huge_page(struct page *page) { struct folio *folio = page_folio(page); @@ -537,6 +567,12 @@ static inline int split_folio_to_list(struct folio *folio, struct list_head *lis return 0; } +static inline int try_folio_split(struct folio *folio, struct page *page, + struct list_head *list) +{ + return 0; +} + static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) -- cgit v1.2.3 From 200a89c159a7a416115e6e309183c82183bf98aa Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 14 Mar 2025 18:21:12 -0400 Subject: mm/filemap: use xas_try_split() in __filemap_add_folio() Patch series "Minimize xa_node allocation during xarry split", v3. When splitting a multi-index entry in XArray from order-n to order-m, existing xas_split_alloc()+xas_split() approach requires 2^(n % XA_CHUNK_SHIFT) xa_node allocations. But its callers, __filemap_add_folio() and shmem_split_large_entry(), use at most 1 xa_node. To minimize xa_node allocation and remove the limitation of no split from order-12 (or above) to order-0 (or anything between 0 and 5)[1], xas_try_split() was added[2], which allocates (n / XA_CHUNK_SHIFT - m / XA_CHUNK_SHIFT) xa_node. It is used for non-uniform folio split, but can be used by __filemap_add_folio() and shmem_split_large_entry(). xas_split_alloc() and xas_split() split an order-9 to order-0: --------------------------------- | | | | | | | | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | | | | | | | | --------------------------------- | | | | ------- --- --- ------- | | ... | | V V V V ----------- ----------- ----------- ----------- | xa_node | | xa_node | ... | xa_node | | xa_node | ----------- ----------- ----------- ----------- xas_try_split() splits an order-9 to order-0: --------------------------------- | | | | | | | | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | | | | | | | | --------------------------------- | | V ----------- | xa_node | ----------- xas_try_split() is designed to be called iteratively with n = m + 1. xas_try_split_mini_order() is added to minmize the number of calls to xas_try_split() by telling the caller the next minimal order to split to instead of n - 1. Splitting order-n to order-m when m= l * XA_CHUNK_SHIFT does not require xa_node allocation and requires 1 xa_node when n=l * XA_CHUNK_SHIFT and m = n - 1, so it is OK to use xas_try_split() with n > m + 1 when no new xa_node is needed. xfstests quick group test passed on xfs and tmpfs. [1] https://lore.kernel.org/linux-mm/Z6YX3RznGLUD07Ao@casper.infradead.org/ [2] https://lore.kernel.org/linux-mm/20250226210032.2044041-1-ziy@nvidia.com/ This patch (of 2): During __filemap_add_folio(), a shadow entry is covering n slots and a folio covers m slots with m < n is to be added. Instead of splitting all n slots, only the m slots covered by the folio need to be split and the remaining n-m shadow entries can be retained with orders ranging from m to n-1. This method only requires (n/XA_CHUNK_SHIFT) - (m/XA_CHUNK_SHIFT) new xa_nodes instead of (n % XA_CHUNK_SHIFT) * ((n/XA_CHUNK_SHIFT) - (m/XA_CHUNK_SHIFT)) new xa_nodes, compared to the original xas_split_alloc() + xas_split() one. For example, to insert an order-0 folio when an order-9 shadow entry is present (assuming XA_CHUNK_SHIFT is 6), 1 xa_node is needed instead of 8. xas_try_split_min_order() is introduced to reduce the number of calls to xas_try_split() during split. Link: https://lkml.kernel.org/r/20250314222113.711703-1-ziy@nvidia.com Link: https://lkml.kernel.org/r/20250314222113.711703-2-ziy@nvidia.com Signed-off-by: Zi Yan Cc: Baolin Wang Cc: Hugh Dickins Cc: Kairui Song Cc: Miaohe Lin Cc: Mattew Wilcox Cc: David Hildenbrand Cc: John Hubbard Cc: Kefeng Wang Cc: Kirill A. Shuemov Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/xarray.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 4010195201c9..78eede109b1a 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1556,6 +1556,7 @@ int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); void xas_try_split(struct xa_state *xas, void *entry, unsigned int order); +unsigned int xas_try_split_min_order(unsigned int order); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { @@ -1582,6 +1583,12 @@ static inline void xas_try_split(struct xa_state *xas, void *entry, unsigned int order) { } + +static inline unsigned int xas_try_split_min_order(unsigned int order) +{ + return 0; +} + #endif /** -- cgit v1.2.3 From 61659efdb35ce6c6ac7639342098f3c4548b794b Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 12 Mar 2025 09:30:43 +1000 Subject: drivers/base/memory: improve add_boot_memory_block() Patch series "drivers/base/memory: Two cleanups", v3. Two cleanups to drivers/base/memory. This patch (of 2)L It's unnecessary to count the present sections for the specified block since the block will be added if any section in the block is present. Besides, for_each_present_section_nr() can be reused as Andrew Morton suggested. Improve by using for_each_present_section_nr() and dropping the unnecessary @section_count. No functional changes intended. Link: https://lkml.kernel.org/r/20250311233045.148943-1-gshan@redhat.com Link: https://lkml.kernel.org/r/20250311233045.148943-2-gshan@redhat.com Signed-off-by: Gavin Shan Acked-by: David Hildenbrand Acked-by: Oscar Salvador Cc: Danilo Krummrich Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 550dbba92521..dbb0ad69e17f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2140,6 +2140,11 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) return -1; } +#define for_each_present_section_nr(start, section_nr) \ + for (section_nr = next_present_section_nr(start - 1); \ + section_nr != -1; \ + section_nr = next_present_section_nr(section_nr)) + /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate -- cgit v1.2.3 From 1a24776fca39ed79d7f5e0b0592b5addd784981e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 12 Mar 2025 09:30:44 +1000 Subject: drivers/base/memory: correct the field name in the header Replace @blocks with @memory_blocks to match with the definition of struct memory_group. Link: https://lkml.kernel.org/r/20250311233045.148943-3-gshan@redhat.com Signed-off-by: Gavin Shan Acked-by: David Hildenbrand Acked-by: Oscar Salvador Cc: Danilo Krummrich Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- include/linux/memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index c0afee5d126e..12daa6ec7d09 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -25,7 +25,7 @@ /** * struct memory_group - a logical group of memory blocks * @nid: The node id for all memory blocks inside the memory group. - * @blocks: List of all memory blocks belonging to this memory group. + * @memory_blocks: List of all memory blocks belonging to this memory group. * @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this * memory group. * @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this -- cgit v1.2.3 From 8c02048d1c6126527f15752a5e0849dc49cefeeb Mon Sep 17 00:00:00 2001 From: Martin Liu Date: Sat, 8 Mar 2025 03:46:00 +0000 Subject: mm/page_alloc: add trace event for per-zone watermark setup Patch series "Add tracepoints for lowmem reserves, watermarks and totalreserve_pages", v2. This patchset introduces tracepoints to track changes in the lowmem reserves, watermarks and totalreserve_pages. This helps to track the exact timing of such changes and understand their relation to reclaim activities. The tracepoints added are: mm_setup_per_zone_lowmem_reserve mm_setup_per_zone_wmarks mm_calculate_totalreserve_pagesi This patch (of 3): This commit introduces the `mm_setup_per_zone_wmarks` trace event, which provides detailed insights into the kernel's per-zone watermark configuration, offering precise timing and the ability to correlate watermark changes with specific kernel events. While `/proc/zoneinfo` provides some information about zone watermarks, this trace event offers: 1. The ability to link watermark changes to specific kernel events and logic. 2. The ability to capture rapid or short-lived changes in watermarks that may be missed by user-space polling 3. Diagnosing unexpected kswapd activity or excessive direct reclaim triggered by rapidly changing watermarks. Link: https://lkml.kernel.org/r/20250308034606.2036033-1-liumartin@google.com Link: https://lkml.kernel.org/r/20250308034606.2036033-2-liumartin@google.com Signed-off-by: Martin Liu Acked-by: David Rientjes Cc: Steven Rostedt Cc: Martin Liu Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Signed-off-by: Andrew Morton --- include/trace/events/kmem.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include') diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index b37eb0a7060f..5fd392dae503 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -342,6 +342,39 @@ TRACE_EVENT(mm_alloc_contig_migrate_range_info, __entry->nr_mapped) ); +TRACE_EVENT(mm_setup_per_zone_wmarks, + + TP_PROTO(struct zone *zone), + + TP_ARGS(zone), + + TP_STRUCT__entry( + __field(int, node_id) + __string(name, zone->name) + __field(unsigned long, watermark_min) + __field(unsigned long, watermark_low) + __field(unsigned long, watermark_high) + __field(unsigned long, watermark_promo) + ), + + TP_fast_assign( + __entry->node_id = zone->zone_pgdat->node_id; + __assign_str(name); + __entry->watermark_min = zone->_watermark[WMARK_MIN]; + __entry->watermark_low = zone->_watermark[WMARK_LOW]; + __entry->watermark_high = zone->_watermark[WMARK_HIGH]; + __entry->watermark_promo = zone->_watermark[WMARK_PROMO]; + ), + + TP_printk("node_id=%d zone name=%s watermark min=%lu low=%lu high=%lu promo=%lu", + __entry->node_id, + __get_str(name), + __entry->watermark_min, + __entry->watermark_low, + __entry->watermark_high, + __entry->watermark_promo) +); + /* * Required for uniquely and securely identifying mm in rss_stat tracepoint. */ -- cgit v1.2.3 From a293aba4a584709889f77a0ad0c45746aecf1b9f Mon Sep 17 00:00:00 2001 From: Martin Liu Date: Sat, 8 Mar 2025 03:46:01 +0000 Subject: mm/page_alloc: add trace event for per-zone lowmem reserve setup This commit introduces the `mm_setup_per_zone_lowmem_reserve` trace event,which provides detailed insights into the kernel's per-zone lowmem reserve configuration. The trace event provides precise timestamps, allowing developers to 1. Correlate lowmem reserve changes with specific kernel events and able to diagnose unexpected kswapd or direct reclaim behavior triggered by dynamic changes in lowmem reserve. 2. Know memory allocation failures that occur due to insufficient lowmem reserve, by precisely correlating allocation attempts with reserve adjustments. Link: https://lkml.kernel.org/r/20250308034606.2036033-3-liumartin@google.com Signed-off-by: Martin Liu Acked-by: David Rientjes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/trace/events/kmem.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 5fd392dae503..9623e68d4d26 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -375,6 +375,33 @@ TRACE_EVENT(mm_setup_per_zone_wmarks, __entry->watermark_promo) ); +TRACE_EVENT(mm_setup_per_zone_lowmem_reserve, + + TP_PROTO(struct zone *zone, struct zone *upper_zone, long lowmem_reserve), + + TP_ARGS(zone, upper_zone, lowmem_reserve), + + TP_STRUCT__entry( + __field(int, node_id) + __string(name, zone->name) + __string(upper_name, upper_zone->name) + __field(long, lowmem_reserve) + ), + + TP_fast_assign( + __entry->node_id = zone->zone_pgdat->node_id; + __assign_str(name); + __assign_str(upper_name); + __entry->lowmem_reserve = lowmem_reserve; + ), + + TP_printk("node_id=%d zone name=%s upper_zone name=%s lowmem_reserve_pages=%ld", + __entry->node_id, + __get_str(name), + __get_str(upper_name), + __entry->lowmem_reserve) +); + /* * Required for uniquely and securely identifying mm in rss_stat tracepoint. */ -- cgit v1.2.3 From 15766485e4a51bec2dcce304c089a95550720033 Mon Sep 17 00:00:00 2001 From: Martin Liu Date: Sat, 8 Mar 2025 03:46:02 +0000 Subject: mm/page_alloc: add trace event for totalreserve_pages calculation This commit introduces a new trace event, `mm_calculate_totalreserve_pages`, which reports the new reserve value at the exact time when it takes effect. The `totalreserve_pages` value represents the total amount of memory reserved across all zones and nodes in the system. This reserved memory is crucial for ensuring that critical kernel operations have access to sufficient memory, even under memory pressure. By tracing the `totalreserve_pages` value, developers can gain insights that how the total reserved memory changes over time. Link: https://lkml.kernel.org/r/20250308034606.2036033-4-liumartin@google.com Signed-off-by: Martin Liu Acked-by: David Rientjes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/trace/events/kmem.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 9623e68d4d26..f74925a6cf69 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -402,6 +402,24 @@ TRACE_EVENT(mm_setup_per_zone_lowmem_reserve, __entry->lowmem_reserve) ); +TRACE_EVENT(mm_calculate_totalreserve_pages, + + TP_PROTO(unsigned long totalreserve_pages), + + TP_ARGS(totalreserve_pages), + + TP_STRUCT__entry( + __field(unsigned long, totalreserve_pages) + ), + + TP_fast_assign( + __entry->totalreserve_pages = totalreserve_pages; + ), + + TP_printk("totalreserve_pages=%lu", __entry->totalreserve_pages) +); + + /* * Required for uniquely and securely identifying mm in rss_stat tracepoint. */ -- cgit v1.2.3 From 67914ac08604345f620566ccf5bac87b40d5881d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Mar 2025 17:05:32 -0400 Subject: mm: compaction: push watermark into compaction_suitable() callers Patch series "mm: reliable huge page allocator". This series makes changes to the allocator and reclaim/compaction code to try harder to avoid fragmentation. As a result, this makes huge page allocations cheaper, more reliable and more sustainable. It's a subset of the huge page allocator RFC initially proposed here: https://lore.kernel.org/lkml/20230418191313.268131-1-hannes@cmpxchg.org/ The following results are from a kernel build test, with additional concurrent bursts of THP allocations on a memory-constrained system. Comparing before and after the changes over 15 runs: before after Hugealloc Time mean 52739.45 ( +0.00%) 28904.00 ( -45.19%) Hugealloc Time stddev 56541.26 ( +0.00%) 33464.37 ( -40.81%) Kbuild Real time 197.47 ( +0.00%) 196.59 ( -0.44%) Kbuild User time 1240.49 ( +0.00%) 1231.67 ( -0.71%) Kbuild System time 70.08 ( +0.00%) 59.10 ( -15.45%) THP fault alloc 46727.07 ( +0.00%) 63223.67 ( +35.30%) THP fault fallback 21910.60 ( +0.00%) 5412.47 ( -75.29%) Direct compact fail 195.80 ( +0.00%) 59.07 ( -69.48%) Direct compact success 7.93 ( +0.00%) 2.80 ( -57.46%) Direct compact success rate % 3.51 ( +0.00%) 3.99 ( +10.49%) Compact daemon scanned migrate 3369601.27 ( +0.00%) 2267500.33 ( -32.71%) Compact daemon scanned free 5075474.47 ( +0.00%) 2339773.00 ( -53.90%) Compact direct scanned migrate 161787.27 ( +0.00%) 47659.93 ( -70.54%) Compact direct scanned free 163467.53 ( +0.00%) 40729.67 ( -75.08%) Compact total migrate scanned 3531388.53 ( +0.00%) 2315160.27 ( -34.44%) Compact total free scanned 5238942.00 ( +0.00%) 2380502.67 ( -54.56%) Alloc stall 2371.07 ( +0.00%) 638.87 ( -73.02%) Pages kswapd scanned 2160926.73 ( +0.00%) 4002186.33 ( +85.21%) Pages kswapd reclaimed 533191.07 ( +0.00%) 718577.80 ( +34.77%) Pages direct scanned 400450.33 ( +0.00%) 355172.73 ( -11.31%) Pages direct reclaimed 94441.73 ( +0.00%) 31162.80 ( -67.00%) Pages total scanned 2561377.07 ( +0.00%) 4357359.07 ( +70.12%) Pages total reclaimed 627632.80 ( +0.00%) 749740.60 ( +19.46%) Swap out 47959.53 ( +0.00%) 110084.33 ( +129.53%) Swap in 7276.00 ( +0.00%) 24457.00 ( +236.10%) File refaults 138043.00 ( +0.00%) 188226.93 ( +36.35%) THP latencies are cut in half, and failure rates are cut by 75%. These metrics also hold up over time, while the vanilla kernel sees a steady downward trend in success rates with each subsequent run, owed to the cumulative effects of fragmentation. A more detailed discussion of results is in the patch changelogs. The patches first introduce a vm.defrag_mode sysctl, which enforces the existing ALLOC_NOFRAGMENT alloc flag until after reclaim and compaction have run. They then change kswapd and kcompactd to target pageblocks, which boosts success in the ALLOC_NOFRAGMENT hotpaths. Patches #1 and #2 are somewhat unrelated cleanups, but touch the same code and so are included here to avoid conflicts from re-ordering. This patch (of 5): compaction_suitable() hardcodes the min watermark, with a boost to the low watermark for costly orders. However, compaction_ready() requires order-0 at the high watermark. It currently checks the marks twice. Make the watermark a parameter to compaction_suitable() and have the callers pass in what they require: - compaction_zonelist_suitable() is used by the direct reclaim path, so use the min watermark. - compact_suit_allocation_order() has a watermark in context derived from cc->alloc_flags. The only quirk is that kcompactd doesn't initialize cc->alloc_flags explicitly. There is a direct check in kcompactd_do_work() that passes ALLOC_WMARK_MIN, but there is another check downstack in compact_zone() that ends up passing the unset alloc_flags. Since they default to 0, and that coincides with ALLOC_WMARK_MIN, it is correct. But it's subtle. Set cc->alloc_flags explicitly. - should_continue_reclaim() is direct reclaim, use the min watermark. - Finally, consolidate the two checks in compaction_ready() to a single compaction_suitable() call passing the high watermark. There is a tiny change in behavior: before, compaction_suitable() would check order-0 against min or low, depending on costly order. Then there'd be another high watermark check. Now, the high watermark is passed to compaction_suitable(), and the costly order-boost (low - min) is added on top. This means compaction_ready() sets a marginally higher target for free pages. In a kernelbuild + THP pressure test, though, this didn't show any measurable negative effects on memory pressure or reclaim rates. As the comment above the check says, reclaim is usually stopped short on should_continue_reclaim(), and this just defines the worst-case reclaim cutoff in case compaction is not making any headway. [hughd@google.com: stop oops on out-of-range highest_zoneidx] Link: https://lkml.kernel.org/r/005ace8b-07fa-01d4-b54b-394a3e029c07@google.com Link: https://lkml.kernel.org/r/20250313210647.1314586-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20250313210647.1314586-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Hugh Dickins Acked-by: Zi Yan Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/compaction.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 7bf0c521db63..173d9c07a895 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -95,7 +95,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern bool compaction_suitable(struct zone *zone, int order, - int highest_zoneidx); + unsigned long watermark, int highest_zoneidx); extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); @@ -113,7 +113,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) } static inline bool compaction_suitable(struct zone *zone, int order, - int highest_zoneidx) + unsigned long watermark, + int highest_zoneidx) { return false; } -- cgit v1.2.3 From a211c6550efcc87aa2459ca347bda10721c7a46a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Mar 2025 17:05:36 -0400 Subject: mm: page_alloc: defrag_mode kswapd/kcompactd watermarks The previous patch added pageblock_order reclaim to kswapd/kcompactd, which helps, but produces only one block at a time. Allocation stalls and THP failure rates are still higher than they could be. To adequately reflect ALLOC_NOFRAGMENT demand for pageblocks, change the watermarking for kswapd & kcompactd: instead of targeting the high watermark in order-0 pages and checking for one suitable block, simply require that the high watermark is entirely met in pageblocks. To this end, track the number of free pages within contiguous pageblocks, then change pgdat_balanced() and compact_finished() to check watermarks against this new value. This further reduces THP latencies and allocation stalls, and improves THP success rates against the previous patch: DEFRAGMODE-ASYNC DEFRAGMODE-ASYNC-WMARKS Hugealloc Time mean 34300.36 ( +0.00%) 28904.00 ( -15.73%) Hugealloc Time stddev 36390.42 ( +0.00%) 33464.37 ( -8.04%) Kbuild Real time 196.13 ( +0.00%) 196.59 ( +0.23%) Kbuild User time 1234.74 ( +0.00%) 1231.67 ( -0.25%) Kbuild System time 62.62 ( +0.00%) 59.10 ( -5.54%) THP fault alloc 57054.53 ( +0.00%) 63223.67 ( +10.81%) THP fault fallback 11581.40 ( +0.00%) 5412.47 ( -53.26%) Direct compact fail 107.80 ( +0.00%) 59.07 ( -44.79%) Direct compact success 4.53 ( +0.00%) 2.80 ( -31.33%) Direct compact success rate % 3.20 ( +0.00%) 3.99 ( +18.66%) Compact daemon scanned migrate 5461033.93 ( +0.00%) 2267500.33 ( -58.48%) Compact daemon scanned free 5824897.93 ( +0.00%) 2339773.00 ( -59.83%) Compact direct scanned migrate 58336.93 ( +0.00%) 47659.93 ( -18.30%) Compact direct scanned free 32791.87 ( +0.00%) 40729.67 ( +24.21%) Compact total migrate scanned 5519370.87 ( +0.00%) 2315160.27 ( -58.05%) Compact total free scanned 5857689.80 ( +0.00%) 2380502.67 ( -59.36%) Alloc stall 2424.60 ( +0.00%) 638.87 ( -73.62%) Pages kswapd scanned 2657018.33 ( +0.00%) 4002186.33 ( +50.63%) Pages kswapd reclaimed 559583.07 ( +0.00%) 718577.80 ( +28.41%) Pages direct scanned 722094.07 ( +0.00%) 355172.73 ( -50.81%) Pages direct reclaimed 107257.80 ( +0.00%) 31162.80 ( -70.95%) Pages total scanned 3379112.40 ( +0.00%) 4357359.07 ( +28.95%) Pages total reclaimed 666840.87 ( +0.00%) 749740.60 ( +12.43%) Swap out 77238.20 ( +0.00%) 110084.33 ( +42.53%) Swap in 11712.80 ( +0.00%) 24457.00 ( +108.80%) File refaults 143438.80 ( +0.00%) 188226.93 ( +31.22%) Also of note is that compaction work overall is reduced. The reason for this is that when free pageblocks are more readily available, allocations are also much more likely to get physically placed in LRU order, instead of being forced to scavenge free space here and there. This means that reclaim by itself has better chances of freeing up whole blocks, and the system relies less on compaction. Comparing all changes to the vanilla kernel: VANILLA DEFRAGMODE-ASYNC-WMARKS Hugealloc Time mean 52739.45 ( +0.00%) 28904.00 ( -45.19%) Hugealloc Time stddev 56541.26 ( +0.00%) 33464.37 ( -40.81%) Kbuild Real time 197.47 ( +0.00%) 196.59 ( -0.44%) Kbuild User time 1240.49 ( +0.00%) 1231.67 ( -0.71%) Kbuild System time 70.08 ( +0.00%) 59.10 ( -15.45%) THP fault alloc 46727.07 ( +0.00%) 63223.67 ( +35.30%) THP fault fallback 21910.60 ( +0.00%) 5412.47 ( -75.29%) Direct compact fail 195.80 ( +0.00%) 59.07 ( -69.48%) Direct compact success 7.93 ( +0.00%) 2.80 ( -57.46%) Direct compact success rate % 3.51 ( +0.00%) 3.99 ( +10.49%) Compact daemon scanned migrate 3369601.27 ( +0.00%) 2267500.33 ( -32.71%) Compact daemon scanned free 5075474.47 ( +0.00%) 2339773.00 ( -53.90%) Compact direct scanned migrate 161787.27 ( +0.00%) 47659.93 ( -70.54%) Compact direct scanned free 163467.53 ( +0.00%) 40729.67 ( -75.08%) Compact total migrate scanned 3531388.53 ( +0.00%) 2315160.27 ( -34.44%) Compact total free scanned 5238942.00 ( +0.00%) 2380502.67 ( -54.56%) Alloc stall 2371.07 ( +0.00%) 638.87 ( -73.02%) Pages kswapd scanned 2160926.73 ( +0.00%) 4002186.33 ( +85.21%) Pages kswapd reclaimed 533191.07 ( +0.00%) 718577.80 ( +34.77%) Pages direct scanned 400450.33 ( +0.00%) 355172.73 ( -11.31%) Pages direct reclaimed 94441.73 ( +0.00%) 31162.80 ( -67.00%) Pages total scanned 2561377.07 ( +0.00%) 4357359.07 ( +70.12%) Pages total reclaimed 627632.80 ( +0.00%) 749740.60 ( +19.46%) Swap out 47959.53 ( +0.00%) 110084.33 ( +129.53%) Swap in 7276.00 ( +0.00%) 24457.00 ( +236.10%) File refaults 138043.00 ( +0.00%) 188226.93 ( +36.35%) THP allocation latencies and %sys time are down dramatically. THP allocation failures are down from nearly 50% to 8.5%. And to recall previous data points, the success rates are steady and reliable without the cumulative deterioration of fragmentation events. Compaction work is down overall. Direct compaction work especially is drastically reduced. As an aside, its success rate of 4% indicates there is room for improvement. For now it's good to rely on it less. Reclaim work is up overall, however direct reclaim work is down. Part of the increase can be attributed to a higher use of THPs, which due to internal fragmentation increase the memory footprint. This is not necessarily an unexpected side-effect for users of THP. However, taken both points together, there may well be some opportunities for fine tuning in the reclaim/compaction coordination. [hannes@cmpxchg.org: fix squawks from rebasing] Link: https://lkml.kernel.org/r/20250314210558.GD1316033@cmpxchg.org Link: https://lkml.kernel.org/r/20250313210647.1314586-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Mel Gorman Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index dbb0ad69e17f..37c29f3fbca8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -138,6 +138,7 @@ enum numa_stat_item { enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, + NR_FREE_PAGES_BLOCKS, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, -- cgit v1.2.3 From e36ba5ab808ef6237c3148d469c8238674230e2b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:23 -0700 Subject: iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC Introduce a new IOMMUFD_OBJ_VEVENTQ object for vIOMMU Event Queue that provides user space (VMM) another FD to read the vIOMMU Events. Allow a vIOMMU object to allocate vEVENTQs, with a condition that each vIOMMU can only have one single vEVENTQ per type. Add iommufd_veventq_alloc() with iommufd_veventq_ops for the new ioctl. Link: https://patch.msgid.link/r/21acf0751dd5c93846935ee06f93b9c65eff5e04.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 3 ++ include/uapi/linux/iommufd.h | 82 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) (limited to 'include') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 11110c749200..8948b1836940 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -34,6 +34,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_FAULT, IOMMUFD_OBJ_VIOMMU, IOMMUFD_OBJ_VDEVICE, + IOMMUFD_OBJ_VEVENTQ, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif @@ -93,6 +94,8 @@ struct iommufd_viommu { const struct iommufd_viommu_ops *ops; struct xarray vdevs; + struct list_head veventqs; + struct rw_semaphore veventqs_rwsem; unsigned int type; }; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 78747b24bd0f..dbb8787d9c63 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -55,6 +55,7 @@ enum { IOMMUFD_CMD_VIOMMU_ALLOC = 0x90, IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92, + IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93, }; /** @@ -1014,4 +1015,85 @@ struct iommu_ioas_change_process { #define IOMMU_IOAS_CHANGE_PROCESS \ _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS) +/** + * enum iommu_veventq_flag - flag for struct iommufd_vevent_header + * @IOMMU_VEVENTQ_FLAG_LOST_EVENTS: vEVENTQ has lost vEVENTs + */ +enum iommu_veventq_flag { + IOMMU_VEVENTQ_FLAG_LOST_EVENTS = (1U << 0), +}; + +/** + * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status + * @flags: Combination of enum iommu_veventq_flag + * @sequence: The sequence index of a vEVENT in the vEVENTQ, with a range of + * [0, INT_MAX] where the following index of INT_MAX is 0 + * + * Each iommufd_vevent_header reports a sequence index of the following vEVENT: + * ------------------------------------------------------------------------- + * | header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...| dataN | + * ------------------------------------------------------------------------- + * And this sequence index is expected to be monotonic to the sequence index of + * the previous vEVENT. If two adjacent sequence indexes has a delta larger than + * 1, it means that delta - 1 number of vEVENTs has lost, e.g. two lost vEVENTs: + * ------------------------------------------------------------------------- + * | ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ... | + * ------------------------------------------------------------------------- + * If a vEVENT lost at the tail of the vEVENTQ and there is no following vEVENT + * providing the next sequence index, an IOMMU_VEVENTQ_FLAG_LOST_EVENTS header + * would be added to the tail, and no data would follow this header: + * --------------------------------------------------------------------------- + * |..| header3 {sequence=3} | data3 | header4 {flags=LOST_EVENTS, sequence=4} | + * --------------------------------------------------------------------------- + */ +struct iommufd_vevent_header { + __u32 flags; + __u32 sequence; +}; + +/** + * enum iommu_veventq_type - Virtual Event Queue Type + * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use + */ +enum iommu_veventq_type { + IOMMU_VEVENTQ_TYPE_DEFAULT = 0, +}; + +/** + * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC) + * @size: sizeof(struct iommu_veventq_alloc) + * @flags: Must be 0 + * @viommu_id: virtual IOMMU ID to associate the vEVENTQ with + * @type: Type of the vEVENTQ. Must be defined in enum iommu_veventq_type + * @veventq_depth: Maximum number of events in the vEVENTQ + * @out_veventq_id: The ID of the new vEVENTQ + * @out_veventq_fd: The fd of the new vEVENTQ. User space must close the + * successfully returned fd after using it + * @__reserved: Must be 0 + * + * Explicitly allocate a virtual event queue interface for a vIOMMU. A vIOMMU + * can have multiple FDs for different types, but is confined to one per @type. + * User space should open the @out_veventq_fd to read vEVENTs out of a vEVENTQ, + * if there are vEVENTs available. A vEVENTQ will lose events due to overflow, + * if the number of the vEVENTs hits @veventq_depth. + * + * Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by + * a type-specific data structure, in a normal case: + * ------------------------------------------------------------- + * || header0 | data0 | header1 | data1 | ... | headerN | dataN || + * ------------------------------------------------------------- + * unless a tailing IOMMU_VEVENTQ_FLAG_LOST_EVENTS header is logged (refer to + * struct iommufd_vevent_header). + */ +struct iommu_veventq_alloc { + __u32 size; + __u32 flags; + __u32 viommu_id; + __u32 type; + __u32 veventq_depth; + __u32 out_veventq_id; + __u32 out_veventq_fd; + __u32 __reserved; +}; +#define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC) #endif -- cgit v1.2.3 From ea94b211c5483080b749c142090f4c4de4926e51 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:24 -0700 Subject: iommufd/viommu: Add iommufd_viommu_get_vdev_id helper This is a reverse search v.s. iommufd_viommu_find_dev, as drivers may want to convert a struct device pointer (physical) to its virtual device ID for an event injection to the user space VM. Again, this avoids exposing more core structures to the drivers, than the iommufd_viommu alone. Link: https://patch.msgid.link/r/18b8e8bc1b8104d43b205d21602c036fd0804e56.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 8948b1836940..05cb393aff0a 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -190,6 +190,8 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, enum iommufd_object_type type); struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id); +int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, unsigned long *vdev_id); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ static inline struct iommufd_object * _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, @@ -203,6 +205,13 @@ iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) { return NULL; } + +static inline int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, + unsigned long *vdev_id) +{ + return -ENOENT; +} #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ /* -- cgit v1.2.3 From e8e1ef9b77a7a09b7809890a52229f24d3c8b532 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:25 -0700 Subject: iommufd/viommu: Add iommufd_viommu_report_event helper Similar to iommu_report_device_fault, this allows IOMMU drivers to report vIOMMU events from threaded IRQ handlers to user space hypervisors. Link: https://patch.msgid.link/r/44be825042c8255e75d0151b338ffd8ba0e4920b.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 05cb393aff0a..60eff9272551 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -11,6 +11,7 @@ #include #include #include +#include struct device; struct file; @@ -192,6 +193,9 @@ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id); int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, struct device *dev, unsigned long *vdev_id); +int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, void *event_data, + size_t data_len); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ static inline struct iommufd_object * _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, @@ -212,6 +216,13 @@ static inline int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, { return -ENOENT; } + +static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, + void *event_data, size_t data_len) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ /* -- cgit v1.2.3 From e7d3fa3d29d5b2ed12d247cf57a0a34fffe89eb8 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:31 -0700 Subject: iommu/arm-smmu-v3: Report events that belong to devices attached to vIOMMU Aside from the IOPF framework, iommufd provides an additional pathway to report hardware events, via the vEVENTQ of vIOMMU infrastructure. Define an iommu_vevent_arm_smmuv3 uAPI structure, and report stage-1 events in the threaded IRQ handler. Also, add another four event record types that can be forwarded to a VM. Link: https://patch.msgid.link/r/5cf6719682fdfdabffdb08374cdf31ad2466d75a.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Acked-by: Will Deacon Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/uapi/linux/iommufd.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index dbb8787d9c63..8719d4f5d618 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -1054,9 +1054,32 @@ struct iommufd_vevent_header { /** * enum iommu_veventq_type - Virtual Event Queue Type * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use + * @IOMMU_VEVENTQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event Queue */ enum iommu_veventq_type { IOMMU_VEVENTQ_TYPE_DEFAULT = 0, + IOMMU_VEVENTQ_TYPE_ARM_SMMUV3 = 1, +}; + +/** + * struct iommu_vevent_arm_smmuv3 - ARM SMMUv3 Virtual Event + * (IOMMU_VEVENTQ_TYPE_ARM_SMMUV3) + * @evt: 256-bit ARM SMMUv3 Event record, little-endian. + * Reported event records: (Refer to "7.3 Event records" in SMMUv3 HW Spec) + * - 0x04 C_BAD_STE + * - 0x06 F_STREAM_DISABLED + * - 0x08 C_BAD_SUBSTREAMID + * - 0x0a C_BAD_CD + * - 0x10 F_TRANSLATION + * - 0x11 F_ADDR_SIZE + * - 0x12 F_ACCESS + * - 0x13 F_PERMISSION + * + * StreamID field reports a virtual device ID. To receive a virtual event for a + * device, a vDEVICE must be allocated via IOMMU_VDEVICE_ALLOC. + */ +struct iommu_vevent_arm_smmuv3 { + __aligned_le64 evt[4]; }; /** -- cgit v1.2.3 From e794dc30be8b69e44646a162db09b43f719525b7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 13 Feb 2025 16:07:15 +0200 Subject: i2c: Introduce i2c_10bit_addr_*_from_msg() helpers There are already a lot of drivers that have been using i2c_8bit_addr_from_msg() for 7-bit addresses, now it's time to have the similar for 10-bit addresses. Signed-off-by: Andy Shevchenko Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250213141045.2716943-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andi Shyti --- include/linux/i2c.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 2b2af24d2a43..1e35b1e23165 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -952,6 +952,21 @@ static inline u8 i2c_8bit_addr_from_msg(const struct i2c_msg *msg) return (msg->addr << 1) | (msg->flags & I2C_M_RD); } +/* + * 10-bit address + * addr_1: 5'b11110 | addr[9:8] | (R/nW) + * addr_2: addr[7:0] + */ +static inline u8 i2c_10bit_addr_hi_from_msg(const struct i2c_msg *msg) +{ + return 0xf0 | ((msg->addr & GENMASK(9, 8)) >> 7) | (msg->flags & I2C_M_RD); +} + +static inline u8 i2c_10bit_addr_lo_from_msg(const struct i2c_msg *msg) +{ + return msg->addr & GENMASK(7, 0); +} + u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold); void i2c_put_dma_safe_msg_buf(u8 *buf, struct i2c_msg *msg, bool xferred); -- cgit v1.2.3 From a72f418703b55072d837b72ac87091e18049260c Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:16 +0100 Subject: tty: convert "TTY Struct Flags" to an enum Convert TTY_* macros (flags) to an enum. This allows for easier kernel-doc (the comment needed fine tuning), grouping of these nicely, and proper checking. Note that these are bit positions. So they are used such as test_bit(TTY_THROTTLED, ...). Given these are not the user API (only in-kernel API/ABI), the bit positions are NOT preserved in this patch. All are renumbered naturally using the enum-auto-numbering. Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250317070046.24386-2-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/tty.h b/include/linux/tty.h index 2372f9357240..6bb4fb3845f0 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -251,7 +251,7 @@ struct tty_file_private { }; /** - * DOC: TTY Struct Flags + * enum tty_struct_flags - TTY Struct Flags * * These bits are used in the :c:member:`tty_struct.flags` field. * @@ -260,62 +260,64 @@ struct tty_file_private { * tty->write. Thus, you must use the inline functions set_bit() and * clear_bit() to make things atomic. * - * TTY_THROTTLED + * @TTY_THROTTLED: * Driver input is throttled. The ldisc should call * :c:member:`tty_driver.unthrottle()` in order to resume reception when * it is ready to process more data (at threshold min). * - * TTY_IO_ERROR + * @TTY_IO_ERROR: * If set, causes all subsequent userspace read/write calls on the tty to * fail, returning -%EIO. (May be no ldisc too.) * - * TTY_OTHER_CLOSED + * @TTY_OTHER_CLOSED: * Device is a pty and the other side has closed. * - * TTY_EXCLUSIVE + * @TTY_EXCLUSIVE: * Exclusive open mode (a single opener). * - * TTY_DO_WRITE_WAKEUP + * @TTY_DO_WRITE_WAKEUP: * If set, causes the driver to call the * :c:member:`tty_ldisc_ops.write_wakeup()` method in order to resume * transmission when it can accept more data to transmit. * - * TTY_LDISC_OPEN + * @TTY_LDISC_OPEN: * Indicates that a line discipline is open. For debugging purposes only. * - * TTY_PTY_LOCK + * @TTY_PTY_LOCK: * A flag private to pty code to implement %TIOCSPTLCK/%TIOCGPTLCK logic. * - * TTY_NO_WRITE_SPLIT + * @TTY_NO_WRITE_SPLIT: * Prevent driver from splitting up writes into smaller chunks (preserve * write boundaries to driver). * - * TTY_HUPPED + * @TTY_HUPPED: * The TTY was hung up. This is set post :c:member:`tty_driver.hangup()`. * - * TTY_HUPPING + * @TTY_HUPPING: * The TTY is in the process of hanging up to abort potential readers. * - * TTY_LDISC_CHANGING + * @TTY_LDISC_CHANGING: * Line discipline for this TTY is being changed. I/O should not block * when this is set. Use tty_io_nonblock() to check. * - * TTY_LDISC_HALTED + * @TTY_LDISC_HALTED: * Line discipline for this TTY was stopped. No work should be queued to * this ldisc. */ -#define TTY_THROTTLED 0 -#define TTY_IO_ERROR 1 -#define TTY_OTHER_CLOSED 2 -#define TTY_EXCLUSIVE 3 -#define TTY_DO_WRITE_WAKEUP 5 -#define TTY_LDISC_OPEN 11 -#define TTY_PTY_LOCK 16 -#define TTY_NO_WRITE_SPLIT 17 -#define TTY_HUPPED 18 -#define TTY_HUPPING 19 -#define TTY_LDISC_CHANGING 20 -#define TTY_LDISC_HALTED 22 +enum tty_struct_flags { + TTY_THROTTLED, + TTY_IO_ERROR, + TTY_OTHER_CLOSED, + TTY_EXCLUSIVE, + TTY_DO_WRITE_WAKEUP, + TTY_LDISC_OPEN, + TTY_PTY_LOCK, + TTY_NO_WRITE_SPLIT, + TTY_HUPPED, + TTY_HUPPING, + TTY_LDISC_CHANGING, + TTY_LDISC_HALTED, +}; static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file) { -- cgit v1.2.3 From d2e38e713bada24539b2b049f9be8d40dea79f41 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:19 +0100 Subject: tty: move N_TTY_BUF_SIZE to n_tty "N_TTY_BUF_SIZE" is private to n_tty and shall not be exposed to the world. Definitely not in tty.h somewhere in the middle of "struct tty_struct". This is a remnant of moving "read_flags" to "struct n_tty_data" in commit 3fe780b379fa ("TTY: move ldisc data from tty_struct: bitmaps"). But some cleanup was needed first (in previous patches). Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250317070046.24386-5-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/tty.h b/include/linux/tty.h index 6bb4fb3845f0..0a46e4054dec 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -239,7 +239,6 @@ struct tty_struct { struct list_head tty_files; -#define N_TTY_BUF_SIZE 4096 struct work_struct SAK_work; } __randomize_layout; -- cgit v1.2.3 From 67acbcc324272ed06cd1c8f360afdeceb919af89 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:27 +0100 Subject: tty: tty_driver: move TTY macros to the top So that they can be referenced in structs once converted to enums (in the next patches). Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250317070046.24386-13-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 156 ++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 78 deletions(-) (limited to 'include') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index d4cdc089f6c3..f3be6d56e9e5 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -16,6 +16,84 @@ struct tty_driver; struct serial_icounter_struct; struct serial_struct; +/** + * DOC: TTY Driver Flags + * + * TTY_DRIVER_RESET_TERMIOS + * Requests the tty layer to reset the termios setting when the last + * process has closed the device. Used for PTYs, in particular. + * + * TTY_DRIVER_REAL_RAW + * Indicates that the driver will guarantee not to set any special + * character handling flags if this is set for the tty: + * + * ``(IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || !INPCK)`` + * + * That is, if there is no reason for the driver to + * send notifications of parity and break characters up to the line + * driver, it won't do so. This allows the line driver to optimize for + * this case if this flag is set. (Note that there is also a promise, if + * the above case is true, not to signal overruns, either.) + * + * TTY_DRIVER_DYNAMIC_DEV + * The individual tty devices need to be registered with a call to + * tty_register_device() when the device is found in the system and + * unregistered with a call to tty_unregister_device() so the devices will + * be show up properly in sysfs. If not set, all &tty_driver.num entries + * will be created by the tty core in sysfs when tty_register_driver() is + * called. This is to be used by drivers that have tty devices that can + * appear and disappear while the main tty driver is registered with the + * tty core. + * + * TTY_DRIVER_DEVPTS_MEM + * Don't use the standard arrays (&tty_driver.ttys and + * &tty_driver.termios), instead use dynamic memory keyed through the + * devpts filesystem. This is only applicable to the PTY driver. + * + * TTY_DRIVER_HARDWARE_BREAK + * Hardware handles break signals. Pass the requested timeout to the + * &tty_operations.break_ctl instead of using a simple on/off interface. + * + * TTY_DRIVER_DYNAMIC_ALLOC + * Do not allocate structures which are needed per line for this driver + * (&tty_driver.ports) as it would waste memory. The driver will take + * care. This is only applicable to the PTY driver. + * + * TTY_DRIVER_UNNUMBERED_NODE + * Do not create numbered ``/dev`` nodes. For example, create + * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a + * driver for a single tty device is being allocated. + */ +#define TTY_DRIVER_INSTALLED 0x0001 +#define TTY_DRIVER_RESET_TERMIOS 0x0002 +#define TTY_DRIVER_REAL_RAW 0x0004 +#define TTY_DRIVER_DYNAMIC_DEV 0x0008 +#define TTY_DRIVER_DEVPTS_MEM 0x0010 +#define TTY_DRIVER_HARDWARE_BREAK 0x0020 +#define TTY_DRIVER_DYNAMIC_ALLOC 0x0040 +#define TTY_DRIVER_UNNUMBERED_NODE 0x0080 + +/* tty driver types */ +#define TTY_DRIVER_TYPE_SYSTEM 0x0001 +#define TTY_DRIVER_TYPE_CONSOLE 0x0002 +#define TTY_DRIVER_TYPE_SERIAL 0x0003 +#define TTY_DRIVER_TYPE_PTY 0x0004 +#define TTY_DRIVER_TYPE_SCC 0x0005 /* scc driver */ +#define TTY_DRIVER_TYPE_SYSCONS 0x0006 + +/* system subtypes (magic, used by tty_io.c) */ +#define SYSTEM_TYPE_TTY 0x0001 +#define SYSTEM_TYPE_CONSOLE 0x0002 +#define SYSTEM_TYPE_SYSCONS 0x0003 +#define SYSTEM_TYPE_SYSPTMX 0x0004 + +/* pty subtypes (magic, used by tty_io.c) */ +#define PTY_TYPE_MASTER 0x0001 +#define PTY_TYPE_SLAVE 0x0002 + +/* serial subtype definitions */ +#define SERIAL_TYPE_NORMAL 1 + /** * struct tty_operations -- interface between driver and tty * @@ -494,84 +572,6 @@ static inline void tty_set_operations(struct tty_driver *driver, driver->ops = op; } -/** - * DOC: TTY Driver Flags - * - * TTY_DRIVER_RESET_TERMIOS - * Requests the tty layer to reset the termios setting when the last - * process has closed the device. Used for PTYs, in particular. - * - * TTY_DRIVER_REAL_RAW - * Indicates that the driver will guarantee not to set any special - * character handling flags if this is set for the tty: - * - * ``(IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || !INPCK)`` - * - * That is, if there is no reason for the driver to - * send notifications of parity and break characters up to the line - * driver, it won't do so. This allows the line driver to optimize for - * this case if this flag is set. (Note that there is also a promise, if - * the above case is true, not to signal overruns, either.) - * - * TTY_DRIVER_DYNAMIC_DEV - * The individual tty devices need to be registered with a call to - * tty_register_device() when the device is found in the system and - * unregistered with a call to tty_unregister_device() so the devices will - * be show up properly in sysfs. If not set, all &tty_driver.num entries - * will be created by the tty core in sysfs when tty_register_driver() is - * called. This is to be used by drivers that have tty devices that can - * appear and disappear while the main tty driver is registered with the - * tty core. - * - * TTY_DRIVER_DEVPTS_MEM - * Don't use the standard arrays (&tty_driver.ttys and - * &tty_driver.termios), instead use dynamic memory keyed through the - * devpts filesystem. This is only applicable to the PTY driver. - * - * TTY_DRIVER_HARDWARE_BREAK - * Hardware handles break signals. Pass the requested timeout to the - * &tty_operations.break_ctl instead of using a simple on/off interface. - * - * TTY_DRIVER_DYNAMIC_ALLOC - * Do not allocate structures which are needed per line for this driver - * (&tty_driver.ports) as it would waste memory. The driver will take - * care. This is only applicable to the PTY driver. - * - * TTY_DRIVER_UNNUMBERED_NODE - * Do not create numbered ``/dev`` nodes. For example, create - * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a - * driver for a single tty device is being allocated. - */ -#define TTY_DRIVER_INSTALLED 0x0001 -#define TTY_DRIVER_RESET_TERMIOS 0x0002 -#define TTY_DRIVER_REAL_RAW 0x0004 -#define TTY_DRIVER_DYNAMIC_DEV 0x0008 -#define TTY_DRIVER_DEVPTS_MEM 0x0010 -#define TTY_DRIVER_HARDWARE_BREAK 0x0020 -#define TTY_DRIVER_DYNAMIC_ALLOC 0x0040 -#define TTY_DRIVER_UNNUMBERED_NODE 0x0080 - -/* tty driver types */ -#define TTY_DRIVER_TYPE_SYSTEM 0x0001 -#define TTY_DRIVER_TYPE_CONSOLE 0x0002 -#define TTY_DRIVER_TYPE_SERIAL 0x0003 -#define TTY_DRIVER_TYPE_PTY 0x0004 -#define TTY_DRIVER_TYPE_SCC 0x0005 /* scc driver */ -#define TTY_DRIVER_TYPE_SYSCONS 0x0006 - -/* system subtypes (magic, used by tty_io.c) */ -#define SYSTEM_TYPE_TTY 0x0001 -#define SYSTEM_TYPE_CONSOLE 0x0002 -#define SYSTEM_TYPE_SYSCONS 0x0003 -#define SYSTEM_TYPE_SYSPTMX 0x0004 - -/* pty subtypes (magic, used by tty_io.c) */ -#define PTY_TYPE_MASTER 0x0001 -#define PTY_TYPE_SLAVE 0x0002 - -/* serial subtype definitions */ -#define SERIAL_TYPE_NORMAL 1 - int tty_register_driver(struct tty_driver *driver); void tty_unregister_driver(struct tty_driver *driver); struct device *tty_register_device(struct tty_driver *driver, unsigned index, -- cgit v1.2.3 From 109e06ae1dcf41d470705c54a67b123792424279 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:28 +0100 Subject: tty: tty_driver: convert "TTY Driver Flags" to an enum Convert TTY_DRIVER_* macros (flags) to an enum. This allows for easier kernel-doc (the comment needed fine tuning), grouping of these nicely, and proper checking. Given these are flags, define them using modern BIT() instead of hex constants. It turns out (thanks, kernel-doc checker) that internal TTY_DRIVER_INSTALLED was undocumented. Fix that too. Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250317070046.24386-14-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index f3be6d56e9e5..d0d940236580 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -17,13 +17,19 @@ struct serial_icounter_struct; struct serial_struct; /** - * DOC: TTY Driver Flags + * enum tty_driver_flag -- TTY Driver Flags * - * TTY_DRIVER_RESET_TERMIOS + * These are flags passed to tty_alloc_driver(). + * + * @TTY_DRIVER_INSTALLED: + * Whether this driver was succesfully installed. This is a tty internal + * flag. Do not touch. + * + * @TTY_DRIVER_RESET_TERMIOS: * Requests the tty layer to reset the termios setting when the last * process has closed the device. Used for PTYs, in particular. * - * TTY_DRIVER_REAL_RAW + * @TTY_DRIVER_REAL_RAW: * Indicates that the driver will guarantee not to set any special * character handling flags if this is set for the tty: * @@ -35,7 +41,7 @@ struct serial_struct; * this case if this flag is set. (Note that there is also a promise, if * the above case is true, not to signal overruns, either.) * - * TTY_DRIVER_DYNAMIC_DEV + * @TTY_DRIVER_DYNAMIC_DEV: * The individual tty devices need to be registered with a call to * tty_register_device() when the device is found in the system and * unregistered with a call to tty_unregister_device() so the devices will @@ -45,33 +51,35 @@ struct serial_struct; * appear and disappear while the main tty driver is registered with the * tty core. * - * TTY_DRIVER_DEVPTS_MEM + * @TTY_DRIVER_DEVPTS_MEM: * Don't use the standard arrays (&tty_driver.ttys and * &tty_driver.termios), instead use dynamic memory keyed through the * devpts filesystem. This is only applicable to the PTY driver. * - * TTY_DRIVER_HARDWARE_BREAK + * @TTY_DRIVER_HARDWARE_BREAK: * Hardware handles break signals. Pass the requested timeout to the * &tty_operations.break_ctl instead of using a simple on/off interface. * - * TTY_DRIVER_DYNAMIC_ALLOC + * @TTY_DRIVER_DYNAMIC_ALLOC: * Do not allocate structures which are needed per line for this driver * (&tty_driver.ports) as it would waste memory. The driver will take * care. This is only applicable to the PTY driver. * - * TTY_DRIVER_UNNUMBERED_NODE + * @TTY_DRIVER_UNNUMBERED_NODE: * Do not create numbered ``/dev`` nodes. For example, create * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a * driver for a single tty device is being allocated. */ -#define TTY_DRIVER_INSTALLED 0x0001 -#define TTY_DRIVER_RESET_TERMIOS 0x0002 -#define TTY_DRIVER_REAL_RAW 0x0004 -#define TTY_DRIVER_DYNAMIC_DEV 0x0008 -#define TTY_DRIVER_DEVPTS_MEM 0x0010 -#define TTY_DRIVER_HARDWARE_BREAK 0x0020 -#define TTY_DRIVER_DYNAMIC_ALLOC 0x0040 -#define TTY_DRIVER_UNNUMBERED_NODE 0x0080 +enum tty_driver_flag { + TTY_DRIVER_INSTALLED = BIT(0), + TTY_DRIVER_RESET_TERMIOS = BIT(1), + TTY_DRIVER_REAL_RAW = BIT(2), + TTY_DRIVER_DYNAMIC_DEV = BIT(3), + TTY_DRIVER_DEVPTS_MEM = BIT(4), + TTY_DRIVER_HARDWARE_BREAK = BIT(5), + TTY_DRIVER_DYNAMIC_ALLOC = BIT(6), + TTY_DRIVER_UNNUMBERED_NODE = BIT(7), +}; /* tty driver types */ #define TTY_DRIVER_TYPE_SYSTEM 0x0001 -- cgit v1.2.3 From 63f3cd5d80d720fc6c9fd321138bbbf322ade392 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:29 +0100 Subject: tty: tty_driver: document both {,__}tty_alloc_driver() properly __tty_alloc_driver()'s kernel-doc needed some care: describe the return value using the standard "Returns:", and use the new enum tty_driver_flag for @flags. Then, the tty_alloc_driver() macro was undocumented, but referenced many times in the docs. Copy the docs from the above (except the @owner parameter, obviously). Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250317070046.24386-15-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index d0d940236580..0fe38befa1b8 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -564,7 +564,13 @@ struct tty_driver *tty_find_polling_driver(char *name, int *line); void tty_driver_kref_put(struct tty_driver *driver); -/* Use TTY_DRIVER_* flags below */ +/** + * tty_alloc_driver - allocate tty driver + * @lines: count of lines this driver can handle at most + * @flags: some of enum tty_driver_flag, will be set in driver->flags + * + * Returns: struct tty_driver or a PTR-encoded error (use IS_ERR() and friends). + */ #define tty_alloc_driver(lines, flags) \ __tty_alloc_driver(lines, THIS_MODULE, flags) -- cgit v1.2.3 From 52443558adcdb11cff76beec34bf75f6779e1a08 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:30 +0100 Subject: tty: tty_driver: introduce TTY driver sub/types enums Convert TTY_DRIVER_TYPE_*, and subtype macros to two enums: tty_driver_type and tty_driver_subtype. This allows for easier kernel-doc (later), grouping of these nicely, and proper checking. The tty_driver's ::type and ::subtype now use these enums instead of bare "short". Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250317070046.24386-16-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 0fe38befa1b8..188ee9b768eb 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -81,26 +81,26 @@ enum tty_driver_flag { TTY_DRIVER_UNNUMBERED_NODE = BIT(7), }; -/* tty driver types */ -#define TTY_DRIVER_TYPE_SYSTEM 0x0001 -#define TTY_DRIVER_TYPE_CONSOLE 0x0002 -#define TTY_DRIVER_TYPE_SERIAL 0x0003 -#define TTY_DRIVER_TYPE_PTY 0x0004 -#define TTY_DRIVER_TYPE_SCC 0x0005 /* scc driver */ -#define TTY_DRIVER_TYPE_SYSCONS 0x0006 +enum tty_driver_type { + TTY_DRIVER_TYPE_SYSTEM, + TTY_DRIVER_TYPE_CONSOLE, + TTY_DRIVER_TYPE_SERIAL, + TTY_DRIVER_TYPE_PTY, + TTY_DRIVER_TYPE_SCC, + TTY_DRIVER_TYPE_SYSCONS, +}; -/* system subtypes (magic, used by tty_io.c) */ -#define SYSTEM_TYPE_TTY 0x0001 -#define SYSTEM_TYPE_CONSOLE 0x0002 -#define SYSTEM_TYPE_SYSCONS 0x0003 -#define SYSTEM_TYPE_SYSPTMX 0x0004 +enum tty_driver_subtype { + SYSTEM_TYPE_TTY = 1, + SYSTEM_TYPE_CONSOLE, + SYSTEM_TYPE_SYSCONS, + SYSTEM_TYPE_SYSPTMX, -/* pty subtypes (magic, used by tty_io.c) */ -#define PTY_TYPE_MASTER 0x0001 -#define PTY_TYPE_SLAVE 0x0002 + PTY_TYPE_MASTER = 1, + PTY_TYPE_SLAVE, -/* serial subtype definitions */ -#define SERIAL_TYPE_NORMAL 1 + SERIAL_TYPE_NORMAL = 1, +}; /** * struct tty_operations -- interface between driver and tty @@ -500,8 +500,8 @@ struct tty_operations { * @major: major /dev device number (zero for autoassignment) * @minor_start: the first minor /dev device number * @num: number of devices allocated - * @type: type of tty driver (%TTY_DRIVER_TYPE_) - * @subtype: subtype of tty driver (%SYSTEM_TYPE_, %PTY_TYPE_, %SERIAL_TYPE_) + * @type: type of tty driver (enum tty_driver_type) + * @subtype: subtype of tty driver (enum tty_driver_subtype) * @init_termios: termios to set to each tty initially (e.g. %tty_std_termios) * @flags: tty driver flags (%TTY_DRIVER_) * @proc_entry: proc fs entry, used internally @@ -533,8 +533,8 @@ struct tty_driver { int major; int minor_start; unsigned int num; - short type; - short subtype; + enum tty_driver_type type; + enum tty_driver_subtype subtype; struct ktermios init_termios; unsigned long flags; struct proc_dir_entry *proc_entry; -- cgit v1.2.3 From 5af89030960fd987a6c25e33405bbaaff3c7298c Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:31 +0100 Subject: tty: serdev: drop serdev_controller_ops::write_room() In particular, serdev_device_write_room() is not called, so the whole serdev's write_room() can go. Signed-off-by: Jiri Slaby (SUSE) Cc: Rob Herring Link: https://lore.kernel.org/r/20250317070046.24386-17-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index ff78efc1f60d..34562eb99931 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -84,7 +84,6 @@ enum serdev_parity { struct serdev_controller_ops { ssize_t (*write_buf)(struct serdev_controller *, const u8 *, size_t); void (*write_flush)(struct serdev_controller *); - int (*write_room)(struct serdev_controller *); int (*open)(struct serdev_controller *); void (*close)(struct serdev_controller *); void (*set_flow_control)(struct serdev_controller *, bool); @@ -212,7 +211,6 @@ int serdev_device_break_ctl(struct serdev_device *serdev, int break_state); void serdev_device_write_wakeup(struct serdev_device *); ssize_t serdev_device_write(struct serdev_device *, const u8 *, size_t, long); void serdev_device_write_flush(struct serdev_device *); -int serdev_device_write_room(struct serdev_device *); /* * serdev device driver functions @@ -273,10 +271,6 @@ static inline ssize_t serdev_device_write(struct serdev_device *sdev, return -ENODEV; } static inline void serdev_device_write_flush(struct serdev_device *sdev) {} -static inline int serdev_device_write_room(struct serdev_device *sdev) -{ - return 0; -} #define serdev_device_driver_register(x) #define serdev_device_driver_unregister(x) -- cgit v1.2.3 From c25951eb7518844fcb7fc9ec58e888731e8c46d0 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 15 Nov 2024 15:20:55 +0000 Subject: bus: fsl-mc: Remove deadcode fsl_mc_allocator_driver_exit() was added explicitly by commit 1e8ac83b6caf ("bus: fsl-mc: add fsl_mc_allocator cleanup function") but was never used. Remove it. fsl_mc_portal_reset() was added in 2015 by commit 197f4d6a4a00 ("staging: fsl-mc: fsl-mc object allocator driver") but was never used. Remove it. fsl_mc_portal_reset() was the only caller of dpmcp_reset(). Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Ioana Ciornei Acked-by: Christophe Leroy Link: https://lore.kernel.org/r/20241115152055.279732-1-linux@treblig.org Signed-off-by: Christophe Leroy --- include/linux/fsl/mc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 99f30c7d6208..897d6211c163 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -417,8 +417,6 @@ int __must_check fsl_mc_portal_allocate(struct fsl_mc_device *mc_dev, void fsl_mc_portal_free(struct fsl_mc_io *mc_io); -int fsl_mc_portal_reset(struct fsl_mc_io *mc_io); - int __must_check fsl_mc_object_allocate(struct fsl_mc_device *mc_dev, enum fsl_mc_pool_type pool_type, struct fsl_mc_device **new_mc_adev); -- cgit v1.2.3 From cfe1f8776f23fd6dfe4ba9fcb695b129f0761187 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 13 Mar 2025 13:01:22 -0400 Subject: NFS: Extend rdirplus mount option with "force|none" There are certain users that wish to force the NFS client to choose READDIRPLUS over READDIR for a particular mount. Update the "rdirplus" mount option to optionally accept values. For "rdirplus=force", the NFS client will always attempt to use READDDIRPLUS. The setting of "rdirplus=none" is aliased to the existing "nordirplus". Signed-off-by: Benjamin Coddington Link: https://lore.kernel.org/r/c4cf0de4c8be0930b91bc74bee310d289781cd3b.1741885071.git.bcodding@redhat.com Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 1711eefcf24f..b83d16a42afc 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -167,6 +167,7 @@ struct nfs_server { #define NFS_MOUNT_TRUNK_DISCOVERY 0x04000000 #define NFS_MOUNT_SHUTDOWN 0x08000000 #define NFS_MOUNT_NO_ALIGNWRITE 0x10000000 +#define NFS_MOUNT_FORCE_RDIRPLUS 0x20000000 unsigned int fattr_valid; /* Valid attributes */ unsigned int caps; /* server capabilities */ -- cgit v1.2.3 From df210d9b0951d714c1668c511ca5c8ff38cf6916 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 7 Feb 2025 15:42:24 -0500 Subject: sunrpc: Add a sysfs file for adding a new xprt Writing to this file will clone the 'main' xprt of an xprt_switch and add it to be used as an additional connection. -- Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker v3: Replace call to xprt_iter_get_xprt() with xprt_iter_get_next() Link: https://lore.kernel.org/r/20250207204225.594002-5-anna@kernel.org Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtmultipath.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index e411368cdacf..e4db5022fe92 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -56,6 +56,7 @@ extern void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt); extern void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, bool offline); +extern struct rpc_xprt *rpc_xprt_switch_get_main_xprt(struct rpc_xprt_switch *xps); extern void xprt_iter_init(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps); -- cgit v1.2.3 From 487fae09d7e266a58a13784983cc1ef6a2076526 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Mar 2025 11:45:06 -0400 Subject: NFS: Add a mount option to make ENETUNREACH errors fatal If the NFS client was initially created in a container, and that container is torn down, there is usually no possibity to go back and destroy any NFS clients that are hung because their virtual network devices have been unlinked. Add a flag that tells the NFS client that in these circumstances, it should treat ENETDOWN and ENETUNREACH errors as fatal to the NFS client. The option defaults to being on when the mount happens from inside a net namespace that is not "init_net". Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Tested-by: Jeff Layton Acked-by: Chuck Lever --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index b83d16a42afc..a6ce8590eaaf 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -168,6 +168,7 @@ struct nfs_server { #define NFS_MOUNT_SHUTDOWN 0x08000000 #define NFS_MOUNT_NO_ALIGNWRITE 0x10000000 #define NFS_MOUNT_FORCE_RDIRPLUS 0x20000000 +#define NFS_MOUNT_NETUNREACH_FATAL 0x40000000 unsigned int fattr_valid; /* Valid attributes */ unsigned int caps; /* server capabilities */ -- cgit v1.2.3 From 9827144bfb2b1baf1e78600da73dcc9e92e28415 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Mar 2025 10:50:26 -0400 Subject: NFS: Treat ENETUNREACH errors as fatal in containers Propagate the NFS_MOUNT_NETUNREACH_FATAL flag to work with the generic NFS client. If the flag is set, the client will receive ENETDOWN and ENETUNREACH errors from the RPC layer, and is expected to treat them as being fatal. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Tested-by: Jeff Layton Acked-by: Chuck Lever --- include/linux/nfs_fs_sb.h | 1 + include/linux/sunrpc/clnt.h | 5 ++++- include/linux/sunrpc/sched.h | 1 + include/trace/events/sunrpc.h | 1 + 4 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index a6ce8590eaaf..71319637a84e 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -50,6 +50,7 @@ struct nfs_client { #define NFS_CS_DS 7 /* - Server is a DS */ #define NFS_CS_REUSEPORT 8 /* - reuse src port on reconnect */ #define NFS_CS_PNFS 9 /* - Server used for pnfs */ +#define NFS_CS_NETUNREACH_FATAL 10 /* - ENETUNREACH errors are fatal */ struct sockaddr_storage cl_addr; /* server identifier */ size_t cl_addrlen; char * cl_hostname; /* hostname of server */ diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index fec976e58174..f8b406b0a1af 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -64,7 +64,9 @@ struct rpc_clnt { cl_noretranstimeo: 1,/* No retransmit timeouts */ cl_autobind : 1,/* use getport() */ cl_chatty : 1,/* be verbose */ - cl_shutdown : 1;/* rpc immediate -EIO */ + cl_shutdown : 1,/* rpc immediate -EIO */ + cl_netunreach_fatal : 1; + /* Treat ENETUNREACH errors as fatal */ struct xprtsec_parms cl_xprtsec; /* transport security policy */ struct rpc_rtt * cl_rtt; /* RTO estimator data */ @@ -175,6 +177,7 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_SOFTERR (1UL << 10) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) #define RPC_CLNT_CREATE_CONNECTED (1UL << 12) +#define RPC_CLNT_CREATE_NETUNREACH_FATAL (1UL << 13) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index eac57914dcf3..ccba79ebf893 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -134,6 +134,7 @@ struct rpc_task_setup { #define RPC_TASK_MOVEABLE 0x0004 /* nfs4.1+ rpc tasks */ #define RPC_TASK_NULLCREDS 0x0010 /* Use AUTH_NULL credential */ #define RPC_CALL_MAJORSEEN 0x0020 /* major timeout seen */ +#define RPC_TASK_NETUNREACH_FATAL 0x0040 /* ENETUNREACH is fatal */ #define RPC_TASK_DYNAMIC 0x0080 /* task was kmalloc'ed */ #define RPC_TASK_NO_ROUND_ROBIN 0x0100 /* send requests on "main" xprt */ #define RPC_TASK_SOFT 0x0200 /* Use soft timeouts */ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 851841336ee6..5d331383047b 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -343,6 +343,7 @@ TRACE_EVENT(rpc_request, { RPC_TASK_MOVEABLE, "MOVEABLE" }, \ { RPC_TASK_NULLCREDS, "NULLCREDS" }, \ { RPC_CALL_MAJORSEEN, "MAJORSEEN" }, \ + { RPC_TASK_NETUNREACH_FATAL, "NETUNREACH_FATAL"}, \ { RPC_TASK_DYNAMIC, "DYNAMIC" }, \ { RPC_TASK_NO_ROUND_ROBIN, "NO_ROUND_ROBIN" }, \ { RPC_TASK_SOFT, "SOFT" }, \ -- cgit v1.2.3 From 8c9f0df7a82a9f3bbdab4c796d302b20a33c1cc6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Mar 2025 08:04:35 -0400 Subject: pNFS/flexfiles: Treat ENETUNREACH errors as fatal in containers Propagate the NFS_MOUNT_NETUNREACH_FATAL flag to work with the pNFS flexfiles client. In these circumstances, the client needs to treat the ENETDOWN and ENETUNREACH errors as fatal, and should abandon the attempted I/O. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Tested-by: Jeff Layton Acked-by: Chuck Lever --- include/linux/nfs4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 5fa60fe441b5..d8cad844870a 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -300,6 +300,7 @@ enum nfsstat4 { /* error codes for internal client use */ #define NFS4ERR_RESET_TO_MDS 12001 #define NFS4ERR_RESET_TO_PNFS 12002 +#define NFS4ERR_FATAL_IOERROR 12003 static inline bool seqid_mutating_err(u32 err) { -- cgit v1.2.3 From fa23a338de93aa03eb0b6146a0440f5762309f85 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Mar 2025 13:36:11 +0000 Subject: mm: separate folio_split_memcg_refs() from split_page_memcg() Patch series "Minor memcg cleanups & prep for memdescs", v2. Separate the handling of accounted folios and GFP_ACCOUNT pages for easier to understand code. For more detail, see https://lore.kernel.org/linux-mm/Z9LwTOudOlCGny3f@casper.infradead.org/ This patch (of 5): Folios always use memcg_data to refer to the mem_cgroup while pages allocated with GFP_ACCOUNT have a pointer to the obj_cgroup. Since the caller already knows what it has, split the function into two and then we don't need to check. Move the assignment of split folio memcg_data to the point where we set up the other parts of the new folio. That leaves folio_split_memcg_refs() just handling the memcg accounting. Link: https://lkml.kernel.org/r/20250314133617.138071-1-willy@infradead.org Link: https://lkml.kernel.org/r/20250314133617.138071-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Zi Yan Acked-by: Roman Gushchin Cc: David Hildenbrand Cc: Matthew Wilcow (Oracle) Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 57664e2a8fb7..d090089c5497 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1039,6 +1039,8 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, } void split_page_memcg(struct page *head, int old_order, int new_order); +void folio_split_memcg_refs(struct folio *folio, unsigned old_order, + unsigned new_order); static inline u64 cgroup_id_from_mm(struct mm_struct *mm) { @@ -1463,6 +1465,11 @@ static inline void split_page_memcg(struct page *head, int old_order, int new_or { } +static inline void folio_split_memcg_refs(struct folio *folio, + unsigned old_order, unsigned new_order) +{ +} + static inline u64 cgroup_id_from_mm(struct mm_struct *mm) { return 0; -- cgit v1.2.3 From 1506c25508acd740ced5e92c539ed3d12f622c5b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Mar 2025 13:36:12 +0000 Subject: mm: simplify split_page_memcg() The last argument to split_page_memcg() is now always 0, so remove it, effectively reverting commit b8791381d7ed. Link: https://lkml.kernel.org/r/20250314133617.138071-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Zi Yan Acked-by: Roman Gushchin Cc: David Hildenbrand Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d090089c5497..ea28cacfb0d2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1038,7 +1038,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -void split_page_memcg(struct page *head, int old_order, int new_order); +void split_page_memcg(struct page *first, unsigned order); void folio_split_memcg_refs(struct folio *folio, unsigned old_order, unsigned new_order); @@ -1461,7 +1461,7 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void split_page_memcg(struct page *head, int old_order, int new_order) +static inline void split_page_memcg(struct page *first, unsigned order) { } -- cgit v1.2.3 From 8492936abb49eda26e00eab01e58d7e69f2355ba Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Mar 2025 13:36:14 +0000 Subject: mm: simplify folio_memcg_charged() There's no need to check which kind of pointer is in the memcg_data field, all we actually care about is whether it's zero or not. Saves 70 bytes in workingset_activation() with the Debian config. Link: https://lkml.kernel.org/r/20250314133617.138071-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Roman Gushchin Cc: David Hildenbrand Cc: Michal Hocko Cc: Muchun Song Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ea28cacfb0d2..53364526d877 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -438,9 +438,7 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) */ static inline bool folio_memcg_charged(struct folio *folio) { - if (folio_memcg_kmem(folio)) - return __folio_objcg(folio) != NULL; - return __folio_memcg(folio) != NULL; + return folio->memcg_data != 0; } /* -- cgit v1.2.3 From 835de37603ef6412949df0a607128f5fffed4576 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Fri, 14 Mar 2025 15:37:54 -0600 Subject: meminfo: add a per node counter for balloon drivers Patch series "track memory used by balloon drivers", v2. This series introduces a way to track memory used by balloon drivers. Add a NR_BALLOON_PAGES counter to track how many pages are reclaimed by the balloon drivers. First add the accounting, then updates the balloon drivers (virtio, Hyper-V, VMware, Pseries-cmm, and Xen) to maintain this counter. The virtio, Vmware, and pseries-cmm balloon drivers utilize the balloon_compaction interface to allocate and free balloon pages. Other balloon drivers will have to maintain this counter manually. This makes the information visible in memory reporting interfaces like /proc/meminfo, show_mem, and OOM reporting. This provides admins visibility into their VM balloon sizes without requiring different virtualization tooling. Furthermore, this information is helpful when debugging an OOM inside a VM. This patch (of 4): Add NR_BALLOON_PAGES counter to track memory used by balloon drivers and expose it through /proc/meminfo and other memory reporting interfaces. [npache@redhat.com: document Balloon Meminfo entry] Link: https://lkml.kernel.org/r/a0315ccf-f244-460e-8643-fd7388724fe5@redhat.com Link: https://lkml.kernel.org/r/20250314213757.244258-1-npache@redhat.com Link: https://lkml.kernel.org/r/20250314213757.244258-2-npache@redhat.com Signed-off-by: Nico Pache Cc: Alexander Atanasov Cc: Chengming Zhou Cc: David Hildenbrand Cc: Dexuan Cui Cc: Haiyang Zhang Cc: Johannes Weiner Cc: Juegren Gross Cc: Kanchana P Sridhar Cc: K. Y. Srinivasan Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Muchun Song Cc: Nhat Pham Cc: Oleksandr Tyshchenko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Stefano Stabellini Cc: Wei Liu Cc: Michael Kelley Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 37c29f3fbca8..a9db0fbd2b94 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -224,6 +224,7 @@ enum node_stat_item { #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif + NR_BALLOON_PAGES, NR_VM_NODE_STAT_ITEMS }; -- cgit v1.2.3 From 3b23a44f1f196741596616082e759f7f3a400e78 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Tue, 18 Mar 2025 11:30:28 -0700 Subject: mm/damon: implement a new DAMOS filter type for active pages Patch series "mm/damon: introduce DAMOS filter type for active pages". The memory reclaim algorithm categorizes pages into active and inactive lists, separately for file and anon pages. The system's performance relies heavily on the (relative and absolute) accuracy of this categorization. This patch series add a new DAMOS filter for pages' activeness, giving us visibility into the access frequency of the pages on each list. This insight can help us diagnose issues with the active-inactive balancing dynamics, and make decisions to optimize reclaim efficiency and memory utilization. For instance, we might decide to enable DAMON_LRU_SORT, if we find that there are pages on the active list that are infrequently accessed, or less frequently accessed than pages on the inactive list. This patch (of 2): Implement a DAMOS filter type for active pages on DAMON kernel API, and add support of it from the physical address space DAMON operations set (paddr). Link: https://lkml.kernel.org/r/20250318183029.2062917-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20250318183029.2062917-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 3db4f77261f5..47e36e6ea203 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -334,6 +334,7 @@ struct damos_stat { /** * enum damos_filter_type - Type of memory for &struct damos_filter * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. + * @DAMOS_FILTER_TYPE_ACTIVE: Active pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @DAMOS_FILTER_TYPE_YOUNG: Recently accessed pages. * @DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: Page is part of a hugepage. @@ -355,6 +356,7 @@ struct damos_stat { */ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, + DAMOS_FILTER_TYPE_ACTIVE, DAMOS_FILTER_TYPE_MEMCG, DAMOS_FILTER_TYPE_YOUNG, DAMOS_FILTER_TYPE_HUGEPAGE_SIZE, -- cgit v1.2.3 From e452872b40e3f1fb92adf0d573a0a6a7c9f6ce22 Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Tue, 18 Mar 2025 15:58:32 +0800 Subject: mm: vmscan: split proactive reclaim statistics from direct reclaim statistics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Adding Proactive Memory Reclaim Statistics". These two patches are related to proactive memory reclaim. Patch 1 Split proactive reclaim statistics from direct reclaim counters and introduces new counters: pgsteal_proactive, pgdemote_proactive, and pgscan_proactive. Patch 2 Adds pswpin and pswpout items to the cgroup-v2 documentation. This patch (of 2): In proactive memory reclaim scenarios, it is necessary to accurately track proactive reclaim statistics to dynamically adjust the frequency and amount of memory being reclaimed proactively. Currently, proactive reclaim is included in direct reclaim statistics, which can make these direct reclaim statistics misleading. Therefore, separate proactive reclaim memory from the direct reclaim counters by introducing new counters: pgsteal_proactive, pgdemote_proactive, and pgscan_proactive, to avoid confusion with direct reclaim. Link: https://lkml.kernel.org/r/20250318075833.90615-1-jiahao.kernel@gmail.com Link: https://lkml.kernel.org/r/20250318075833.90615-2-jiahao.kernel@gmail.com Signed-off-by: Hao Jia Acked-by: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + include/linux/vm_event_item.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a9db0fbd2b94..f7fe1126dc75 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -221,6 +221,7 @@ enum node_stat_item { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, + PGDEMOTE_PROACTIVE, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index f70d0958095c..f11b6fa9c5b3 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -41,9 +41,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, PGSTEAL_KHUGEPAGED, + PGSTEAL_PROACTIVE, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, + PGSCAN_PROACTIVE, PGSCAN_DIRECT_THROTTLE, PGSCAN_ANON, PGSCAN_FILE, -- cgit v1.2.3 From 5f5ee52d4f58605330b09851273d6e56aaadd29e Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Tue, 18 Mar 2025 16:39:38 +0800 Subject: mm/hwpoison: introduce folio_contain_hwpoisoned_page() helper Patch series "mm/vmscan: don't try to reclaim hwpoison folio". Fix a bug during memory reclaim if folio is hwpoisoned. This patch (of 2): Introduce helper folio_contain_hwpoisoned_page() to check if the entire folio is hwpoisoned or it contains hwpoisoned pages. Link: https://lkml.kernel.org/r/20250318083939.987651-1-tujinjiang@huawei.com Link: https://lkml.kernel.org/r/20250318083939.987651-2-tujinjiang@huawei.com Signed-off-by: Jinjiang Tu Acked-by: Miaohe Lin Cc: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f26ce54c7aa4..68f4b188fc6b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1098,6 +1098,12 @@ static inline bool is_page_hwpoison(const struct page *page) return folio_test_hugetlb(folio) && PageHWPoison(&folio->page); } +static inline bool folio_contain_hwpoisoned_page(struct folio *folio) +{ + return folio_test_hwpoison(folio) || + (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)); +} + bool is_free_buddy_page(const struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); -- cgit v1.2.3 From 3cf67d61ff98672120f6ad07528afa165df12588 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 25 Feb 2025 16:02:34 +0900 Subject: hung_task: show the blocker task if the task is hung on mutex Patch series "hung_task: Dump the blocking task stacktrace", v4. The hung_task detector is very useful for detecting the lockup. However, since it only dumps the blocked (uninterruptible sleep) processes, it is not enough to identify the root cause of that lockup. For example, if a process holds a mutex and sleep an event in interruptible state long time, the other processes will wait on the mutex in uninterruptible state. In this case, the waiter processes are dumped, but the blocker process is not shown because it is sleep in interruptible state. This adds a feature to dump the blocker task which holds a mutex when detecting a hung task. e.g. INFO: task cat:115 blocked for more than 122 seconds. Not tainted 6.14.0-rc3-00003-ga8946be3de00 #156 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:cat state:D stack:13432 pid:115 tgid:115 ppid:106 task_flags:0x400100 flags:0x00000002 Call Trace: __schedule+0x731/0x960 ? schedule_preempt_disabled+0x54/0xa0 schedule+0xb7/0x140 ? __mutex_lock+0x51b/0xa60 ? __mutex_lock+0x51b/0xa60 schedule_preempt_disabled+0x54/0xa0 __mutex_lock+0x51b/0xa60 read_dummy+0x23/0x70 full_proxy_read+0x6a/0xc0 vfs_read+0xc2/0x340 ? __pfx_direct_file_splice_eof+0x10/0x10 ? do_sendfile+0x1bd/0x2e0 ksys_read+0x76/0xe0 do_syscall_64+0xe3/0x1c0 ? exc_page_fault+0xa9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x4840cd RSP: 002b:00007ffe99071828 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd RDX: 0000000000001000 RSI: 00007ffe99071870 RDI: 0000000000000003 RBP: 00007ffe99071870 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000 R13: 00000000132fd3a0 R14: 0000000000000001 R15: ffffffffffffffff INFO: task cat:115 is blocked on a mutex likely owned by task cat:114. task:cat state:S stack:13432 pid:114 tgid:114 ppid:106 task_flags:0x400100 flags:0x00000002 Call Trace: __schedule+0x731/0x960 ? schedule_timeout+0xa8/0x120 schedule+0xb7/0x140 schedule_timeout+0xa8/0x120 ? __pfx_process_timeout+0x10/0x10 msleep_interruptible+0x3e/0x60 read_dummy+0x2d/0x70 full_proxy_read+0x6a/0xc0 vfs_read+0xc2/0x340 ? __pfx_direct_file_splice_eof+0x10/0x10 ? do_sendfile+0x1bd/0x2e0 ksys_read+0x76/0xe0 do_syscall_64+0xe3/0x1c0 ? exc_page_fault+0xa9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x4840cd RSP: 002b:00007ffe3e0147b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd RDX: 0000000000001000 RSI: 00007ffe3e014800 RDI: 0000000000000003 RBP: 00007ffe3e014800 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000 R13: 000000001a0a93a0 R14: 0000000000000001 R15: ffffffffffffffff TBD: We can extend this feature to cover other locks like rwsem and rt_mutex, but rwsem requires to dump all the tasks which acquire and wait that rwsem. We can follow the waiter link but the output will be a bit different compared with mutex case. This patch (of 2): The "hung_task" shows a long-time uninterruptible slept task, but most often, it's blocked on a mutex acquired by another task. Without dumping such a task, investigating the root cause of the hung task problem is very difficult. This introduce task_struct::blocker_mutex to point the mutex lock which this task is waiting for. Since the mutex has "owner" information, we can find the owner task and dump it with hung tasks. Note: the owner can be changed while dumping the owner task, so this is "likely" the owner of the mutex. With this change, the hung task shows blocker task's info like below; INFO: task cat:115 blocked for more than 122 seconds. Not tainted 6.14.0-rc3-00003-ga8946be3de00 #156 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:cat state:D stack:13432 pid:115 tgid:115 ppid:106 task_flags:0x400100 flags:0x00000002 Call Trace: __schedule+0x731/0x960 ? schedule_preempt_disabled+0x54/0xa0 schedule+0xb7/0x140 ? __mutex_lock+0x51b/0xa60 ? __mutex_lock+0x51b/0xa60 schedule_preempt_disabled+0x54/0xa0 __mutex_lock+0x51b/0xa60 read_dummy+0x23/0x70 full_proxy_read+0x6a/0xc0 vfs_read+0xc2/0x340 ? __pfx_direct_file_splice_eof+0x10/0x10 ? do_sendfile+0x1bd/0x2e0 ksys_read+0x76/0xe0 do_syscall_64+0xe3/0x1c0 ? exc_page_fault+0xa9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x4840cd RSP: 002b:00007ffe99071828 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd RDX: 0000000000001000 RSI: 00007ffe99071870 RDI: 0000000000000003 RBP: 00007ffe99071870 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000 R13: 00000000132fd3a0 R14: 0000000000000001 R15: ffffffffffffffff INFO: task cat:115 is blocked on a mutex likely owned by task cat:114. task:cat state:S stack:13432 pid:114 tgid:114 ppid:106 task_flags:0x400100 flags:0x00000002 Call Trace: __schedule+0x731/0x960 ? schedule_timeout+0xa8/0x120 schedule+0xb7/0x140 schedule_timeout+0xa8/0x120 ? __pfx_process_timeout+0x10/0x10 msleep_interruptible+0x3e/0x60 read_dummy+0x2d/0x70 full_proxy_read+0x6a/0xc0 vfs_read+0xc2/0x340 ? __pfx_direct_file_splice_eof+0x10/0x10 ? do_sendfile+0x1bd/0x2e0 ksys_read+0x76/0xe0 do_syscall_64+0xe3/0x1c0 ? exc_page_fault+0xa9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x4840cd RSP: 002b:00007ffe3e0147b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd RDX: 0000000000001000 RSI: 00007ffe3e014800 RDI: 0000000000000003 RBP: 00007ffe3e014800 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000 R13: 000000001a0a93a0 R14: 0000000000000001 R15: ffffffffffffffff [akpm@linux-foundation.org: implement debug_show_blocker() in C rather than in CPP] Link: https://lkml.kernel.org/r/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com Link: https://lkml.kernel.org/r/174046695384.2194069.16796289525958195643.stgit@mhiramat.tok.corp.google.com Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Waiman Long Reviewed-by: Lance Yang Reviewed-by: Sergey Senozhatsky Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: Kent Overstreet Cc: Steven Rostedt Cc: Tomasz Figa Cc: Will Deacon Cc: Yongliang Gao Signed-off-by: Andrew Morton --- include/linux/mutex.h | 2 ++ include/linux/sched.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 2bf91b57591b..2143d05116be 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -202,4 +202,6 @@ DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T)) DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T)) DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0) +extern unsigned long mutex_get_owner(struct mutex *lock); + #endif /* __LINUX_MUTEX_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c15365a30c0..1419d94c8e87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1217,6 +1217,10 @@ struct task_struct { struct mutex_waiter *blocked_on; #endif +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + struct mutex *blocker_mutex; +#endif + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif -- cgit v1.2.3 From 8ee065a6fdd285387d0eca5e1139ffb182a6e626 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 17 Mar 2025 20:11:10 +0200 Subject: resource: split DEFINE_RES_NAMED_DESC() out of DEFINE_RES_NAMED() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "resource: Split and use DEFINE_RES*() macros", v2. Replace open coded variants of DEFINE_RES*() macros. Note, there are many more possibilities over the kernel and even in reources.c, however the latter contains not so trivial leftovers. That's why the examples cover only straightforward conversions. This patch (of 4): In some cases it would be useful to supply predefined descriptor of the resource. For this, introduce DEFINE_RES_NAMED_DESC() macro. While at it, provide DEFINE_RES() that takes only start, size, and flags. Link: https://lkml.kernel.org/r/20250317181412.1560630-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20250317181412.1560630-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Andrew Morton --- include/linux/ioport.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 5385349f0b8a..e8b2d6aa4013 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -154,15 +154,20 @@ enum { }; /* helpers to define resources */ -#define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ +#define DEFINE_RES_NAMED_DESC(_start, _size, _name, _flags, _desc) \ (struct resource) { \ .start = (_start), \ .end = (_start) + (_size) - 1, \ .name = (_name), \ .flags = (_flags), \ - .desc = IORES_DESC_NONE, \ + .desc = (_desc), \ } +#define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ + DEFINE_RES_NAMED_DESC(_start, _size, _name, _flags, IORES_DESC_NONE) +#define DEFINE_RES(_start, _size, _flags) \ + DEFINE_RES_NAMED(_start, _size, NULL, _flags) + #define DEFINE_RES_IO_NAMED(_start, _size, _name) \ DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_IO) #define DEFINE_RES_IO(_start, _size) \ -- cgit v1.2.3 From 24fe172b50b5749c315349e740e4a09e3a0165d5 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 24 Mar 2025 14:56:01 -0700 Subject: objtool: Fix up some outdated references to ENTRY/ENDPROC ENTRY and ENDPROC were deprecated years ago and replaced with SYM_FUNC_{START,END}. Fix up a few outdated references in the objtool documentation and comments. Also fix a few typos. Suggested-by: Brendan Jackman Suggested-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/5eb7e06e1a0e87aaeda8d583ab060e7638a6ea8e.1742852846.git.jpoimboe@kernel.org --- include/linux/linkage.h | 4 ---- include/linux/objtool.h | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 5c8865bb59d9..b11660b706c5 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -134,10 +134,6 @@ .size name, .-name #endif -/* If symbol 'name' is treated as a subroutine (gets called, and returns) - * then please use ENDPROC to mark 'name' as STT_FUNC for the benefit of - * static analysis tools such as stack depth analyzer. - */ #ifndef ENDPROC /* deprecated, use SYM_FUNC_END */ #define ENDPROC(name) \ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 3ca965a2ddc8..366ad004d794 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -69,7 +69,7 @@ * In asm, there are two kinds of code: normal C-type callable functions and * the rest. The normal callable functions can be called by other code, and * don't do anything unusual with the stack. Such normal callable functions - * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this + * are annotated with SYM_FUNC_{START,END}. Most asm code falls in this * category. In this case, no special debugging annotations are needed because * objtool can automatically generate the ORC data for the ORC unwinder to read * at runtime. -- cgit v1.2.3 From 6aa63a4ec947f350d1a2f9f6aba8591a2455d192 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 24 Mar 2025 21:05:15 -0700 Subject: iommu: Sort out domain user data When DMA/MSI cookies were made first-class citizens back in commit 46983fcd67ac ("iommu: Pull IOVA cookie management into the core"), there was no real need to further expose the two different cookie types. However, now that IOMMUFD wants to add a third type of MSI-mapping cookie, we do have a nicely compelling reason to properly dismabiguate things at the domain level beyond just vaguely guessing from the domain type. Meanwhile, we also effectively have another "cookie" in the form of the anonymous union for other user data, which isn't much better in terms of being vague and unenforced. The fact is that all these cookie types are mutually exclusive, in the sense that combining them makes zero sense and/or would be catastrophic (iommu_set_fault_handler() on an SVA domain, anyone?) - the only combination which *might* be reasonable is perhaps a fault handler and an MSI cookie, but nobody's doing that at the moment, so let's rule it out as well for the sake of being clear and robust. To that end, we pull DMA and MSI cookies apart a little more, mostly to clear up the ambiguity at domain teardown, then for clarity (and to save a little space), move them into the union, whose ownership we can then properly describe and enforce entirely unambiguously. [nicolinc: rebase on latest tree; use prefix IOMMU_COOKIE_; merge unions in iommu_domain; add IOMMU_COOKIE_IOMMUFD for iommufd_hwpt] Link: https://patch.msgid.link/r/1ace9076c95204bbe193ee77499d395f15f44b23.1742871535.git.nicolinc@nvidia.com Signed-off-by: Robin Murphy Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e93d2e918599..06cc14e9993d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -41,6 +41,7 @@ struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; struct iommu_dma_cookie; +struct iommu_dma_msi_cookie; struct iommu_fault_param; struct iommufd_ctx; struct iommufd_viommu; @@ -165,6 +166,15 @@ struct iommu_domain_geometry { bool force_aperture; /* DMA only allowed in mappable range? */ }; +enum iommu_domain_cookie_type { + IOMMU_COOKIE_NONE, + IOMMU_COOKIE_DMA_IOVA, + IOMMU_COOKIE_DMA_MSI, + IOMMU_COOKIE_FAULT_HANDLER, + IOMMU_COOKIE_SVA, + IOMMU_COOKIE_IOMMUFD, +}; + /* Domain feature flags */ #define __IOMMU_DOMAIN_PAGING (1U << 0) /* Support for iommu_map/unmap */ #define __IOMMU_DOMAIN_DMA_API (1U << 1) /* Domain for use in DMA-API @@ -211,12 +221,12 @@ struct iommu_domain_geometry { struct iommu_domain { unsigned type; + enum iommu_domain_cookie_type cookie_type; const struct iommu_domain_ops *ops; const struct iommu_dirty_ops *dirty_ops; const struct iommu_ops *owner; /* Whose domain_alloc we came from */ unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; - struct iommu_dma_cookie *iova_cookie; int (*iopf_handler)(struct iopf_group *group); #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) @@ -224,10 +234,10 @@ struct iommu_domain { phys_addr_t msi_addr); #endif - union { /* Pointer usable by owner of the domain */ - struct iommufd_hw_pagetable *iommufd_hwpt; /* iommufd */ - }; - union { /* Fault handler */ + union { /* cookie */ + struct iommu_dma_cookie *iova_cookie; + struct iommu_dma_msi_cookie *msi_cookie; + struct iommufd_hw_pagetable *iommufd_hwpt; struct { iommu_fault_handler_t handler; void *handler_token; -- cgit v1.2.3 From 06d54f00f3f5a29cbf43410ac93ee2dd89e3b711 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 24 Mar 2025 21:05:17 -0700 Subject: iommu: Drop sw_msi from iommu_domain There are only two sw_msi implementations in the entire system, thus it's not very necessary to have an sw_msi pointer. Instead, check domain->cookie_type to call the two sw_msi implementations directly from the core code. Link: https://patch.msgid.link/r/7ded87c871afcbaac665b71354de0a335087bf0f.1742871535.git.nicolinc@nvidia.com Suggested-by: Robin Murphy Reviewed-by: Robin Murphy Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 06cc14e9993d..e01c855ae8a7 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -229,11 +229,6 @@ struct iommu_domain { struct iommu_domain_geometry geometry; int (*iopf_handler)(struct iopf_group *group); -#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) - int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, - phys_addr_t msi_addr); -#endif - union { /* cookie */ struct iommu_dma_cookie *iova_cookie; struct iommu_dma_msi_cookie *msi_cookie; @@ -254,16 +249,6 @@ struct iommu_domain { }; }; -static inline void iommu_domain_set_sw_msi( - struct iommu_domain *domain, - int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, - phys_addr_t msi_addr)) -{ -#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) - domain->sw_msi = sw_msi; -#endif -} - static inline bool iommu_is_dma_domain(struct iommu_domain *domain) { return domain->type & __IOMMU_DOMAIN_DMA_API; -- cgit v1.2.3 From 2fb69c602d57f77483b8dcdd12d17408a09f76fe Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:33 -0700 Subject: iommufd: Support pasid attach/replace This extends the below APIs to support PASID. Device drivers to manage pasid attach/replace/detach. int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid, u32 *pt_id); int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid, u32 *pt_id); void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid); The pasid operations share underlying attach/replace/detach infrastructure with the device operations, but still have some different implications: - no reserved region per pasid otherwise SVA architecture is already broken (CPU address space doesn't count device reserved regions); - accordingly no sw_msi trick; Cache coherency enforcement is still applied to pasid operations since it is about memory accesses post page table walking (no matter the walk is per RID or per PASID). Link: https://patch.msgid.link/r/20250321171940.7213-12-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Signed-off-by: Kevin Tian Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 60eff9272551..34b6e6ca4bfa 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -54,9 +55,11 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, struct device *dev, u32 *id); void iommufd_device_unbind(struct iommufd_device *idev); -int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); -int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id); -void iommufd_device_detach(struct iommufd_device *idev); +int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id); +int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id); +void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid); struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev); u32 iommufd_device_to_id(struct iommufd_device *idev); -- cgit v1.2.3 From dbc5f37b4f8ad833132f77c1f67e68bb11ca9b9e Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:36 -0700 Subject: iommufd: Allow allocating PASID-compatible domain The underlying infrastructure has supported the PASID attach and related enforcement per the requirement of the IOMMU_HWPT_ALLOC_PASID flag. This extends iommufd to support PASID compatible domain requested by userspace. Link: https://patch.msgid.link/r/20250321171940.7213-15-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/uapi/linux/iommufd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 8719d4f5d618..6901804ec736 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -393,6 +393,9 @@ struct iommu_vfio_ioas { * Any domain attached to the non-PASID part of the * device must also be flagged, otherwise attaching a * PASID will blocked. + * For the user that wants to attach PASID, ioas is + * not recommended for both the non-PASID part + * and PASID part of the device. * If IOMMU does not support PASID it will return * error (-EOPNOTSUPP). */ -- cgit v1.2.3 From 7fe6b987166b901efc5c6fce5fe853c9ebb835be Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 11:01:39 -0700 Subject: ida: Add ida_find_first_range() There is no helpers for user to check if a given ID is allocated or not, neither a helper to loop all the allocated IDs in an IDA and do something for cleanup. With the two needs, a helper to get the lowest allocated ID of a range and two variants based on it. Caller can check if a given ID is allocated or not by: bool ida_exists(struct ida *ida, unsigned int id) Caller can iterate all allocated IDs by: int id; while ((id = ida_find_first(&pasid_ida)) >= 0) { //anything to do with the allocated ID ida_free(pasid_ida, pasid); } Link: https://patch.msgid.link/r/20250321180143.8468-2-yi.l.liu@intel.com Cc: Matthew Wilcox (Oracle) Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: Matthew Wilcox (Oracle) Tested-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/idr.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/idr.h b/include/linux/idr.h index da5f5fa4a3a6..718f9b1b91af 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -257,6 +257,7 @@ struct ida { int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t); void ida_free(struct ida *, unsigned int id); void ida_destroy(struct ida *ida); +int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max); /** * ida_alloc() - Allocate an unused ID. @@ -328,4 +329,14 @@ static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); } + +static inline bool ida_exists(struct ida *ida, unsigned int id) +{ + return ida_find_first_range(ida, id, id) == id; +} + +static inline int ida_find_first(struct ida *ida) +{ + return ida_find_first_range(ida, 0, ~0); +} #endif /* __IDR_H__ */ -- cgit v1.2.3 From 290641346d0d1eaf400c4f968d5b2cd91f483733 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 11:01:40 -0700 Subject: vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices This adds pasid_at|de]tach_ioas ops for attaching hwpt to pasid of a device and the helpers for it. For now, only vfio-pci supports pasid attach/detach. Link: https://patch.msgid.link/r/20250321180143.8468-3-yi.l.liu@intel.com Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Tested-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/vfio.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 000a6cab2d31..707b00772ce1 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -67,6 +67,7 @@ struct vfio_device { struct inode *inode; #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; + struct ida pasids; u8 iommufd_attached:1; #endif u8 cdev_opened:1; @@ -91,6 +92,8 @@ struct vfio_device { * bound iommufd. Undo in unbind_iommufd if @detach_ioas is not * called. * @detach_ioas: Opposite of attach_ioas + * @pasid_attach_ioas: The pasid variation of attach_ioas + * @pasid_detach_ioas: Opposite of pasid_attach_ioas * @open_device: Called when the first file descriptor is opened for this device * @close_device: Opposite of open_device * @read: Perform read(2) on device file descriptor @@ -115,6 +118,9 @@ struct vfio_device_ops { void (*unbind_iommufd)(struct vfio_device *vdev); int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id); void (*detach_ioas)(struct vfio_device *vdev); + int (*pasid_attach_ioas)(struct vfio_device *vdev, u32 pasid, + u32 *pt_id); + void (*pasid_detach_ioas)(struct vfio_device *vdev, u32 pasid); int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -139,6 +145,10 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev, void vfio_iommufd_physical_unbind(struct vfio_device *vdev); int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev); +int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev, + u32 pasid, u32 *pt_id); +void vfio_iommufd_physical_pasid_detach_ioas(struct vfio_device *vdev, + u32 pasid); int vfio_iommufd_emulated_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx, u32 *out_device_id); void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); @@ -166,6 +176,10 @@ vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx) ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) #define vfio_iommufd_physical_detach_ioas \ ((void (*)(struct vfio_device *vdev)) NULL) +#define vfio_iommufd_physical_pasid_attach_ioas \ + ((int (*)(struct vfio_device *vdev, u32 pasid, u32 *pt_id)) NULL) +#define vfio_iommufd_physical_pasid_detach_ioas \ + ((void (*)(struct vfio_device *vdev, u32 pasid)) NULL) #define vfio_iommufd_emulated_bind \ ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ u32 *out_device_id)) NULL) -- cgit v1.2.3 From ad744ed5dd8b70e9256fc1ff18aaaffeedf5f21e Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 11:01:41 -0700 Subject: vfio: VFIO_DEVICE_[AT|DE]TACH_IOMMUFD_PT support pasid This extends the VFIO_DEVICE_[AT|DE]TACH_IOMMUFD_PT ioctls to attach/detach a given pasid of a vfio device to/from an IOAS/HWPT. Link: https://patch.msgid.link/r/20250321180143.8468-4-yi.l.liu@intel.com Reviewed-by: Alex Williamson Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Tested-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/uapi/linux/vfio.h | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index c8dbf8219c4f..6899da70b929 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -931,29 +931,34 @@ struct vfio_device_bind_iommufd { * VFIO_DEVICE_ATTACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 19, * struct vfio_device_attach_iommufd_pt) * @argsz: User filled size of this data. - * @flags: Must be 0. + * @flags: Flags for attach. * @pt_id: Input the target id which can represent an ioas or a hwpt * allocated via iommufd subsystem. * Output the input ioas id or the attached hwpt id which could * be the specified hwpt itself or a hwpt automatically created * for the specified ioas by kernel during the attachment. + * @pasid: The pasid to be attached, only meaningful when + * VFIO_DEVICE_ATTACH_PASID is set in @flags * * Associate the device with an address space within the bound iommufd. * Undo by VFIO_DEVICE_DETACH_IOMMUFD_PT or device fd close. This is only * allowed on cdev fds. * - * If a vfio device is currently attached to a valid hw_pagetable, without doing - * a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl - * passing in another hw_pagetable (hwpt) id is allowed. This action, also known - * as a hw_pagetable replacement, will replace the device's currently attached - * hw_pagetable with a new hw_pagetable corresponding to the given pt_id. + * If a vfio device or a pasid of this device is currently attached to a valid + * hw_pagetable (hwpt), without doing a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second + * VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl passing in another hwpt id is allowed. + * This action, also known as a hw_pagetable replacement, will replace the + * currently attached hwpt of the device or the pasid of this device with a new + * hwpt corresponding to the given pt_id. * * Return: 0 on success, -errno on failure. */ struct vfio_device_attach_iommufd_pt { __u32 argsz; __u32 flags; +#define VFIO_DEVICE_ATTACH_PASID (1 << 0) __u32 pt_id; + __u32 pasid; }; #define VFIO_DEVICE_ATTACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 19) @@ -962,17 +967,21 @@ struct vfio_device_attach_iommufd_pt { * VFIO_DEVICE_DETACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 20, * struct vfio_device_detach_iommufd_pt) * @argsz: User filled size of this data. - * @flags: Must be 0. + * @flags: Flags for detach. + * @pasid: The pasid to be detached, only meaningful when + * VFIO_DEVICE_DETACH_PASID is set in @flags * - * Remove the association of the device and its current associated address - * space. After it, the device should be in a blocking DMA state. This is only - * allowed on cdev fds. + * Remove the association of the device or a pasid of the device and its current + * associated address space. After it, the device or the pasid should be in a + * blocking DMA state. This is only allowed on cdev fds. * * Return: 0 on success, -errno on failure. */ struct vfio_device_detach_iommufd_pt { __u32 argsz; __u32 flags; +#define VFIO_DEVICE_DETACH_PASID (1 << 0) + __u32 pasid; }; #define VFIO_DEVICE_DETACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 20) -- cgit v1.2.3 From 9e6ec8cf64e2973f0ec74f09023988cabd218426 Mon Sep 17 00:00:00 2001 From: xueqin Luo Date: Thu, 6 Feb 2025 16:14:36 +0800 Subject: thermal: core: Remove duplicate struct declaration The struct thermal_zone_device is already declared on line 32, so the duplicate declaration has been removed. Fixes: b1ae92dcfa8e ("thermal: core: Make struct thermal_zone_device definition internal") Signed-off-by: xueqin Luo Link: https://lore.kernel.org/r/20250206081436.51785-1-luoxueqin@kylinos.cn Signed-off-by: Daniel Lezcano --- include/linux/thermal.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 69f9bedd0ee8..0b5ed6821080 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -86,8 +86,6 @@ struct thermal_trip { #define THERMAL_TRIP_PRIV_TO_INT(_val_) (uintptr_t)(_val_) #define THERMAL_INT_TO_TRIP_PRIV(_val_) (void *)(uintptr_t)(_val_) -struct thermal_zone_device; - struct cooling_spec { unsigned long upper; /* Highest cooling state */ unsigned long lower; /* Lowest cooling state */ -- cgit v1.2.3 From d073a571e68f42414f8f06f01b59f52224538a83 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sat, 26 Oct 2024 10:13:56 -0700 Subject: asm-generic: Always define Elf_Rel and Elf_Rela These definitions are useful for relocating the kernel image as well, regardless of the type of relocations used for modules. Signed-off-by: Samuel Holland Link: https://lore.kernel.org/r/20241026171441.3047904-5-samuel.holland@sifive.com Signed-off-by: Palmer Dabbelt --- include/asm-generic/module.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/asm-generic/module.h b/include/asm-generic/module.h index 98e1541b72b7..a8622501b975 100644 --- a/include/asm-generic/module.h +++ b/include/asm-generic/module.h @@ -19,12 +19,8 @@ struct mod_arch_specific #define Elf_Dyn Elf64_Dyn #define Elf_Ehdr Elf64_Ehdr #define Elf_Addr Elf64_Addr -#ifdef CONFIG_MODULES_USE_ELF_REL #define Elf_Rel Elf64_Rel -#endif -#ifdef CONFIG_MODULES_USE_ELF_RELA #define Elf_Rela Elf64_Rela -#endif #define ELF_R_TYPE(X) ELF64_R_TYPE(X) #define ELF_R_SYM(X) ELF64_R_SYM(X) @@ -36,12 +32,8 @@ struct mod_arch_specific #define Elf_Dyn Elf32_Dyn #define Elf_Ehdr Elf32_Ehdr #define Elf_Addr Elf32_Addr -#ifdef CONFIG_MODULES_USE_ELF_REL #define Elf_Rel Elf32_Rel -#endif -#ifdef CONFIG_MODULES_USE_ELF_RELA #define Elf_Rela Elf32_Rela -#endif #define ELF_R_TYPE(X) ELF32_R_TYPE(X) #define ELF_R_SYM(X) ELF32_R_SYM(X) #endif -- cgit v1.2.3 From 923936efeb74b3f42e5ad283a0b9110bda102601 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 28 Mar 2025 01:01:19 +0800 Subject: iomap: Fix conflicting values of iomap flags IOMAP_F_ATOMIC_BIO mistakenly took the same value as of IOMAP_F_SIZE_CHANGED in patch '370a6de7651b ("iomap: rework IOMAP atomic flags")'. Let's fix this and let's also create some more space for filesystem reported flags to avoid this in future. This patch makes the core iomap flags to start from bit 15, moving downwards. Note that "flags" member within struct iomap is of type u16. Fixes: 370a6de7651b ("iomap: rework IOMAP atomic flags") Signed-off-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20250327170119.61045-1-ritesh.list@gmail.com Reviewed-by: John Garry Signed-off-by: Christian Brauner --- include/linux/iomap.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 02fe001feebb..68416b135151 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -78,6 +78,11 @@ struct vm_fault; #define IOMAP_F_ANON_WRITE (1U << 7) #define IOMAP_F_ATOMIC_BIO (1U << 8) +/* + * Flag reserved for file system specific usage + */ +#define IOMAP_F_PRIVATE (1U << 12) + /* * Flags set by the core iomap code during operations: * @@ -88,14 +93,8 @@ struct vm_fault; * range it covers needs to be remapped by the high level before the operation * can proceed. */ -#define IOMAP_F_SIZE_CHANGED (1U << 8) -#define IOMAP_F_STALE (1U << 9) - -/* - * Flags from 0x1000 up are for file system specific usage: - */ -#define IOMAP_F_PRIVATE (1U << 12) - +#define IOMAP_F_SIZE_CHANGED (1U << 14) +#define IOMAP_F_STALE (1U << 15) /* * Magic value for addr: -- cgit v1.2.3 From 079a206f51f0c183be96a2f78f183dee42df1d4a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 21 Mar 2025 16:40:47 +0200 Subject: seq_buf: Mark binary printing functions with __printf() attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Binary printing functions are using printf() type of format, and compiler is not happy about them as is: lib/seq_buf.c:162:17: error: function ‘seq_buf_bprintf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] Fix the compilation errors by adding __printf() attribute. Signed-off-by: Andy Shevchenko Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20250321144822.324050-2-andriy.shevchenko@linux.intel.com Signed-off-by: Petr Mladek --- include/linux/seq_buf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index fe41da005970..52791e070506 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -167,8 +167,8 @@ extern int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, const void *buf, size_t len, bool ascii); #ifdef CONFIG_BINARY_PRINTF -extern int -seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary); +__printf(2, 0) +int seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary); #endif void seq_buf_do_printk(struct seq_buf *s, const char *lvl); -- cgit v1.2.3 From 6b2c1e30ad6846624d935a7ea98dae60458126b8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 21 Mar 2025 16:40:48 +0200 Subject: seq_file: Mark binary printing functions with __printf() attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Binary printing functions are using printf() type of format, and compiler is not happy about them as is: fs/seq_file.c:418:35: error: function ‘seq_bprintf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] Fix the compilation errors by adding __printf() attribute. Signed-off-by: Andy Shevchenko Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20250321144822.324050-3-andriy.shevchenko@linux.intel.com Signed-off-by: Petr Mladek --- include/linux/seq_file.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 2fb266ea69fa..d6ebf0596510 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -181,6 +181,7 @@ int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); #ifdef CONFIG_BINARY_PRINTF +__printf(2, 0) void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary); #endif -- cgit v1.2.3 From 196a062641fe68d9bfe0ad36b6cd7628c99ad22c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 21 Mar 2025 16:40:49 +0200 Subject: tracing: Mark binary printing functions with __printf() attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Binary printing functions are using printf() type of format, and compiler is not happy about them as is: kernel/trace/trace.c:3292:9: error: function ‘trace_vbprintk’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] kernel/trace/trace_seq.c:182:9: error: function ‘trace_seq_bprintf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] Fix the compilation errors by adding __printf() attribute. While at it, move existing __printf() attributes from the implementations to the declarations. IT also fixes incorrect attribute parameters that are used for trace_array_printk(). Signed-off-by: Andy Shevchenko Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20250321144822.324050-4-andriy.shevchenko@linux.intel.com Signed-off-by: Petr Mladek --- include/linux/trace.h | 4 ++-- include/linux/trace_seq.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/trace.h b/include/linux/trace.h index fdcd76b7be83..7eaad857dee0 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -72,8 +72,8 @@ static inline int unregister_ftrace_export(struct trace_export *export) static inline void trace_printk_init_buffers(void) { } -static inline int trace_array_printk(struct trace_array *tr, unsigned long ip, - const char *fmt, ...) +static inline __printf(3, 4) +int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { return 0; } diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 1ef95c0287f0..a93ed5ac3226 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -88,8 +88,8 @@ extern __printf(2, 3) void trace_seq_printf(struct trace_seq *s, const char *fmt, ...); extern __printf(2, 0) void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args); -extern void -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); +extern __printf(2, 0) +void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); extern int trace_print_seq(struct seq_file *m, struct trace_seq *s); extern int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt); @@ -113,8 +113,8 @@ static inline __printf(2, 3) void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { } -static inline void -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +static inline __printf(2, 0) +void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { } -- cgit v1.2.3 From 7bf819aa992faee980610e9021aec0c38aac53be Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 21 Mar 2025 16:40:50 +0200 Subject: vsnprintf: Mark binary printing functions with __printf() attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Binary printf() functions are using printf() type of format, and compiler is not happy about them as is: lib/vsprintf.c:3130:47: error: function ‘vbin_printf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] lib/vsprintf.c:3298:33: error: function ‘bstr_printf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] Fix the compilation errors by adding __printf() attribute. Signed-off-by: Andy Shevchenko Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20250321144822.324050-5-andriy.shevchenko@linux.intel.com Signed-off-by: Petr Mladek --- include/linux/string.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index 0403a4ca4c11..01621ad0f598 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -336,8 +336,8 @@ int __sysfs_match_string(const char * const *array, size_t n, const char *s); #define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s) #ifdef CONFIG_BINARY_PRINTF -int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); -int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); +__printf(3, 0) int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); +__printf(3, 0) int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, -- cgit v1.2.3 From 803f97298e7de9242eb677a1351dcafbbcc9117e Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 11:01:42 -0700 Subject: iommufd: Extend IOMMU_GET_HW_INFO to report PASID capability PASID usage requires PASID support in both device and IOMMU. Since the iommu drivers always enable the PASID capability for the device if it is supported, this extends the IOMMU_GET_HW_INFO to report the PASID capability to userspace. Also, enhances the selftest accordingly. Link: https://patch.msgid.link/r/20250321180143.8468-5-yi.l.liu@intel.com Cc: Bjorn Helgaas Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao #aarch64 platform Tested-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/pci-ats.h | 3 +++ include/uapi/linux/iommufd.h | 14 +++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h index 0e8b74e63767..75c6c86cf09d 100644 --- a/include/linux/pci-ats.h +++ b/include/linux/pci-ats.h @@ -42,6 +42,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features); void pci_disable_pasid(struct pci_dev *pdev); int pci_pasid_features(struct pci_dev *pdev); int pci_max_pasids(struct pci_dev *pdev); +int pci_pasid_status(struct pci_dev *pdev); #else /* CONFIG_PCI_PASID */ static inline int pci_enable_pasid(struct pci_dev *pdev, int features) { return -EINVAL; } @@ -50,6 +51,8 @@ static inline int pci_pasid_features(struct pci_dev *pdev) { return -EINVAL; } static inline int pci_max_pasids(struct pci_dev *pdev) { return -EINVAL; } +static inline int pci_pasid_status(struct pci_dev *pdev) +{ return -EINVAL; } #endif /* CONFIG_PCI_PASID */ #endif /* LINUX_PCI_ATS_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 6901804ec736..e2c04e58a997 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -612,9 +612,17 @@ enum iommu_hw_info_type { * IOMMU_HWPT_GET_DIRTY_BITMAP * IOMMU_HWPT_SET_DIRTY_TRACKING * + * @IOMMU_HW_CAP_PCI_PASID_EXEC: Execute Permission Supported, user ignores it + * when the struct + * iommu_hw_info::out_max_pasid_log2 is zero. + * @IOMMU_HW_CAP_PCI_PASID_PRIV: Privileged Mode Supported, user ignores it + * when the struct + * iommu_hw_info::out_max_pasid_log2 is zero. */ enum iommufd_hw_capabilities { IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, + IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1, + IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, }; /** @@ -630,6 +638,9 @@ enum iommufd_hw_capabilities { * iommu_hw_info_type. * @out_capabilities: Output the generic iommu capability info type as defined * in the enum iommu_hw_capabilities. + * @out_max_pasid_log2: Output the width of PASIDs. 0 means no PASID support. + * PCI devices turn to out_capabilities to check if the + * specific capabilities is supported or not. * @__reserved: Must be 0 * * Query an iommu type specific hardware information data from an iommu behind @@ -653,7 +664,8 @@ struct iommu_hw_info { __u32 data_len; __aligned_u64 data_uptr; __u32 out_data_type; - __u32 __reserved; + __u8 out_max_pasid_log2; + __u8 __reserved[3]; __aligned_u64 out_capabilities; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) -- cgit v1.2.3 From 858c9c10c123b7b04bba12c689db675c18d48bda Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 28 Mar 2025 18:46:54 +0700 Subject: iommufd: Fix iommu_vevent_header tables markup Stephen Rothwell reports htmldocs warnings on iommufd_vevent_header tables: Documentation/userspace-api/iommufd:323: ./include/uapi/linux/iommufd.h:1048: CRITICAL: Unexpected section title or transition. ------------------------------------------------------------------------- [docutils] WARNING: kernel-doc './scripts/kernel-doc -rst -enable-lineno -sphinx-version 8.1.3 ./include/uapi/linux/iommufd.h' processing failed with: Documentation/userspace-api/iommufd:323: ./include/uapi/linux/iommufd.h:1048: (SEVERE/4) Unexpected section title or transition. ------------------------------------------------------------------------- These are because Sphinx confuses the tables for section headings. Fix the table markup to squash away above warnings. Fixes: e36ba5ab808e ("iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC") Link: https://patch.msgid.link/r/20250328114654.55840-1-bagasdotme@gmail.com Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20250318213359.5dc56fd1@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Signed-off-by: Jason Gunthorpe --- include/uapi/linux/iommufd.h | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index e2c04e58a997..f29b6c44655e 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -1045,21 +1045,26 @@ enum iommu_veventq_flag { * [0, INT_MAX] where the following index of INT_MAX is 0 * * Each iommufd_vevent_header reports a sequence index of the following vEVENT: - * ------------------------------------------------------------------------- + * + * +----------------------+-------+----------------------+-------+---+-------+ * | header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...| dataN | - * ------------------------------------------------------------------------- + * +----------------------+-------+----------------------+-------+---+-------+ + * * And this sequence index is expected to be monotonic to the sequence index of * the previous vEVENT. If two adjacent sequence indexes has a delta larger than * 1, it means that delta - 1 number of vEVENTs has lost, e.g. two lost vEVENTs: - * ------------------------------------------------------------------------- + * + * +-----+----------------------+-------+----------------------+-------+-----+ * | ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ... | - * ------------------------------------------------------------------------- + * +-----+----------------------+-------+----------------------+-------+-----+ + * * If a vEVENT lost at the tail of the vEVENTQ and there is no following vEVENT * providing the next sequence index, an IOMMU_VEVENTQ_FLAG_LOST_EVENTS header * would be added to the tail, and no data would follow this header: - * --------------------------------------------------------------------------- + * + * +--+----------------------+-------+-----------------------------------------+ * |..| header3 {sequence=3} | data3 | header4 {flags=LOST_EVENTS, sequence=4} | - * --------------------------------------------------------------------------- + * +--+----------------------+-------+-----------------------------------------+ */ struct iommufd_vevent_header { __u32 flags; @@ -1117,9 +1122,11 @@ struct iommu_vevent_arm_smmuv3 { * * Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by * a type-specific data structure, in a normal case: - * ------------------------------------------------------------- - * || header0 | data0 | header1 | data1 | ... | headerN | dataN || - * ------------------------------------------------------------- + * + * +-+---------+-------+---------+-------+-----+---------+-------+-+ + * | | header0 | data0 | header1 | data1 | ... | headerN | dataN | | + * +-+---------+-------+---------+-------+-----+---------+-------+-+ + * * unless a tailing IOMMU_VEVENTQ_FLAG_LOST_EVENTS header is logged (refer to * struct iommufd_vevent_header). */ -- cgit v1.2.3 From a30d4ff8193ef768dbb524824c7aa07c5486a63a Mon Sep 17 00:00:00 2001 From: Nir Lichtman Date: Tue, 4 Feb 2025 05:47:41 +0000 Subject: kdb: remove usage of static environment buffer Problem: The set environment variable logic uses a static "heap" like buffer to store the values of the variables, and they are never freed, on top of that this is redundant since the kernel supplies allocation facilities which are even used also in this file. Solution: Remove the weird static buffer logic and use kmalloc instead, call kfree when overriding an existing variable. Signed-off-by: Nir Lichtman Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20250204054741.GB1219827@lichtman.org Signed-off-by: Daniel Thompson --- include/linux/kdb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 905a2e2f45f6..ecbf819deeca 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -104,7 +104,7 @@ extern int kdb_initial_cpu; #define KDB_NOENVVALUE (-6) #define KDB_NOTIMP (-7) #define KDB_ENVFULL (-8) -#define KDB_ENVBUFFULL (-9) +#define KDB_KMALLOCFAILED (-9) #define KDB_TOOMANYBPT (-10) #define KDB_TOOMANYDBREGS (-11) #define KDB_DUPBPT (-12) -- cgit v1.2.3 From ebf695f129367ed4b26df6baec2ea7fc50c9e6f0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Mar 2025 17:51:15 +0800 Subject: ublk: add segment parameter IO split is usually bad in io_uring world, since -EAGAIN is caused and IO handling may have to fallback to io-wq, this way does hurt performance. ublk starts to support zero copy recently, for avoiding unnecessary IO split, ublk driver's segment limit should be aligned with backend device's segment limit. Another reason is that io_buffer_register_bvec() needs to allocate bvecs, which number is aligned with ublk request segment number, so that big memory allocation can be avoided by setting reasonable max_segments limit. So add segment parameter for providing ublk server chance to align segment limit with backend, and keep it reasonable from implementation viewpoint. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250327095123.179113-7-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/uapi/linux/ublk_cmd.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 7255b36b5cf6..583b86681c93 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -410,6 +410,29 @@ struct ublk_param_dma_align { __u8 pad[4]; }; +#define UBLK_MIN_SEGMENT_SIZE 4096 +/* + * If any one of the three segment parameter is set as 0, the behavior is + * undefined. + */ +struct ublk_param_segment { + /* + * seg_boundary_mask + 1 needs to be power_of_2(), and the sum has + * to be >= UBLK_MIN_SEGMENT_SIZE(4096) + */ + __u64 seg_boundary_mask; + + /* + * max_segment_size could be override by virt_boundary_mask, so be + * careful when setting both. + * + * max_segment_size has to be >= UBLK_MIN_SEGMENT_SIZE(4096) + */ + __u32 max_segment_size; + __u16 max_segments; + __u8 pad[2]; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -423,6 +446,7 @@ struct ublk_params { #define UBLK_PARAM_TYPE_DEVT (1 << 2) #define UBLK_PARAM_TYPE_ZONED (1 << 3) #define UBLK_PARAM_TYPE_DMA_ALIGN (1 << 4) +#define UBLK_PARAM_TYPE_SEGMENT (1 << 5) __u32 types; /* types of parameter included */ struct ublk_param_basic basic; @@ -430,6 +454,7 @@ struct ublk_params { struct ublk_param_devt devt; struct ublk_param_zoned zoned; struct ublk_param_dma_align dma; + struct ublk_param_segment seg; }; #endif -- cgit v1.2.3 From 0f6439f61a6e2ddc92b98362c6d1afc210f56a90 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 22 Jan 2025 13:55:27 -0800 Subject: fuse: add kernel-enforced timeout option for requests There are situations where fuse servers can become unresponsive or stuck, for example if the server is deadlocked. Currently, there's no good way to detect if a server is stuck and needs to be killed manually. This commit adds an option for enforcing a timeout (in seconds) for requests where if the timeout elapses without the server responding to the request, the connection will be automatically aborted. Please note that these timeouts are not 100% precise. For example, the request may take roughly an extra FUSE_TIMEOUT_TIMER_FREQ seconds beyond the requested timeout due to internal implementation, in order to mitigate overhead. [SzM: Bump the API version number] Signed-off-by: Joanne Koong Reviewed-by: Jeff Layton Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 5e0eb41d967e..5ec43ecbceb7 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -229,6 +229,9 @@ * - FUSE_URING_IN_OUT_HEADER_SZ * - FUSE_URING_OP_IN_OUT_SZ * - enum fuse_uring_cmd + * + * 7.43 + * - add FUSE_REQUEST_TIMEOUT */ #ifndef _LINUX_FUSE_H @@ -264,7 +267,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 42 +#define FUSE_KERNEL_MINOR_VERSION 43 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -435,6 +438,8 @@ struct fuse_file_lock { * of the request ID indicates resend requests * FUSE_ALLOW_IDMAP: allow creation of idmapped mounts * FUSE_OVER_IO_URING: Indicate that client supports io-uring + * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests. + * init_out.request_timeout contains the timeout (in secs) */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -477,11 +482,11 @@ struct fuse_file_lock { #define FUSE_PASSTHROUGH (1ULL << 37) #define FUSE_NO_EXPORT_SUPPORT (1ULL << 38) #define FUSE_HAS_RESEND (1ULL << 39) - /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP #define FUSE_ALLOW_IDMAP (1ULL << 40) #define FUSE_OVER_IO_URING (1ULL << 41) +#define FUSE_REQUEST_TIMEOUT (1ULL << 42) /** * CUSE INIT request/reply flags @@ -909,7 +914,8 @@ struct fuse_init_out { uint16_t map_alignment; uint32_t flags2; uint32_t max_stack_depth; - uint32_t unused[6]; + uint16_t request_timeout; + uint16_t unused[11]; }; #define CUSE_INIT_INFO_MAX 4096 -- cgit v1.2.3 From 296e16961817e8e5f574661febc608ac0c0c0108 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 08:55:47 +0100 Subject: io_uring: hide caches sqes from drivers There is now an io_uring private part of cmd async_data, move saved sqe into it. Drivers are accessing it via struct io_uring_cmd::cmd. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ecbe078dd57acefdbc4366d083327086c0879378.1743357121.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index e6723fa95160..0634a3de1782 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -21,7 +21,6 @@ struct io_uring_cmd { struct io_uring_cmd_data { void *op_data; - struct io_uring_sqe sqes[2]; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) -- cgit v1.2.3 From f278b6d5bb465c7fd66f3d103812947e55b376ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Mar 2025 07:59:46 +0000 Subject: Revert "tcp: avoid atomic operations on sk->sk_rmem_alloc" This reverts commit 0de2a5c4b824da2205658ebebb99a55c43cdf60f. I forgot that a TCP socket could receive messages in its error queue. sock_queue_err_skb() can be called without socket lock being held, and changes sk->sk_rmem_alloc. The fact that skbs in error queue are limited by sk->sk_rcvbuf means that error messages can be dropped if socket receive queues are full, which is an orthogonal issue. In future kernels, we could use a separate sk->sk_error_mem_alloc counter specifically for the error queue. Fixes: 0de2a5c4b824 ("tcp: avoid atomic operations on sk->sk_rmem_alloc") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250331075946.31960-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index df04dc09c519..4450c384ef17 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -779,7 +779,6 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) /* tcp.c */ void tcp_get_info(struct sock *, struct tcp_info *); -void tcp_sock_rfree(struct sk_buff *skb); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, @@ -2899,18 +2898,4 @@ enum skb_drop_reason tcp_inbound_hash(struct sock *sk, const void *saddr, const void *daddr, int family, int dif, int sdif); -/* version of skb_set_owner_r() avoiding one atomic_add() */ -static inline void tcp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) -{ - skb_orphan(skb); - skb->sk = sk; - skb->destructor = tcp_sock_rfree; - - sock_owned_by_me(sk); - atomic_set(&sk->sk_rmem_alloc, - atomic_read(&sk->sk_rmem_alloc) + skb->truesize); - - sk_forward_alloc_add(sk, -skb->truesize); -} - #endif /* _TCP_H */ -- cgit v1.2.3 From 09f37f2d7b21ff35b8b533f9ab8cfad2fe8f72f6 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 31 Mar 2025 21:26:44 -0700 Subject: sched/smt: Always inline sched_smt_active() sched_smt_active() can be called from noinstr code, so it should always be inlined. The CONFIG_SCHED_SMT version already has __always_inline. Do the same for its !CONFIG_SCHED_SMT counterpart. Fixes the following warning: vmlinux.o: error: objtool: intel_idle_ibrs+0x13: call to sched_smt_active() leaves .noinstr.text section Fixes: 321a874a7ef8 ("sched/smt: Expose sched_smt_present static key") Reported-by: kernel test robot Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/1d03907b0a247cf7fb5c1d518de378864f603060.1743481539.git.jpoimboe@kernel.org Closes: https://lore.kernel.org/r/202503311434.lyw2Tveh-lkp@intel.com/ --- include/linux/sched/smt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index fb1e295e7e63..166b19af956f 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -12,7 +12,7 @@ static __always_inline bool sched_smt_active(void) return static_branch_likely(&sched_smt_present); } #else -static inline bool sched_smt_active(void) { return false; } +static __always_inline bool sched_smt_active(void) { return false; } #endif void arch_smt_update(void); -- cgit v1.2.3 From 9ac50f7311dc8b39e355582f14c1e82da47a8196 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 31 Mar 2025 21:26:45 -0700 Subject: context_tracking: Always inline ct_{nmi,irq}_{enter,exit}() Thanks to CONFIG_DEBUG_SECTION_MISMATCH, empty functions can be generated out of line. These can be called from noinstr code, so make sure they're always inlined. Fixes the following warnings: vmlinux.o: warning: objtool: irqentry_nmi_enter+0xa2: call to ct_nmi_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_nmi_exit+0x16: call to ct_nmi_exit() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_exit+0x78: call to ct_irq_exit() leaves .noinstr.text section Fixes: 6f0e6c1598b1 ("context_tracking: Take IRQ eqs entrypoints over RCU") Reported-by: Randy Dunlap Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Frederic Weisbecker Cc: Paul E. McKenney Cc: Linus Torvalds Link: https://lore.kernel.org/r/8509bce3f536bcd4ae7af3a2cf6930d48c5e631a.1743481539.git.jpoimboe@kernel.org Closes: https://lore.kernel.org/d1eca076-fdde-484a-b33e-70e0d167c36d@infradead.org --- include/linux/context_tracking_irq.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/context_tracking_irq.h b/include/linux/context_tracking_irq.h index c50b5670c4a5..197916ee91a4 100644 --- a/include/linux/context_tracking_irq.h +++ b/include/linux/context_tracking_irq.h @@ -10,12 +10,12 @@ void ct_irq_exit_irqson(void); void ct_nmi_enter(void); void ct_nmi_exit(void); #else -static inline void ct_irq_enter(void) { } -static inline void ct_irq_exit(void) { } +static __always_inline void ct_irq_enter(void) { } +static __always_inline void ct_irq_exit(void) { } static inline void ct_irq_enter_irqson(void) { } static inline void ct_irq_exit_irqson(void) { } -static inline void ct_nmi_enter(void) { } -static inline void ct_nmi_exit(void) { } +static __always_inline void ct_nmi_enter(void) { } +static __always_inline void ct_nmi_exit(void) { } #endif #endif -- cgit v1.2.3 From 6309a5c43b0dc629851f25b2e5ef8beff61d08e5 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 31 Mar 2025 21:26:46 -0700 Subject: rcu-tasks: Always inline rcu_irq_work_resched() Thanks to CONFIG_DEBUG_SECTION_MISMATCH, empty functions can be generated out of line. rcu_irq_work_resched() can be called from noinstr code, so make sure it's always inlined. Fixes: 564506495ca9 ("rcu/context-tracking: Move deferred nocb resched to context tracking") Reported-by: Randy Dunlap Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Frederic Weisbecker Cc: Paul E. McKenney Cc: Linus Torvalds Link: https://lore.kernel.org/r/e84f15f013c07e4c410d972e75620c53b62c1b3e.1743481539.git.jpoimboe@kernel.org Closes: https://lore.kernel.org/d1eca076-fdde-484a-b33e-70e0d167c36d@infradead.org --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f8159f8a7d73..120536f4c6eb 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -132,7 +132,7 @@ static inline void rcu_sysrq_end(void) { } #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) void rcu_irq_work_resched(void); #else -static inline void rcu_irq_work_resched(void) { } +static __always_inline void rcu_irq_work_resched(void) { } #endif #ifdef CONFIG_RCU_NOCB_CPU -- cgit v1.2.3 From c2004b6efb1c891b80bef186771a3668fc25f6b3 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Mon, 3 Mar 2025 23:36:37 +0100 Subject: rtc: mt6397: drop unused defines RTC_NUM_YEARS has never been used, the other defines are not used since commit 34bbdc12d04e ("rtc: mt6359: Add RTC hardware range and add support for start-year") Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250303223637.1135362-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- include/linux/mfd/mt6397/rtc.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/mfd/mt6397/rtc.h b/include/linux/mfd/mt6397/rtc.h index 068ae1c0f0e8..27883af44f87 100644 --- a/include/linux/mfd/mt6397/rtc.h +++ b/include/linux/mfd/mt6397/rtc.h @@ -60,11 +60,6 @@ #define RTC_PDN2 0x002e #define RTC_PDN2_PWRON_ALARM BIT(4) -#define RTC_MIN_YEAR 1968 -#define RTC_BASE_YEAR 1900 -#define RTC_NUM_YEARS 128 -#define RTC_MIN_YEAR_OFFSET (RTC_MIN_YEAR - RTC_BASE_YEAR) - #define MTK_RTC_POLL_DELAY_US 10 #define MTK_RTC_POLL_TIMEOUT (jiffies_to_usecs(HZ)) -- cgit v1.2.3 From b98072af60a7ce19214c80f10a7daab924c69182 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 8 Jan 2025 00:48:21 -0700 Subject: mm/hugetlb_vmemmap: fix memory loads ordering Using x86_64 as an example, for a 32KB struct page[] area describing a 2MB hugeTLB, HVO reduces the area to 4KB by the following steps: 1. Split the (r/w vmemmap) PMD mapping the area into 512 (r/w) PTEs; 2. For the 8 PTEs mapping the area, remap PTE 1-7 to the page mapped by PTE 0, and at the same time change the permission from r/w to r/o; 3. Free the pages PTE 1-7 used to map, hence the reduction from 32KB to 4KB. However, the following race can happen due to improperly memory loads ordering: CPU 1 (HVO) CPU 2 (speculative PFN walker) page_ref_freeze() synchronize_rcu() rcu_read_lock() page_is_fake_head() is false vmemmap_remap_pte() XXX: struct page[] becomes r/o page_ref_unfreeze() page_ref_count() is not zero atomic_add_unless(&page->_refcount) XXX: try to modify r/o struct page[] Specifically, page_is_fake_head() must be ordered after page_ref_count() on CPU 2 so that it can only return true for this case, to avoid the later attempt to modify r/o struct page[]. This patch adds the missing memory barrier and makes the tests on page_is_fake_head() and page_ref_count() done in the proper order. Link: https://lkml.kernel.org/r/20250108074822.722696-1-yuzhao@google.com Fixes: bd225530a4c7 ("mm/hugetlb_vmemmap: fix race with speculative PFN walkers") Signed-off-by: Yu Zhao Reported-by: Will Deacon Closes: https://lore.kernel.org/20241128142028.GA3506@willie-the-truck/ Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Acked-by: Will Deacon Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 37 +++++++++++++++++++++++++++++++++++++ include/linux/page_ref.h | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5bd9492a66ee..e6a21b62dcce 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -226,11 +226,48 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page } return page; } + +static __always_inline bool page_count_writable(const struct page *page, int u) +{ + if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) + return true; + + /* + * The refcount check is ordered before the fake-head check to prevent + * the following race: + * CPU 1 (HVO) CPU 2 (speculative PFN walker) + * + * page_ref_freeze() + * synchronize_rcu() + * rcu_read_lock() + * page_is_fake_head() is false + * vmemmap_remap_pte() + * XXX: struct page[] becomes r/o + * + * page_ref_unfreeze() + * page_ref_count() is not zero + * + * atomic_add_unless(&page->_refcount) + * XXX: try to modify r/o struct page[] + * + * The refcount check also prevents modification attempts to other (r/o) + * tail pages that are not fake heads. + */ + if (atomic_read_acquire(&page->_refcount) == u) + return false; + + return page_fixed_fake_head(page) == page; +} #else static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } + +static inline bool page_count_writable(const struct page *page, int u) +{ + return true; +} #endif static __always_inline int page_is_fake_head(const struct page *page) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 8c236c651d1d..544150d1d5fd 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -234,7 +234,7 @@ static inline bool page_ref_add_unless(struct page *page, int nr, int u) rcu_read_lock(); /* avoid writing to the vmemmap area being remapped */ - if (!page_is_fake_head(page) && page_ref_count(page) != u) + if (page_count_writable(page, u)) ret = atomic_add_unless(&page->_refcount, nr, u); rcu_read_unlock(); -- cgit v1.2.3 From f21bb37afbba0878c8d417cd861e43d014119845 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Tue, 25 Feb 2025 11:45:51 +0800 Subject: mm: pgtable: make generic tlb_remove_table() use struct ptdesc Patch series "remove tlb_remove_page_ptdesc()", v2. As suggested by Peter Zijlstra below [1], this series aims to remove tlb_remove_page_ptdesc(). : Fundamentally tlb_remove_page() is about removing *pages* as from a PTE, : there should not be a page-table anywhere near here *ever*. : : Yes, some architectures use tlb_remove_page() for page-tables too, but : that is more or less an implementation detail that can be fixed. After this series, all architectures use tlb_remove_table() or tlb_remove_ptdesc() to remove the page table pages. In the future, once all architectures using tlb_remove_table() have also converted to using struct ptdesc (eg. powerpc), it may be possible to use only tlb_remove_ptdesc(). [1] https://lore.kernel.org/linux-mm/20250103111457.GC22934@noisy.programming.kicks-ass.net/ This patch (of 6): Now only arm will call tlb_remove_ptdesc()/tlb_remove_table() when CONFIG_MMU_GATHER_TABLE_FREE is disabled. In this case, the type of the table parameter is actually struct ptdesc * instead of struct page *. Since struct ptdesc still overlaps with struct page and has not been separated from it, forcing the table parameter to struct page * will not cause any problems at this time. But this is definitely incorrect and needs to be fixed. So just like the generic __tlb_remove_table(), let generic tlb_remove_table() use struct ptdesc by default when CONFIG_MMU_GATHER_TABLE_FREE is disabled. Link: https://lkml.kernel.org/r/cover.1740454179.git.zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/5be8c3ab7bd68510bf0db4cf84010f4dfe372917.1740454179.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Kevin Brodsky Cc: Alexandre Ghiti Cc: "Aneesh Kumar K.V" Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: Hugh Dickens Cc: Jann Horn Cc: Matthew Wilcow (Oracle) Cc: "Mike Rapoport (IBM)" Cc: Muchun Song Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Rik van Riel Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index d1adfba8387e..e27d24dd8d5e 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -227,10 +227,10 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); */ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) { - struct page *page = (struct page *)table; + struct ptdesc *ptdesc = (struct ptdesc *)table; - pagetable_dtor(page_ptdesc(page)); - tlb_remove_page(tlb, page); + pagetable_dtor(ptdesc); + tlb_remove_page(tlb, ptdesc_page(ptdesc)); } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ -- cgit v1.2.3 From 1a03c275a3ad4e47d479a63037b72f2b305f0c13 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Tue, 25 Feb 2025 11:45:52 +0800 Subject: mm: pgtable: change pt parameter of tlb_remove_ptdesc() to struct ptdesc* All callers of tlb_remove_ptdesc() pass it a pointer of struct ptdesc, so let's change the pt parameter from void * to struct ptdesc * to perform a type safety check. Link: https://lkml.kernel.org/r/60bb44299cf2d731df6592e446e7f694054d0dbe.1740454179.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Originally-by: Peter Zijlstra (Intel) Reviewed-by: Kevin Brodsky Cc: Alexandre Ghiti Cc: "Aneesh Kumar K.V" Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: Hugh Dickens Cc: Jann Horn Cc: Matthew Wilcow (Oracle) Cc: "Mike Rapoport (IBM)" Cc: Muchun Song Cc: Nicholas Piggin Cc: Rik van Riel Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index e27d24dd8d5e..bf845019af36 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -493,7 +493,7 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) return tlb_remove_page_size(tlb, page, PAGE_SIZE); } -static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt) +static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) { tlb_remove_table(tlb, pt); } -- cgit v1.2.3 From 02d9e1a2048e47d39733f0ced71ce8e8fee3e56d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Tue, 25 Feb 2025 11:45:56 +0800 Subject: mm: pgtable: remove tlb_remove_page_ptdesc() The tlb_remove_ptdesc()/tlb_remove_table() is specially designed for page table pages, and now all architectures have been converted to use it to remove page table pages. So let's remove tlb_remove_page_ptdesc(), it currently has no users and should not be used for page table pages. Link: https://lkml.kernel.org/r/3df04c8494339073b71be4acb2d92e108ecd1b60.1740454179.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Suggested-by: Peter Zijlstra (Intel) Reviewed-by: Kevin Brodsky Cc: Alexandre Ghiti Cc: "Aneesh Kumar K.V" Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: Hugh Dickens Cc: Jann Horn Cc: Matthew Wilcow (Oracle) Cc: "Mike Rapoport (IBM)" Cc: Muchun Song Cc: Nicholas Piggin Cc: Rik van Riel Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index bf845019af36..88a42973fa47 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -498,12 +498,6 @@ static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) tlb_remove_table(tlb, pt); } -/* Like tlb_remove_ptdesc, but for page-like page directories. */ -static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) -{ - tlb_remove_page(tlb, ptdesc_page(pt)); -} - static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { -- cgit v1.2.3 From 5796d3967c0956734bd1249f76989ca80da0225b Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Wed, 5 Mar 2025 02:17:05 +0000 Subject: mseal sysmap: kernel config and header change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mseal system mappings", v9. As discussed during mseal() upstream process [1], mseal() protects the VMAs of a given virtual memory range against modifications, such as the read/write (RW) and no-execute (NX) bits. For complete descriptions of memory sealing, please see mseal.rst [2]. The mseal() is useful to mitigate memory corruption issues where a corrupted pointer is passed to a memory management system. For example, such an attacker primitive can break control-flow integrity guarantees since read-only memory that is supposed to be trusted can become writable or .text pages can get remapped. The system mappings are readonly only, memory sealing can protect them from ever changing to writable or unmmap/remapped as different attributes. System mappings such as vdso, vvar, vvar_vclock, vectors (arm compat-mode), sigpage (arm compat-mode), are created by the kernel during program initialization, and could be sealed after creation. Unlike the aforementioned mappings, the uprobe mapping is not established during program startup. However, its lifetime is the same as the process's lifetime [3]. It could be sealed from creation. The vsyscall on x86-64 uses a special address (0xffffffffff600000), which is outside the mm managed range. This means mprotect, munmap, and mremap won't work on the vsyscall. Since sealing doesn't enhance the vsyscall's security, it is skipped in this patch. If we ever seal the vsyscall, it is probably only for decorative purpose, i.e. showing the 'sl' flag in the /proc/pid/smaps. For this patch, it is ignored. It is important to note that the CHECKPOINT_RESTORE feature (CRIU) may alter the system mappings during restore operations. UML(User Mode Linux) and gVisor, rr are also known to change the vdso/vvar mappings. Consequently, this feature cannot be universally enabled across all systems. As such, CONFIG_MSEAL_SYSTEM_MAPPINGS is disabled by default. To support mseal of system mappings, architectures must define CONFIG_ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS and update their special mappings calls to pass mseal flag. Additionally, architectures must confirm they do not unmap/remap system mappings during the process lifetime. The existence of this flag for an architecture implies that it does not require the remapping of thest system mappings during process lifetime, so sealing these mappings is safe from a kernel perspective. This version covers x86-64 and arm64 archiecture as minimum viable feature. While no specific CPU hardware features are required for enable this feature on an archiecture, memory sealing requires a 64-bit kernel. Other architectures can choose whether or not to adopt this feature. Currently, I'm not aware of any instances in the kernel code that actively munmap/mremap a system mapping without a request from userspace. The PPC does call munmap when _install_special_mapping fails for vdso; however, it's uncertain if this will ever fail for PPC - this needs to be investigated by PPC in the future [4]. The UML kernel can add this support when KUnit tests require it [5]. In this version, we've improved the handling of system mapping sealing from previous versions, instead of modifying the _install_special_mapping function itself, which would affect all architectures, we now call _install_special_mapping with a sealing flag only within the specific architecture that requires it. This targeted approach offers two key advantages: 1) It limits the code change's impact to the necessary architectures, and 2) It aligns with the software architecture by keeping the core memory management within the mm layer, while delegating the decision of sealing system mappings to the individual architecture, which is particularly relevant since 32-bit architectures never require sealing. Prior to this patch series, we explored sealing special mappings from userspace using glibc's dynamic linker. This approach revealed several issues: - The PT_LOAD header may report an incorrect length for vdso, (smaller than its actual size). The dynamic linker, which relies on PT_LOAD information to determine mapping size, would then split and partially seal the vdso mapping. Since each architecture has its own vdso/vvar code, fixing this in the kernel would require going through each archiecture. Our initial goal was to enable sealing readonly mappings, e.g. .text, across all architectures, sealing vdso from kernel since creation appears to be simpler than sealing vdso at glibc. - The [vvar] mapping header only contains address information, not length information. Similar issues might exist for other special mappings. - Mappings like uprobe are not covered by the dynamic linker, and there is no effective solution for them. This feature's security enhancements will benefit ChromeOS, Android, and other high security systems. Testing: This feature was tested on ChromeOS and Android for both x86-64 and ARM64. - Enable sealing and verify vdso/vvar, sigpage, vector are sealed properly, i.e. "sl" shown in the smaps for those mappings, and mremap is blocked. - Passing various automation tests (e.g. pre-checkin) on ChromeOS and Android to ensure the sealing doesn't affect the functionality of Chromebook and Android phone. I also tested the feature on Ubuntu on x86-64: - With config disabled, vdso/vvar is not sealed, - with config enabled, vdso/vvar is sealed, and booting up Ubuntu is OK, normal operations such as browsing the web, open/edit doc are OK. Link: https://lore.kernel.org/all/20240415163527.626541-1-jeffxu@chromium.org/ [1] Link: Documentation/userspace-api/mseal.rst [2] Link: https://lore.kernel.org/all/CABi2SkU9BRUnqf70-nksuMCQ+yyiWjo3fM4XkRkL-NrCZxYAyg@mail.gmail.com/ [3] Link: https://lore.kernel.org/all/CABi2SkV6JJwJeviDLsq9N4ONvQ=EFANsiWkgiEOjyT9TQSt+HA@mail.gmail.com/ [4] Link: https://lore.kernel.org/all/202502251035.239B85A93@keescook/ [5] This patch (of 7): Provide infrastructure to mseal system mappings. Establish two kernel configs (CONFIG_MSEAL_SYSTEM_MAPPINGS, ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS) and VM_SEALED_SYSMAP macro for future patches. Link: https://lkml.kernel.org/r/20250305021711.3867874-1-jeffxu@google.com Link: https://lkml.kernel.org/r/20250305021711.3867874-2-jeffxu@google.com Signed-off-by: Jeff Xu Reviewed-by: Kees Cook Reviewed-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Cc: Adhemerval Zanella Cc: Alexander Mikhalitsyn Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Anna-Maria Behnsen Cc: Ard Biesheuvel Cc: Benjamin Berg Cc: Christoph Hellwig Cc: Dave Hansen Cc: David Rientjes Cc: David S. Miller Cc: Elliot Hughes Cc: Florian Faineli Cc: Greg Ungerer Cc: Guenter Roeck Cc: Heiko Carstens Cc: Helge Deller Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Ingo Molnar Cc: Jann Horn Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Jorge Lucangeli Obes Cc: Linus Waleij Cc: Mark Rutland Cc: Matthew Wilcow (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Oleg Nesterov Cc: Pedro Falcato Cc: Peter Xu Cc: Randy Dunlap Cc: Stephen Röttger Cc: Thomas Weißschuh Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 32ba0e33422b..778f5de6a12e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4236,4 +4236,14 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); + +/* + * mseal of userspace process's system mappings. + */ +#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS +#define VM_SEALED_SYSMAP VM_SEALED +#else +#define VM_SEALED_SYSMAP VM_NONE +#endif + #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From e2a33a2a3258794891cdd6ca4b1318da6594d157 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 26 Mar 2025 11:26:06 -0400 Subject: lib/sort.c: add _nonatomic() variants with cond_resched() bcachefs calls sort() during recovery to sort all keys it found in the journal, and this may be very large - gigabytes on large machines. This has been causing "task blocked" warnings, so needs a cond_resched(). [kent.overstreet@linux.dev: fix kerneldoc] Link: https://lkml.kernel.org/r/cgsr5a447pxqomc4gvznsp5yroqmif4omd7o5lsr2swifjhoic@yzjjrx2bvrq7 Link: https://lkml.kernel.org/r/20250326152606.2594920-1-kent.overstreet@linux.dev Signed-off-by: Kent Overstreet Cc: Kuan-Wei Chiu Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/sort.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/sort.h b/include/linux/sort.h index e163287ac6c1..8e5603b10941 100644 --- a/include/linux/sort.h +++ b/include/linux/sort.h @@ -13,4 +13,15 @@ void sort(void *base, size_t num, size_t size, cmp_func_t cmp_func, swap_func_t swap_func); +/* Versions that periodically call cond_resched(): */ + +void sort_r_nonatomic(void *base, size_t num, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv); + +void sort_nonatomic(void *base, size_t num, size_t size, + cmp_func_t cmp_func, + swap_func_t swap_func); + #endif -- cgit v1.2.3 From 149974fdb8e186a1c72b87cee806373624a8a375 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 25 Mar 2025 21:51:51 +0800 Subject: block: add for_each_mp_bvec() Add helper of for_each_mp_bvec() for io_uring to import fixed kernel buffer. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250325135155.935398-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/bvec.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ba8f52d48b94..204b22a99c4b 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -184,6 +184,12 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv, ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) +#define for_each_mp_bvec(bvl, bio_vec, iter, start) \ + for (iter = (start); \ + (iter).bi_size && \ + ((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1); \ + bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) + /* for iterating one bio from start to end */ #define BVEC_ITER_ALL_INIT (struct bvec_iter) \ { \ -- cgit v1.2.3 From 4c975fd700022c90e61a46326e3444e08317876e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:43 -0700 Subject: net: hold instance lock during NETDEV_REGISTER/UP Callers of inetdev_init can come from several places with inconsistent expectation about netdev instance lock. Grab instance lock during REGISTER (plus UP). Also solve the inconsistency with UNREGISTER where it was locked only during move netns path. WARNING: CPU: 10 PID: 1479 at ./include/net/netdev_lock.h:54 __netdev_update_features+0x65f/0xca0 __warn+0x81/0x180 __netdev_update_features+0x65f/0xca0 report_bug+0x156/0x180 handle_bug+0x4f/0x90 exc_invalid_op+0x13/0x60 asm_exc_invalid_op+0x16/0x20 __netdev_update_features+0x65f/0xca0 netif_disable_lro+0x30/0x1d0 inetdev_init+0x12f/0x1f0 inetdev_event+0x48b/0x870 notifier_call_chain+0x38/0xf0 register_netdevice+0x741/0x8b0 register_netdev+0x1f/0x40 mlx5e_probe+0x4e3/0x8e0 [mlx5_core] auxiliary_bus_probe+0x3f/0x90 really_probe+0xc3/0x3a0 __driver_probe_device+0x80/0x150 driver_probe_device+0x1f/0x90 __device_attach_driver+0x7d/0x100 bus_for_each_drv+0x80/0xd0 __device_attach+0xb4/0x1c0 bus_probe_device+0x91/0xa0 device_add+0x657/0x870 Reviewed-by: Jakub Kicinski Reported-by: Cosmin Ratiu Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations") Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-3-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fa79145518d1..cf3b6445817b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4192,7 +4192,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, int netif_set_alias(struct net_device *dev, const char *alias, size_t len); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); -int netif_change_net_namespace(struct net_device *dev, struct net *net, +int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex, struct netlink_ext_ack *extack); int dev_change_net_namespace(struct net_device *dev, struct net *net, -- cgit v1.2.3 From 8965c160b8f7333df895321c8aa6bad4a7175f2b Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:44 -0700 Subject: net: use netif_disable_lro in ipv6_add_dev ipv6_add_dev might call dev_disable_lro which unconditionally grabs instance lock, so it will deadlock during NETDEV_REGISTER. Switch to netif_disable_lro. Make sure all callers hold the instance lock as well. Cc: Cosmin Ratiu Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations") Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-4-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/net/ip.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 8a48ade24620..47ed6d23853d 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -667,14 +667,6 @@ static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, memcpy(buf, &naddr, sizeof(naddr)); } -#if IS_MODULE(CONFIG_IPV6) -#define EXPORT_IPV6_MOD(X) EXPORT_SYMBOL(X) -#define EXPORT_IPV6_MOD_GPL(X) EXPORT_SYMBOL_GPL(X) -#else -#define EXPORT_IPV6_MOD(X) -#define EXPORT_IPV6_MOD_GPL(X) -#endif - #if IS_ENABLED(CONFIG_IPV6) #include #endif @@ -694,6 +686,14 @@ static __inline__ void inet_reset_saddr(struct sock *sk) #endif +#if IS_MODULE(CONFIG_IPV6) +#define EXPORT_IPV6_MOD(X) EXPORT_SYMBOL(X) +#define EXPORT_IPV6_MOD_GPL(X) EXPORT_SYMBOL_GPL(X) +#else +#define EXPORT_IPV6_MOD(X) +#define EXPORT_IPV6_MOD_GPL(X) +#endif + static inline unsigned int ipv4_addr_hash(__be32 ip) { return (__force unsigned int) ip; -- cgit v1.2.3 From 1901066aab7654f4a225ac29354a564d891d0c1a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:46 -0700 Subject: netdevsim: add dummy device notifiers In order to exercise and verify notifiers' locking assumptions, register dummy notifiers (via register_netdevice_notifier_dev_net). Share notifier event handler that enforces the assumptions with lock_debug.c (rename and export rtnl_net_debug_event as netdev_debug_event). Add ops lock asserts to netdev_debug_event. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-6-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/net/netdev_lock.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index 1c0c9a94cc22..c316b551df8d 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -98,4 +98,7 @@ static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, &qdisc_xmit_lock_key); \ } +int netdev_debug_event(struct notifier_block *nb, unsigned long event, + void *ptr); + #endif -- cgit v1.2.3 From ec304b70d46bd2ed66541c5b57b63276529e05b1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:34:04 -0700 Subject: net: move mp dev config validation to __net_mp_open_rxq() devmem code performs a number of safety checks to avoid having to reimplement all of them in the drivers. Move those to __net_mp_open_rxq() and reuse that function for binding to make sure that io_uring ZC also benefits from them. While at it rename the queue ID variable to rxq_idx in __net_mp_open_rxq(), we touch most of the relevant lines. The XArray insertion is reordered after the netdev_rx_queue_restart() call, otherwise we'd need to duplicate the queue index check or risk inserting an invalid pointer. The XArray allocation failures should be extremely rare. Reviewed-by: Mina Almasry Acked-by: Stanislav Fomichev Fixes: 6e18ed929d3b ("net: add helpers for setting a memory provider on an rx queue") Link: https://patch.msgid.link/20250403013405.2827250-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index b3e665897767..ada4f968960a 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -6,6 +6,7 @@ #include struct netdev_rx_queue; +struct netlink_ext_ack; struct sk_buff; struct memory_provider_ops { @@ -24,8 +25,13 @@ void net_mp_niov_clear_page_pool(struct net_iov *niov); int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *p); +int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack); void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *old_p); +void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *old_p); /** * net_mp_netmem_place_in_cache() - give a netmem to a page pool -- cgit v1.2.3 From 825dfab23bca520629a9e5a21ba5b03aaccc75f2 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:28:55 +0100 Subject: irqdomain: Rename irq_set_default_host() to irq_set_default_domain() Naming interrupt domains host is confusing at best and the irqdomain code uses both domain and host inconsistently. Therefore rename irq_set_default_host() to irq_set_default_domain(). Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-3-jirislaby@kernel.org --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 33ff41eef8f7..4b5c495b5710 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -352,7 +352,7 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, void *host_data); struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token); -void irq_set_default_host(struct irq_domain *host); +void irq_set_default_domain(struct irq_domain *domain); struct irq_domain *irq_get_default_host(void); int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, irq_hw_number_t hwirq, int node, -- cgit v1.2.3 From 0a27ea384c82e70d16e40adbaebeb3725f7e6342 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:28:56 +0100 Subject: irqdomain: Rename irq_get_default_host() to irq_get_default_domain() Naming interrupt domains host is confusing at best and the irqdomain code uses both domain and host inconsistently. Therefore rename irq_get_default_host() to irq_get_default_domain(). Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-4-jirislaby@kernel.org --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 4b5c495b5710..e9ab95fbc5a9 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -353,7 +353,7 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token); void irq_set_default_domain(struct irq_domain *domain); -struct irq_domain *irq_get_default_host(void); +struct irq_domain *irq_get_default_domain(void); int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity); -- cgit v1.2.3 From d2705d33885e3a19f727dff2521fb7d5b1fc5cda Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:28:57 +0100 Subject: irqdomain: Stop using 'host' for domain It is confusing to see 'host' and 'domain' to be used as 'domain'. Given this header is all about domains, switch the remaining 'host' uses to 'domain'. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-5-jirislaby@kernel.org --- include/linux/irqdomain.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e9ab95fbc5a9..bb7111105296 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -72,7 +72,7 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, /** * struct irq_domain_ops - Methods for irq_domain objects - * @match: Match an interrupt controller device node to a host, returns + * @match: Match an interrupt controller device node to a domain, returns * 1 on a match * @select: Match an interrupt controller fw specification. It is more generic * than @match as it receives a complete struct irq_fwspec. Therefore, @@ -454,7 +454,7 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod return IS_ERR(d) ? NULL : d; } -unsigned int irq_create_direct_mapping(struct irq_domain *host); +unsigned int irq_create_direct_mapping(struct irq_domain *domain); #endif static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, @@ -507,7 +507,7 @@ static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fw return IS_ERR(d) ? NULL : d; } -void irq_domain_remove(struct irq_domain *host); +void irq_domain_remove(struct irq_domain *domain); int irq_domain_associate(struct irq_domain *domain, unsigned int irq, irq_hw_number_t hwirq); @@ -515,16 +515,16 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count); -unsigned int irq_create_mapping_affinity(struct irq_domain *host, +unsigned int irq_create_mapping_affinity(struct irq_domain *domain, irq_hw_number_t hwirq, const struct irq_affinity_desc *affinity); unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); void irq_dispose_mapping(unsigned int virq); -static inline unsigned int irq_create_mapping(struct irq_domain *host, +static inline unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - return irq_create_mapping_affinity(host, hwirq, NULL); + return irq_create_mapping_affinity(domain, hwirq, NULL); } struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, -- cgit v1.2.3 From 8fa7292fee5c5240402371ea89ab285ec856c916 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 Apr 2025 10:17:26 +0200 Subject: treewide: Switch/rename to timer_delete[_sync]() timer_delete[_sync]() replaces del_timer[_sync](). Convert the whole tree over and remove the historical wrapper inlines. Conversion was done with coccinelle plus manual fixups where necessary. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/timer.h | 36 +----------------------------------- include/net/sctp/sctp.h | 2 +- 2 files changed, 2 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/linux/timer.h b/include/linux/timer.h index e67ecd1cbc97..10596d7c3a34 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -30,7 +30,7 @@ * * @TIMER_IRQSAFE: An irqsafe timer is executed with IRQ disabled and * it's safe to wait for the completion of the running instance from - * IRQ handlers, for example, by calling del_timer_sync(). + * IRQ handlers, for example, by calling timer_delete_sync(). * * Note: The irq disabled callback execution is a special case for * workqueue locking issues. It's not meant for executing random crap @@ -168,40 +168,6 @@ extern int timer_delete(struct timer_list *timer); extern int timer_shutdown_sync(struct timer_list *timer); extern int timer_shutdown(struct timer_list *timer); -/** - * del_timer_sync - Delete a pending timer and wait for a running callback - * @timer: The timer to be deleted - * - * See timer_delete_sync() for detailed explanation. - * - * Do not use in new code. Use timer_delete_sync() instead. - * - * Returns: - * * %0 - The timer was not pending - * * %1 - The timer was pending and deactivated - */ -static inline int del_timer_sync(struct timer_list *timer) -{ - return timer_delete_sync(timer); -} - -/** - * del_timer - Delete a pending timer - * @timer: The timer to be deleted - * - * See timer_delete() for detailed explanation. - * - * Do not use in new code. Use timer_delete() instead. - * - * Returns: - * * %0 - The timer was not pending - * * %1 - The timer was pending and deactivated - */ -static inline int del_timer(struct timer_list *timer) -{ - return timer_delete(timer); -} - extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 84e6b9fd5610..d8da764cf6de 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -636,7 +636,7 @@ static inline void sctp_transport_pl_reset(struct sctp_transport *t) } } else { if (t->pl.state != SCTP_PL_DISABLED) { - if (del_timer(&t->probe_timer)) + if (timer_delete(&t->probe_timer)) sctp_transport_put(t); t->pl.state = SCTP_PL_DISABLED; } -- cgit v1.2.3 From 9779489a31d77a7b9cb6f20d2d2caced4e29dbe6 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:10 +0100 Subject: hrtimers: Delete hrtimer_init() hrtimer_init() is now unused. Delete it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/003722f60c7a2a4f8d4ed24fb741aa313b7e5136.1738746927.git.namcao@linutronix.de --- include/linux/hrtimer.h | 2 -- include/linux/hrtimer_types.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 88e078871158..1adcba3ddd76 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -231,8 +231,6 @@ static inline enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) /* Exported timer functions: */ /* Initialize timers: */ -extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, - enum hrtimer_mode mode); extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode); extern void hrtimer_setup_on_stack(struct hrtimer *timer, diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index ad66a3081735..7c5b27daa89d 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -34,7 +34,7 @@ enum hrtimer_restart { * @is_hard: Set if hrtimer will be expired in hard interrupt context * even on RT. * - * The hrtimer structure must be initialized by hrtimer_init() + * The hrtimer structure must be initialized by hrtimer_setup() */ struct hrtimer { struct timerqueue_node node; -- cgit v1.2.3 From 04257da0c99c9d4ff7c5bb93046482e1f7d34938 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:16 +0100 Subject: hrtimers: Make callback function pointer private Make the struct hrtimer::function field private, to prevent users from changing this field in an unsafe way. hrtimer_update_function() should be used if the callback function needs to be changed. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/7d0e6e0c5c59a64a9bea940051aac05d750bc0c2.1738746927.git.namcao@linutronix.de --- include/linux/hrtimer_types.h | 2 +- include/trace/events/timer.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 7c5b27daa89d..8fbbb6bdf7a1 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -39,7 +39,7 @@ enum hrtimer_restart { struct hrtimer { struct timerqueue_node node; ktime_t _softexpires; - enum hrtimer_restart (*function)(struct hrtimer *); + enum hrtimer_restart (*__private function)(struct hrtimer *); struct hrtimer_clock_base *base; u8 state; u8 is_rel; diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 1ef58a04fc57..f8c906be4cd0 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -235,7 +235,7 @@ TRACE_EVENT(hrtimer_start, TP_fast_assign( __entry->hrtimer = hrtimer; - __entry->function = hrtimer->function; + __entry->function = ACCESS_PRIVATE(hrtimer, function); __entry->expires = hrtimer_get_expires(hrtimer); __entry->softexpires = hrtimer_get_softexpires(hrtimer); __entry->mode = mode; @@ -271,7 +271,7 @@ TRACE_EVENT(hrtimer_expire_entry, TP_fast_assign( __entry->hrtimer = hrtimer; __entry->now = *now; - __entry->function = hrtimer->function; + __entry->function = ACCESS_PRIVATE(hrtimer, function); ), TP_printk("hrtimer=%p function=%ps now=%llu", -- cgit v1.2.3 From 244132c4e5777fe0a4544ef23afba0d9a50e5ec5 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:21 +0100 Subject: tracing/timers: Rename the hrtimer_init event to hrtimer_setup The function hrtimer_init() doesn't exist anymore. It was replaced by hrtimer_setup(). Thus, rename the hrtimer_init trace event to hrtimer_setup to keep it consistent. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/cba84c3d853c5258aa3a262363a6eac08e2c7afc.1738746927.git.namcao@linutronix.de --- include/trace/events/timer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index f8c906be4cd0..1641ae3e6ca0 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -185,12 +185,12 @@ TRACE_EVENT(timer_base_idle, { HRTIMER_MODE_REL_PINNED_HARD, "REL|PINNED|HARD" }) /** - * hrtimer_init - called when the hrtimer is initialized + * hrtimer_setup - called when the hrtimer is initialized * @hrtimer: pointer to struct hrtimer * @clockid: the hrtimers clock * @mode: the hrtimers mode */ -TRACE_EVENT(hrtimer_init, +TRACE_EVENT(hrtimer_setup, TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid, enum hrtimer_mode mode), -- cgit v1.2.3